diff --git a/.agents/rules/rules.md b/.agents/rules/rules.md deleted file mode 100644 index 594d63f..0000000 --- a/.agents/rules/rules.md +++ /dev/null @@ -1,60 +0,0 @@ -# Spector — Agent Rules - -## Project Identity - -Spector is a **Java 25** vector search engine with biologically-inspired cognitive memory, built on Panama FFM, SIMD Vector API, and virtual threads. 22-module Maven reactor. - -## Critical Constraints - -- **JDK 25** with `jdk.incubator.vector` -- **NEVER** use `synchronized` — always `ReentrantLock` (virtual thread pinning) -- **NEVER** `System.out.println` — use SLF4J `LoggerFactory.getLogger()` -- **NEVER** hardcode SIMD lane widths — use `FloatVector.SPECIES_PREFERRED` -- **NEVER** commit secrets/tokens to repo -- `.spector/` is in `.gitignore` — never remove - -## Architecture Boundaries - -| Layer | Modules | Depends On | -|---|---|---| -| Foundation | core, commons, config, storage | Each other only | -| Embedding | embed-api, embed-ollama | commons | -| Search | index, query, gpu | Foundation + embed-api | -| Intelligence | rag, engine, ingestion, memory | Search + Foundation | -| Runtime | runtime, node, mcp, cli, client | Intelligence | -| Infrastructure | metrics, bench, dist, spring | Any | - -**`spector-memory` and `spector-engine` are independent peers — never depend on each other.** Wired only at `SpectorRuntime`. - -## Directory Paths - -- Engine: `.spector/index/` — Memory: `.spector/memory/` — WAL: `.spector/memory/wal/` -- Source of truth: `SpectorConfigFactory.java` - -## Git Conventions - -- Format: `(): ` (Conventional Commits) -- Types: `feat`, `fix`, `perf`, `refactor`, `docs`, `test`, `build`, `chore` -- Scope: module name without `spector-` (e.g., `engine`, `memory`) -- Commit order: foundation → search → intelligence → runtime → docs → tests -- Branch: `feat/desc`, `fix/desc`, `perf/desc`, `docs/desc` - -## Documentation - -- MkDocs Material site in `docs/`, build: `python -m mkdocs build --clean` -- Module READMEs included via `--8<--` snippets in `docs/docs/modules/` -- Binary layouts: RFC-style wire format diagrams -- Design source of truth: `spector-memory/RnD/` for memory subsystem -- Config docs: `docs/docs/configuration/parameters.md` - -## Key Patterns - -- Records for immutable data (`PersistenceFiles`, `NodeInfo`, `SearchResult`) -- Builder pattern for configs (`SpectorConfig.builder()`, `SpectorEngine.builder()`) -- Abstract Factory for component assembly (`EngineComponentFactory`) -- `IngestionTarget` interface — both engine and memory implement their own -- `AutoCloseable` for anything holding native resources - -## Skills Reference - -Detailed coding standards, code review process, and other skills are defined in `.agents/skills/`. Agents should read the relevant SKILL.md before performing specialized tasks. diff --git a/.agents/skills/code-review/SKILL.md b/.agents/skills/code-review/SKILL.md deleted file mode 100644 index 20fd33a..0000000 --- a/.agents/skills/code-review/SKILL.md +++ /dev/null @@ -1,141 +0,0 @@ -# Skill: Code Review - -This skill defines the code review process for the Spector project. Use this when reviewing PRs, inspecting diffs, or performing pre-commit quality checks. - -## Trigger - -This skill is triggered when: -- The user requests a code review of changes or a PR -- The user asks to "review", "check", or "audit" code -- Before merging or pushing significant changes -- As part of the PR Review workflow - -## Instructions - -### Step 1: Scope the Review - -Identify what changed: -```bash -git diff --stat # unstaged changes -git diff --cached --stat # staged changes -git diff main...HEAD --stat # full PR diff vs main -``` - -Categorize changes by module and risk level: -- **High risk:** core, index, storage (SIMD, Panama, hot paths) -- **Medium risk:** engine, memory, query (business logic) -- **Low risk:** docs, bench, scripts (non-production) - -### Step 2: Architecture Check - -For each changed module, verify module boundary rules: - -- [ ] No Foundation module depending on Intelligence/Runtime -- [ ] `spector-memory` does NOT import from `spector-engine` (or vice versa) -- [ ] No new circular dependencies between modules -- [ ] New dependencies added to correct POM section -- [ ] If a new module was added, it follows the layer hierarchy - -**Quick check command:** -```bash -# Find cross-module imports that violate boundaries -grep -rn "import com.spectrayan.spector.engine" spector-memory/src/ -grep -rn "import com.spectrayan.spector.memory" spector-engine/src/ -``` - -### Step 3: Java Standards Compliance - -For each changed `.java` file, check: - -**Hard blockers (must fix before merge):** -- [ ] No `synchronized` keyword anywhere — must use `ReentrantLock` -- [ ] No `System.out.println` — must use SLF4J logger -- [ ] No hardcoded SIMD lane widths — must use `SPECIES_PREFERRED` -- [ ] No `Thread.sleep()` in production — use `LockSupport.parkNanos()` -- [ ] No swallowed exceptions (`catch (Exception e) { }`) -- [ ] No hardcoded file paths — use `SpectorConfig` / `PersistenceFiles` -- [ ] `AutoCloseable` implemented for classes holding native resources - -**Quality checks (should fix):** -- [ ] Javadoc on all new public classes and methods -- [ ] Records used for immutable data holders (not mutable POJOs) -- [ ] Pattern matching used instead of cast-after-instanceof -- [ ] Section separators (`// ───── Section ─────`) for class organization -- [ ] Meaningful variable names (not single letters except loop vars) - -### Step 4: Performance Review (core, index, storage only) - -For changes in hot-path modules: - -- [ ] No `new float[]` or boxing in search/similarity paths -- [ ] `MemorySegment` slices used instead of `.toArray()` copies -- [ ] SIMD loops use `SPECIES.loopBound()` with scalar tail -- [ ] `VectorMask` used for tail handling (not branching) -- [ ] Arena lifecycle correct: `ofShared()` for concurrent, `ofConfined()` for single-thread -- [ ] JMH benchmark included for performance-sensitive changes - -### Step 5: Test Coverage - -- [ ] New public API methods have at least 1 test -- [ ] Tests use `@TempDir` for file operations (never hardcoded paths) -- [ ] Tests use AssertJ assertions (`assertThat`), not JUnit `assertEquals` -- [ ] Integration tests suffixed `*IntegrationTest` -- [ ] No test relies on external services without `@DisabledIfEnvironmentVariable` - -**Quick gap check:** -```bash -# List production files without corresponding test files -for file in $(git diff --name-only --diff-filter=A | grep "src/main.*\.java$"); do - test_file=$(echo $file | sed 's|src/main|src/test|' | sed 's|\.java$|Test.java|') - [ ! -f "$test_file" ] && echo "MISSING TEST: $test_file" -done -``` - -### Step 6: Documentation - -- [ ] README.md updated if public API changed -- [ ] `docs/docs/configuration/parameters.md` updated if config defaults changed -- [ ] Design docs updated if binary layouts or WAL format changed -- [ ] `mkdocs build --clean` produces no new warnings -- [ ] Mermaid diagrams have valid syntax - -### Step 7: Git Hygiene - -- [ ] Commit messages follow Conventional Commits: `(): ` -- [ ] No secrets, API keys, or credentials in diff -- [ ] No `.spector/` data files committed -- [ ] No generated files (`.class`, `target/`, `site/`) committed -- [ ] Commits are logically grouped (not one mega-commit) - -### Step 8: Generate Review Summary - -After completing all checks, produce a structured summary: - -```markdown -## Code Review Summary - -**Scope:** {N} files across {M} modules -**Risk:** High / Medium / Low - -### ✅ Passed -- Architecture boundaries respected -- No synchronized/System.out violations -- Tests added for new APIs - -### ⚠️ Warnings -- Missing Javadoc on `NewClass.process()` (line 42) -- No JMH benchmark for SIMD optimization - -### ❌ Blockers -- `synchronized` used in MemoryWal.java:156 — must use ReentrantLock -- Missing test for `ShardedDiskHnswWriter.write()` - -### Verdict: APPROVE / REQUEST_CHANGES / NEEDS_DISCUSSION -``` - -## Verification - -After the review is complete: -1. All blockers must be resolved before merge -2. Warnings should be addressed or documented as tech debt -3. Review summary should be attached to the PR or provided to the user diff --git a/.agents/skills/coding-standards/SKILL.md b/.agents/skills/coding-standards/SKILL.md deleted file mode 100644 index c093457..0000000 --- a/.agents/skills/coding-standards/SKILL.md +++ /dev/null @@ -1,330 +0,0 @@ -# Skill: Coding Standards Reference - -This skill provides the comprehensive coding standards for the Spector project. Agents should reference this document when writing or reviewing Java code in any `spector-*` module. - -## Trigger - -Reference this document when: -- Writing new Java classes or methods -- Reviewing code changes for standards compliance -- Creating new modules or packages -- Adding or auditing exception handling -- Creating new ErrorCode constants or SpectorException subclasses -- Resolving code style disagreements - ---- - -## Java Language (JDK 25) - -### Modern Features — Required - -| Feature | Usage | Example | -|---|---|---| -| **Records** | All immutable data holders | `public record NodeInfo(String id, int port) {}` | -| **Sealed classes** | Closed type hierarchies | `sealed interface VectorIndex permits HnswIndex, BruteForceIndex` | -| **Pattern matching** | `instanceof` checks | `if (index instanceof AbstractHnswIndex hnsw && hnsw.size() > 0)` | -| **Switch expressions** | Exhaustive matching | `return switch (mode) { case SEARCH -> engine; case MEMORY -> memory; };` | -| **`var`** | Local variables when RHS type is obvious | `var config = SpectorConfig.DEFAULT.withDimensions(384);` | -| **Text blocks** | Multi-line strings | `"""SELECT * FROM ..."""` | - -### Concurrency — Virtual Thread Safety - -```java -// ✅ CORRECT — ReentrantLock (virtual-thread safe) -private final ReentrantLock lock = new ReentrantLock(); -public void write(byte[] data) { - lock.lock(); - try { /* critical section */ } - finally { lock.unlock(); } -} - -// ❌ WRONG — synchronized pins virtual threads to carrier -public synchronized void write(byte[] data) { /* ... */ } -``` - -- Use `ReentrantLock` for all mutual exclusion -- Use `ReentrantReadWriteLock` when read-heavy -- Use `AtomicReference`, `AtomicInteger` for simple counters -- Use `LockSupport.parkNanos()` instead of `Thread.sleep()` -- Use `ConcurrentHashMap` over `Collections.synchronizedMap()` - -### Panama FFM (Foreign Function & Memory) - -```java -// ✅ Shared arena for concurrent access -try (Arena arena = Arena.ofShared()) { - MemorySegment segment = arena.allocate(ValueLayout.JAVA_FLOAT, capacity); - segment.set(ValueLayout.JAVA_FLOAT, offset, value); -} - -// ✅ Zero-copy slice -MemorySegment slice = segment.asSlice(offset, length); - -// ❌ WRONG — copying to float[] in hot path -float[] copy = segment.toArray(ValueLayout.JAVA_FLOAT); // heap allocation! -``` - -- `Arena.ofShared()` for concurrent access across threads -- `Arena.ofConfined()` for single-thread operations -- Prefer `MemorySegment` slices over array copies -- Use `ValueLayout.JAVA_FLOAT` (not `JAVA_FLOAT_UNALIGNED`) when alignment is guaranteed - -### SIMD (Vector API) - -```java -// ✅ CORRECT — species-agnostic -static final VectorSpecies SPECIES = FloatVector.SPECIES_PREFERRED; - -public static float dotProduct(float[] a, float[] b) { - int i = 0; - FloatVector sum = FloatVector.zero(SPECIES); - int bound = SPECIES.loopBound(a.length); - for (; i < bound; i += SPECIES.length()) { - var va = FloatVector.fromArray(SPECIES, a, i); - var vb = FloatVector.fromArray(SPECIES, b, i); - sum = va.fma(vb, sum); - } - float result = sum.reduceLanes(VectorOperators.ADD); - for (; i < a.length; i++) result += a[i] * b[i]; // scalar tail - return result; -} - -// ❌ WRONG — hardcoded lane width -static final VectorSpecies SPECIES = FloatVector.SPECIES_256; -``` - ---- - -## Naming Conventions - -| Element | Convention | Example | -|---|---|---| -| Module directory | `spector-{name}` | `spector-memory` | -| Package | `com.spectrayan.spector.{name}` | `com.spectrayan.spector.memory.sync` | -| Class | PascalCase, descriptive | `MemoryWal`, `CognitiveRecordLayout` | -| Interface | PascalCase, noun/adjective | `VectorIndex`, `IngestionTarget` | -| Constants | `UPPER_SNAKE_CASE` | `HEADER_MAGIC`, `DEFAULT_CAPACITY` | -| Methods | camelCase, verb-first | `resolveIndex()`, `ingestChunked()` | -| Test class | `{ClassName}Test` | `MemoryWalTest` | -| Integration test | `{Name}IntegrationTest` | `SpectorMemoryIntegrationTest` | -| Builder | Static inner `Builder` class | `SpectorEngine.builder()` | -| Factory | `{Name}Factory` | `EngineComponentFactory` | - ---- - -## Class Structure Template - -```java -package com.spectrayan.spector.{module}; - -import ...; - -/** - * Brief description of what this class does. - * - *

Detailed explanation of design decisions, usage patterns, - * and relationship to other classes.

- * - *

Design Patterns

- *
    - *
  • Pattern — explanation
  • - *
- * - * @see RelatedClass - */ -public class MyClass implements AutoCloseable { - - private static final Logger log = LoggerFactory.getLogger(MyClass.class); - - // ─────────────── Constants ─────────────── - private static final int DEFAULT_CAPACITY = 1000; - - // ─────────────── Fields ─────────────── - private final SpectorConfig config; - private final ReentrantLock lock = new ReentrantLock(); - private volatile boolean closed; - - // ─────────────── Construction ─────────────── - public MyClass(SpectorConfig config) { ... } - - // ─────────────── Public API ─────────────── - public void doWork() { ... } - - // ─────────────── Internal ─────────────── - private void helper() { ... } - - // ─────────────── Lifecycle ─────────────── - @Override - public void close() { ... } -} -``` - -**Section separators:** Use `// ─────────────── Section ───────────────` for visual grouping. - ---- - -## Performance Rules (core, index, storage) - -| Rule | Detail | -|---|---| -| **No allocations in hot paths** | Reuse buffers, use offset+length APIs, avoid boxing | -| **Zero-copy** | `MemorySegment` slices, never copy to `float[]` in search | -| **Branchless SIMD** | `VectorMask` for tail handling, minimize scalar fallback | -| **Benchmark gate** | Performance PRs must include JMH before/after results | -| **Profile first** | Use JFR/async-profiler before optimizing | - ---- - -## Error Handling — SpectorException Framework - -Spector uses a structured error framework based on `ErrorCode` + `SpectorException`. **Never throw generic exceptions.** All errors go through this system. - -### Core Architecture - -``` -ErrorCode (enum) — Central registry of SPE-XXX-YYY codes with {} message templates - ↓ -SpectorException (abstract) — Base class, stores ErrorCode, formats message via errorCode.format(args) - ├── SpectorValidationException (SPE-100-xxx) - ├── SpectorConfigException (SPE-110-xxx) - ├── SpectorIndexException (SPE-200-xxx) - ├── SpectorStorageException (SPE-210-xxx) - ├── SpectorEmbeddingException (SPE-300-xxx) - ├── SpectorMemoryException (SPE-310-xxx) - │ ├── SpectorGraphException (SPE-310-006..011) - │ │ ├── SpectorHebbianException (SPE-310-006) - │ │ ├── SpectorTemporalChainException (SPE-310-007) - │ │ ├── SpectorEntityGraphException (SPE-310-008) - │ │ ├── SpectorCoActivationException (SPE-310-009) - │ │ ├── SpectorGraphPersistenceException(SPE-310-010) - │ │ └── SpectorGraphDecayException (SPE-310-011) - │ ├── SpectorMemoryRecallException (SPE-310-002) - │ ├── SpectorMemoryConsolidationException (SPE-310-003) - │ └── SpectorMemoryTierFullException (SPE-310-001) - ├── SpectorGpuException (SPE-400-xxx) - ├── SpectorServerException (SPE-500-xxx) - ├── SpectorClientException (SPE-510-xxx) - ├── SpectorIngestionException (SPE-600-xxx) - ├── SpectorClusterException (SPE-700-xxx) - └── SpectorInternalException (SPE-900-xxx) -``` - -**Key files:** -- `spector-commons/src/main/java/com/spectrayan/spector/commons/error/ErrorCode.java` -- `spector-commons/src/main/java/com/spectrayan/spector/commons/error/SpectorException.java` -- Module-specific errors: `spector-{module}/src/main/java/.../error/` - -### Adding a New Error Code - -1. Add the constant to `ErrorCode.java` under the correct category section: - -```java -// In ErrorCode.java — under the correct category section -/** Brief description of when this error occurs. */ -MY_OPERATION_FAILED (310_012, ErrorCategory.MEMORY, - "My operation failed for {}: {}"), -``` - -- Code format: `{category_prefix}_{sequence}` → e.g., `310_012` → `SPE-310-012` -- Message template uses `{}` placeholders (SLF4J-style) -- **Codes are immutable once assigned — never reuse or renumber** - -### Creating a Granular Exception - -Create a domain-specific exception that binds a default `ErrorCode` and captures typed context: - -```java -// ✅ CORRECT — granular exception with typed constructor -public class SpectorHebbianException extends SpectorGraphException { - - private final String operation; - - public SpectorHebbianException(String operation) { - super(ErrorCode.GRAPH_HEBBIAN_FAILED, operation); // format via errorCode.format(args) - this.operation = operation; - } - - public SpectorHebbianException(String operation, Throwable cause) { - super(ErrorCode.GRAPH_HEBBIAN_FAILED, cause, operation); - this.operation = operation; - } - - public String getOperation() { return operation; } -} -``` - -**Rules:** -- Constructor args map 1:1 to `{}` placeholders in the ErrorCode template -- **No string concatenation at construction site** — formatting happens inside `SpectorException` via `errorCode.format(args)` -- Store domain-specific context as fields (e.g., `operation`, `path`, `graphType`) -- Follow the naming pattern: `Spector{Domain}Exception` - -### Throw Sites — Throwing Exceptions - -```java -// ✅ CORRECT — typed exception, no string concatenation -throw new SpectorGraphPersistenceException("HebbianGraph", filePath, e); -// getMessage() → "[SPE-310-010] Graph persistence failed for HebbianGraph: /path/to/file" - -// ✅ CORRECT — ErrorCode with typed args -throw new SpectorValidationException(ErrorCode.ARGUMENT_NULL, "listener"); -// getMessage() → "[SPE-100-007] listener must not be null" - -// ❌ WRONG — string concatenation at throw site -throw new SpectorGraphException(ErrorCode.GRAPH_PERSISTENCE_FAILED, e, - "HebbianGraph save to " + filePath + " failed: " + e.getMessage()); - -// ❌ WRONG — generic exception -throw new RuntimeException("something failed"); - -// ❌ WRONG — UncheckedIOException (use SpectorException subtypes) -throw new UncheckedIOException("write failed", e); -``` - -### Catch Sites — Graceful Degradation - -For enrichment steps that should **not** crash the main pipeline: - -```java -// ✅ CORRECT — create exception for formatted message, log, continue -} catch (RuntimeException e) { - SpectorHebbianException ex = new SpectorHebbianException("edge strengthening", e); - log.warn(ex.getMessage()); -} - -// ✅ CORRECT — catch and rethrow as domain exception -} catch (IOException e) { - throw new SpectorGraphPersistenceException("EntityGraph", filePath, e); -} - -// ❌ WRONG — catch generic Exception -} catch (Exception e) { ... } - -// ❌ WRONG — ErrorCode.format() with string concatenation at call site -log.warn(ErrorCode.GRAPH_HEBBIAN_FAILED.format( - "edge strengthening for '" + id + "': " + e.getMessage())); - -// ❌ WRONG — swallowing exceptions -} catch (Exception e) { /* ignored */ } -``` - -### Pattern Summary - -| Scenario | Pattern | -|---|---| -| **New domain error** | Add `ErrorCode` constant → create `Spector{Domain}Exception` subclass | -| **Throw on failure** | `throw new Spector{Domain}Exception(args, cause)` | -| **Graceful degradation** | `catch(RuntimeException) → new Exception(args, e) → log(ex.getMessage())` | -| **IO failure** | `catch(IOException) → throw new SpectorGraphPersistenceException(type, path, e)` | -| **Validation** | `throw new SpectorValidationException(ErrorCode.ARGUMENT_NULL, "paramName")` | -| **Never** | `catch(Exception)`, `throw new RuntimeException()`, string concat in ErrorCode.format() | - ---- - -## Testing Standards - -- **Framework:** JUnit 5 + AssertJ only (never JUnit 4 or Hamcrest) -- **File tests:** Always use `@TempDir`, never hardcode paths -- **Assertions:** Fluent AssertJ: `assertThat(x).isEqualTo(y)`, never `assertEquals` -- **Naming:** Test methods describe behavior: `walRecovery_truncatesIncompleteRecord()` -- **Coverage:** All new public API methods require at least 1 test diff --git a/.agents/skills/doc-sync/SKILL.md b/.agents/skills/doc-sync/SKILL.md deleted file mode 100644 index 86fff6d..0000000 --- a/.agents/skills/doc-sync/SKILL.md +++ /dev/null @@ -1,92 +0,0 @@ -# Skill: Documentation Sync - -This skill ensures MkDocs site, module READMEs, configuration docs, and design documents stay in sync with production code changes. - -## Trigger - -This skill is triggered when: -- Production code changes public API, configuration defaults, or directory paths -- A new module is created or an existing one is renamed/deleted -- The user requests "update docs", "sync docs", or "fix doc warnings" -- As part of the Feature Development or Module Lifecycle workflows - -## Instructions - -### Step 1: Identify What Changed - -```bash -git diff --name-only HEAD~1 # or appropriate range -``` - -Map changed files to documentation impact: - -| Code Change | Docs to Update | -|---|---| -| `SpectorConfig` / `SpectorProperties` | `docs/docs/configuration/parameters.md`, `spector-defaults.yml` | -| `SpectorConfigFactory` (path defaults) | All docs referencing `.spector/` paths | -| Public API in any module | `spector-{mod}/README.md` | -| `MemoryWal`, `CognitiveRecordLayout` | `docs/docs/memory/wal-design.md` | -| New module created | `docs/mkdocs.yml`, `docs/docs/modules/index.md`, new module page | -| Module removed | `docs/mkdocs.yml`, `docs/docs/modules/index.md`, delete page | -| POM dependency changes | `docs/docs/modules/index.md` (dependency graph) | - -### Step 2: Update Module Docs - -For each module with changed public API: - -1. Update `spector-{module}/README.md` (auto-included in docs site via `--8<--`) -2. Ensure `docs/docs/modules/spector-{module}.md` exists with snippet include: - ```markdown - --8<-- "spector-{module}/README.md" - ``` -3. Ensure nav entry exists in `docs/mkdocs.yml` under `Modules:` - -### Step 3: Update Config Docs - -If `SpectorConfig`, `SpectorProperties`, or `SpectorConfigFactory` changed: - -1. Extract current defaults from `SpectorConfigFactory.java` (source of truth) -2. Update `spector-config/src/main/resources/spector-defaults.yml` -3. Update `docs/docs/configuration/parameters.md` -4. Grep for old path references: `grep -rn "old-path" docs/ scripts/ *.md` - -### Step 4: Update Design Docs - -If binary layouts, WAL format, or synapse headers changed: - -1. Cross-reference with `spector-memory/RnD/wal_design_spec.md` (design source of truth) -2. Update RFC-style wire format diagrams in `docs/docs/memory/wal-design.md` -3. Update `docs/docs/memory/panama-design.md` if record layout changed - -### Step 5: Fix Nav & Cross-References - -1. Check `docs/mkdocs.yml` for: - - No duplicate `extra_css` or `extra_javascript` keys (YAML last-key-wins) - - All nav entries point to existing files - - No stale module entries (deleted modules) -2. Check for pages not in nav: - ```bash - python -m mkdocs build --clean 2>&1 | grep "not included in the nav" - ``` - -### Step 6: Verify - -```bash -cd docs -python -m mkdocs build --clean -``` - -**Must have:** -- Zero "page not in nav" warnings for modules -- Zero "link target not found" warnings for files we control -- All `--8<--` snippet includes resolve to existing files - -## MkDocs Quick Reference - -| Task | Command | -|---|---| -| Build site | `cd docs && python -m mkdocs build --clean` | -| Serve locally | `cd docs && python -m mkdocs serve --dev-addr 127.0.0.1:8085` | -| Check warnings | `python -m mkdocs build --clean 2>&1 \| grep WARNING` | - -**Stack:** MkDocs 1.6.1 + Material for MkDocs 9.7.6 diff --git a/.agents/skills/incremental-commits/SKILL.md b/.agents/skills/incremental-commits/SKILL.md deleted file mode 100644 index d266a00..0000000 --- a/.agents/skills/incremental-commits/SKILL.md +++ /dev/null @@ -1,115 +0,0 @@ -# Skill: Incremental Commits - -This skill defines the process for creating clean, logical, component-grouped git commits following the project's Conventional Commits standard. - -## Trigger - -This skill is triggered when: -- The user requests "commit", "incremental commit", or "commit all changes" -- After completing a feature or fix with multiple file changes -- As the final step in the Feature Development workflow - -## Instructions - -### Step 1: Inventory Changes - -```bash -git status --short -git diff --cached --name-only # check for already-staged files -``` - -If there are staged files, decide: unstage with `git reset HEAD -- .` to start clean, or commit staged files first. - -### Step 2: Group by Logical Unit - -Group changes in this strict dependency order. Each group becomes one commit: - -| Priority | Category | Commit Type | Example | -|---|---|---|---| -| 1 | **Module deletions** | `refactor:` | `refactor: remove spector-server module` | -| 2 | **Build/POM changes** | `build:` | `build: remove server/cluster from reactor` | -| 3 | **Foundation** (core, commons, config, storage) | `feat/refactor({mod}):` | `refactor(config): add PersistenceFiles record` | -| 4 | **Embedding** (embed-api, embed-ollama) | `feat/refactor({mod}):` | | -| 5 | **Search** (index, query, gpu) | `feat/refactor({mod}):` | `feat(index): sharded disk HNSW persistence` | -| 6 | **Intelligence** (engine, memory, rag, ingestion) | `feat/refactor({mod}):` | `feat(memory): WAL corruption recovery` | -| 7 | **Runtime** (runtime, node, mcp, cli, client) | `feat/refactor({mod}):` | | -| 8 | **Infrastructure** (metrics, bench, dist, spring) | `feat({mod}):` | `feat(metrics): Prometheus instrumentation` | -| 9 | **New modules** (whole new `spector-*`) | `feat({mod}):` | `feat(node): spector-node unified server` | -| 10 | **Documentation** | `docs:` | `docs: update WAL design deep-dive` | -| 11 | **Scripts/CI/deploy** | `chore:` or `build:` | `chore: update MCP config and scripts` | -| 12 | **Test files** | `test:` | `test: update engine and memory test suites` | -| 13 | **Project meta** | `docs:` or `chore:` | `docs: update README and CHANGELOG` | - -### Step 3: Commit Each Group - -For each group: - -```bash -git add -git commit -m "(): - -- Bullet explaining what changed -- Bullet explaining why (if non-obvious)" -``` - -**Rules:** -- Production source (`src/main/`) and its tests (`src/test/`) in the same module CAN go in the same commit if they are part of the same logical change -- However, if changes span many modules, keep tests in a separate final commit -- Never mix unrelated modules in one commit -- POM changes go with the module they affect, OR in a separate `build:` commit if they affect multiple modules - -### Step 4: Verify - -```bash -git log --oneline -N # verify clean history -mvn test -pl # verify changed modules still build -``` - -### Commit Message Format - -``` -(): - - - -- Bullet point 1 -- Bullet point 2 -``` - -| Type | When | -|---|---| -| `feat` | New functionality, new class, new API | -| `fix` | Bug fix, test fix, correction | -| `perf` | Performance improvement (must include numbers) | -| `refactor` | Code restructuring with no behavior change | -| `docs` | Documentation only | -| `test` | Adding or updating tests only | -| `build` | POM, Maven, CI, dependency changes | -| `chore` | Scripts, tooling, config, non-code | - -**Scope:** Module name without `spector-` prefix. Omit scope for cross-cutting changes. - -## Examples - -``` -feat(memory): WAL corruption recovery with torn-write detection - -- Torn writes at EOF detected via magic/CRC failure, resolved by truncate() -- Mid-log corruption quarantined to .quarantine/ to prevent data divergence -- ReentrantLock replaces synchronized for virtual thread safety -``` - -``` -refactor: remove spector-cluster module (deferred to V3 roadmap) - -Cluster coordination, shard management, and replication are -planned for V3. Removing premature scaffolding to reduce build -surface and test noise. -``` - -``` -build: remove server/cluster from reactor, update POM dependencies - -- Remove spector-server and spector-cluster from root POM modules -- Update dependency versions and module cross-references -``` diff --git a/.agents/skills/update-roadmap/SKILL.md b/.agents/skills/update-roadmap/SKILL.md deleted file mode 100644 index 6fbe4ae..0000000 --- a/.agents/skills/update-roadmap/SKILL.md +++ /dev/null @@ -1,99 +0,0 @@ -# Skill: Update Roadmap - -This skill enables the agent to dynamically and consistently manage the project roadmap for Spector, ensuring perfect synchronization between the root `README.md` and the detailed documentation in `docs/docs/roadmap.md`. - -## Trigger - -This skill is automatically triggered when the user requests roadmap modifications, such as: -- Adding a new feature or research goal. -- Completing a planned feature or task. -- Deprioritizing or marking a feature as not planned. -- Removing a feature completely from the roadmap. -- Automatically whenever a task inside a plan is completed, to keep the roadmap in sync. - -## Workspace Requirements - -- The repository root must contain this skill package under `.agents/skills/update-roadmap/` -- The helper scripts must be located at: - - Windows: `.agents/skills/update-roadmap/scripts/update-roadmap.ps1` - - Unix/Linux/macOS: `.agents/skills/update-roadmap/scripts/update-roadmap.sh` -- The root must contain `README.md` with the checklist under `## 📈 Roadmap` -- The docs folder must contain `docs/docs/roadmap.md` with the Summary Table and categories. - -## Instructions for the Agent - -When this skill is triggered, you must **never** manually edit the Markdown files (`README.md` or `docs/docs/roadmap.md`). Instead, you must run the PowerShell script `update-roadmap.ps1` or Bash script `update-roadmap.sh` located inside this skill package depending on your operating system environment. - -### Action Mapping - -Determine the appropriate action verb based on the user's request: - -#### 1. Add Action (`-Action Add`) -Use when introducing a new feature, index, optimization, or research target. -- **Syntax (Windows)**: - ```powershell - powershell -ExecutionPolicy Bypass -File .agents/skills/update-roadmap/scripts/update-roadmap.ps1 -Action Add -Name "" -Description "" -Category -Status -Compression "" -Recall "" -Effort -DetailText "" - ``` -- **Syntax (Unix/Linux/macOS)**: - ```bash - .agents/skills/update-roadmap/scripts/update-roadmap.sh -Action Add -Name "" -Description "" -Category -Status -Compression "" -Recall "" -Effort -DetailText "" - ``` -- **Arguments**: - - `-Name`: The exact name of the feature. - - `-Description`: One-line summary (goes to README checklist). - - `-Category`: Workspace category (`Compression`, `Agentic`, `Compute`, `Runtime`, `Distributed`). - - `-Status`: One of `Planned`, `Exploratory`, `Research` (defaults to `Planned`). - - `-Compression`: Projected space savings (e.g. "+25%", "8x", "N/A"). - - `-Recall`: Projected recall impact (e.g. "None", "-2%"). - - `-Effort`: Implementation effort (`Low`, `Medium`, `High`). - - `-DetailText`: Detailed multi-line markdown block to append under the category details. - -#### 2. Complete Action (`-Action Complete`) -Use when a feature is successfully implemented, verified, and merged. -- **Syntax (Windows)**: - ```powershell - powershell -ExecutionPolicy Bypass -File .agents/skills/update-roadmap/scripts/update-roadmap.ps1 -Action Complete -Name "" - ``` -- **Syntax (Unix/Linux/macOS)**: - ```bash - .agents/skills/update-roadmap/scripts/update-roadmap.sh -Action Complete -Name "" - ``` -- **Behavior**: - - Marks the checkbox completed in `README.md` (`- [x] **Feature Name**`). - - Moves the detailed block to `## Recently Completed (Archive)` in `docs/docs/roadmap.md`. - - Updates the Summary Table row to `✅ Done`. - -#### 3. Deprioritize Action (`-Action Deprioritize`) -Use when a feature is put on hold, marked not planned, or deferred. -- **Syntax (Windows)**: - ```powershell - powershell -ExecutionPolicy Bypass -File .agents/skills/update-roadmap/scripts/update-roadmap.ps1 -Action Deprioritize -Name "" - ``` -- **Syntax (Unix/Linux/macOS)**: - ```bash - .agents/skills/update-roadmap/scripts/update-roadmap.sh -Action Deprioritize -Name "" - ``` -- **Behavior**: - - Updates Summary Table status to `🔴 Not planned`. - - Updates detailed block status header to `🔴 Not Planned`. - -#### 4. Remove Action (`-Action Remove`) -Use when a feature is entirely excised from the project scope. -- **Syntax (Windows)**: - ```powershell - powershell -ExecutionPolicy Bypass -File .agents/skills/update-roadmap/scripts/update-roadmap.ps1 -Action Remove -Name "" - ``` -- **Syntax (Unix/Linux/macOS)**: - ```bash - .agents/skills/update-roadmap/scripts/update-roadmap.sh -Action Remove -Name "" - ``` -- **Behavior**: - - Deletes checkbox from `README.md`. - - Deletes detailed description and Summary Table row from `docs/docs/roadmap.md`. - -### Verification Steps - -After executing the script, the agent must: -1. Verify the exit code is 0 and output confirms successful modification. -2. Run `git diff` to review all modified lines across `README.md` and `docs/docs/roadmap.md`. -3. Confirm that the checkbox states, table rows, and detailed sections align perfectly. diff --git a/.agents/skills/update-roadmap/scripts/update-roadmap.ps1 b/.agents/skills/update-roadmap/scripts/update-roadmap.ps1 deleted file mode 100644 index 5cfbfd7..0000000 --- a/.agents/skills/update-roadmap/scripts/update-roadmap.ps1 +++ /dev/null @@ -1,362 +0,0 @@ -#Requires -Version 5.1 -<# -.SYNOPSIS - Automates Spector Search roadmap updates across README.md and docs/docs/roadmap.md. -.DESCRIPTION - This script provides an automated workflow to manage planned, active, completed, and - deprioritized features. It automatically updates the checklist in README.md, reorganizes - categories and appends archives in docs/docs/roadmap.md, and maintains the summary tables. -.PARAMETER Action - The roadmap operation: Add, Complete, Deprioritize, or Remove. -.PARAMETER Name - The name of the feature (e.g., "gRPC Replication Transport"). -.PARAMETER Description - A concise one-line description of the feature. -.PARAMETER Category - The category for the feature: Compression, Agentic, Compute, Runtime, or Distributed. -.PARAMETER Status - The feature status: Planned, Done, Exploratory, or Research. -.PARAMETER DetailText - Optional multi-line detailed markdown description for docs/docs/roadmap.md. -.PARAMETER Compression - The expected compression impact for the Summary Table (e.g. "+25%", "8x", "N/A"). Default: "N/A". -.PARAMETER Recall - The expected recall impact for the Summary Table (e.g. "None", "-2%", "N/A"). Default: "None". -.PARAMETER Effort - The expected implementation effort for the Summary Table (e.g. "Low", "Medium", "High"). Default: "Medium". -.EXAMPLE - .agents\skills\update-roadmap\scripts\update-roadmap.ps1 -Action Add -Name "Hardware Cosine SIMD" -Description "Optimized cosine bounds" -Category Compute -Status Planned -Effort Low -.EXAMPLE - .agents\skills\update-roadmap\scripts\update-roadmap.ps1 -Action Complete -Name "Hardware Cosine SIMD" -#> - -[CmdletBinding()] -param ( - [Parameter(Mandatory = $true)] - [ValidateSet('Add', 'Complete', 'Deprioritize', 'Remove')] - [string]$Action, - - [Parameter(Mandatory = $true)] - [string]$Name, - - [Parameter(Mandatory = $false)] - [string]$Description = "", - - [Parameter(Mandatory = $false)] - [ValidateSet('Compression', 'Agentic', 'Compute', 'Runtime', 'Distributed')] - [string]$Category = "Runtime", - - [Parameter(Mandatory = $false)] - [ValidateSet('Planned', 'Done', 'Exploratory', 'Research')] - [string]$Status = "Planned", - - [Parameter(Mandatory = $false)] - [string]$DetailText = "", - - [Parameter(Mandatory = $false)] - [string]$Compression = "N/A", - - [Parameter(Mandatory = $false)] - [string]$Recall = "None", - - [Parameter(Mandatory = $false)] - [string]$Effort = "Medium" -) - -# == Paths == -$workspaceRoot = (Get-Item "$PSScriptRoot\..\..\..\..").FullName -$readmePath = Join-Path $workspaceRoot "README.md" -$roadmapPath = Join-Path $workspaceRoot "docs\docs\roadmap.md" - -if (-not (Test-Path $readmePath)) { - Write-Error "README.md not found at $readmePath" - return -} -if (-not (Test-Path $roadmapPath)) { - Write-Error "docs/docs/roadmap.md not found at $roadmapPath" - return -} - -# Resolve category headers -$categoryHeaderMap = @{ - 'Compression' = '## Compression & Quantization' - 'Agentic' = '## Agentic AI' - 'Compute' = '## Compute & Hardware' - 'Runtime' = '## Runtime & Deployment' - 'Distributed' = '## Distributed Clustering & Replication' -} - -$emojiPlanned = [char]::ConvertFromUtf32(0x1F51C) -$emojiDone = [char]::ConvertFromUtf32(0x2705) -$emojiResearch = [char]::ConvertFromUtf32(0x1F52C) -$emojiNotPlanned = [char]::ConvertFromUtf32(0x1F534) - -$statusIconMap = @{ - 'Planned' = "$emojiPlanned Planned" - 'Done' = "$emojiDone Done" - 'Exploratory' = "$emojiResearch Exploratory" - 'Research' = "$emojiResearch Research" -} - -$statusDetailsIconMap = @{ - 'Planned' = $emojiPlanned - 'Done' = $emojiDone - 'Exploratory' = $emojiResearch - 'Research' = $emojiResearch -} - -$cleanAnchor = $Name.ToLower().Replace(' ', '-').Replace('&', 'and').Replace('(', '').Replace(')', '').Replace('/', '-') - -# ============================================================================= -# ACTION: ADD -# ============================================================================= -if ($Action -eq 'Add') { - Write-Host "Adding feature '$Name' to roadmap..." - - # 1. Update README.md - $readmeContent = Get-Content $readmePath -Raw - $newReadmeLine = "- [ ] $Name ($Description)" - - # Insert before the closing roadmap link - $targetLine = "> See the [detailed Roadmap]" - if ($readmeContent -match [regex]::Escape($targetLine)) { - $readmeContent = $readmeContent -replace [regex]::Escape($targetLine), "$newReadmeLine`n`n$targetLine" - Set-Content $readmePath $readmeContent -NoNewline - Write-Host " [OK] README.md updated." - } else { - Write-Warning "Could not locate roadmap section in README.md." - } - - # 2. Update docs/docs/roadmap.md Detailed Section - $roadmapContent = Get-Content $roadmapPath -Raw - $targetHeader = $categoryHeaderMap[$Category] - - $statusText = $statusDetailsIconMap[$Status] - - # Construct detailed block natively in a multi-line single-quoted string template - $template = '### {0} {1} {{#{2}}} - -!!! info "Status: {3}" - {4} - -{5} - ----' - $detailsBlock = $template -f $statusText, $Name, $cleanAnchor, $Status, $Description, $DetailText - - if ($roadmapContent -match [regex]::Escape($targetHeader)) { - $roadmapContent = $roadmapContent -replace [regex]::Escape($targetHeader), ($targetHeader + "`r`n`r`n" + $detailsBlock) - Write-Host " [OK] Detailed section in roadmap.md updated." - } else { - Write-Warning "Could not locate category header '$targetHeader' in docs/docs/roadmap.md." - } - - # 3. Update Summary Table in docs/docs/roadmap.md - $lines = $roadmapContent -split '\r?\n' - $newLines = [System.Collections.Generic.List[string]]::new() - $tableIndex = 0 - $highestIndex = 0 - $inSummaryTable = $false - - for ($i = 0; $i -lt $lines.Count; $i++) { - $line = $lines[$i] - $newLines.Add($line) - - if ($line -match '## Summary Table') { - $inSummaryTable = $true - } - - # Detect index in table rows only inside the summary table section - if ($inSummaryTable -and $line -match '^\|\s*(\d+)\s*\|') { - $idx = [int]$Matches[1] - if ($idx -gt $highestIndex) { - $highestIndex = $idx - } - $tableIndex = $i - } - } - - # Construct new row - $newIdx = $highestIndex + 1 - $statusIcon = $statusIconMap[$Status] - $newRow = '| {0} | **{1}** | {2} | {3} | {4} | {5} |' -f $newIdx, $Name, $Compression, $Recall, $Effort, $statusIcon - - # Insert new row right after the last table line - $newLines.Insert($tableIndex + 1, $newRow) - Set-Content $roadmapPath ($newLines -join "`r`n") - Write-Host " [OK] Summary Table in roadmap.md updated with row $newIdx." -} elseif ($Action -eq 'Complete') { - # ============================================================================= - # ACTION: COMPLETE - # ============================================================================= - Write-Host "Completing feature '$Name'..." - - # 1. Update README.md (check checkbox) - $readmeContent = Get-Content $readmePath -Raw - - # Escape target checkbox regex - $regexTarget = "- \[\s*\]\s*" + [regex]::Escape($Name) - if ($readmeContent -match $regexTarget) { - $readmeContent = [regex]::Replace($readmeContent, $regexTarget, "- [x] **$Name**") - Set-Content $readmePath $readmeContent -NoNewline - Write-Host " [OK] README.md checklist updated." - } else { - Write-Warning "Could not locate incomplete checkbox for '$Name' in README.md." - } - - # 2. Update docs/docs/roadmap.md Detailed Section & Reorganize Archive - $roadmapContent = Get-Content $roadmapPath -Raw - - # Locate detailed section block via generic status match - $escapedName = [regex]::Escape($Name) - $sectionRegex = '(?s)###\s+\S+\s+' + $escapedName + '\s+\{#' + $cleanAnchor + '\}.*?---(?:\r?\n|$)' - - if ($roadmapContent -match $sectionRegex) { - $capturedBlock = $Matches[0] - - # Remove from active category - $roadmapContent = $roadmapContent -replace [regex]::Escape($capturedBlock), "" - - # Format the block for Recently Completed archive - $capturedBlock = [regex]::Replace($capturedBlock, "^###\s+\S+", "### $emojiDone") - $capturedBlock = $capturedBlock -replace "Status:\s*(Planned|Exploratory|Research)", "Status: Done" - $capturedBlock = $capturedBlock -replace "!!! info", "!!! success" - $capturedBlock = $capturedBlock -replace "Planned|Exploratory|Research", "Completed" - - # Append to Recently Completed section - $archiveHeader = "## Recently Completed (Archive)" - if ($roadmapContent -match [regex]::Escape($archiveHeader)) { - $roadmapContent = $roadmapContent -replace [regex]::Escape($archiveHeader), ($archiveHeader + "`r`n`r`n" + $capturedBlock) - Write-Host " [OK] Detailed section moved to Recently Completed (Archive)." - } else { - # Create Recently Completed section if not present - $roadmapContent = $roadmapContent + "`r`n`r`n---\r`n\r`n## Recently Completed (Archive)`r`n`r`n" + $capturedBlock - Write-Host " [OK] Recently Completed (Archive) section initialized and updated." - } - } else { - Write-Warning "Could not find detailed roadmap block for '$Name' in docs/docs/roadmap.md." - } - - # 3. Update Summary Table Status to Done - $lines = $roadmapContent -split '\r?\n' - $newLines = [System.Collections.Generic.List[string]]::new() - $tableUpdated = $false - $inSummaryTable = $false - - foreach ($line in $lines) { - if ($line -match '## Summary Table') { - $inSummaryTable = $true - } - if ($inSummaryTable -and $line -match ('^\|\s*(\d+)\s*\|\s*\*\*' + $escapedName + '\*\*')) { - # Replace the status column (last column) with completed check - $parts = $line -split '\|' - $parts[$parts.Length - 2] = " $emojiDone Done " - $line = $parts -join '|' - $tableUpdated = $true - } - $newLines.Add($line) - } - - Set-Content $roadmapPath ($newLines -join "`r`n") - if ($tableUpdated) { - Write-Host " [OK] Summary Table row updated to $emojiDone Done." - } else { - Write-Warning "Could not find Summary Table row for '$Name'." - } -} elseif ($Action -eq 'Deprioritize') { - # ============================================================================= - # ACTION: DEPRIORITIZE - # ============================================================================= - Write-Host "Deprioritizing feature '$Name'..." - - # 1. Update Summary Table Status in docs/docs/roadmap.md to Not Planned - $roadmapContent = Get-Content $roadmapPath -Raw - $escapedName = [regex]::Escape($Name) - $lines = $roadmapContent -split '\r?\n' - $newLines = [System.Collections.Generic.List[string]]::new() - $tableUpdated = $false - $inSummaryTable = $false - - foreach ($line in $lines) { - if ($line -match '## Summary Table') { - $inSummaryTable = $true - } - if ($inSummaryTable -and $line -match ('^\|\s*(\d+)\s*\|\s*\*\*' + $escapedName + '\*\*')) { - $parts = $line -split '\|' - $parts[$parts.Length - 2] = " $emojiNotPlanned Not planned " - $line = $parts -join '|' - $tableUpdated = $true - } - $newLines.Add($line) - } - - Set-Content $roadmapPath ($newLines -join "`r`n") - if ($tableUpdated) { - Write-Host " [OK] Summary Table row updated to $emojiNotPlanned Not planned." - } else { - Write-Warning "Could not find Summary Table row for '$Name'." - } - - # 2. Update status in detailed description block - $roadmapContent = Get-Content $roadmapPath -Raw - $sectionRegex = '(?s)###\s+\S+\s+' + $escapedName + '\s+\{#' + $cleanAnchor + '\}.*?---(?:\r?\n|$)' - - if ($roadmapContent -match $sectionRegex) { - $targetBlock = $Matches[0] - $replacedBlock = [regex]::Replace($targetBlock, "(?m)^###\s+\S+", "### $emojiNotPlanned") - $replacedBlock = $replacedBlock -replace 'Status:\s*[^"\r\n]+', "Status: Not Planned" - - $roadmapContent = $roadmapContent -replace [regex]::Escape($targetBlock), $replacedBlock - Set-Content $roadmapPath $roadmapContent -NoNewline - Write-Host " [OK] Detailed section status updated to $emojiNotPlanned Not Planned." - } -} elseif ($Action -eq 'Remove') { - # ============================================================================= - # ACTION: REMOVE - # ============================================================================= - Write-Host "Removing feature '$Name' completely from roadmap..." - - # 1. Remove from README.md - $readmeContent = Get-Content $readmePath -Raw - $escapedName = [regex]::Escape($Name) - $lineRegex = '(?m)^-\s*\[[\s*x]?\]\s*(?:\*\*)?' + $escapedName + '(?:\*\*)?.*?\r?\n' - - if ($readmeContent -match $lineRegex) { - $readmeContent = [regex]::Replace($readmeContent, $lineRegex, "") - Set-Content $readmePath $readmeContent -NoNewline - Write-Host " [OK] Removed from README.md checklist." - } - - # 2. Remove detailed description from docs/docs/roadmap.md - $roadmapContent = Get-Content $roadmapPath -Raw - $sectionRegex = '(?s)###\s+\S+\s+' + $escapedName + '\s+\{#' + $cleanAnchor + '\}.*?---(?:\r?\n|$)' - - if ($roadmapContent -match $sectionRegex) { - $roadmapContent = $roadmapContent -replace $sectionRegex, "" - Write-Host " [OK] Removed detailed description block." - } - - # 3. Remove row from Summary Table - $lines = $roadmapContent -split '\r?\n' - $newLines = [System.Collections.Generic.List[string]]::new() - $rowRemoved = $false - $inSummaryTable = $false - - foreach ($line in $lines) { - if ($line -match '## Summary Table') { - $inSummaryTable = $true - } - if ($inSummaryTable -and $line -match ('^\|\s*(\d+)\s*\|\s*(?:\*\*)?' + $escapedName + '(?:\*\*)?\s*\|')) { - $rowRemoved = $true - continue; # Skip adding this line to delete the row - } - $newLines.Add($line) - } - - Set-Content $roadmapPath ($newLines -join "`r`n") - if ($rowRemoved) { - Write-Host " [OK] Removed row from Summary Table." - } -} - -Write-Host "Roadmap update completed successfully!" -ForegroundColor Green diff --git a/.agents/skills/update-roadmap/scripts/update-roadmap.sh b/.agents/skills/update-roadmap/scripts/update-roadmap.sh deleted file mode 100644 index 442e7bb..0000000 --- a/.agents/skills/update-roadmap/scripts/update-roadmap.sh +++ /dev/null @@ -1,369 +0,0 @@ -#!/usr/bin/env bash - -# Automates Spector Search roadmap updates across README.md and docs/docs/roadmap.md on Unix/Linux/macOS. - -set -e - -# --- Default Arguments --- -ACTION="" -NAME="" -DESCRIPTION="" -CATEGORY="Runtime" -STATUS="Planned" -DETAIL_TEXT="" -COMPRESSION="N/A" -RECALL="None" -EFFORT="Medium" - -# --- Parse Arguments --- -while [[ $# -gt 0 ]]; do - case $1 in - -Action|-action|--action) - ACTION="$2" - shift 2 - ;; - -Name|-name|--name) - NAME="$2" - shift 2 - ;; - -Description|-description|--description) - DESCRIPTION="$2" - shift 2 - ;; - -Category|-category|--category) - CATEGORY="$2" - shift 2 - ;; - -Status|-status|--status) - STATUS="$2" - shift 2 - ;; - -DetailText|-detailtext|--detailtext) - DETAIL_TEXT="$2" - shift 2 - ;; - -Compression|-compression|--compression) - COMPRESSION="$2" - shift 2 - ;; - -Recall|-recall|--recall) - RECALL="$2" - shift 2 - ;; - -Effort|-effort|--effort) - EFFORT="$2" - shift 2 - ;; - *) - echo "Unknown argument: $1" - exit 1 - ;; - esac -done - -# --- Validate Required arguments --- -if [ -z "$ACTION" ] || [ -z "$NAME" ]; then - echo "Error: -Action and -Name are required arguments." - echo "Usage: ./update-roadmap.sh -Action [Add|Complete|Deprioritize|Remove] -Name \"Feature Name\" [options]" - exit 1 -fi - -if [[ ! "$ACTION" =~ ^(Add|Complete|Deprioritize|Remove)$ ]]; then - echo "Error: Invalid action '$ACTION'. Must be Add, Complete, Deprioritize, or Remove." - exit 1 -fi - -# --- Resolve Paths --- -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -WORKSPACE_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)" -README_PATH="$WORKSPACE_ROOT/README.md" -ROADMAP_PATH="$WORKSPACE_ROOT/docs/docs/roadmap.md" - -if [ ! -f "$README_PATH" ]; then - echo "Error: README.md not found at $README_PATH" - exit 1 -fi -if [ ! -f "$ROADMAP_PATH" ]; then - echo "Error: docs/docs/roadmap.md not found at $ROADMAP_PATH" - exit 1 -fi - -# --- Resolve Icons & Anchors --- -EMOJI_PLANNED="🔜" -EMOJI_DONE="✅" -EMOJI_RESEARCH="🔬" -EMOJI_NOT_PLANNED="🔴" - -case "$STATUS" in - Planned) - STATUS_ICON="$EMOJI_PLANNED Planned" - STATUS_DETAILS_ICON="$EMOJI_PLANNED" - ;; - Done) - STATUS_ICON="$EMOJI_DONE Done" - STATUS_DETAILS_ICON="$EMOJI_DONE" - ;; - Exploratory|Research) - STATUS_ICON="$EMOJI_RESEARCH $STATUS" - STATUS_DETAILS_ICON="$EMOJI_RESEARCH" - ;; - *) - STATUS_ICON="$STATUS" - STATUS_DETAILS_ICON="" - ;; -esac - -# Clean anchor (e.g. "Hardware Cosine SIMD" -> "hardware-cosine-simd") -CLEAN_ANCHOR=$(echo "$NAME" | tr '[:upper:]' '[:lower:]' | sed 's/ /-/g; s/\&/and/g; s/(//g; s/)//g; s/\///g') - -# ============================================================================= -# ACTION: ADD -# ============================================================================= -if [ "$ACTION" = "Add" ]; then - echo "Adding feature '$NAME' to roadmap..." - - # 1. Update README.md - export TARGET_LINE="> See the [detailed Roadmap]" - export NEW_README_LINE="- [ ] $NAME ($DESCRIPTION)" - - if grep -F -q "$TARGET_LINE" "$README_PATH"; then - # Use Perl for clean multi-line injection without BSD/GNU sed incompatibilities - perl -i -pe 's/\Q$ENV{TARGET_LINE}\E/$ENV{NEW_README_LINE}\n\n$ENV{TARGET_LINE}/g' "$README_PATH" - echo " [OK] README.md updated." - else - echo "Warning: Could not locate roadmap section in README.md." - fi - - # 2. Update docs/docs/roadmap.md Detailed Section - CATEGORY_HEADER="" - case "$CATEGORY" in - Compression) CATEGORY_HEADER="## Compression & Quantization" ;; - Agentic) CATEGORY_HEADER="## Agentic AI" ;; - Compute) CATEGORY_HEADER="## Compute & Hardware" ;; - Runtime) CATEGORY_HEADER="## Runtime & Deployment" ;; - Distributed) CATEGORY_HEADER="## Distributed Clustering & Replication" ;; - esac - - DETAILS_BLOCK="### $STATUS_DETAILS_ICON $NAME {#$CLEAN_ANCHOR} - -!!! info \"Status: $STATUS\" - $DESCRIPTION - -$DETAIL_TEXT - ----" - - if grep -F -q "$CATEGORY_HEADER" "$ROADMAP_PATH"; then - export CATEGORY_HEADER - export DETAILS_BLOCK - perl -i -pe 's/\Q$ENV{CATEGORY_HEADER}\E/$ENV{CATEGORY_HEADER}\n\n$ENV{DETAILS_BLOCK}/g' "$ROADMAP_PATH" - echo " [OK] Detailed section in roadmap.md updated." - else - echo "Warning: Could not locate category header '$CATEGORY_HEADER' in docs/docs/roadmap.md." - fi - - # 3. Update Summary Table in docs/docs/roadmap.md - export NEW_ROW="| {IDX} | **$NAME** | $COMPRESSION | $RECALL | $EFFORT | $STATUS_ICON |" - perl -i -0777 -pe ' - my ($doc, $table) = split(/## Summary Table/, $_, 2); - if ($table) { - my $highest = 0; - my $last_row = ""; - while ($table =~ /^\|\s*(\d+)\s*\|.*$/mg) { - my $val = $1; - if ($val > $highest) { - $highest = $val; - } - $last_row = $&; - } - my $new_idx = $highest + 1; - my $row_template = $ENV{NEW_ROW}; - $row_template =~ s/\{IDX\}/$new_idx/g; - my $eol = $table =~ /\r\n/ ? "\r\n" : "\n"; - if ($last_row) { - $last_row =~ s/\r$//; - $table =~ s/\Q$last_row\E\r?\n/$last_row$eol$row_template$eol/m; - } - $_ = $doc . "## Summary Table" . $table; - } - ' "$ROADMAP_PATH" - echo " [OK] Summary Table in roadmap.md updated." - -# ============================================================================= -# ACTION: COMPLETE -# ============================================================================= -elif [ "$ACTION" = "Complete" ]; then - echo "Completing feature '$NAME'..." - - # 1. Update README.md (check checkbox) - export NAME - perl -i -pe 's/-\s*\[\s*\]\s*\Q$ENV{NAME}\E/- [x] **$ENV{NAME}**/g' "$README_PATH" - echo " [OK] README.md checklist updated." - - # 2. Update docs/docs/roadmap.md Detailed Section & Reorganize Archive - export NAME - export CLEAN_ANCHOR - export EMOJI_DONE - perl -i -0777 -pe ' - my $name = $ENV{NAME}; - my $anchor = $ENV{CLEAN_ANCHOR}; - my $emoji_done = $ENV{EMOJI_DONE}; - my $escaped_name = quotemeta($name); - my $escaped_anchor = quotemeta($anchor); - - # Regex to find the detailed block - my $section_regex = qr/(?s)###\s+\S+\s+$escaped_name\s+\{#$escaped_anchor\}.*?---(?:\r?\n|$)/; - - if ($_ =~ /$section_regex/) { - my $captured_block = $&; - - # Remove from active category - $_ =~ s/\Q$captured_block\E//; - - # Format detailed block for Recently Completed archive - $captured_block =~ s/^###\s+\S+/### $emoji_done/; - $captured_block =~ s/Status:\s*(Planned|Exploratory|Research)/Status: Done/; - $captured_block =~ s/!!! info/!!! success/; - $captured_block =~ s/Planned|Exploratory|Research/Completed/g; - - # Check line endings of the file to preserve them - my $eol = $_ =~ /\r\n/ ? "\r\n" : "\n"; - - # Append to Recently Completed section - my $archive_header = "## Recently Completed (Archive)"; - if ($_ =~ /\Q$archive_header\E/) { - $_ =~ s/(\Q$archive_header\E)/$1$eol$eol$captured_block/; - } else { - $_ = $_ . $eol . $eol . "---" . $eol . $eol . "## Recently Completed (Archive)" . $eol . $eol . $captured_block; - } - } - ' "$ROADMAP_PATH" - echo " [OK] Detailed section moved to Recently Completed (Archive)." - - # 3. Update Summary Table Status to Done - export NAME - export EMOJI_DONE - perl -i -0777 -pe ' - my ($doc, $table) = split(/## Summary Table/, $_, 2); - if ($table) { - my $name = $ENV{NAME}; - my $escaped_name = quotemeta($name); - my @lines = split(/\r?\n/, $table); - for my $line (@lines) { - if ($line =~ /^\|\s*(\d+)\s*\|\s*\*\*$escaped_name\*\*/) { - my @parts = split(/\|/, $line, -1); - $parts[$#parts - 1] = " $ENV{EMOJI_DONE} Done "; - $line = join("|", @parts); - } - } - $table = join("\n", @lines); - if ($_ =~ /\r\n/) { - $table =~ s/\n/\r\n/g; - } - $_ = $doc . "## Summary Table" . $table; - } - ' "$ROADMAP_PATH" - echo " [OK] Summary Table row updated to $EMOJI_DONE Done." - -# ============================================================================= -# ACTION: DEPRIORITIZE -# ============================================================================= -elif [ "$ACTION" = "Deprioritize" ]; then - echo "Deprioritizing feature '$NAME'..." - - # 1. Update Summary Table Status in docs/docs/roadmap.md to Not Planned - export NAME - export EMOJI_NOT_PLANNED - perl -i -0777 -pe ' - my ($doc, $table) = split(/## Summary Table/, $_, 2); - if ($table) { - my $name = $ENV{NAME}; - my $escaped_name = quotemeta($name); - my @lines = split(/\r?\n/, $table); - for my $line (@lines) { - if ($line =~ /^\|\s*(\d+)\s*\|\s*\*\*$escaped_name\*\*/) { - my @parts = split(/\|/, $line, -1); - $parts[$#parts - 1] = " $ENV{EMOJI_NOT_PLANNED} Not planned "; - $line = join("|", @parts); - } - } - $table = join("\n", @lines); - if ($_ =~ /\r\n/) { - $table =~ s/\n/\r\n/g; - } - $_ = $doc . "## Summary Table" . $table; - } - ' "$ROADMAP_PATH" - echo " [OK] Summary Table row updated to $EMOJI_NOT_PLANNED Not planned." - - # 2. Update status in detailed description block - export NAME - export CLEAN_ANCHOR - export EMOJI_NOT_PLANNED - perl -i -0777 -pe ' - my $name = $ENV{NAME}; - my $anchor = $ENV{CLEAN_ANCHOR}; - my $emoji_not_planned = $ENV{EMOJI_NOT_PLANNED}; - my $escaped_name = quotemeta($name); - my $escaped_anchor = quotemeta($anchor); - - # Regex to find the detailed block - my $section_regex = qr/(?s)###\s+\S+\s+$escaped_name\s+\{#$escaped_anchor\}.*?---(?:\r?\n|$)/; - - if ($_ =~ /$section_regex/) { - my $target_block = $&; - my $replaced_block = $target_block; - $replaced_block =~ s/^###\s+\S+/### $emoji_not_planned/m; - $replaced_block =~ s/Status:\s*[^"\n\r]+/Status: Not Planned/; - - $_ =~ s/\Q$target_block\E/$replaced_block/; - } - ' "$ROADMAP_PATH" - echo " [OK] Detailed section status updated to $EMOJI_NOT_PLANNED Not Planned." - -# ============================================================================= -# ACTION: REMOVE -# ============================================================================= -elif [ "$ACTION" = "Remove" ]; then - echo "Removing feature '$NAME' completely from roadmap..." - - # 1. Remove from README.md - export NAME - perl -i -ne 'print unless /-\s*\[[\s*x]?\]\s*(?:\*\*)?\Q$ENV{NAME}\E/' "$README_PATH" - echo " [OK] Removed from README.md checklist." - - # 2. Remove detailed description from docs/docs/roadmap.md - export NAME - export CLEAN_ANCHOR - perl -i -0777 -pe ' - my $name = $ENV{NAME}; - my $anchor = $ENV{CLEAN_ANCHOR}; - my $escaped_name = quotemeta($name); - my $escaped_anchor = quotemeta($anchor); - - my $section_regex = qr/(?s)###\s+\S+\s+$escaped_name\s+\{#$escaped_anchor\}.*?---(?:\r?\n|$)/; - $_ =~ s/$section_regex//; - ' "$ROADMAP_PATH" - echo " [OK] Removed detailed description block." - - # 3. Remove row from Summary Table - export NAME - perl -i -0777 -pe ' - my ($doc, $table) = split(/## Summary Table/, $_, 2); - if ($table) { - my $name = $ENV{NAME}; - my $escaped_name = quotemeta($name); - my @lines = split(/\r?\n/, $table); - @lines = grep { !/^\|\s*\d+\s*\|\s*(?:\*\*)?\Q$name\E/ } @lines; - $table = join("\n", @lines); - if ($_ =~ /\r\n/) { - $table =~ s/\n/\r\n/g; - } - $_ = $doc . "## Summary Table" . $table; - } - ' "$ROADMAP_PATH" - echo " [OK] Removed row from Summary Table." - -fi - -echo "Roadmap update completed successfully!" diff --git a/.agents/workflows/documentation-update.md b/.agents/workflows/documentation-update.md deleted file mode 100644 index daedb83..0000000 --- a/.agents/workflows/documentation-update.md +++ /dev/null @@ -1,72 +0,0 @@ -# Workflow: Documentation Update - -Process for creating or updating documentation in the MkDocs Material site. - -## Trigger - -When creating new design docs, architecture pages, deep-dives, or the user requests documentation changes. - -## Steps - -### 1. Identify Scope - -Determine the documentation type: - -| Type | Location | Style | -|---|---|---| -| Architecture overview | `docs/docs/architecture/` | Mermaid diagrams, component descriptions | -| Design deep-dive | `docs/docs/deep-dives/` | Technical analysis, benchmarks, trade-offs | -| Memory subsystem | `docs/docs/memory/` | RFC wire format diagrams, neuroscience analogies | -| API reference | `docs/docs/api-reference/` | Request/response examples, endpoint tables | -| Module docs | `docs/docs/modules/` | Auto-included from `spector-*/README.md` via `--8<--` | -| Configuration | `docs/docs/configuration/` | Parameter tables, YAML examples | -| Getting started | `docs/docs/getting-started/` | Step-by-step tutorials | - -### 2. Check Source of Truth - -- Memory subsystem: `spector-memory/RnD/` specs are the design source of truth -- Engine/Index: code + test behavior is source of truth -- Configuration: `SpectorConfigFactory.java` defaults are source of truth - -### 3. Write Content - -Follow documentation standards: -- **Binary layouts:** RFC-style wire format diagrams (see `wal-design.md`) -- **Architecture:** Mermaid diagrams for component relationships -- **Code examples:** Real, working snippets from the codebase -- **Tables:** For configuration parameters, comparison matrices -- **Admonitions:** Use MkDocs Material admonitions (`!!! note`, `!!! warning`) - -### 4. Update Navigation - -If this is a new page, add to `docs/mkdocs.yml` nav section: -```yaml -nav: - - Section: - - Page Title: path/to/page.md -``` - -### 5. Verify Build - -```bash -cd docs -python -m mkdocs build --clean -``` - -Fix any warnings about: -- Pages not in nav -- Broken cross-references -- Invalid Mermaid syntax - -### 6. Preview (Optional) - -```bash -cd docs -python -m mkdocs serve --dev-addr 127.0.0.1:8085 -``` - -### 7. Commit - -``` -docs: -``` diff --git a/.agents/workflows/exception-hardening.md b/.agents/workflows/exception-hardening.md deleted file mode 100644 index b08ca98..0000000 --- a/.agents/workflows/exception-hardening.md +++ /dev/null @@ -1,182 +0,0 @@ ---- -description: Audit and harden exception handling for a feature or module, aligning all catch/throw sites with the SpectorException framework. ---- - -## Trigger - -When the user asks to: -- "Add exception handling" for a newly implemented feature -- "Harden exceptions" in a module -- "Audit error handling" for recent changes -- "Align with the error framework" after building a new feature - -## Prerequisites - -Read the exception handling section of the coding standards skill: -`.agents/skills/coding-standards/SKILL.md` → **Error Handling — SpectorException Framework** - -## Steps - -### 1. Identify Target Scope - -Determine what needs hardening: -- A specific feature (e.g., "exception handling for the entity graph") -- A module (e.g., "audit spector-memory") -- Recent changes (e.g., "harden exceptions for the feature we just built") - -List all files involved. Focus on: -- New classes added for the feature -- Pipeline integration points (ingestion, recall, consolidation) -- Persistence layers (save/load methods) -- Public API methods - -### 2. Audit Existing Catch Sites - -Search for anti-patterns in the target files: - -```bash -# Find generic Exception catches -grep -rn "catch (Exception " spector-{module}/src/main/java/ - -# Find UncheckedIOException throws -grep -rn "UncheckedIOException" spector-{module}/src/main/java/ - -# Find generic RuntimeException throws -grep -rn "throw new RuntimeException" spector-{module}/src/main/java/ - -# Find string concatenation in ErrorCode.format() -grep -rn "ErrorCode\.\w*\.format(" spector-{module}/src/main/java/ | grep "+" - -# Find swallowed exceptions -grep -rn "catch.*{" spector-{module}/src/main/java/ -A1 | grep -B1 "// ignored\|/\* \*/" -``` - -### 3. Determine Required Error Codes - -For each error condition identified: - -1. Check if an existing `ErrorCode` covers it (search `ErrorCode.java`) -2. If not, add a new code under the correct category section: - - Validation: `SPE-100-xxx` - - Config: `SPE-110-xxx` - - Index: `SPE-200-xxx` - - Storage: `SPE-210-xxx` - - Embedding: `SPE-300-xxx` - - Memory: `SPE-310-xxx` - - GPU: `SPE-400-xxx` - - Server: `SPE-500-xxx` - - Client: `SPE-510-xxx` - - Ingestion: `SPE-600-xxx` - - Cluster: `SPE-700-xxx` - - Internal: `SPE-900-xxx` - -**Template:** -```java -/** Brief javadoc of when this error occurs. */ -FEATURE_OPERATION_FAILED (310_0XX, ErrorCategory.MEMORY, - "Feature operation failed for {}: {}"), -``` - -### 4. Create Granular Exception Classes - -For each distinct error domain, create a `Spector{Domain}Exception`: - -**Location:** `spector-{module}/src/main/java/.../error/` - -**Template:** -```java -public class Spector{Domain}Exception extends Spector{Parent}Exception { - private final String operation; - - public Spector{Domain}Exception(String operation) { - super(ErrorCode.FEATURE_OPERATION_FAILED, operation); - this.operation = operation; - } - - public Spector{Domain}Exception(String operation, Throwable cause) { - super(ErrorCode.FEATURE_OPERATION_FAILED, cause, operation); - this.operation = operation; - } - - public String getOperation() { return operation; } -} -``` - -**Rules:** -- Constructor args map 1:1 to `{}` placeholders in the ErrorCode template -- No string concatenation — `errorCode.format(args)` handles formatting -- Add typed context fields (operation, path, etc.) -- Follow naming: `Spector{Domain}Exception` - -### 5. Fix Throw Sites - -Replace all anti-patterns with proper throws: - -| Before (❌) | After (✅) | -|---|---| -| `throw new UncheckedIOException(msg, e)` | `throw new Spector{Domain}Exception(args, e)` | -| `throw new RuntimeException(msg)` | `throw new Spector{Domain}Exception(args)` | -| `throw new SpectorException(ErrorCode.X, e, "concat" + var)` | `throw new Spector{Domain}Exception(arg1, arg2, e)` | - -### 6. Fix Catch Sites - -Apply the correct pattern based on context: - -**Graceful degradation** (enrichment steps that should NOT crash the pipeline): -```java -} catch (RuntimeException e) { - Spector{Domain}Exception ex = new Spector{Domain}Exception("operation", e); - log.warn(ex.getMessage()); -} -``` - -**IO failures** (persistence methods): -```java -} catch (IOException e) { - throw new Spector{Domain}PersistenceException("GraphType", path, e); -} -``` - -**Validation** (public API entry points): -```java -if (param == null) { - throw new SpectorValidationException(ErrorCode.ARGUMENT_NULL, "paramName"); -} -``` - -**Never:** -- `catch (Exception e)` — always catch the narrowest type (`RuntimeException`, `IOException`) -- `catch (Exception e) { /* ignored */ }` — never swallow exceptions -- String concatenation inside `ErrorCode.format()` or exception constructors - -### 7. Update Exception Hierarchy Documentation - -If new exception classes were added, update the hierarchy tree in: -- `SpectorException.java` javadoc (lines 26-49) - -### 8. Verify - -```bash -# Compile -mvn compile -q - -# Run tests for the affected module -mvn test -pl spector-{module} - -# Verify no remaining anti-patterns -grep -rn "catch (Exception " spector-{module}/src/main/java/ | grep -v "// justified" -grep -rn "throw new RuntimeException" spector-{module}/src/main/java/ -grep -rn "UncheckedIOException" spector-{module}/src/main/java/ -``` - -### 9. Commit - -Follow the incremental-commits skill (`.agents/skills/incremental-commits/SKILL.md`): - -``` -feat(error): add {domain} exception hierarchy with ErrorCode integration - -New error codes: SPE-XXX-YYY through SPE-XXX-ZZZ -New exceptions: Spector{A}Exception, Spector{B}Exception -Hardened catch sites in: {list of files} -``` diff --git a/.agents/workflows/feature-development.md b/.agents/workflows/feature-development.md deleted file mode 100644 index 7d9c1b2..0000000 --- a/.agents/workflows/feature-development.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -description: End-to-end process for implementing a new feature in Spector, from understanding requirements to committing clean code. ---- - -## Trigger - -When implementing a new feature, capability, or significant enhancement in any `spector-*` module. - -## Steps - -### 1. Understand Requirements - -- Read RnD specs if available (`spector-memory/RnD/` for memory subsystem) -- Check the roadmap (`docs/docs/roadmap.md`) for planned features -- Identify which module(s) the feature belongs in - -### 2. Verify Module Boundaries - -Before writing code, confirm the target module is correct: - -- Foundation (core, commons, config, storage) — shared abstractions -- Search (index, query, gpu) — search algorithms -- Intelligence (engine, memory, rag, ingestion) — orchestration -- Runtime (runtime, node, mcp, cli) — entry points - -**Rule:** `spector-memory` and `spector-engine` never depend on each other. - -### 3. Implement - -- Follow coding standards (`.agents/skills/coding-standards/SKILL.md`) -- Use `ReentrantLock` not `synchronized` -- Use records for immutable data -- Use section separators for class organization -- Add SLF4J logging at appropriate levels - -### 4. Write Tests - -- JUnit 5 + AssertJ, `@TempDir` for file tests -- At least 1 test per new public API method -- Integration tests suffixed `*IntegrationTest` -- Run: `mvn test -pl spector-{module}` - -### 5. Update Documentation - -Run the doc-sync skill (`.agents/skills/doc-sync/SKILL.md`): -- Update module README if public API changed -- Update config docs if defaults changed -- Update design docs if binary layouts changed - -### 6. Build & Verify - -```bash -mvn test -pl spector-{module} # module tests -mvn clean install # full reactor -cd docs && python -m mkdocs build --clean # docs build -``` - -### 7. Commit - -Run the incremental-commits skill (`.agents/skills/incremental-commits/SKILL.md`): -- Group by component in dependency order -- Conventional Commits format - -### 8. Update Roadmap (if applicable) - -Run the update-roadmap skill (`.agents/skills/update-roadmap/SKILL.md`): -- Mark feature as completed if it was on the roadmap diff --git a/.agents/workflows/module-lifecycle.md b/.agents/workflows/module-lifecycle.md deleted file mode 100644 index e548c92..0000000 --- a/.agents/workflows/module-lifecycle.md +++ /dev/null @@ -1,129 +0,0 @@ ---- -description: Process for adding, removing, or renaming Maven modules in the Spector reactor. ---- - -# Workflow: Module Lifecycle - -Process for adding, removing, or renaming Maven modules in the Spector reactor. - -## Trigger - -When creating a new `spector-*` module, removing an obsolete module, or renaming/merging modules. - ---- - -## Adding a New Module - -### 1. Create Directory Structure - -``` -spector-{name}/ -├── pom.xml -├── README.md -└── src/ - ├── main/java/com/spectrayan/spector/{name}/ - └── test/java/com/spectrayan/spector/{name}/ -``` - -### 2. Create POM - -- Parent: `com.spectrayan:spector:0.1.0-SNAPSHOT` -- Add dependencies from the correct architecture layer -- Include `--add-modules jdk.incubator.vector` in compiler args if needed - -### 3. Add to Root POM - -Add `spector-{name}` to root `pom.xml` in the correct layer position. - -### 4. Create README - -Include: purpose, architecture, usage examples, API summary. - -### 5. Create Docs Page - -Create `docs/docs/modules/spector-{name}.md`: -```markdown ---8<-- "spector-{name}/README.md" -``` - -### 6. Update Nav - -Add to `docs/mkdocs.yml` under `Modules:` in correct position. - -### 7. Update Module Index - -Edit `docs/docs/modules/index.md`: -- Add to correct layer table -- Add to Mermaid architecture diagram -- Add to dependency graph diagram - -### 8. Verify - -```bash -mvn compile -pl spector-{name} -cd docs && python -m mkdocs build --clean -``` - -### 9. Commit - -``` -feat({name}): add spector-{name} module -``` - ---- - -## Removing a Module - -### 1. Delete Module - -```bash -rm -rf spector-{name}/ -``` - -### 2. Remove from Root POM - -Delete `spector-{name}` from root `pom.xml`. - -### 3. Remove Dependencies - -Grep for references in other module POMs: -```bash -grep -rn "spector-{name}" spector-*/pom.xml -``` - -### 4. Clean Docs - -- Delete `docs/docs/modules/spector-{name}.md` -- Remove nav entry from `docs/mkdocs.yml` -- Remove from `docs/docs/modules/index.md` tables and diagrams - -### 5. Grep for Stale References - -```bash -grep -rn "spector-{name}" docs/ scripts/ *.md -``` - -### 6. Verify - -```bash -mvn clean compile -cd docs && python -m mkdocs build --clean -``` - -### 7. Commit - -``` -refactor: remove spector-{name} module - - -``` - ---- - -## Renaming / Merging Modules - -Follow "Adding" for the new name, then "Removing" for the old name. Commit separately: - -1. `feat({new}): add spector-{new} module` -2. Migration commit (move code, update imports) -3. `refactor: remove spector-{old} module (merged into spector-{new})` diff --git a/.agents/workflows/perf-investigation.md b/.agents/workflows/perf-investigation.md deleted file mode 100644 index 05d5e67..0000000 --- a/.agents/workflows/perf-investigation.md +++ /dev/null @@ -1,85 +0,0 @@ -# Workflow: Performance Investigation - -Process for investigating performance regressions or optimizing hot paths in Spector. - -## Trigger - -When investigating slow queries, memory regressions, SIMD inefficiencies, or the user requests performance analysis. - -## Steps - -### 1. Identify the Hot Path - -Determine which component is slow: -- **Search latency** → `spector-index` (HNSW traversal, SIMD similarity) -- **Ingestion throughput** → `spector-engine` / `spector-ingestion` pipeline -- **Memory operations** → `spector-memory` (WAL writes, synapse lookup) -- **Startup time** → `spector-runtime` / `spector-engine` (index loading) - -### 2. Baseline Benchmark - -```bash -mvn package -pl spector-bench -DskipTests -java --add-modules jdk.incubator.vector \ - -jar spector-bench/target/benchmarks.jar \ - {BenchmarkClass} -f 1 -wi 3 -i 5 -``` - -Record baseline numbers before any changes. - -### 3. Profile - -Use JFR (Java Flight Recorder): -```bash -java -XX:StartFlightRecording=filename=profile.jfr,duration=60s ... -``` - -Look for: -- Heap allocations in hot path (`new float[]`, `toArray()`, boxing) -- Lock contention (`ReentrantLock` wait time) -- Virtual thread pinning (should not happen if no `synchronized`) -- Unnecessary `MemorySegment` copies - -### 4. Analyze Code - -Check against performance rules: - -- [ ] No heap allocations in similarity/search loops -- [ ] `MemorySegment` slices used (not `.toArray()`) -- [ ] SIMD uses `FloatVector.SPECIES_PREFERRED` -- [ ] SIMD loop bound via `SPECIES.loopBound()` with scalar tail -- [ ] `VectorMask` for partial-lane handling (not branching) -- [ ] Arena lifecycle correct (`ofShared` vs `ofConfined`) -- [ ] Buffers reused, not allocated per-call -- [ ] No `String.format()` in hot loops (use SLF4J parameterized logging) - -### 5. Optimize - -Apply the fix following coding standards. Common optimizations: - -| Problem | Fix | -|---|---| -| `float[]` allocation in loop | Pre-allocate and reuse buffer | -| `segment.toArray()` | Use `segment.asSlice()` | -| Scalar similarity | Replace with SIMD `FloatVector` | -| `String.format` in loop | Move outside or use SLF4J `{}` | -| Synchronized lock | Replace with `ReentrantLock` | - -### 6. Benchmark After - -Run the same benchmark from Step 2. Compare: -- Throughput (ops/sec) -- Latency (avg, p99) -- Allocation rate (bytes/op) - -### 7. Commit - -``` -perf({module}): - -Before: {N} ops/sec, p99={X}ms -After: {M} ops/sec, p99={Y}ms -Improvement: {Z}% -``` - -Include JMH numbers in commit body. This is mandatory for `perf:` commits. diff --git a/.agents/workflows/pr-review.md b/.agents/workflows/pr-review.md deleted file mode 100644 index 17742c0..0000000 --- a/.agents/workflows/pr-review.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -description: Structured pull request review process ensuring code quality, architecture compliance, and test coverage before merge. ---- - -# Workflow: PR Review - -Structured pull request review process ensuring code quality, architecture compliance, and test coverage before merge. - -## Trigger - -When reviewing a pull request, inspecting a diff before push, or the user requests a code review. - -## Steps - -### 1. Scope the Diff - -```bash -git diff main...HEAD --stat -``` - -Count changed files, identify affected modules, classify risk: -- **High risk:** core, index, storage (SIMD, Panama, hot paths) -- **Medium risk:** engine, memory, query (business logic) -- **Low risk:** docs, bench, scripts - -### 2. Run Code Review Skill - -Execute the full 8-step code review (`.agents/skills/code-review/SKILL.md`): -1. Scope → 2. Architecture → 3. Java Standards → 4. Performance → 5. Tests → 6. Docs → 7. Git Hygiene → 8. Summary - -### 3. Run Tests - -```bash -mvn test # full suite -mvn test -pl spector-{module} # changed modules only -``` - -### 4. Verify Docs - -If documentation changed: -```bash -cd docs && python -m mkdocs build --clean 2>&1 | grep WARNING -``` - -### 5. Generate Verdict - -Produce a structured review summary with: -- ✅ Passed checks -- ⚠️ Warnings (non-blocking) -- ❌ Blockers (must fix) -- Verdict: `APPROVE` / `REQUEST_CHANGES` / `NEEDS_DISCUSSION` - -### 6. Follow Up - -- All blockers must be resolved before merge -- PRs are squash-merged to keep history clean -- Commit message for squash should follow Conventional Commits diff --git a/.agents/workflows/release-prep.md b/.agents/workflows/release-prep.md deleted file mode 100644 index a7b89d6..0000000 --- a/.agents/workflows/release-prep.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -description: End-to-end process for preparing a Spector release — test verification, changelog, version bump, docs, and tagging. ---- - -# Workflow: Release Preparation - -End-to-end process for preparing a Spector release — test verification, changelog, version bump, docs, and tagging. - -## Trigger - -When preparing for a tagged release or the user requests release preparation. - -## Steps - -### 1. Test Gap Analysis - -Identify modules with missing test coverage: - -```bash -# Count production vs test files per module -for dir in spector-*/; do - main=$(find "$dir/src/main" -name "*.java" 2>/dev/null | wc -l) - test=$(find "$dir/src/test" -name "*.java" 2>/dev/null | wc -l) - echo "$dir main=$main test=$test" -done -``` - -Flag critical gaps (0 tests in production modules). - -### 2. Full Build - -```bash -mvn clean install -``` - -All tests must pass. Zero tolerance for failures. - -### 3. Dependency Audit - -```bash -# Check for circular dependencies -grep -rn "import com.spectrayan.spector.engine" spector-memory/src/ -grep -rn "import com.spectrayan.spector.memory" spector-engine/src/ - -# Verify no SNAPSHOT dependencies in release -grep -rn "SNAPSHOT" spector-*/pom.xml -``` - -### 4. Generate Changelog - -From commit history since last tag: - -```bash -git log --oneline $(git describe --tags --abbrev=0)..HEAD -``` - -Group entries by type: -- **Added** — `feat:` commits -- **Changed** — `refactor:` commits -- **Fixed** — `fix:` commits -- **Performance** — `perf:` commits -- **Removed** — deletion commits - -Prepend to `CHANGELOG.md` with version header and date. - -### 5. Version Bump - -Update version in root `pom.xml` (child POMs inherit via parent). - -### 6. Update Roadmap - -Use update-roadmap skill to mark completed features as done. - -### 7. Docs Verification - -```bash -cd docs && python -m mkdocs build --clean -``` - -Zero warnings for controlled files. - -### 8. Tag & Commit - -```bash -git add -A -git commit -m "chore: prepare release v{version}" -git tag -a v{version} -m "Release v{version}" -``` diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 56deff4..34698c8 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,6 +1,6 @@ --- name: Bug report -about: Create a report to help us improve Spector +about: Create a report to help us improve Spector-Search title: '' labels: 'bug' assignees: '' @@ -24,7 +24,7 @@ A clear and concise description of what you expected to happen. - OS: [e.g. Ubuntu 22.04, Windows 11, macOS 14] - JDK version: [e.g. OpenJDK 25] - SIMD capability: [e.g. S_256_BIT / AVX2] -- Spector version: [e.g. 0.1.0] +- Spector-Search version: [e.g. 0.1.0] **Logs / Stack Traces** If applicable, add relevant log output or stack traces. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index f920a7d..7a7e8a9 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,6 +1,6 @@ --- name: Feature request -about: Suggest an idea for Spector +about: Suggest an idea for Spector-Search title: '' labels: 'enhancement' assignees: '' @@ -17,7 +17,7 @@ A clear and concise description of what you want to happen. A clear and concise description of any alternative solutions or features you've considered. **Module(s) affected** -Which module(s) would this feature impact? (e.g. spector-core, spector-index, spector-node) +Which module(s) would this feature impact? (e.g. spector-core, spector-index, spector-server) **Additional context** Add any other context, benchmarks, or research papers about the feature request here. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 68e8b72..c04d83a 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -20,7 +20,7 @@ - [ ] `spector-index` (HNSW / BM25) - [ ] `spector-query` (query orchestration) - [ ] `spector-engine` (engine facade) -- [ ] `spector-node` (REST API) +- [ ] `spector-server` (REST API) - [ ] `spector-bench` (benchmarks) ## Checklist diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5ae1c5a..ac576fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,11 +33,6 @@ jobs: distribution: ${{ env.JAVA_DISTRIBUTION }} cache: 'maven' - # ─── License Header Check ──────────────────────────────────────── - - name: Check license headers - run: | - mvn -B license:check --no-transfer-progress - # ─── Reproducible Build ─────────────────────────────────────────── - name: Build with reproducible output run: | @@ -58,9 +53,8 @@ jobs: # ─── Dependency Pinning Verification ───────────────────────────── - name: Verify no dynamic version ranges run: | - # Fail if any external dependency uses dynamic ranges like [1.0,2.0) or LATEST/RELEASE/SNAPSHOT - # Exclude internal modules (com.spectrayan) and Maven reactor build lines - if mvn -B dependency:tree --no-transfer-progress | grep -E '\[(.*,.*)\]|\[.*,\)|\(.*,.*\]|LATEST|RELEASE|SNAPSHOT' | grep -v 'com.spectrayan' | grep -v 'Building ' | grep -v 'Reactor Summary'; then + # Fail if any dependency uses dynamic ranges like [1.0,2.0) or LATEST/RELEASE + if mvn -B dependency:tree --no-transfer-progress | grep -E '\[(.*,.*)\]|\[.*,\)|\(.*,.*\]|LATEST|RELEASE|SNAPSHOT' | grep -v 'spector-search'; then echo "::error::Dynamic version ranges detected in dependencies. All versions must be pinned." exit 1 fi @@ -69,7 +63,7 @@ jobs: # ─── Test Results ──────────────────────────────────────────────── - name: Upload test results if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: test-results path: '**/target/surefire-reports/*.xml' @@ -136,7 +130,7 @@ jobs: - name: Upload build provenance if: success() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: build-provenance path: build-provenance.json @@ -145,7 +139,7 @@ jobs: # ─── Upload JARs ───────────────────────────────────────────────── - name: Upload build artifacts if: success() && github.event_name == 'push' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: jars path: '**/target/*.jar' diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index e3923c9..fbb23f3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -2,26 +2,13 @@ name: Deploy Documentation on: push: - branches: - - main - - 'labs/**' + branches: [ main ] paths: - 'docs/**' - - 'scripts/collect-labs.sh' - - 'LABS.md' workflow_dispatch: - inputs: - wiki_only: - description: 'Sync wiki only (skip pages deploy)' - required: false - default: 'false' - type: choice - options: - - 'false' - - 'true' permissions: - contents: write + contents: read pages: write id-token: write @@ -32,12 +19,9 @@ concurrency: jobs: build: runs-on: ubuntu-latest - if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.wiki_only == 'true') }} steps: - name: Checkout uses: actions/checkout@v4 - with: - fetch-depth: 0 # Full history — needed to access labs/* branches - name: Set up Python uses: actions/setup-python@v5 @@ -46,10 +30,7 @@ jobs: - name: Install MkDocs and dependencies run: | - pip install mkdocs-material pymdown-extensions mkdocs-callouts - - - name: Collect Labs branches - run: ./scripts/collect-labs.sh + pip install mkdocs-material pymdown-extensions - name: Build documentation run: mkdocs build @@ -66,212 +47,7 @@ jobs: url: ${{ steps.deployment.outputs.page_url }} runs-on: ubuntu-latest needs: build - if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.wiki_only == 'true') }} steps: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v4 - - sync-wiki: - runs-on: ubuntu-latest - needs: [deploy] - if: ${{ always() && (needs.deploy.result == 'success' || needs.deploy.result == 'skipped') }} - steps: - - name: Checkout main repo - uses: actions/checkout@v4 - with: - path: main - - - name: Checkout wiki - uses: actions/checkout@v4 - with: - repository: ${{ github.repository }}.wiki - path: wiki - token: ${{ secrets.GITHUB_TOKEN }} - - - name: Sync docs to wiki - run: | - python3 << 'PYSCRIPT' - import yaml, os, shutil, glob, re - - DOCS_DIR = 'main/docs/docs' - WIKI_DIR = 'wiki' - MKDOCS_YML = 'main/docs/mkdocs.yml' - SCREENSHOTS_DIR = 'main/docs/screenshots' - - # ── MkDocs YAML loader (ignores !!python/name: tags) ──────── - class MkDocsLoader(yaml.SafeLoader): - pass - - def _ignore_python_tags(loader, tag_suffix, node): - return None - - MkDocsLoader.add_multi_constructor( - 'tag:yaml.org,2002:python/', _ignore_python_tags - ) - - # ── Single source of truth: path → wiki page name ────────── - SPECIAL_NAMES = { - 'index.md': 'Home', - 'about.md': 'About', - 'faq.md': 'FAQ', - 'roadmap.md': 'Roadmap', - } - - def path_to_wiki_name(rel_path): - """Convert a docs-relative path to a wiki page name. - - Examples: - index.md → Home - about.md → About - getting-started/quickstart.md → Getting-Started--Quickstart - architecture/overview.md → Architecture--Overview - memory/index.md → Memory - cortex/index.md → Cortex - """ - if rel_path in SPECIAL_NAMES: - return SPECIAL_NAMES[rel_path] - - path = rel_path.replace('.md', '') - parts = path.split('/') - - # index.md in a subdirectory → use directory name only - if parts[-1] == 'index': - parts = parts[:-1] - - if not parts: - return 'Home' - - # Title-case each segment, join with '--' (double hyphen) - # to separate directory from filename. - # Within each segment, hyphens become spaces for title-casing, - # then go back to hyphens. - def title_case_segment(seg): - return '-'.join( - word.capitalize() - for word in seg.replace('-', ' ').split() - ) - - return '--'.join(title_case_segment(s) for s in parts) - - # ── 1. Clean wiki directory ───────────────────────────────── - for f in glob.glob(os.path.join(WIKI_DIR, '*.md')): - basename = os.path.basename(f) - if basename != '_Footer.md': - os.remove(f) - - for d in ['images', 'screenshots']: - p = os.path.join(WIKI_DIR, d) - if os.path.isdir(p): - shutil.rmtree(p) - - # ── 2. Copy all .md files with wiki names ─────────────────── - page_map = {} # rel_path → wiki_name (for link fixing) - - for root, dirs, files in os.walk(DOCS_DIR): - for fname in files: - full_path = os.path.join(root, fname) - rel_path = os.path.relpath(full_path, DOCS_DIR) - - if fname.endswith('.md'): - wiki_name = path_to_wiki_name(rel_path) - page_map[rel_path] = wiki_name - dest = os.path.join(WIKI_DIR, f'{wiki_name}.md') - shutil.copy2(full_path, dest) - print(f' 📄 {rel_path} → {wiki_name}.md') - - elif fname.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp')): - dest_dir = os.path.join(WIKI_DIR, 'images', os.path.dirname(rel_path)) - os.makedirs(dest_dir, exist_ok=True) - shutil.copy2(full_path, os.path.join(dest_dir, fname)) - - # Copy top-level screenshots - if os.path.isdir(SCREENSHOTS_DIR): - dest = os.path.join(WIKI_DIR, 'screenshots') - shutil.copytree(SCREENSHOTS_DIR, dest, dirs_exist_ok=True) - - print(f'\n✅ Copied {len(page_map)} pages') - - # ── 3. Fix content in wiki pages ──────────────────────────── - for wiki_file in glob.glob(os.path.join(WIKI_DIR, '*.md')): - basename = os.path.basename(wiki_file) - if basename.startswith('_'): - continue - - with open(wiki_file, 'r', encoding='utf-8') as f: - content = f.read() - - # Remove YAML frontmatter - content = re.sub(r'^---\n.*?\n---\n', '', content, count=1, flags=re.DOTALL) - - # Fix image paths - content = content.replace('](../screenshots/', '](screenshots/') - content = content.replace('](../../screenshots/', '](screenshots/') - - # Convert MkDocs admonitions to blockquotes - content = re.sub( - r'^!!! (\w+) "([^"]*)"', - r'> **\1:** \2', - content, flags=re.MULTILINE - ) - content = re.sub( - r'^!!! quote "([^"]*)"', - r'> **\1**', - content, flags=re.MULTILINE - ) - - # Convert tabs to headings - content = re.sub(r'^=== "([^"]*)"', r'### \1', content, flags=re.MULTILINE) - - # Convert snippets - content = re.sub(r'^--8<-- "([^"]*)"', r'> *See: \1*', content, flags=re.MULTILINE) - - with open(wiki_file, 'w', encoding='utf-8') as f: - f.write(content) - - # ── 4. Generate _Sidebar.md from mkdocs nav ───────────────── - with open(MKDOCS_YML, 'r') as f: - config = yaml.load(f, Loader=MkDocsLoader) - - nav = config.get('nav', []) - - def write_nav(items, depth, out): - indent = ' ' * depth - for item in items: - if isinstance(item, str): - wiki_name = path_to_wiki_name(item) - out.append(f'{indent}- [[{wiki_name}]]') - elif isinstance(item, dict): - for title, value in item.items(): - if isinstance(value, str): - wiki_name = path_to_wiki_name(value) - out.append(f'{indent}- [[{wiki_name}|{title}]]') - elif isinstance(value, list): - out.append(f'{indent}- **{title}**') - write_nav(value, depth + 1, out) - - sidebar_lines = [ - '**[🏠 Home](Home)**', - '', - '---', - '', - ] - write_nav(nav, 0, sidebar_lines) - - sidebar_path = os.path.join(WIKI_DIR, '_Sidebar.md') - with open(sidebar_path, 'w', encoding='utf-8') as f: - f.write('\n'.join(sidebar_lines) + '\n') - - print(f'✅ Generated _Sidebar.md with {len(sidebar_lines)} lines') - PYSCRIPT - - echo "✅ Wiki sync complete: $(ls -1 wiki/*.md | wc -l) pages" - - - name: Push wiki changes - run: | - cd wiki - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add -A - git diff --cached --quiet || git commit -m "docs: sync from main repo docs [skip ci]" - git push diff --git a/.gitignore b/.gitignore index 2db387f..fe0daf2 100644 --- a/.gitignore +++ b/.gitignore @@ -29,9 +29,7 @@ Desktop.ini dependency-reduced-pom.xml buildNumber.properties .mvn/timing.properties -.mvn/maven.config .mvn/wrapper/maven-wrapper.jar -.mvn # ──────────── Logs ──────────── *.log @@ -41,19 +39,3 @@ logs/ *.mmap *.vec *.dat -embedding-cache/ -.spector/ - -# ──────────── User config ──────────── -spector-local.yml -!spector.yml.example -!**/src/main/resources/spector-defaults.yml -!**/src/test/resources/spector-defaults.yml - -# ──────────── Documentation build ──────────── -docs/site/ -docs/docs/labs/* -!docs/docs/labs/roadmap.md -RnD - -.scratch/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index a63bd91..98eed4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -51,35 +51,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **spector-engine:** Document deletion support (`delete()` method) - **spector-engine:** Auto-embed ingestion, chunked ingestion, and streaming file ingestion - **spector-engine:** IVF-PQ auto-training with buffered vector accumulation -- **spector-node:** Armeria REST API with virtual threads -- **spector-node:** CORS support via bundled plugin -- **spector-node:** Optional API key authentication (`X-API-Key` header) -- **spector-node:** Auto-embed ingest endpoint (`/api/v1/ingest/auto`) -- **spector-node:** Bulk ingest endpoint (`/api/v1/ingest/bulk`) -- **spector-node:** Document deletion endpoint (`DELETE /api/v1/documents/{id}`) -- **spector-node:** Metrics endpoint (`/api/v1/metrics`) -- **spector-node:** Vector dimension validation on ingest -- **spector-node:** gRPC-based distributed search with coordinator/shard fan-out -- **spector-node:** `ClusterCoordinator` with parallel shard queries and result merging -- **spector-node:** `RemoteShardClient` with TLS support (mutual TLS optional) -- **spector-node:** `ShardNode` gRPC server wrapping a local SpectorEngine -- **spector-node:** `ClusterConfig` with consistent hash and range partitioning +- **spector-server:** Javalin REST API with virtual threads +- **spector-server:** CORS support via bundled plugin +- **spector-server:** Optional API key authentication (`X-API-Key` header) +- **spector-server:** Auto-embed ingest endpoint (`/api/v1/ingest/auto`) +- **spector-server:** Bulk ingest endpoint (`/api/v1/ingest/bulk`) +- **spector-server:** Document deletion endpoint (`DELETE /api/v1/documents/{id}`) +- **spector-server:** Metrics endpoint (`/api/v1/metrics`) +- **spector-server:** Vector dimension validation on ingest +- **spector-cluster:** gRPC-based distributed search with coordinator/shard fan-out +- **spector-cluster:** `ClusterCoordinator` with parallel shard queries and result merging +- **spector-cluster:** `RemoteShardClient` with TLS support (mutual TLS optional) +- **spector-cluster:** `ShardNode` gRPC server wrapping a local SpectorEngine +- **spector-cluster:** `ClusterConfig` with consistent hash and range partitioning - **spector-bench:** JMH benchmarks for SIMD kernels, HNSW, BM25, ingestion, IVF-PQ, concurrency - **spector-bench:** `PerformanceTestRunner` for comprehensive latency/throughput reporting - 316+ tests across all modules, all passing -### Added — spector-mcp (Agent-Native MCP Server) -- **spector-mcp:** Built-in Model Context Protocol (MCP) server for AI agent integration (Claude Desktop, Cursor, autonomous agents) -- **spector-mcp:** 6 MCP tools: `semantic_search`, `hybrid_search`, `rag_query`, `ingest_document`, `delete_document`, `engine_status` -- **spector-mcp:** `McpToolHandler` abstract base class with template method pattern (timing, error handling, arg parsing) -- **spector-mcp:** `ToolSchemaBuilder` — type-safe fluent builder for JSON schemas (replaces error-prone `Map.of()` literals) -- **spector-mcp:** `SpectorToolRegistry` — tool discovery and registration with Open/Closed Principle -- **spector-mcp:** `SpectorResourceProvider` and `SpectorPromptProvider` — MCP resource/prompt definitions -- **spector-mcp:** `ResultFormatter` — shared formatting utilities for search results, RAG context, engine status -- **spector-mcp:** `SpectorMcpMain` CLI entry point with Ollama embedding provider auto-detection -- **spector-mcp:** In-process MCP execution with zero network overhead (50–200µs per tool call) -- **spector-mcp:** 15 unit tests covering tool registry, all tool handlers, schema builder, and argument validation - ### Technical Decisions - Java 25 with `jdk.incubator.vector` for SIMD - `FloatVector.SPECIES_PREFERRED` for ISA-agnostic code diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f24d29..c185962 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,11 +1,10 @@ -# Contributing to Spector +# Contributing to Spector-Search -Thank you for your interest in contributing to Spector! This document provides guidelines and instructions for contributing. +Thank you for your interest in contributing to Spector-Search! This document provides guidelines and instructions for contributing. ## Table of Contents - [Code of Conduct](#code-of-conduct) -- [Contributor License Agreement](#contributor-license-agreement) - [Getting Started](#getting-started) - [Development Setup](#development-setup) - [Making Changes](#making-changes) @@ -17,34 +16,6 @@ Thank you for your interest in contributing to Spector! This document provides g This project adheres to the [Contributor Covenant Code of Conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. Please report unacceptable behavior to [support@spectrayan.com](mailto:support@spectrayan.com). -## Contributor License Agreement - -By contributing to Spector, you agree that: - -1. **You have the right** to submit the contribution. The code is your original work, or you have permission to submit it under the project's license terms. - -2. **You grant Spectrayan** a perpetual, worldwide, non-exclusive, royalty-free, irrevocable license to use, reproduce, modify, distribute, and sublicense your contribution under: - - The **Apache License 2.0** for all modules except `spector-memory`. - - The **Business Source License 1.1** for the `spector-memory` module (which transitions to Apache 2.0 on the Change Date specified in its LICENSE file). - -3. **You understand** that your contribution becomes part of the project and may be distributed under the project's current or future license terms as described above. - -### How to Sign Off - -All commits must include a `Signed-off-by` line certifying this agreement. Use the `-s` flag when committing: - -```bash -git commit -s -m "feat(core): add new SIMD kernel" -``` - -This adds a line like: - -``` -Signed-off-by: Your Name -``` - -> **Note:** Pull requests without signed-off commits will not be merged. - ## Getting Started 1. **Fork** the repository on GitHub @@ -67,8 +38,8 @@ Signed-off-by: Your Name ```bash # Clone your fork -git clone https://github.com//spector.git -cd spector +git clone https://github.com//spector-search.git +cd spector-search # Verify JDK 25+ is installed java -version @@ -80,12 +51,12 @@ mvn clean compile mvn test # Run the server (optional) -mvn exec:java -pl spector-node -Dexec.mainClass="com.spectrayan.spector.server.SpectorNode" +mvn exec:java -pl spector-server -Dexec.mainClass="com.spectrayan.spector.server.SpectorServer" ``` ### SIMD Verification -Spector uses the Java Vector API for SIMD acceleration. Verify your system supports it: +Spector-Search uses the Java Vector API for SIMD acceleration. Verify your system supports it: ```bash # Check SIMD capability @@ -191,7 +162,7 @@ docs: add benchmark results to README ### Bug Reports -Use the [Bug Report template](https://github.com/spectrayan/spector/issues/new?template=bug_report.md) and include: +Use the [Bug Report template](https://github.com/spectrayan/spector-search/issues/new?template=bug_report.md) and include: - Steps to reproduce - Expected vs actual behavior @@ -200,7 +171,7 @@ Use the [Bug Report template](https://github.com/spectrayan/spector/issues/new?t ### Feature Requests -Use the [Feature Request template](https://github.com/spectrayan/spector/issues/new?template=feature_request.md) and describe: +Use the [Feature Request template](https://github.com/spectrayan/spector-search/issues/new?template=feature_request.md) and describe: - The problem you're trying to solve - Your proposed solution @@ -208,11 +179,11 @@ Use the [Feature Request template](https://github.com/spectrayan/spector/issues/ ## Questions? -- **General questions:** Open a [Discussion](https://github.com/spectrayan/spector/discussions) -- **Bug reports:** Open an [Issue](https://github.com/spectrayan/spector/issues) +- **General questions:** Open a [Discussion](https://github.com/spectrayan/spector-search/discussions) +- **Bug reports:** Open an [Issue](https://github.com/spectrayan/spector-search/issues) - **Security vulnerabilities:** See [SECURITY.md](SECURITY.md) - **Email:** [developer@spectrayan.com](mailto:developer@spectrayan.com) --- -Thank you for contributing to Spector! ⚡ +Thank you for contributing to Spector-Search! ⚡ diff --git a/NOTICE b/NOTICE index d3079a8..76e5fa2 100644 --- a/NOTICE +++ b/NOTICE @@ -1,61 +1,42 @@ -Spector +Spector-Search Copyright 2026 Spectrayan This product includes software developed by Spectrayan (https://www.spectrayan.com/). -================================================================================ -LICENSE STRUCTURE -================================================================================ - -This repository utilizes a split licensing model: - - 1. The "spector-memory" module (located under the spector-memory/ directory) - is licensed under the Business Source License 1.1 (BSL 1.1). Under the - terms of the BSL 1.1, you are granted non-production use rights, with - an Additional Use Grant permitting production use except for offering - the module as a managed service or embedding it in a competing AI - cognitive memory product. On the Change Date (May 27, 2030), this module - automatically transitions to the Apache License 2.0. - Please see spector-memory/LICENSE for details. - - 2. All other directories, modules, and core infrastructure in this repository - are licensed under the Apache License 2.0. - Please see the root LICENSE file for details. - ================================================================================ ATTRIBUTION NOTICE ================================================================================ This software is the original work of the Spectrayan team. If you use -Spector in your own projects, deployments, or services, you MUST +Spector-Search in your own projects, deployments, or services, you MUST provide visible attribution to the Spectrayan team. This attribution must include: - 1. The text "Powered by Spector" or "Built with Spector" in + 1. The text "Powered by Spector-Search" or "Built with Spector-Search" in your application's documentation, about page, or equivalent visible location. - 2. A link to the Spector GitHub repository: - https://github.com/spectrayan/spector + 2. A link to the Spector-Search GitHub repository: + https://github.com/spectrayan/spector-search ================================================================================ TRADEMARK POLICY ================================================================================ -"Spector", "Spectrayan", the Spectrayan logo, and associated branding +"Spector-Search", "Spectrayan", the Spectrayan logo, and associated branding are trademarks of Spectrayan. This license does NOT grant you permission to: - - Use the names "Spector" or "Spectrayan" as your product name + - Use the names "Spector-Search" or "Spectrayan" as your product name - Present this software as your own original creation - Remove or obscure the Spectrayan attribution notices - Use the Spectrayan logos or branding in your own marketing materials - Offer this software as a commercial SaaS product under a different brand without prior written agreement from Spectrayan -You MAY use the names "Spector" and "Spectrayan" solely to: +You MAY use the names "Spector-Search" and "Spectrayan" solely to: - - Describe that your software is based on or derived from Spector + - Describe that your software is based on or derived from Spector-Search - Give credit to the original authors as required by this NOTICE file - Link back to the official repository @@ -65,64 +46,13 @@ For trademark licensing inquiries: legal@spectrayan.com THIRD-PARTY NOTICES ================================================================================ -This product includes software developed by the following open-source projects. -Dependency versions are managed in the root pom.xml. +This product includes software developed by the following open-source projects: -NOTE: The core engine modules (spector-core, spector-storage, spector-index, -spector-query, spector-engine, spector-memory) have ZERO external dependencies -beyond the JDK itself. All SIMD acceleration, off-heap storage, and vector -indexing use only standard JDK APIs (Vector API, Panama FFM, Virtual Threads). - -The third-party libraries listed below are used only by the server, CLI, -MCP, and integration modules: - -Runtime (all distribution modes): - - - Jackson 3.x (https://github.com/FasterXML/jackson) — Apache 2.0 - - Jackson 2.x (https://github.com/FasterXML/jackson) — Apache 2.0 - Used by MCP SDK and Javalin for JSON serialization. + - Javalin (https://javalin.io) — Apache 2.0 + - Jackson (https://github.com/FasterXML/jackson) — Apache 2.0 - SLF4J (https://www.slf4j.org/) — MIT - Logback (https://logback.qos.ch/) — EPL 1.0 / LGPL 2.1 - - SnakeYAML (https://bitbucket.org/snakeyaml/snakeyaml/) — Apache 2.0 - - Apache Commons Configuration 2 (https://commons.apache.org/configuration/) — Apache 2.0 - - Apache Commons BeanUtils (https://commons.apache.org/beanutils/) — Apache 2.0 - -Server/Node mode only (spector-node): - - - Armeria (https://armeria.dev/) — Apache 2.0 - HTTP/gRPC server framework built on Netty. - - Netty (https://netty.io/) — Apache 2.0 - Transitive dependency via Armeria. - - gRPC Java (https://grpc.io/) — Apache 2.0 - - Protocol Buffers (https://protobuf.dev/) — BSD 3-Clause - - Javalin (https://javalin.io/) — Apache 2.0 - -Metrics & Observability (spector-metrics, spector-node): - - - Micrometer Core (https://micrometer.io/) — Apache 2.0 - - Micrometer Prometheus Registry (https://micrometer.io/) — Apache 2.0 - -MCP Agent Integration (spector-mcp): - - - MCP SDK (https://github.com/modelcontextprotocol/java-sdk) — MIT - Official Anthropic Model Context Protocol Java SDK. - -Spring Integration (spector-spring module): - - - Spring Framework (https://spring.io/projects/spring-framework) — Apache 2.0 - - Spring Boot (https://spring.io/projects/spring-boot) — Apache 2.0 - - Spring AI (https://spring.io/projects/spring-ai) — Apache 2.0 - -Test Dependencies (not distributed): - - JUnit 5 (https://junit.org/junit5/) — EPL 2.0 - AssertJ (https://assertj.github.io/doc/) — Apache 2.0 - - JMH (https://openjdk.org/projects/code-tools/jmh/) — GPL 2.0 + CE - -JDK APIs (bundled with OpenJDK, not separately distributed): - - - Java Vector API (JEP 489) — incubator module (jdk.incubator.vector) - - Panama Foreign Function & Memory API (JEP 454) — finalized - - Virtual Threads (JEP 444) — finalized - - Structured Concurrency (JEP 505) — preview - + - JMH (https://openjdk.java.net/projects/code-tools/jmh/) — GPL 2.0 + CE + - OpenJDK Vector API (https://openjdk.java.net/jeps/338) — GPL 2.0 + CE diff --git a/README.md b/README.md index 5425f29..d881c8a 100644 --- a/README.md +++ b/README.md @@ -1,322 +1,61 @@ -# ⚡ Spector +# Spector-Search ⚡ -> **The Zero-Overhead, Agent-Ready AI Memory Backbone.** -> -> Legacy search engines bolted vectors onto text databases. Spector is designed from the ground up for modern AI — leveraging Java Project Panama to achieve C++ bare-metal SIMD speeds natively, with a built-in Model Context Protocol (MCP) server that turns any AI agent into a search-powered reasoning machine. +> Ultra-fast, SIMD-accelerated semantic search engine built on Java Vector API + modern JVM technologies. [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE) [![Java](https://img.shields.io/badge/Java-25-orange.svg)](https://openjdk.org/) -[![Build](https://img.shields.io/github/actions/workflow/status/spectrayan/spector/ci.yml?branch=main)](https://github.com/spectrayan/spector/actions) -[![MCP](https://img.shields.io/badge/MCP-Agent_Ready-blueviolet.svg)](spector-mcp/) -[![Docs](https://img.shields.io/badge/Docs-MkDocs-blue?logo=materialformkdocs)](https://spectrayan.github.io/spector/) - -## 🧠 Why Spector? - -### 1. 🤖 Agent-Native (MCP Protocol) - -Includes a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server. Plug Claude Desktop, Cursor, or autonomous agents directly into Spector for native RAG memory. **Zero Python glue-code required.** - -```mermaid -graph LR - A["🤖 AI Agent"] -->|"JSON-RPC (stdio)"| B["⚡ SpectorMcpServer"] - B -->|"Virtual Thread"| C["SpectorEngine.search()"] - C -->|"Direct method call"| D["Off-heap MemorySegment + SIMD"] - D -->|"88µs p50"| E["✅ Results"] - - style A fill:#6c5ce7,color:white - style B fill:#00b894,color:white - style E fill:#00b894,color:white -``` - -> **23–113× faster** than Python MCP servers — zero network overhead, zero GC pressure. [Benchmarked ↓](#-benchmarks) - -### 2. ⚡ SpectorQuant — SVASQ (Spector Vector-Aligned Scalar Quantization) - -A proprietary SIMD-first quantization engine that mathematically smears dimensional outliers via the Fast Walsh-Hadamard Transform (FWHT) and executes Asymmetric Distance Computation inside the IVF residual space. **Float32 recall at INT8 memory sizes.** - -- SVASQ-8: 4× compression, 99.5%+ recall -- SVASQ-4: 6–8× compression, 97–99% recall (with 3× rescore) -- IVF-PQ: 32× compression for billion-scale datasets - -### 3. 🧊 100% Off-Heap Panama Execution - -Bypasses the JVM Garbage Collector entirely. Maps raw disk bytes directly into hardware SIMD registers for sub-millisecond, Zero-Copy latency. - -- **Zero Network Tax** — runs in-process, no gRPC/HTTP roundtrip -- **Zero Serialization Tax** — bytes → AVX-512 registers directly, no JSON, no Protobuf -- **Zero GC Pressure** — all vector data lives off-heap via Panama `MemorySegment` - -### 4. 📦 Embedded or Standalone - -Deploy as a lightweight embedded library (the **"DuckDB of Vector DBs"**) inside your application, or scale it horizontally as a standalone server with REST API, gRPC clustering, and Spring AI integration. - ---- - -## 🤖 MCP Integration (Agent-Native) - -Give any AI agent instant access to Spector's SIMD-accelerated search engine — with zero network overhead. - -### MCP Tools - -**Search Tools (always available):** - -| Tool | Description | -|:---|:---| -| `semantic_search` | Semantic similarity search with auto-embedding | -| `hybrid_search` | Combined keyword (BM25) + vector search with RRF | -| `rag_query` | Retrieval-Augmented Generation with source citations | -| `ingest_document` | Document ingestion with auto-embedding + chunking | -| `delete_document` | Document deletion by ID | -| `engine_status` | Engine metadata, SIMD capabilities, GPU status | - -**Cognitive Memory Tools (enabled via `spector.memory.enabled: true`):** - -| Tool | Description | -|:---|:---| -| `core_memory_append` | Store a semantic memory with tags and source | -| `recall_context` | Cognitive recall with fused scoring across tiers | -| `memory_status` | Memory tier counts and persistence info | -| `memory_reinforce` | Report positive/negative outcome for a memory | -| `memory_forget` | Tombstone a memory by ID | -| `memory_introspect` | Metamemory self-analysis on a topic | -| `working_memory_scratchpad` | Quick-write to working memory | - -### Claude Desktop Configuration - -Add to your `claude_desktop_config.json`: - -```json -{ - "mcpServers": { - "spector": { - "command": "java", - "args": [ - "--add-modules", "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "--enable-preview", - "-jar", "/path/to/spector-dist/target/spector.jar", - "--config", "/path/to/spector.yml" - ] - } - } -} -``` - -### Why Spector MCP is Different - -| Feature | Python Vector DB MCP | **Spector MCP** | -|:---|:---|:---| -| Search latency | 2–10ms (network + Python GIL) | **88µs p50** (in-process SIMD) † | -| Network overhead | HTTP/gRPC round-trip | **Zero** (direct method call) | -| GC pauses | Python/JVM heap pressure | **≤0.01%** (100% off-heap Panama) † | -| Concurrent queries | Limited by Python GIL | **61,000 QPS** (Virtual Threads) † | -| Dependencies | Python framework stack | **Single JAR** (zero Python) | - -† *Measured on Intel Core Ultra 9 285K, Java 25, AVX2. See [Benchmarks](#-benchmarks).* - -> See the full [spector-mcp documentation](spector-mcp/README.md) for CLI options, Cursor IDE config, and troubleshooting. - ---- - -## 🧠 Cognitive Memory (`spector-memory`) - -Spector Memory is a biologically-inspired cognitive memory engine that gives AI agents the ability to **remember**, **forget**, **consolidate**, and **associate** — with microsecond latency and zero garbage collection pressure. - -| Brain Region | Package | Function | -|---|---|---| -| 🧠 Cerebral Cortex | `cortex/` | 4-tier memory (Working → Episodic → Semantic → Procedural) | -| 🔗 Synapses | `synapse/` | 32-byte header, 6-phase SIMD scoring, Bloom filter gating | -| ⚡ Dopamine | `dopamine/` | Surprise detection, auto-importance, flashbulb pinning | -| 😱 Amygdala | `amygdala/` | Emotional valence (positive/negative/neutral) | -| 🔄 Hebbian | `hebbian/` | "Neurons that fire together wire together" | -| 🛏️ Hippocampus | `hippocampus/` | Sleep consolidation, synaptic pruning, partition rebuild | -| 😴 Habituation | `habituation/` | Anti-filter bubble — penalizes repetitive recall | -| 🚫 Inhibition | `inhibition/` | Explicit memory suppression | - -**Key differentiators vs. Mem0, Letta, Zep:** -- **0.13ms** recall latency at 1M memories (vs. 50–200ms) † -- **Zero GC** — 100% off-heap Panama storage (≤0.01% GC overhead measured) † -- **Fused scoring** — similarity × importance × decay in a single SIMD pass (no truncation trap) -- **Synaptic tag gating** — 64-bit Bloom filter eliminates 99% of candidates in 1 CPU cycle - -† *Measured. See [Benchmarks](#-benchmarks).* - -> 📖 See the full [Cognitive Memory documentation](docs/docs/memory/index.md) and the [module README](spector-memory/README.md). - ---- +[![Build](https://img.shields.io/github/actions/workflow/status/spectrayan/spector-search/ci.yml?branch=main)](https://github.com/spectrayan/spector-search/actions) ## ✨ Features - **🔥 SIMD-Accelerated** — Hardware-accelerated vector math via Java Vector API (AVX2/AVX-512/NEON) -- **🧠 Cognitive Memory** — Biologically-inspired 4-tier memory with fused SIMD scoring, synaptic tags, temporal decay, surprise detection, and sleep consolidation - **🧠 Hybrid Search** — Combines semantic vector search (HNSW) with keyword search (BM25) via Reciprocal Rank Fusion - **💾 Zero-Copy Storage** — Off-heap vector storage using Panama Foreign Function & Memory API - **🧵 Virtual Thread Native** — Designed for Project Loom's virtual threads, no `synchronized` blocks - **🎯 High Recall** — HNSW approximate nearest-neighbor search with configurable recall@K ≥ 80% - **⚡ Sub-Millisecond Queries** — Branchless SIMD kernels with masked tail handling - **🗜️ Multi-Level Quantization** — INT8 (4×), INT4 (8×), and INT2 (16×) scalar quantization with non-uniform calibration and configurable rescore -- **🗜️ SVASQ Quantization** — FWHT-rotated affine INT8 quantization with exact-norm header for high-accuracy zero-copy compression (retaining 99.5%+ recall) -- **🗜️ SVASQ-4 Quantization** — INT4 nibble-packed variant of SVASQ achieving 6–8× compression vs float32 with 97–99% recall (with 3× rescore) -- **🎯 SpectorIndex (IVF-HNSW-SVASQ)** — Multi-level adaptive vector index yielding 99.5%–100% recall on real text embeddings at aggressive 3% partition scanning rates - **🗜️ IVF-PQ Index** — Inverted file with product quantization for 32× memory compression at billion scale - **🤖 LLM Re-ranking** — Listwise relevance scoring via Ollama for precision-critical retrieval - **🖥️ GPU Acceleration** — CUDA kernel loader + SIMD batch similarity via Panama FFM -- **🌐 Distributed Search** — gRPC-based coordinator/shard fan-out with consistent hash partitioning (unified in `spector-node`) +- **🌐 Distributed Search** — gRPC-based coordinator/shard fan-out with consistent hash partitioning - **🧬 Embedding SPI** — Pluggable embedding providers (Ollama included out-of-the-box) - **📄 Chunked Ingestion** — Text, token-level, and streaming chunkers for large document support -- **🤖 MCP Server** — Built-in Model Context Protocol server for AI agent integration - ---- - -## 📊 Benchmarks - -All numbers measured on **Intel Core Ultra 9 285K** (24 cores), **Java 25.0.1**, AVX2 256-bit SIMD, 30GB heap. - -### Core Engine (in-process, 128-dim vectors) - -| Benchmark | Result | Notes | -|:---|:---|:---| -| Vector search p50 | **88–143µs** | 10K–100K docs, HNSW M=16 | -| In-process vs Python MCP | **23–113× faster** | 88µs vs 2–10ms | -| GC overhead | **0.01%** | 1 pause / 100K searches | -| Peak QPS (16 threads) | **61,011** | Concurrent vectorSearch | -| Search at 1M memories | **p50=0.13ms** | 15× better than 2ms target | -| Truncation trap recall loss | **100%** | Top-K-then-rerank loses all correct results | - -### Disk Persistence (4096-dim vectors, real Ollama embeddings) - -| Benchmark | Result | Notes | -|:---|:---|:---| -| DISK vs IN_MEMORY overhead | **2.3%** | mmap’d sharded store, near-zero cost | -| Cold-start latency | **11.3ms** | First search after JVM restart | -| Warm search p50 | **2.2ms** | OS page cache populated (4096-dim) | -| WAL fsync append | **1,203 ops/s** | Crash-durable, per-write fsync | -| WAL buffered append | **339,416 ops/s** | 2.9µs/op, no fsync | -| WAL concurrent (8 threads) | **222,586 ops/s** | Multi-agent write scenario | -| Cognitive recall (Ollama) | **64ms** | End-to-end: embed + score + rank | - -### Run Benchmarks - -```bash -# Core performance (no external dependencies) -mvn exec:exec -pl spector-bench \ - -Dexec.mainClass=com.spectrayan.spector.bench.CorePerformanceBenchmark - -# Disk + Memory + WAL (requires Ollama with an embedding model) -mvn exec:exec -pl spector-bench \ - -Dexec.mainClass=com.spectrayan.spector.bench.DiskPersistenceBenchmark -``` - ---- ## 🏗 Architecture -```mermaid -graph LR - subgraph "🔬 Foundation" - core["spector-core
SIMD kernels"] - commons["spector-commons
Chunkers, tokenizer"] - config["spector-config
SpectorConfig + YAML"] - storage["spector-storage
Panama MemorySegment"] - end - - subgraph "🧠 Intelligence" - embedApi["spector-embed-api
Embedding SPI"] - embedOllama["spector-embed-ollama
Ollama provider"] - index["spector-index
HNSW + IVF-PQ + BM25"] - query["spector-query
Hybrid + RRF + rerank"] - gpu["spector-gpu
CUDA via Panama FFM"] - end - - subgraph "⚡ Engine" - rag["spector-rag
RAG pipeline"] - engine["spector-engine
Search facade"] - ingestion["spector-ingestion
File ingest pipeline"] - memory["spector-memory
Cognitive memory 🧠"] - end - - subgraph "🌐 Runtime & Interfaces" - runtime["spector-runtime
Composition root"] - node["spector-node
Armeria: REST + gRPC + SSE"] - mcp["spector-mcp
MCP Server (stdio)"] - cli["spector-cli
spectorctl"] - client["spector-client
Java SDK"] - spring["spector-spring
Spring AI"] - end - - subgraph "📦 Distribution" - metrics["spector-metrics
Prometheus + JVM"] - bench["spector-bench
JMH benchmarks"] - dist["spector-dist
Fat JAR"] - end +``` +spector-search/ +├── spector-core/ # SIMD kernels (DotProduct, Cosine, Euclidean, VectorOps) +├── spector-commons/ # Text chunkers, tokenizer, content extractor +├── spector-storage/ # Panama MemorySegment stores (InMemory + Mmap + Quantized) +├── spector-index/ # HNSW + IVF-PQ vector indexes + BM25 keyword index +│ ├── hnsw/ # HNSW graph-based ANN index (standard + quantized INT8/INT4/INT2) +│ ├── ivf/ # IVF inverted file index + quantized IVF-PQ +│ ├── pq/ # Product quantizer (K-Means++, ADC) +│ ├── text/ # BM25 keyword scoring + analyzers +│ └── fuzz/ # Index fuzz testing framework +├── spector-query/ # Hybrid orchestrator + RRF fusion + LLM re-ranking +├── spector-embed-api/ # EmbeddingProvider SPI +├── spector-embed-ollama/ # Ollama embedding provider implementation +├── spector-gpu/ # GPU acceleration (Panama FFM + CUDA) +├── spector-engine/ # Unified engine facade + lifecycle +├── spector-server/ # REST API (Javalin + virtual threads) +├── spector-cluster/ # Distributed gRPC search (coordinator + shards) +└── spector-bench/ # JMH benchmarks ``` ### Module Dependency Graph -```mermaid -graph TD - node["🌐 node"] --> runtime["⚡ runtime"] - node --> mcp["🤖 mcp"] - node --> metrics["📈 metrics"] - mcp --> runtime - mcp --> ingestion["📥 ingestion"] - cli["🖥️ cli"] --> runtime - cli --> client["📦 client"] - - runtime --> engine["⚡ engine"] - runtime --> memory["🧠 memory"] - runtime --> ingestion - - engine --> query["🔍 query"] - engine --> rag["🤖 rag"] - engine --> ingestion - engine --> index["📊 index"] - engine --> storage["💾 storage"] - engine --> embedapi["🧬 embed-api"] - engine -.-> gpu["🎮 gpu"] - - memory --> index - memory --> storage - memory --> ingestion - memory --> embedapi - memory --> core["🔬 core"] - - metrics --> engine - metrics --> memory - - ingestion --> config["⚙️ config"] - ingestion --> embedapi - - rag --> query - rag --> index - rag --> storage - rag --> embedapi - - query --> index - index --> storage - index --> config - storage --> config - storage --> core - config --> core - - embedapi --> commons["📄 commons"] - gpu --> core - gpu --> storage - - dist["📦 dist"] --> mcp - dist --> cli - dist --> runtime - - spring["🌱 spring"] --> engine - spring --> memory - spring --> metrics - bench["🧪 bench"] --> engine - bench --> memory ``` - -> **Legend:** Solid arrows = compile dependency. Dotted arrow (`gpu`) = optional dependency. - ---- +cluster → engine → query → index → core + → index → storage → core +server → engine +engine → gpu (optional) +engine → commons +engine → embed-api +gpu → core, storage +``` ## 🚀 Quick Start @@ -325,39 +64,24 @@ graph TD - **JDK 25+** (OpenJDK with Vector API incubator) - **Maven 3.9+** -> **⚠️ JDK API Note:** Spector leverages two JDK APIs that are not yet finalized — the **Vector API** (incubator, for SIMD acceleration) and **Structured Concurrency** (preview, for safe parallel tasks). Both require JVM flags (`--add-modules jdk.incubator.vector`, `--enable-preview`). The remaining core technologies — **Panama FFM** (off-heap memory) and **Virtual Threads** — are fully finalized. The Vector API has been stable across 10 incubation rounds and carries low practical risk. See our [JDK API Status & Compatibility](docs/docs/getting-started/jdk-api-status.md) page for details, migration paths, and FAQ. - ### Build & Test ```bash # Clone the repository -git clone https://github.com/spectrayan/spector.git -cd spector +git clone https://github.com/spectrayan/spector-search.git +cd spector-search -# Build and run all tests +# Build and run all tests (316+ tests) mvn clean test -# Build the distribution JAR (single JAR, all modules) -mvn package -pl spector-dist -am -DskipTests -``` - -### Run with Configuration +# Start the REST server +mvn exec:java -pl spector-server \ + -Dexec.mainClass="com.spectrayan.spector.server.SpectorServer" -All settings are read from `spector.yml` (see [Configuration Guide](docs/docs/configuration/parameters.md)): - -```bash -# Start the MCP server (for AI agents) -java --add-modules jdk.incubator.vector \ - --enable-native-access=ALL-UNNAMED --enable-preview \ - -jar spector-dist/target/spector.jar \ - --config spector.yml - -# Start the file ingestion pipeline -java --add-modules jdk.incubator.vector \ - --enable-native-access=ALL-UNNAMED --enable-preview \ - -cp spector-dist/target/spector.jar \ - com.spectrayan.spector.ingestion.FileIngestionMain \ - --config spector.yml --root . +# Start with API key authentication +mvn exec:java -pl spector-server \ + -Dexec.mainClass="com.spectrayan.spector.server.SpectorServer" \ + -Dexec.args="7070 384 my-secret-key" ``` ### REST API @@ -414,8 +138,6 @@ curl -X DELETE http://localhost:7070/api/v1/documents/doc-1 curl http://localhost:7070/api/v1/metrics ``` ---- - ## 🧩 Programmatic API ```java @@ -443,24 +165,6 @@ try (var engine = new SpectorEngine(config)) { } ``` -### SVASQ-4 Quantization (6–8× Compression) - -```java -// Fluent builder with SVASQ-4 quantization -var engine = SpectorEngine.builder() - .dimensions(4096) // e.g., qwen3-embedding - .capacity(500_000) - .svasq4() // INT4 FWHT-rotated, 3× rescore default - .build(); - -// Or with explicit oversampling -var config = SpectorConfig.DEFAULT - .withDimensions(768) - .withSvasq4(5); // 5× oversampling for higher recall -``` - ---- - ## ⚙️ Configuration | Parameter | Default | Description | @@ -475,14 +179,12 @@ var config = SpectorConfig.DEFAULT | `b` | 0.75 | BM25 document length normalization | | `RRF k` | 60 | Reciprocal Rank Fusion constant | | `gpuEnabled` | false | Enable CUDA GPU acceleration | -| `quantization` | NONE | Quantization type: NONE, SCALAR_INT8, SCALAR_INT4, SCALAR_INT2, SVASQ, SVASQ_4 | +| `quantization` | NONE | Quantization type: NONE, SCALAR_INT8, SCALAR_INT4, SCALAR_INT2 | | `oversamplingFactor` | auto | Rescore oversampling (INT4→3, INT2→5, INT8→1). Higher = better recall | | `rerankerEnabled` | false | Enable LLM re-ranking via Ollama | | `rerankerModel` | — | Ollama model name (e.g., "llama3.2") | | `rerankerMaxCandidates` | 20 | Max docs sent to LLM for re-ranking | ---- - ## 🏎 Performance SIMD auto-detection adapts to your hardware: @@ -504,46 +206,43 @@ Sub-microsecond vector math at every dimension: | 384 | ~100 ns | 100 ns | ~100 ns | 100 ns | | 768 | ~100 ns | 100 ns | ~100 ns | 100 ns | -> Measured on 24-core Intel Core Ultra 9 285K x86, AVX2 256-bit (8 lanes), Java 25, ZGC. Values at 384+ dimensions are at `System.nanoTime()` resolution floor — real throughput confirmed at millions of ops/sec via JMH. +> Measured on 24-core x86, AVX2 256-bit (8 lanes), Java 25, ZGC. Values at 384+ dimensions are at `System.nanoTime()` resolution floor — real throughput confirmed at millions of ops/sec via JMH. ### Search Latency (128-dim, top-10) | Scale | Keyword (BM25) | Vector (HNSW) | Hybrid (RRF) | |-------|---------------|---------------|--------------| -| **10K docs** | **0.18 ms** avg / 0.33 ms p99 | **0.04 ms** avg / 0.07 ms p99 | **0.17 ms** avg / 0.26 ms p99 | -| **50K docs** | **0.44 ms** avg / 0.59 ms p99 | **0.08 ms** avg / 0.11 ms p99 | **0.51 ms** avg / 0.84 ms p99 | -| **100K docs** | **1.53 ms** avg / 1.94 ms p99 | **0.10 ms** avg / 0.22 ms p99 | **1.76 ms** avg / 2.81 ms p99 | +| **10K docs** | **0.15 ms** avg / 0.43 ms p99 | **0.05 ms** avg / 0.16 ms p99 | **0.14 ms** avg / 0.24 ms p99 | +| **50K docs** | **0.35 ms** avg / 0.55 ms p99 | **0.04 ms** avg / 0.05 ms p99 | **0.25 ms** avg / 0.44 ms p99 | +| **100K docs** | **0.60 ms** avg / 1.12 ms p99 | **0.05 ms** avg / 0.06 ms p99 | **0.47 ms** avg / 0.64 ms p99 | ### Search Throughput (queries/sec) -| Scale | Keyword | Vector | Hybrid | -|-------|---------|--------|--------| -| **10K docs** | **5,490** | **23,726** | **5,993** | -| **50K docs** | **2,264** | **13,287** | **1,958** | -| **100K docs** | **653** | **9,925** | **569** | +| Scale | Keyword | Vector | Hybrid | Vector top-100 | +|-------|---------|--------|--------|----------------| +| **10K docs** | **6,806** | **22,152** | **7,318** | 17,573 | +| **50K docs** | **2,854** | **22,808** | **4,038** | 12,271 | +| **100K docs** | **1,679** | **20,246** | **2,143** | 10,174 | ### Ingestion Throughput | Dataset Size | Time | Rate | Memory | |-------------|------|------|--------| -| 10,000 | 2.1s | **4,679 docs/s** | +48 MB | -| 50,000 | 20.5s | **2,430 docs/s** | +86 MB | -| 100,000 | 1m 2s | **1,597 docs/s** | +202 MB | +| 10,000 | 2.1s | **4,589 docs/s** | +20 MB | +| 50,000 | 16.2s | **3,079 docs/s** | +94 MB | +| 100,000 | 45.5s | **2,194 docs/s** | +188 MB | -### Concurrency Scaling (50K docs, 128-dim, Hybrid Search) +### Concurrency Scaling (50K docs, Hybrid Search) | Threads | Throughput | Avg Latency | Scaling Factor | |---------|-----------|-------------|----------------| -| 1 | 1,231 ops/s | 0.81 ms | 1.0× | -| 4 | 2,894 ops/s | 1.38 ms | **2.3×** | -| 8 | 5,466 ops/s | 1.46 ms | **4.4×** | -| 16 | 7,635 ops/s | 1.99 ms | **6.2×** | +| 1 | 4,108 ops/s | 0.24 ms | 1.0× | +| 4 | 12,344 ops/s | 0.32 ms | **3.0×** | +| 8 | 17,628 ops/s | 0.44 ms | **4.3×** | +| 16 | 18,324 ops/s | 0.79 ms | **4.5×** | > Run the full benchmark suite: `mvn -pl spector-bench exec:java` > HTML report generated at `spector-bench/target/performance-report.html` -> -> [!TIP] -> For the comprehensive, empirical sweeps across multiple partition configurations ($C \in \{32, 64, 128, 256\}$) and detailed HNSW shard promotion benchmarks on real text embeddings (using Qwen3-embedding 4096-dim), see our dedicated [Large-Scale Real-Embedding Benchmarks page](docs/docs/deep-dives/real-embedding-benchmarks.md). --- @@ -555,7 +254,7 @@ All comparisons below use **100K documents, 128 dimensions, top-10 retrieval** a | Engine | Language | Avg Latency | P99 Latency | Notes | |--------|----------|------------|------------|-------| -| **Spector** | Java 25 | **0.10 ms** | **0.22 ms** | SIMD via Vector API, pure in-process, 100K docs | +| **Spector Search** | Java 25 | **0.05 ms** | **0.06 ms** | SIMD via Vector API, pure in-process | | hnswlib | C++ | ~0.1–0.5 ms | ~1 ms | Fastest native HNSW; single-threaded | | FAISS (HNSW) | C++/Python | ~0.2–0.8 ms | ~1–2 ms | Versatile; GPU support available | | Apache Lucene 9+ | Java | ~1–5 ms | ~5–10 ms | Segment-based; force-merge helps | @@ -564,14 +263,11 @@ All comparisons below use **100K documents, 128 dimensions, top-10 retrieval** a | Milvus | Go/C++ | ~3–10 ms | ~10–35 ms | Scales to billions; DiskANN support | | Weaviate | Go | ~5–15 ms | ~25–40 ms | Built-in vectorization modules | -> [!NOTE] -> Spector's vector search latency is competitive with native C++ hnswlib for in-process workloads at 100K scale. External system numbers are from published benchmarks and ann-benchmarks.com. Hardware/configuration differences apply. - ### Keyword Search (BM25, 100K docs) | Engine | Avg Latency | Notes | |--------|------------|-------| -| **Spector** | **1.53 ms** | float[] scoring, min-heap top-K, virtual-thread parallel terms | +| **Spector Search** | **0.51 ms** | float[] scoring, min-heap top-K, virtual-thread parallel terms | | Elasticsearch | <1–5 ms | Inverted index + skip lists, highly optimized | | Apache Lucene | <1–3 ms | Raw engine, no network overhead | | Weaviate (BM25) | ~10–30 ms | Go-based BM25 for hybrid search | @@ -580,7 +276,7 @@ All comparisons below use **100K documents, 128 dimensions, top-10 retrieval** a | Engine | Approach | Avg Latency | Notes | |--------|----------|------------|-------| -| **Spector** | RRF (parallel virtual threads) | **1.76 ms** | Both legs sub-ms at 10K; parallel via virtual threads | +| **Spector Search** | RRF (parallel virtual threads) | **0.47 ms** | Both legs sub-ms; shared vthread executor | | Elasticsearch | RRF / linear combination | ~10–30 ms | Mature query planner, skip-list BM25 | | Qdrant | Sparse+Dense fusion | ~15–30 ms | Rust-based sparse vectors | | Weaviate | Hybrid BM25+HNSW | ~25–40 ms | Unified API, built-in vectorization | @@ -589,7 +285,7 @@ All comparisons below use **100K documents, 128 dimensions, top-10 retrieval** a | Engine | Rate (100K docs) | Notes | |--------|-----------------|-------| -| **Spector** | **1,597 docs/s** | In-process, HNSW graph build included | +| **Spector Search** | **2,194 docs/s** | In-process, HNSW graph build included | | Elasticsearch | ~2,000–5,000 docs/s | Bulk API, depends on mapping & replicas | | Milvus | ~3,000–8,000 docs/s | Batch insert optimized | | Qdrant | ~2,000–5,000 docs/s | Payload indexing included | @@ -605,25 +301,22 @@ All comparisons below use **100K documents, 128 dimensions, top-10 retrieval** a | **Off-Heap Vectors** | ✅ Panama MemorySegment | ✅ Lucene MMapDir | ✅ MMapDir | ❌ Heap-only | ✅ Mmap | ✅ Mmap | | **Virtual Threads** | ✅ Native Loom | ❌ Platform threads | N/A | N/A | N/A | N/A | | **Zero Dependencies** | ✅ JDK only | ❌ Heavy stack | ✅ Standalone | ✅ Header-only | ❌ Tokio runtime | ❌ etcd, MinIO, Pulsar | -| **Quantization** | ✅ Scalar INT8/INT4/INT2 + SVASQ/SVASQ-4 + PQ | ✅ BBQ/Scalar | ✅ Scalar | ❌ None | ✅ Scalar/Binary | ✅ PQ/SQ | +| **Quantization** | ✅ Scalar INT8/INT4/INT2 + PQ | ✅ BBQ/Scalar | ✅ Scalar | ❌ None | ✅ Scalar/Binary | ✅ PQ/SQ | | **Disk-based Index** | ✅ HNSW serialization | ✅ Segment merge | ✅ MMap | ❌ In-memory | ✅ On-disk HNSW | ✅ DiskANN | | **IVF-PQ** | ✅ 32× compression | ❌ None | ❌ None | ❌ None | ❌ None | ✅ IVF_PQ | | **GPU Acceleration** | ✅ CUDA (Panama FFM) | ❌ None | ❌ None | ❌ None | ❌ None | ✅ GPU | | **LLM Re-ranking** | ✅ Ollama | ❌ None | ❌ None | ❌ None | ❌ None | ❌ None | | **Distributed Search** | ✅ gRPC fan-out | ✅ Built-in | ❌ None | ❌ None | ✅ Raft | ✅ gRPC | -| **MCP Server** | ✅ Built-in | ❌ None | ❌ None | ❌ None | ❌ None | ❌ None | ### Where Spector Excels -- **🚀 Sub-millisecond vector search**: 0.04ms at 10K, 0.10ms at 100K (128-dim), competitive with native C++ implementations -- **🔥 Fast BM25**: Sub-millisecond keyword search at 10K/50K scale — comparable to raw inverted index engines +- **🚀 Sub-millisecond everything**: Vector (0.05ms), keyword (0.60ms), AND hybrid (0.47ms) at 100K docs +- **🔥 Faster BM25 than Elasticsearch**: 0.60ms vs 1–5ms — float[] scoring + min-heap top-K + virtual-thread parallelism - **🧵 Modern JVM**: Only search engine built on Java 25 virtual threads + Vector API - **📦 Zero-dependency embedded**: Drop-in JAR, no external infrastructure needed -- **⚡ 7.6K+ ops/sec concurrent**: 7,635 hybrid searches/sec at 16 threads (128-dim) -- **🎯 23K+ vector QPS**: 23,726 vector queries/sec at 10K docs -- **🗜️ IVF-PQ + SVASQ + SVASQ-4 + TurboQuant**: 6–32× memory reduction for large-scale datasets with high-accuracy calibration -- **🔬 99.5%+ Recall**: IVF-HNSW-SVASQ (`SpectorIndex`) achieves near-perfect recall on real semantic embeddings scanning just 3% of the clusters -- **🤖 Agent-Native**: Built-in MCP server — the only search engine with native AI agent integration +- **⚡ 18K+ ops/sec concurrent**: 18,324 hybrid searches/sec at 16 threads +- **🎯 20K+ vector QPS**: 20,246 vector queries/sec at 100K docs — outperforms native C++ hnswlib +- **🗜️ IVF-PQ compression**: 32× memory reduction for billion-scale datasets - **🤖 LLM re-ranking**: Listwise Ollama-powered relevance scoring - **🖥️ GPU acceleration**: CUDA kernel launcher + SIMD batch similarity via Panama FFM - **🌐 Distributed search**: gRPC-based fan-out/merge with consistent hash sharding @@ -634,21 +327,18 @@ All comparisons below use **100K documents, 128 dimensions, top-10 retrieval** a | Module | Tests | Coverage | |--------|-------|----------| -| spector-core | 276 | SIMD kernels, similarity functions, scalar/SVASQ quantization, SIMD Euclidean | +| spector-core | 117 | SIMD kernels, similarity functions, scalar quantization | | spector-commons | 28 | Text chunkers, token chunker, streaming chunker, content extractor | | spector-storage | 38 | Off-heap stores, mmap persistence, quantized vector store | | spector-index | 79 | HNSW recall, BM25 scoring, IVF-PQ, PQ encode/decode | | spector-query | 29 | RRF fusion, hybrid orchestration, LLM re-ranking | -| spector-memory | 167 | Cognitive scoring, tier stores, mmap persistence, synapse, Bloom filters, reverse index, performance benchmarks + 10 Ollama E2E tests | | spector-embed-api | 9 | Embedding SPI contracts | | spector-embed-ollama | 7 | Ollama provider, fallback behavior | | spector-gpu | 14 | GPU detection, SIMD batch similarity, CUDA launcher | | spector-engine | 12 | End-to-end ingestion, IVF-PQ auto-training | -| spector-node | 11 | REST endpoints, shard routing, hash consistency | -| spector-mcp | 15 | MCP tool registry, tool handlers, schema builder | -| **Total** | **685+** | **All passing ✅** | - ---- +| spector-server | 6 | REST API endpoints | +| spector-cluster | 5 | Shard routing, hash consistency | +| **Total** | **316+** | **All passing ✅** | ## 📈 Roadmap @@ -656,66 +346,25 @@ All comparisons below use **100K documents, 128 dimensions, top-10 retrieval** a - [x] BM25 keyword search - [x] Hybrid search with RRF fusion - [x] Scalar quantization (INT8, INT4, INT2) with non-uniform calibration and configurable rescore -- [x] TurboQuant quantization (rotation + optimal scalar, 8× compression) - [x] Disk-based HNSW persistence - [x] Embedding provider SPI (Ollama) - [x] IVF-PQ vector index (32× compression) - [x] LLM-powered re-ranking -- [x] GPU infrastructure (CUDA context, memory management via Panama FFM) +- [x] GPU acceleration (CUDA via Panama FFM) - [x] Distributed search (gRPC coordinator/shards) -- [x] REST API with CORS, auth, metrics, SSE streaming -- [x] Standalone ingestion pipeline (`spector-ingestion`) -- [x] Standalone RAG pipeline (`spector-rag`) +- [x] REST API with CORS, auth, metrics - [x] Document deletion - [x] Auto-embed + bulk ingest endpoints - [x] gRPC TLS support -- [x] SVASQ-4 quantization (FWHT-rotated INT4, nibble-packed — 6–8× compression vs float32) -- [x] Structured concurrency (JEP 505) — `ConcurrentTasks` with dual-mode + feature flag -- [x] **Native MCP Server** (`spector-mcp` — 13 tools: 6 search + 7 cognitive memory, stdio transport) -- [x] **SpectorRuntime** — Unified application context (engine + memory), config-driven via `spector.yml` -- [x] **Distribution JAR** (`spector-dist` — single fat JAR for all modules) -- [ ] Streamable HTTP transport (MCP over HTTP for cloud/remote deployments) -- [ ] Padding-aware storage (skip zero-padded dims — 25% savings for non-pow2 dimensions) -- [ ] Norm header compression (float32 → float16 — 2 bytes/vector savings) -- [ ] LoRA adapter routing (multi-tenant query projection via SIMD matrix multiply) -- [ ] ColBERT late interaction reranking (native MaxSim via Panama FMA loops) -- [ ] SVASQ-PQ hybrid (FWHT rotation + product quantization — 16–32× compression) -- [ ] Flat-mode SVASQ (SVASQ compression of flat-shard residuals — 3× on flat shards) -- [ ] GPU kernel dispatch (CUDA compute for batch similarity — requires CUDA Toolkit) -- [ ] NPU acceleration (Intel/AMD NPU for INT8 batch operations via OpenVINO or DirectML) - [ ] WASM runtime for edge deployment -> See the [detailed Roadmap](docs/docs/roadmap.md) for in-depth descriptions, projected savings, and implementation plans. - -## 📖 Documentation - -| Resource | Link | -|:---------|:-----| -| **Full Documentation** | [spectrayan.github.io/spector](https://spectrayan.github.io/spector/) | -| **GitHub Wiki** | [Wiki](https://github.com/spectrayan/spector/wiki) | -| **Cognitive Memory** | [Memory Docs](https://spectrayan.github.io/spector/memory/) | -| **Neural Dashboard** | [Cortex Dashboard](https://spectrayan.github.io/spector/cortex/) | -| **API Reference** | [REST API](https://spectrayan.github.io/spector/api-reference/rest-endpoints/) | -| **MCP Server** | [MCP Docs](https://spectrayan.github.io/spector/sdk-usage/mcp-server/) | - ## 🤝 Contributing We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. ## 📄 License -This repository is licensed under a **split licensing model**: - -1. **`spector-memory` Module**: Licensed under the **Business Source License 1.1 (BSL 1.1)**. - - Permits free use for non-production purposes. - - Permits production use for all purposes **except** offering it as a managed service or embedding/integrating it in a competing AI cognitive memory product or service. - - Automatically transitions to the **Apache License 2.0** on **May 27, 2030** (4 years from release). - - See [spector-memory/LICENSE](spector-memory/LICENSE) for details. - -2. **Core Infrastructure & All Other Modules**: Licensed under the **Apache License 2.0**. - - See [LICENSE](LICENSE) for details. - -For branding and trademark guidelines, please consult the [NOTICE](NOTICE) file. +This project is licensed under the Apache License 2.0 — see [LICENSE](LICENSE) for details. ## 🔒 Security diff --git a/deploy/k8s-statefulset.yaml b/deploy/k8s-statefulset.yaml deleted file mode 100644 index 7d93c7a..0000000 --- a/deploy/k8s-statefulset.yaml +++ /dev/null @@ -1,116 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: spector-nvme-local -provisioner: kubernetes.io/no-provisioner -volumeBindingMode: WaitForFirstConsumer -allowVolumeExpansion: false -description: "High-performance StorageClass for Spector cognitive off-heap data using local NVMe drives mapped via Local Volumes" ---- -apiVersion: v1 -kind: Service -metadata: - name: spector - labels: - app: spector -spec: - ports: - - port: 8080 - name: http-api - - port: 9090 - name: internal-sync - clusterIP: None - selector: - app: spector ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: spector-node - labels: - app: spector -spec: - serviceName: "spector" - replicas: 3 - selector: - matchLabels: - app: spector - template: - metadata: - labels: - app: spector - annotations: - # Inform container runtimes of low/high memory boundaries in cgroups v2 - # resources.requests.memory sets memory.min/low, and limits.memory sets memory.max. - # This keeps the container's physical RAM consumption bounded while giving full headroom for page cache. - cgroups.kubernetes.io/memory-high: "15Gi" - spec: - # Pod Anti-Affinity: Prevent scheduling multiple Spector pods on the same physical bare-metal host. - # This ensures parallel off-heap page cache scans do not saturate host memory buses or NVMe IOPS. - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: app - operator: In - values: - - spector - topologyKey: "kubernetes.io/hostname" - containers: - - name: spector - image: spectrayan/spector:latest - imagePullPolicy: IfNotPresent - ports: - - containerPort: 8080 - name: http-api - - containerPort: 9090 - name: internal-sync - env: - - name: JAVA_OPTS - # PRODUCTION JVM tuning for Panama FFM off-heap cognitive engine: - # - Keep heap small (-Xmx2G) to leave the remaining 14GB of the 16GB container limit - # fully free for OS page-cache (madvise prefetching and zero-copy mmap mapped regions). - # - Enable vector API, preview features, and compiler optimizations. - value: "-Xms2G -Xmx2G -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:+UnlockDiagnosticVMOptions -XX:+UnlockExperimentalVMOptions --add-modules=jdk.incubator.vector --enable-preview -Djava.lang.foreign.restricted=permit" - - name: SPECTOR_MEMORY_DIR - value: "/var/lib/spector/data" - resources: - requests: - cpu: "4" - memory: "16Gi" - limits: - cpu: "8" - # Under cgroups v2: - # - limits.memory translates to memory.max = 16Gi - # - requests.memory translates to memory.low/min = 16Gi - # Since the JVM heap is restricted to 2Gi, the Linux kernel can comfortably allocate - # up to 14Gi for system page cache. This guarantees zero JVM heap OOMs, while - # preventing the Kubernetes OOM killer from terminating the pod when the page cache is hot. - memory: "16Gi" - volumeMounts: - - name: spector-offheap-store - mountPath: /var/lib/spector/data - livenessProbe: - httpGet: - path: /api/v2/memory/status - port: 8080 - initialDelaySeconds: 30 - periodSeconds: 10 - timeoutSeconds: 3 - readinessProbe: - httpGet: - path: /api/v2/memory/status - port: 8080 - initialDelaySeconds: 15 - periodSeconds: 5 - timeoutSeconds: 2 - volumeClaimTemplates: - - metadata: - name: spector-offheap-store - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: "spector-nvme-local" - resources: - requests: - storage: 100Gi diff --git a/docs/docs/about.md b/docs/docs/about.md deleted file mode 100644 index eae1844..0000000 --- a/docs/docs/about.md +++ /dev/null @@ -1,220 +0,0 @@ -# 🌟 What is Spector? - -> **The Zero-Overhead, Agent-Ready AI Memory Backbone.** -> -> Legacy search engines bolted vectors onto text databases. Spector is designed from the ground up for modern AI — combining vector similarity, keyword search, and hybrid ranking in a single embeddable library with zero external dependencies. Connect any AI agent via the built-in MCP server, or embed directly in your application. - -Spector is an open-source, high-performance search engine built entirely on modern Java 25. It's designed for developers who want sub-millisecond search, native AI agent integration, and zero infrastructure complexity. Drop in a JAR, write a few lines of code, and you have production-grade hybrid search with built-in agent support. - ---- - -## 🎯 What It Does - -Spector indexes documents with their vector embeddings and text content, then retrieves them using multiple strategies — directly from AI agents or your application code: - -```mermaid -graph LR - subgraph Clients - MCP["🤖 AI Agent (MCP)"] - REST["🌐 REST API"] - SDK["📦 Java SDK"] - end - - subgraph Search Modes - A[Vector Search] --> D[Results] - B[Keyword Search] --> D - C[Hybrid Search] --> D - end - - subgraph Engines - A --> E[HNSW ANN] - B --> F[BM25 Scoring] - C --> E - C --> F - C --> G[RRF Fusion] - end - - MCP --> A & B & C - REST --> A & B & C - SDK --> A & B & C -``` - -| Mode | How It Works | Best For | -|------|-------------|----------| -| **🧠 Vector Search** | HNSW approximate nearest neighbor graphs | Semantic similarity | -| **📝 Keyword Search** | BM25 scoring with term frequency saturation | Exact term matching | -| **🧬 Hybrid Search** | Combines both via Reciprocal Rank Fusion | Best-of-both-worlds | -| **🤖 RAG Pipeline** | Ingest → chunk → embed → retrieve → context assembly | LLM applications | -| **🏛️ SpectorIndex** | IVF-HNSW-SVASQ adaptive hybrid index | Scale + recall | - ---- - -## 💎 Key Differentiators - -### 🤖 Agent-Native (MCP Protocol) - -Includes a built-in [Model Context Protocol](https://modelcontextprotocol.io/) server with 6 tools. AI agents connect directly via JSON-RPC — no Python frameworks, no network round-trips. - -| Feature | Python Vector DB MCP | **Spector MCP** | -|:---|:---|:---| -| Search latency | 2–10ms | **88µs p50** (23–113× faster) † | -| Network overhead | HTTP/gRPC round-trip | **Zero** (in-process) | -| Concurrent queries | Limited by Python GIL | **61,000 QPS** † | -| Dependencies | Python framework stack | **Single JAR** | - -† *Measured. See [Benchmarks](../#-benchmarks).* - -> [!TIP] -> See the [MCP Server Guide](../sdk-usage/mcp-server.md) to connect Claude Desktop, Cursor, or any MCP client in minutes. - -### 📦 Pure Java, Zero Dependencies - -Unlike most vector databases that rely on C++, Rust, or Python bindings, Spector is 100% Java. It uses the JDK's own Vector API for SIMD acceleration — no JNI, no native libraries, no external infrastructure. - -> [!TIP] -> Add the JAR to your classpath and you're done. No Docker, no clusters, no ops. - -### 🚀 Modern JVM Technologies - -| Technology | Purpose | -|-----------|---------| -| Java Vector API | SIMD-accelerated math (AVX2/AVX-512/NEON) | -| Panama FFM | Zero-copy memory-mapped storage, GPU interop | -| Virtual Threads | Millions of concurrent operations without thread pools | -| Structured Concurrency | Safe parallel task management | - -### ⚡ Sub-Millisecond at Scale - -**HNSW** at 100K documents (128 dimensions, top-10, M=16, efSearch=64): - -| Search Type | Average Latency | Throughput | -|-------------|----------------|------------| -| Vector | **0.13 ms** | 7,556 QPS | -| Keyword | **0.98 ms** | 1,019 QPS | -| Hybrid | **1.01 ms** | 994 QPS | - -**SpectorIndex (IVF-HNSW-SVASQ)** at 10K documents (4096-dim real Qwen3 embeddings): - -| Config | Average Latency | Throughput | Recall@10 | -|--------|----------------|------------|----------| -| nCentroids=128, nProbe=4 | **0.46 ms** | **2,173 QPS** | **1.0000** | -| nCentroids=64, nProbe=4 | **0.62 ms** | 1,601 QPS | **1.0000** | -| nCentroids=128, nProbe=16 | **1.26 ms** | 792 QPS | **1.0000** | - -> [!NOTE] -> SpectorIndex achieves **perfect recall while searching only 3.1% of the data** (nProbe=4 out of 128 centroids). Ingestion is 28–160× faster than standalone HNSW. Numbers measured on 24-core x86, AVX2, Java 25, ZGC with Qwen3-embedding real vectors. For comprehensive, multi-centroid sweeps and adaptive HNSW shard promotion benchmarks, see the dedicated [Large-Scale Real-Embedding Benchmarks page](deep-dives/real-embedding-benchmarks.md). - -### 🏠 Dual Deployment Modes - -| Mode | Description | Best For | -|------|-------------|----------| -| **Embedded** | In-process library, zero network overhead | Microservices, desktop apps, edge | -| **Server** | REST API with CORS, auth, and metrics | Teams, multi-language clients | - -### 🗜️ Advanced Quantization (SVASQ + IVF-PQ) - -Spector offers two quantization paths: - -- **SVASQ (Vectorized Affine Scalar Quantization):** Uses the Fast Walsh-Hadamard Transform to spread variance before INT8 quantization, achieving **4× compression with near-lossless recall** (~97–99.5%). Used inside SpectorIndex shards. -- **IVF-PQ (Product Quantization):** Provides **32× memory compression** for billion-scale datasets. - -> [!IMPORTANT] -> SVASQ gives INT8 the precision of INT12–16 by rotating vectors before quantization. See the [SVASQ Deep Dive](deep-dives/svasq-deep-dive.md) for the full theory. - ---- - -## 📊 How Spector Compares - -### Latency Comparison (100K docs, 128-dim, top-10) - -| Engine | Language | Vector Avg | Vector P99 | -|--------|----------|-----------|-----------| -| **⚡ Spector** | **Java 25** | **0.13 ms** | **0.26 ms** | -| hnswlib | C++ | 0.1–0.5 ms | ~1 ms | -| FAISS | C++ | 0.2–0.8 ms | 1–2 ms | -| Lucene 9+ | Java | 1–5 ms | 5–10 ms | -| Elasticsearch 8+ | Java | 2–10 ms | 10–25 ms | -| Qdrant | Rust | 2–5 ms | 10–25 ms | -| Milvus | Go/C++ | 3–10 ms | 10–35 ms | - -> [!NOTE] -> Spector's vector search latency is competitive with native C++ implementations (hnswlib, FAISS) for in-process workloads. Numbers for external systems are from published benchmarks and ann-benchmarks.com. Hardware and configuration differences apply — these are directional comparisons, not controlled A/B tests. - -### Feature Comparison - -| Feature | Spector | Elasticsearch | Qdrant | Milvus | hnswlib | -|---------|---------|--------------|--------|--------|---------| -| **Deployment** | Embedded + Server | Cluster only | Server only | Cluster only | Embedded only | -| **MCP Server** | ✅ Built-in (6 tools) | ❌ | ❌ | ❌ | ❌ | -| **Hybrid Search** | ✅ RRF built-in | ✅ RRF | ✅ Sparse+Dense | ✅ RRF | ❌ | -| **Zero Dependencies** | ✅ JDK only | ❌ Heavy stack | ❌ Tokio runtime | ❌ etcd, MinIO, Pulsar | ✅ Header-only | -| **Virtual Threads** | ✅ Project Loom | ❌ Platform threads | N/A (Rust async) | N/A (Go goroutines) | N/A | -| **GPU Acceleration** | ✅ CUDA (Panama FFM) | ❌ | ✅ Vulkan (indexing) | ✅ CUDA (search + indexing) | ❌ | -| **Quantization** | ✅ Scalar INT8 + IVF-PQ | ✅ BBQ + Scalar + DiskBBQ (IVF) | ✅ Scalar + Binary | ✅ IVF-PQ + IVF-SQ | ❌ | -| **Re-ranking** | ✅ LLM via Ollama | ✅ Elastic Rerank + Inference API | ✅ FastEmbed / ColBERT | ✅ vLLM Ranker + Cross-encoder | ❌ | -| **Distributed** | ✅ gRPC fan-out | ✅ Built-in sharding | ✅ Raft consensus | ✅ gRPC + etcd | ❌ | -| **SIMD Acceleration** | ✅ Java Vector API | ✅ simdvec (Panama) | ✅ Native SIMD | ✅ AVX/NEON | ✅ AVX/SSE | - -> [!NOTE] -> This comparison reflects publicly available information as of May 2025. Feature availability may vary by version and deployment mode. All products are actively evolving. - ---- - -## 🛠️ Use Cases - -### 🤖 Agentic AI Memory - -Connect AI agents (Claude, Cursor, custom) directly to Spector via the built-in MCP server. The agent autonomously ingests documents, searches for relevant context, and retrieves information — all with zero Python glue-code. *"Point your LLM at Spector's MCP port, and it instantly has mathematically-perfect long-term memory."* - -### 🤖 Retrieval-Augmented Generation (RAG) - -Ingest documents (PDF, HTML, Markdown), chunk them with token awareness, generate embeddings, and retrieve relevant context for LLM prompting — all through a single `/api/v1/rag` endpoint or the `rag_query` MCP tool. - -### 🔍 Semantic Search Applications - -Power product search, documentation search, code search, or any application where meaning matters more than exact keywords. - -### 💡 Recommendation Systems - -Use vector similarity to find items similar to what users have engaged with. Sub-millisecond latency makes real-time recommendations practical. - -### 🏢 Hybrid Enterprise Search - -Combine keyword precision (finding exact product SKUs, error codes) with semantic understanding (finding conceptually related documents). - -### 📱 Embedded Analytics - -Drop Spector into existing Java applications without infrastructure changes. Perfect for desktop applications, microservices, or edge deployments. - ---- - -## ✅ When to Choose Spector - -> [!NOTE] -> **Choose Spector when:** -> - You want AI agents to autonomously search your data (MCP integration) -> - You want sub-millisecond hybrid search without infrastructure complexity -> - Your stack is Java/JVM and you want native integration -> - You need an embedded search library with server-mode option -> - You want GPU acceleration without leaving the JVM -> - Zero external dependencies matters to your deployment - -> [!WARNING] -> **Consider alternatives when:** -> - You need a managed cloud service with zero ops -> - Your team primarily works in Python/Rust/Go -> - You need built-in ML model serving - ---- - -## 🚀 Next Steps - -- [Getting Started](getting-started/quickstart.md) — Build and run your first search in 5 minutes - -- [MCP Server Guide](sdk-usage/mcp-server.md) — Connect an AI agent in 3 steps - -- [Architecture Overview](architecture/overview.md) — Understand how it works under the hood - -- [REST API Reference](api-reference/rest-endpoints.md) — Full API documentation - -- [Core Concepts](architecture/core-concepts.md) — Deep dive into the algorithms \ No newline at end of file diff --git a/docs/docs/api-reference/error-codes.md b/docs/docs/api-reference/error-codes.md deleted file mode 100644 index 3a4bc2a..0000000 --- a/docs/docs/api-reference/error-codes.md +++ /dev/null @@ -1,238 +0,0 @@ -# Spector Error Code Reference - -All Spector errors follow the `SPE-XXX-YYY` schema where `XXX` identifies the -error category and `YYY` identifies the specific error within that category. - -**Stability guarantee:** Error codes are immutable once assigned. They will never -be reassigned or removed, even if deprecated. - ---- - -## How to Read Error Codes - -``` -SPE-100-001 -│ │ │ -│ │ └── Specific error (001–999) -│ └─────── Category (100–900) -└─────────── Spector prefix -``` - -| Category Range | Subsystem | -|---|---| -| `SPE-100-xxx` | Input Validation | -| `SPE-110-xxx` | Configuration | -| `SPE-200-xxx` | Index | -| `SPE-210-xxx` | Storage | -| `SPE-300-xxx` | Embedding | -| `SPE-310-xxx` | Memory | -| `SPE-400-xxx` | GPU | -| `SPE-500-xxx` | Server (REST/gRPC/MCP) | -| `SPE-510-xxx` | Client SDK | -| `SPE-600-xxx` | Ingestion | -| `SPE-700-xxx` | Cluster | -| `SPE-900-xxx` | Internal | - ---- - -## Validation Errors (SPE-100) - -These errors indicate invalid input provided by the caller. - -| Code | Message | Common Cause | -|---|---|---| -| `SPE-100-001` | Vector dimensions must be positive | Dimensions set to 0 or negative in config | -| `SPE-100-002` | Expected {n} dimensions but received {m} | Query vector has different dimensionality than the index | -| `SPE-100-003` | Vector must not be null | Null vector passed to ingest or search | -| `SPE-100-004` | Vector length does not match expected dimensions | Float array length ≠ configured dimensions | -| `SPE-100-005` | top_k must be between 1 and max | top_k set to 0, negative, or exceeding index capacity | -| `SPE-100-006` | Document ID must not be null or empty | Empty string or null passed as document ID | -| `SPE-100-007` | Required argument must not be null | A required method parameter was null | -| `SPE-100-008` | Argument out of range | A numeric parameter is outside valid bounds | -| `SPE-100-009` | Unsupported quantization type | Quantization type not recognized | -| `SPE-100-010` | Capacity exceeded | Collection or buffer exceeds maximum size | -| `SPE-100-011` | SimilarityFunction must not be null | Null similarity function in config | -| `SPE-100-012` | Collection must not be empty | Empty list/array passed where non-empty required | -| `SPE-100-013` | Invalid value for parameter | General argument validation failure | -| `SPE-100-014` | Argument must be non-negative | Negative value for a non-negative parameter | -| `SPE-100-015` | Length mismatch | Two arrays that must be same length differ | -| `SPE-100-016` | Bit width invalid | Quantization bit width not 2, 4, or 8 | - ---- - -## Configuration Errors (SPE-110) - -These errors indicate problems with Spector configuration files or values. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-110-001` | Configuration file not found | Verify the config file path. Check `spector.yml` or `spector.properties` exists. | -| `SPE-110-002` | Failed to parse configuration | Check YAML/properties syntax. Validate with a YAML linter. | -| `SPE-110-003` | Invalid configuration value | Verify the reported field value is within documented bounds. | -| `SPE-110-004` | Configuration profile not found | Check available profiles in your config file. | -| `SPE-110-005` | Required configuration key missing | Add the missing key to your config file. | - ---- - -## Index Errors (SPE-200) - -These errors relate to vector index operations. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-200-001` | HNSW index construction failed | Check available memory. Reduce `capacity` or `dimensions`. | -| `SPE-200-002` | HNSW graph integrity check failed | Index file may be corrupted. Re-build from source data. | -| `SPE-200-003` | Index has reached maximum capacity | Increase `capacity` in config, or delete old documents. | -| `SPE-200-004` | Index is read-only | Index was opened in read-only mode. | -| `SPE-200-005` | IVF centroid training failed | Provide more training vectors or reduce `nlist`. | -| `SPE-200-006` | BM25 text tokenization failed | Check text encoding. Ensure input is valid UTF-8. | -| `SPE-200-007` | Index serialization to disk failed | Check disk space and write permissions. | -| `SPE-200-008` | Index deserialization from disk failed | Index file may be corrupted or incompatible version. | -| `SPE-200-009` | Index not trained | Call `train()` before searching an IVF-PQ index. | -| `SPE-200-010` | Centroid count must be positive | Set `nlist` to a positive integer. | -| `SPE-200-011` | HNSW graph connectivity below threshold | Index quality degraded. Rebuild with higher `efConstruction`. | - ---- - -## Storage Errors (SPE-210) - -These errors relate to vector storage and disk I/O. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-210-001` | Memory segment is closed | Don't use the store after calling `close()`. | -| `SPE-210-002` | Memory-mapped file creation failed | Check disk space, file permissions, and OS mmap limits. | -| `SPE-210-003` | Vector store has reached capacity | Increase `capacity` or delete old vectors. | -| `SPE-210-004` | Disk I/O operation failed | Check disk health, space, and permissions. | -| `SPE-210-005` | Write-ahead log write failed | Check disk space. WAL directory may be full. | -| `SPE-210-006` | Write-ahead log replay failed | WAL file may be corrupted. Check logs for details. | -| `SPE-210-007` | Vector store not initialized | Ensure the store is opened before operations. | -| `SPE-210-008` | Invalid index file format | File was created by an incompatible version. | - ---- - -## Embedding Errors (SPE-300) - -These errors relate to embedding provider connectivity. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-300-001` | Embedding provider is unavailable | Check that Ollama (or your provider) is running. Verify the URL. | -| `SPE-300-002` | Embedding request failed | Check provider logs for details. | -| `SPE-300-003` | Embedding request timed out | Increase timeout or check provider load. | -| `SPE-300-004` | Embedding model not found | Pull the model: `ollama pull `. | -| `SPE-300-005` | Embedding dimension mismatch | Model returns different dimensions than index expects. Change model or recreate index. | - ---- - -## Memory Errors (SPE-310) - -These errors relate to the cognitive memory subsystem. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-310-001` | Memory tier has reached capacity | Configure higher capacity or enable consolidation. | -| `SPE-310-002` | Cognitive recall pipeline failed | Check logs for underlying cause. | -| `SPE-310-003` | Memory consolidation failed | Check disk space and WAL integrity. | -| `SPE-310-004` | Memory ID not found | The specified memory ID does not exist in any tier. | -| `SPE-310-005` | Memory WAL file corrupted | WAL file is unreadable. Recovery may require reinitialization. | - ---- - -## GPU Errors (SPE-400) - -These errors relate to GPU acceleration via CUDA/Panama FFM. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-400-001` | CUDA driver not found | Install NVIDIA CUDA drivers. GPU features will fall back to CPU. | -| `SPE-400-002` | GPU memory allocation failed | Reduce batch size or free GPU memory from other processes. | -| `SPE-400-003` | GPU kernel launch failed | Check CUDA compatibility. Update GPU drivers. | -| `SPE-400-004` | GPU device error | Hardware issue or driver crash. Restart and check `nvidia-smi`. | -| `SPE-400-005` | GPU memory budget exceeded | Reduce `gpuMemoryBudget` or free GPU memory. | - ---- - -## Server Errors (SPE-500) - -These errors are returned by the Spector REST API, gRPC, or MCP server. - -| Code | HTTP Status | Message | Resolution | -|---|---|---|---| -| `SPE-500-001` | 400 | Bad request | Fix the request body or parameters. | -| `SPE-500-002` | 404 | Resource not found | Verify the document/collection ID exists. | -| `SPE-500-003` | 409 | Resource conflict | Document with this ID already exists. | -| `SPE-500-004` | 401 | Unauthorized | Provide a valid API key. | -| `SPE-500-005` | 503 | Service unavailable | Backend service is down. Retry after delay. | -| `SPE-500-006` | 500 | MCP tool execution failed | Check MCP tool logs for details. | -| `SPE-500-007` | 500 | gRPC transport error | Check network connectivity between nodes. | - ---- - -## Client SDK Errors (SPE-510) - -These errors are raised by the Spector client SDK. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-510-001` | Failed to connect to Spector server | Verify server URL and that the server is running. | -| `SPE-510-002` | Client request timed out | Increase timeout or check server load. | -| `SPE-510-003` | Invalid server response | Server may be returning unexpected format. Check version compatibility. | - ---- - -## Ingestion Errors (SPE-600) - -These errors relate to the document ingestion pipeline. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-600-001` | Unsupported document format | Use a supported format (PDF, TXT, MD, HTML, DOCX). | -| `SPE-600-002` | Document chunking failed | Check document encoding and content. | -| `SPE-600-003` | Ingestion pipeline failed | Check logs for the underlying cause. | -| `SPE-600-004` | Failed to read document | Check file path, read permissions, or ensure the file is not corrupted. | - ---- - -## Cluster Errors (SPE-700) - -These errors relate to distributed mode operations. - -| Code | Message | Resolution | -|---|---|---| -| `SPE-700-001` | Shard is unavailable | Check that all cluster nodes are running. | -| `SPE-700-002` | Cluster membership operation failed | Check network connectivity between nodes. | -| `SPE-700-003` | Request routing failed | Shard map may be stale. Wait for rebalance. | - ---- - -## Internal Errors (SPE-900) - -These errors indicate a bug in Spector itself. **If you encounter a 900-series -error, please report it** with the full error code and any available log context. - -| Code | Message | What It Means | -|---|---|---| -| `SPE-900-001` | Internal error | An unexpected condition occurred. This is a bug. | -| `SPE-900-002` | Internal invariant violated | A data structure is in an invalid state. This is a bug. | -| `SPE-900-003` | Reached unreachable code path | A switch/if exhaustiveness gap. This is a bug. | -| `SPE-900-004` | Concurrent execution failed | A virtual thread subtask failed unexpectedly. | - ---- - -## JSON Error Response Format - -All REST API errors return a structured JSON response: - -```json -{ - "code": "SPE-100-002", - "category": "Validation", - "message": "[SPE-100-002] Expected 384 dimensions but received 768", - "status": 400, - "path": "/api/v1/ingest", - "timestamp": "2026-05-30T12:00:00Z" -} -``` - -Legacy errors (without error codes) omit the `code` and `category` fields. diff --git a/docs/docs/api-reference/overview.md b/docs/docs/api-reference/overview.md new file mode 100644 index 0000000..1e8fc5a --- /dev/null +++ b/docs/docs/api-reference/overview.md @@ -0,0 +1,37 @@ +# API Reference + +Spector Search exposes a REST API via Javalin on port 7070 (configurable). + +## Base URL + +``` +http://localhost:7070 +``` + +## Authentication + +When an API key is configured, include it as a header: + +``` +X-API-Key: your-secret-key +``` + +## Endpoints Summary + +| Method | Path | Description | +|--------|------|-------------| +| GET | `/health` | Health check | +| GET | `/api/v1/status` | Engine status | +| POST | `/api/v1/search` | Hybrid search (auto-detects mode) | +| POST | `/api/v1/vector-search` | Vector-only search | +| POST | `/api/v1/bm25` | Keyword-only BM25 search | +| POST | `/api/v1/hybrid` | Explicit hybrid search | +| POST | `/api/v1/rag` | RAG retrieval with context assembly | +| POST | `/api/v1/ingest` | Ingest a single document | +| POST | `/api/v1/ingest/auto` | Ingest with auto-embedding | +| POST | `/api/v1/ingest/bulk` | Bulk ingest documents | +| POST | `/api/v1/index` | Create/manage indexes | +| DELETE | `/api/v1/documents/{id}` | Delete a document | +| GET | `/api/v1/metrics` | Request metrics | + +See [REST Endpoints](rest-endpoints.md) for detailed request/response schemas. diff --git a/docs/docs/api-reference/rest-endpoints.md b/docs/docs/api-reference/rest-endpoints.md index bd96a5b..57a2c87 100644 --- a/docs/docs/api-reference/rest-endpoints.md +++ b/docs/docs/api-reference/rest-endpoints.md @@ -1,205 +1,66 @@ -# 🌐 REST API Reference +# REST Endpoints -> **Complete reference for all Spector REST endpoints.** The API runs on an embedded Armeria server with virtual threads, accepting and returning JSON. Every request gets its own virtual thread — no connection limits to worry about. +## Ingest ---- - -## 🔧 Base Configuration - -| Setting | Default | Description | -|---------|---------|-------------| -| Base URL | `http://localhost:7070` | Configurable port | -| Content-Type | `application/json` | All requests and responses | -| Auth Header | `X-API-Key: ` | Optional, configured at startup | -| CORS | Enabled | All origins by default | - -> [!NOTE] -> When an API key is configured, requests without a valid key receive `401 Unauthorized`. - ---- - -## 💚 Health & Status - -### `GET /health` +### POST /api/v1/ingest -Quick health check for load balancers and monitoring. +Ingest a single document with a pre-computed vector. -```bash -curl http://localhost:7070/health -``` +**Request:** -**Response `200`:** -```json -{"status": "UP"} -``` - ---- - -### `GET /api/v1/status` - -Engine status including SIMD capabilities, GPU availability, and configuration. - -```bash -curl http://localhost:7070/api/v1/status -``` - -**Response `200`:** ```json { - "status": "RUNNING", - "simd": "AVX2 (256-bit, 8 lanes)", - "gpuAvailable": false, - "rerankerEnabled": false, - "documentCount": 1250, - "dimensions": 384, - "capacity": 100000 + "id": "doc-1", + "title": "Java Vector API", + "content": "SIMD-accelerated search engine on modern JVM", + "vector": [0.1, 0.2, 0.3, 0.4, 0.5] } ``` ---- - -### `GET /api/v1/metrics` - -Request metrics including query counts, latencies, and throughput. +**Response (200):** -```bash -curl http://localhost:7070/api/v1/metrics -``` - -**Response `200`:** ```json { - "totalQueries": 4521, - "totalIngestions": 1250, - "avgLatencyMs": 0.34, - "p99LatencyMs": 1.12, - "queriesPerSecond": 8432.5 + "id": "doc-1", + "status": "indexed" } ``` ---- - -## 📥 Ingest Endpoints - -### `POST /api/v1/ingest` - -Ingest a single document with a pre-computed vector embedding. - -```bash -curl -X POST http://localhost:7070/api/v1/ingest \ - -H "Content-Type: application/json" \ - -H "X-API-Key: my-secret-key" \ - -d '{ - "id": "doc-1", - "title": "Java Vector API", - "content": "SIMD-accelerated search engine on modern JVM", - "vector": [0.1, 0.2, 0.3, 0.4, 0.5] - }' -``` - -**Request Schema:** - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `id` | string | ✅ | Unique document identifier | -| `title` | string | ❌ | Document title | -| `content` | string | ✅ | Text content for BM25 indexing | -| `vector` | float[] | ✅ | Embedding vector (must match configured dimensions) | -| `metadata` | object | ❌ | Arbitrary key-value metadata | - -**Response `200`:** -```json -{"id": "doc-1", "status": "indexed"} -``` - ---- - -### `POST /api/v1/ingest/auto` - -Ingest with automatic embedding generation. Requires a configured embedding provider (e.g., Ollama). - -```bash -curl -X POST http://localhost:7070/api/v1/ingest/auto \ - -H "Content-Type: application/json" \ - -d '{ - "id": "doc-2", - "title": "Panama FFM", - "content": "Foreign Function and Memory API for zero-copy storage" - }' -``` - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `id` | string | ✅ | Unique document identifier | -| `title` | string | ❌ | Document title | -| `content` | string | ✅ | Text content (used for both BM25 and embedding) | -| `metadata` | object | ❌ | Arbitrary key-value metadata | - ---- - -### `POST /api/v1/ingest/bulk` +### POST /api/v1/ingest/bulk Ingest multiple documents in a single request. -```bash -curl -X POST http://localhost:7070/api/v1/ingest/bulk \ - -H "Content-Type: application/json" \ - -d '{ - "documents": [ - {"id": "d1", "content": "first document", "vector": [0.1, 0.2, 0.3]}, - {"id": "d2", "content": "second document", "vector": [0.4, 0.5, 0.6]} - ] - }' -``` +**Request:** -**Response `200`:** ```json { - "indexed": 2, - "failed": 0, - "results": [ - {"id": "d1", "status": "indexed"}, - {"id": "d2", "status": "indexed"} + "documents": [ + {"id": "d1", "content": "first document", "vector": [0.1, 0.2, 0.3]}, + {"id": "d2", "content": "second document", "vector": [0.4, 0.5, 0.6]} ] } ``` --- -## 🔍 Search Endpoints +## Search -### `POST /api/v1/search` +### POST /api/v1/search -Auto-detecting search endpoint. The mode is determined by which fields you provide: +Auto-detecting search. Provide `text` for keyword, `vector` for vector, or both for hybrid. -| Fields Provided | Mode | Engine Used | -|-----------------|------|-------------| -| `text` only | 📝 KEYWORD | BM25 | -| `vector` only | 🧠 VECTOR | HNSW | -| `text` + `vector` | 🧬 HYBRID | RRF Fusion | +**Request:** -```bash -curl -X POST http://localhost:7070/api/v1/search \ - -H "Content-Type: application/json" \ - -d '{ - "text": "vector search engine", - "vector": [0.1, 0.2, 0.3, 0.4, 0.5], - "topK": 10 - }' +```json +{ + "text": "vector search engine", + "vector": [0.1, 0.2, 0.3], + "topK": 10 +} ``` -**Request Schema:** - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `text` | string | ❌* | Query text for keyword search | -| `vector` | float[] | ❌* | Query vector for similarity search | -| `topK` | int | ❌ | Number of results (default: 10, max: 10000) | - -> [!IMPORTANT] -> *At least one of `text` or `vector` must be provided. +**Response (200):** -**Response `200`:** ```json { "results": [ @@ -207,203 +68,135 @@ curl -X POST http://localhost:7070/api/v1/search \ "id": "doc-1", "score": 0.9523, "title": "Java Vector API", - "content": "SIMD-accelerated search engine on modern JVM" + "content": "SIMD-accelerated search engine..." } ], "searchMode": "HYBRID", - "latencyMs": 0.47, - "totalResults": 1 + "latencyMs": 0.47 } ``` ---- - -### `POST /api/v1/vector-search` +### POST /api/v1/vector-search -Explicit vector-only similarity search. +Vector-only similarity search. -```bash -curl -X POST http://localhost:7070/api/v1/vector-search \ - -H "Content-Type: application/json" \ - -d '{"vector": [0.1, 0.2, 0.3, 0.4, 0.5], "topK": 10}' -``` +### POST /api/v1/bm25 -### `POST /api/v1/bm25` - -Explicit keyword-only BM25 search. - -```bash -curl -X POST http://localhost:7070/api/v1/bm25 \ - -H "Content-Type: application/json" \ - -d '{"text": "SIMD acceleration", "topK": 10}' -``` +Keyword-only BM25 search. Only requires `text` field. -### `POST /api/v1/hybrid` +### POST /api/v1/hybrid Explicit hybrid search combining vector + keyword via RRF. -```bash -curl -X POST http://localhost:7070/api/v1/hybrid \ - -H "Content-Type: application/json" \ - -d '{"text": "vector search", "vector": [0.1, 0.2, 0.3, 0.4, 0.5], "topK": 10}' -``` - --- -### `GET /api/v1/search/stream` (SSE) - -Streaming search via Server-Sent Events. Results are emitted one-by-one as they become available, enabling progressive display in UIs. - -```bash -curl -N "http://localhost:7070/api/v1/search/stream?text=vector+search&topK=5&mode=HYBRID" -``` - -**Query Parameters:** +## RAG -| Param | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `text` | string | ❌* | — | Query text for keyword/hybrid search | -| `vector` | string | ❌* | — | Comma-separated floats (e.g., `0.1,0.2,0.3`) | -| `topK` | int | ❌ | 10 | Number of results | -| `mode` | string | ❌ | auto-detect | `KEYWORD`, `VECTOR`, or `HYBRID` | +### POST /api/v1/rag -> [!IMPORTANT] -> *At least one of `text` or `vector` must be provided. +Retrieval-Augmented Generation endpoint. Retrieves relevant context for LLM prompting. -**Event Stream:** +**Request:** +```json +{ + "query": "How does HNSW indexing work?", + "topK": 5, + "tokenLimit": 4096, + "searchMode": "hybrid" +} ``` -event: result -data: {"id":"doc-1","score":0.9523,"rank":1} - -event: result -data: {"id":"doc-3","score":0.8741,"rank":2} -event: result -data: {"id":"doc-7","score":0.8102,"rank":3} +**Response (200):** -event: done -data: {"totalHits":3,"queryTimeMs":12,"mode":"HYBRID"} +```json +{ + "context": "Assembled context text from relevant chunks...", + "attributions": [ + {"documentId": "doc-1", "chunkOffset": 0}, + {"documentId": "doc-3", "chunkOffset": 2} + ], + "isEmpty": false +} ``` -**Event Types:** +**Error Responses:** -| Event | Description | -|-------|-------------| -| `result` | A single search result with id, score, and rank | -| `done` | Search complete — includes timing and metadata | -| `error` | An error occurred during search | +- `400` — Missing or invalid query (must be 1–2000 chars) +- `503` — Embedding provider unavailable -> [!TIP] -> Use the `EventSource` API in browsers or any SSE client library. Results stream immediately as they are scored, giving users instant feedback. +--- -**JavaScript Example:** -```javascript -const source = new EventSource('/api/v1/search/stream?text=HNSW+algorithm&topK=5'); +## Index Management -source.addEventListener('result', (event) => { - const result = JSON.parse(event.data); - console.log(`#${result.rank}: ${result.id} (score: ${result.score})`); -}); +### POST /api/v1/index -source.addEventListener('done', (event) => { - const meta = JSON.parse(event.data); - console.log(`Search complete in ${meta.queryTimeMs}ms`); - source.close(); -}); -``` +Create or manage indexes. --- -## 🤖 RAG (Retrieval-Augmented Generation) - -### `POST /api/v1/rag` - -Retrieve relevant context for LLM prompting. Performs search, then assembles a context window from matching chunks. +## Document Management -```bash -curl -X POST http://localhost:7070/api/v1/rag \ - -H "Content-Type: application/json" \ - -d '{ - "query": "How does HNSW indexing work?", - "topK": 5, - "tokenLimit": 4096, - "searchMode": "hybrid" - }' -``` +### DELETE /api/v1/documents/{id} -**Request Schema:** +Delete a document by ID. -| Field | Type | Required | Default | Description | -|-------|------|----------|---------|-------------| -| `query` | string | ✅ | — | Query text (1–2000 chars) | -| `topK` | int | ❌ | 5 | Results to retrieve (1–100) | -| `tokenLimit` | int | ❌ | 4096 | Max context tokens (1–8192) | -| `searchMode` | string | ❌ | "vector" | `"vector"` or `"hybrid"` | +**Response (200):** -**Response `200`:** ```json { - "context": "Assembled context text from relevant document chunks...", - "attributions": [ - {"documentId": "doc-1", "chunkOffset": 0}, - {"documentId": "doc-3", "chunkOffset": 2} - ], - "isEmpty": false + "id": "doc-1", + "deleted": true } ``` --- -## 🗑️ Document Management - -### `DELETE /api/v1/documents/{id}` - -Delete a document by its ID. +## Monitoring -```bash -curl -X DELETE http://localhost:7070/api/v1/documents/doc-1 -``` +### GET /health -**Response `200`:** -```json -{"id": "doc-1", "deleted": true} -``` - ---- +Returns `200 OK` when the server is running. -## 📊 Index Management +### GET /api/v1/status -### `POST /api/v1/index` +Engine status including SIMD capabilities, GPU availability, and reranker configuration. -Create or manage indexes. +### GET /api/v1/metrics -```bash -curl -X POST http://localhost:7070/api/v1/index \ - -H "Content-Type: application/json" \ - -d '{"action": "create", "name": "my-index", "dimensions": 384}' -``` +Request metrics including query counts, latencies, and throughput. --- -## ❌ Error Responses +## Runnable REST API Example -| Status | Meaning | -|--------|---------| -| `200` | ✅ Success | -| `400` | Bad request (validation error, dimension mismatch) | -| `401` | Unauthorized (invalid or missing API key) | -| `404` | Resource not found | -| `503` | Service unavailable (embedding provider down) | +This complete example demonstrates ingesting a document and searching for it: ---- - -## 🔗 See Also - -- [Getting Started](../getting-started/quickstart.md) — Quick start with curl examples +```bash +# 1. Start the server (in another terminal) +mvn exec:java -pl spector-server \ + -Dexec.mainClass="com.spectrayan.spector.server.SpectorServer" \ + -Dexec.args="7070 5" -- [Java SDK Guide](../sdk-usage/java-client.md) — Type-safe programmatic access +# 2. Ingest a document +curl -X POST http://localhost:7070/api/v1/ingest \ + -H "Content-Type: application/json" \ + -d '{ + "id": "readme-1", + "title": "Spector Search", + "content": "Ultra-fast SIMD-accelerated semantic search engine", + "vector": [0.9, 0.1, 0.3, 0.7, 0.5] + }' -- [CLI Reference](../cli-reference/spectorctl.md) — Command-line access to the API +# 3. Search for it +curl -X POST http://localhost:7070/api/v1/search \ + -H "Content-Type: application/json" \ + -d '{ + "text": "fast search engine", + "vector": [0.8, 0.2, 0.3, 0.6, 0.4], + "topK": 5 + }' -- [Configuration Guide](../configuration/parameters.md) — Server and auth configuration \ No newline at end of file +# 4. Delete the document +curl -X DELETE http://localhost:7070/api/v1/documents/readme-1 +``` diff --git a/docs/docs/architecture/core-concepts.md b/docs/docs/architecture/core-concepts.md deleted file mode 100644 index b675e75..0000000 --- a/docs/docs/architecture/core-concepts.md +++ /dev/null @@ -1,284 +0,0 @@ -# 🧠 Core Concepts - -> **The algorithms and data structures that make Spector blazingly fast.** This page explains HNSW, IVF-PQ, BM25, RRF, and SIMD acceleration — the building blocks behind sub-millisecond hybrid search. - ---- - -## 🌐 HNSW (Hierarchical Navigable Small World) - -HNSW is the primary index structure for approximate nearest neighbor (ANN) vector search. It builds a multi-layered graph where each node represents a vector, and edges connect similar vectors. - -### 🔍 How It Works - -```mermaid -graph TD - subgraph "Layer 3 — Few nodes, long-range links" - A3[A] --- D3[D] - end - - subgraph "Layer 2 — More nodes, medium links" - A2[A] --- C2[C] --- D2[D] --- F2[F] - end - - subgraph "Layer 1 — Most nodes, short links" - A1[A] --- B1[B] --- C1[C] --- D1[D] --- E1[E] --- F1[F] --- G1[G] - end - - subgraph "Layer 0 — All nodes, local links" - A0[A] --- B0[B] --- C0[C] --- D0[D] --- E0[E] --- F0[F] --- G0[G] --- H0[H] - end - - A3 -.-> A2 -.-> A1 -.-> A0 - D3 -.-> D2 -.-> D1 -.-> D0 -``` - -**Search algorithm:** -1. Enter at the top layer's entry point -2. Greedily traverse to the closest node at each layer -3. Drop to the next layer, using the found node as the new entry -4. At layer 0, explore `efSearch` candidates to find top-K nearest neighbors - -### ⚙️ Key Parameters - -| Parameter | Default | Effect | -|-----------|---------|--------| -| `M` | 16 | Max connections per node. Higher = better recall, more memory | -| `efConstruction` | 200 | Build-time beam width. Higher = better graph quality, slower build | -| `efSearch` | 50 | Query-time beam width. Higher = better recall, slower query | - -### 🚀 Why HNSW is Fast - -- **Logarithmic complexity** — O(log N) layers mean search scales well - -- **Greedy navigation** — Each step moves closer to the target - -- **SIMD distance computation** — Every neighbor comparison uses hardware-accelerated vector math - -- **Cache-friendly** — Graph traversal exhibits good spatial locality - -### 💾 Persistence Format - -Spector uses a page-aligned binary format for HNSW persistence: - -``` -[Header: 64 bytes] → magic "SPHW", version, metadata -[Vector Region] → 4KB-aligned float32 vectors (memory-mappable) -[Graph Region] → Per-node adjacency lists -[ID Table] → External ID ↔ internal offset mapping -``` - -> [!TIP] -> Loading is a single `mmap` syscall — no deserialization needed. Startup is instant regardless of index size. - ---- - -## 🗜️ IVF-PQ (Inverted File with Product Quantization) - -IVF-PQ enables billion-scale search with **32× memory compression**. It combines two techniques: - -### 📊 IVF: Coarse Partitioning - -```mermaid -graph LR - subgraph "Training: K-Means clusters vectors into cells" - Q[Query Vector] --> C0[Cell 0
• • •] - Q --> C1[Cell 1
• • •] - Q --> C2[Cell 2
• • •] - Q --> C3[Cell N
• • •] - end -``` - -Instead of comparing against all vectors, IVF narrows search to the `nprobe` nearest cells. - -### 🧬 PQ: Product Quantization - -PQ compresses each vector from full float32 to compact codes: - -| Step | Data | Size | -|------|------|------| -| Original vector (384 dims) | `[0.12, 0.45, ..., 0.78]` | 1,536 bytes | -| Split into 16 subspaces | `[sub1] [sub2] ... [sub16]` | — | -| Each quantized to 1 byte | `[42] [187] [3] ... [201]` | **16 bytes** | -| **Compression ratio** | | **96×** | - -> [!IMPORTANT] -> At 32 subspaces with 256 centroids, you get **32× compression** while maintaining recall@10 ≥ 80%. - -### ⚡ ADC (Asymmetric Distance Computation) - -During search, PQ uses lookup tables instead of full distance computation: - -1. Pre-compute distances from query to all 256 centroids per subspace (256 × 32 = 8,192 lookups) -2. For each compressed vector, sum up table lookups (32 additions per vector) -3. This is orders of magnitude faster than full float32 distance - ---- - -## 📝 BM25 (Best Matching 25) - -BM25 is the keyword scoring algorithm used for text search. It extends TF-IDF with term saturation and document length normalization. - -### 📐 Scoring Formula - -``` -score(D, Q) = Σ IDF(qi) × (tf(qi, D) × (k1 + 1)) / (tf(qi, D) + k1 × (1 - b + b × |D|/avgdl)) -``` - -| Variable | Meaning | -|----------|---------| -| `tf(qi, D)` | Term frequency of query term qi in document D | -| `IDF(qi)` | Inverse document frequency (how rare the term is) | -| `\|D\|` | Document length | -| `avgdl` | Average document length across corpus | -| `k1` | Term frequency saturation (default: 1.2) | -| `b` | Length normalization factor (default: 0.75) | - -### ⚙️ Key Parameters - -| Parameter | Default | Effect | -|-----------|---------|--------| -| `k1` | 1.2 | Controls how quickly term frequency saturates. Lower = faster saturation | -| `b` | 0.75 | Controls document length penalty. 0 = no normalization, 1 = full | - -### 🚀 Spector's BM25 Implementation - -| Optimization | Benefit | -|-------------|---------| -| `float[]` scoring | Raw float arrays for max throughput | -| Min-heap top-K | Only tracks best K results (no full sort) | -| Virtual-thread parallel terms | Multi-term queries score in parallel | - -**Result:** 0.60 ms avg at 100K docs — faster than Elasticsearch's BM25. - ---- - -## 🧬 Reciprocal Rank Fusion (RRF) - -RRF combines ranked results from multiple search methods into a single unified ranking. - -### 📐 Formula - -``` -RRF_score(d) = Σ 1 / (k + rank_i(d)) -``` - -Where `k` = 60 (default fusion constant) and `rank_i(d)` = rank of document d in the i-th result list. - -### 💡 Example - -```mermaid -graph LR - subgraph "BM25 Results" - B1["docA (rank 1)"] - B2["docB (rank 2)"] - B3["docC (rank 3)"] - end - - subgraph "Vector Results" - V1["docC (rank 1)"] - V2["docA (rank 2)"] - V3["docD (rank 3)"] - end - - subgraph "🧬 RRF Fusion (k=60)" - R1["docA: 0.0325 ✨"] - R2["docC: 0.0323"] - R3["docB: 0.0161"] - R4["docD: 0.0159"] - end - - B1 --> R1 - B2 --> R3 - B3 --> R2 - V1 --> R2 - V2 --> R1 - V3 --> R4 -``` - -### ✅ Why RRF Works - -- **Rank-based, not score-based** — Avoids normalization issues between different scoring methods - -- **Resistant to outliers** — A high score in one system can't dominate - -- **Parameter-light** — Only one tunable constant (k) - -- **Empirically strong** — Competitive with learned fusion methods - ---- - -## ⚡ SIMD Acceleration via Java Vector API - -Spector uses the Java Vector API (`jdk.incubator.vector`) to execute vector math on hardware SIMD lanes. - -### 🔬 How It Works - -```java -// Traditional scalar loop (1 operation per cycle): -for (int i = 0; i < dim; i++) { - sum += a[i] * b[i]; -} - -// SIMD vectorized (8-16 operations per cycle): -var species = FloatVector.SPECIES_PREFERRED; // AVX2=8, AVX-512=16 -for (int i = 0; i < dim; i += species.length()) { - var va = FloatVector.fromArray(species, a, i); - var vb = FloatVector.fromArray(species, b, i); - sum = va.fma(vb, sum); // Fused multiply-add -} -``` - -### 🎯 Supported Kernels - -| Kernel | Operation | Used By | -|--------|-----------|---------| -| Dot Product | `Σ(a[i] × b[i])` | Vector similarity (DOT_PRODUCT mode) | -| Cosine Similarity | `dot(a,b) / (‖a‖ × ‖b‖)` | Vector similarity (COSINE mode) | -| Euclidean Distance | `√Σ(a[i] - b[i])²` | Vector similarity (EUCLIDEAN mode) | -| Vector Ops | Norm, normalize, quantize | Internal utilities | - -### 🖥️ Hardware Adaptation - -The Vector API automatically selects the best SIMD width for your hardware: - -| ISA | Width | Lanes (float32) | Platform | -|-----|-------|-----------------|----------| -| AVX2 | 256-bit | 8 | Most modern x86 CPUs | -| AVX-512 | 512-bit | 16 | Intel Xeon, recent AMD | -| NEON | 128-bit | 4 | Apple Silicon, ARM servers | - -### 📊 Performance Impact - -SIMD kernels achieve sub-microsecond latency: - -| Dimension | Dot Product P50 | Cosine P50 | -|-----------|----------------|-----------| -| 32 | 200 ns | 1,100 ns | -| 128 | <100 ns | <100 ns | -| 384 | ~100 ns | ~100 ns | -| 768 | ~100 ns | ~100 ns | - -> [!NOTE] -> Values at 128+ dimensions are at `System.nanoTime()` resolution floor. JMH confirms millions of ops/sec throughput. - -### 🎨 Design Principles - -- **Never hardcode lane widths** — Always use `FloatVector.SPECIES_PREFERRED` - -- **Branchless tail handling** — Use `VectorMask` for dimensions not divisible by lane count - -- **Zero allocations in hot path** — Reuse buffers, slice-based APIs - -- **Fused multiply-add** — Use FMA where available for accuracy and speed - ---- - -## 🔗 See Also - -- [Architecture Overview](overview.md) — How these components fit together - -- [GPU Acceleration](gpu-acceleration.md) — CUDA kernels for batch operations - -- [Performance Tuning](../operations/performance-tuning.md) — How to tune these parameters - -- [Configuration Guide](../configuration/parameters.md) — All parameter defaults and ranges \ No newline at end of file diff --git a/docs/docs/architecture/distributed-mode.md b/docs/docs/architecture/distributed-mode.md deleted file mode 100644 index 2473c3e..0000000 --- a/docs/docs/architecture/distributed-mode.md +++ /dev/null @@ -1,253 +0,0 @@ -# 🌐 Distributed Mode - -> **Scale Spector horizontally across multiple nodes.** The distributed architecture uses consistent hash sharding, configurable replication, heartbeat-based membership, and parallel query fan-out with result merging via gRPC. - ---- - -## 🏗️ Architecture Overview - -```mermaid -graph TD - Client["👤 Client"] --> Coord["🧭 Query Coordinator
Fan-out + Merge + Dedup"] - - Coord --> S0["💾 Shard 0
(Primary)"] - Coord --> S1["💾 Shard 1
(Primary)"] - Coord --> S2["💾 Shard 2
(Primary)"] - - S0 --> R0["📋 Replica 0a"] - S1 --> R1["📋 Replica 1a"] - S2 --> R2["📋 Replica 2a"] - - MS["💓 Membership Service
(Heartbeat)"] -.-> S0 - MS -.-> S1 - MS -.-> S2 -``` - ---- - -## 🧩 Components - -### 🔑 Shard Manager - -The `ConsistentHashShardManager` distributes documents across shards using consistent hashing on document IDs. - -```mermaid -graph LR - subgraph "Hash Ring" - H1["Hash(doc-A) → Shard 0"] - H2["Hash(doc-B) → Shard 2"] - H3["Hash(doc-C) → Shard 1"] - end -``` - -**Properties:** - -- Each shard owns a range on a hash ring (using virtual nodes for even distribution) - -- Document ID → hash → ring position → assigned shard (deterministic) - -- Adding a shard migrates only affected documents (minimal data movement) - -- Shard count changes apply without full cluster restart - ---- - -### 📋 Replication Manager - -Each shard maintains configurable replicas for fault tolerance. - -| Behavior | Details | -|----------|---------| -| Writes | Go to primary, replicate to all replicas within 2s | -| Reads | Served from any fully-synchronized replica | -| Primary failure | Replica promoted within 10 seconds | -| Recovery | Delta sync only (data changed since failure) | - ---- - -### 💓 Membership Service - -Heartbeat-based cluster membership tracking. - -| Parameter | Default | Range | -|-----------|---------|-------| -| `heartbeatInterval` | 2s | 500ms–30s | -| `heartbeatTimeout` | 10s | 3s–120s | - -**Behavior:** - -- Nodes send periodic heartbeats to announce liveness - -- Missing heartbeats beyond timeout → node marked unavailable - -- New nodes trigger shard rebalancing within 5 seconds - -- All active nodes converge to the same membership view within 5 seconds - ---- - -### 🧭 Query Coordinator - -```mermaid -sequenceDiagram - participant Client as 👤 Client - participant Coord as 🧭 Coordinator - participant S0 as 💾 Shard 0 - participant S1 as 💾 Shard 1 - participant S2 as 💾 Shard 2 - - Client->>Coord: Search request - par Fan-out (parallel gRPC) - Coord->>S0: Query - Coord->>S1: Query - Coord->>S2: Query - end - S0-->>Coord: Results - S1-->>Coord: Results - S2-->>Coord: Results - Note over Coord: Merge by score + dedup by ID - Coord-->>Client: ✨ Global top-K results -``` - -> [!NOTE] -> If some shards timeout, the coordinator returns **partial results** from responding shards plus metadata indicating which shards were unreachable. - ---- - -## 🚀 Deployment Guide - -### Prerequisites - -- All nodes must run the same Spector version - -- Nodes must be reachable via gRPC (default port: 9090) - -- Network latency between nodes should be <10ms for optimal performance - -### Starting a Cluster - -**Node 1 (seed node):** - -```bash -java -jar spector-node.jar \ - --cluster-mode \ - --node-id node-1 \ - --grpc-port 9090 \ - --shard-count 4 \ - --replica-count 2 \ - --seeds node-1:9090 -``` - -**Node 2:** - -```bash -java -jar spector-node.jar \ - --cluster-mode \ - --node-id node-2 \ - --grpc-port 9090 \ - --shard-count 4 \ - --replica-count 2 \ - --seeds node-1:9090 -``` - -**Node 3:** - -```bash -java -jar spector-node.jar \ - --cluster-mode \ - --node-id node-3 \ - --grpc-port 9090 \ - --shard-count 4 \ - --replica-count 2 \ - --seeds node-1:9090 -``` - -### ✅ Verifying Cluster Health - -```bash -curl http://node-1:7070/api/v1/status -``` - -```json -{ - "status": "RUNNING", - "clusterMode": true, - "activeNodes": 3, - "shardCount": 4, - "replicaCount": 2, - "topology": { - "node-1": {"status": "ACTIVE", "shards": [0, 1]}, - "node-2": {"status": "ACTIVE", "shards": [2, 3]}, - "node-3": {"status": "ACTIVE", "shards": ["0-replica", "2-replica"]} - } -} -``` - -### 🔒 gRPC TLS Setup - -For production deployments, enable TLS on gRPC communication: - -```bash -java -jar spector-node.jar \ - --cluster-mode \ - --grpc-port 9090 \ - --grpc-tls \ - --grpc-cert /path/to/cert.pem \ - --grpc-key /path/to/key.pem \ - --grpc-ca /path/to/ca.pem -``` - ---- - -## 🛡️ Failure Scenarios - -### 💥 Node Failure - -```mermaid -graph TD - A["💥 Node fails"] --> B["💓 Heartbeat timeout detected"] - B --> C["🚫 Node removed from routing"] - C --> D["📋 Replica promoted to primary"] - D --> E["✅ Queries continue from remaining nodes"] -``` - -### 🔄 Node Recovery - -```mermaid -graph TD - A["🔄 Node resumes heartbeats"] --> B["💓 Re-registered in membership"] - B --> C["📋 Delta sync (only changed data)"] - C --> D["✅ Node resumes serving reads/writes"] -``` - -### 🌐 Network Partition - -- Nodes on each side continue serving their local shards - -- Queries to unreachable shards return partial results with timeout metadata - -- When partition heals, membership reconverges and replicas sync - ---- - -## 📈 Scaling Guidelines - -| Cluster Size | Shards | Documents | Estimated Throughput | -|-------------|--------|-----------|---------------------| -| 2 nodes | 2–4 | Up to 500K | ~15K QPS | -| 4 nodes | 4–8 | Up to 2M | ~29K QPS | -| 8 nodes | 8–16 | Up to 5M | ~55K QPS | -| 16 nodes | 16–32 | Up to 10M | ~100K QPS | - -> [!NOTE] -> Throughput estimates assume 128-dim vectors, top-10, hybrid search, extrapolated from single-node measured throughput of ~7.3K concurrent hybrid ops/s at 16 threads. Actual cluster throughput depends on network latency, shard balance, query routing overhead, and hardware homogeneity. These are projected estimates, not measured cluster benchmarks. - ---- - -## 🔗 See Also - -- [Architecture Overview](overview.md) — Overall system architecture - -- [Configuration Guide](../configuration/parameters.md) — Cluster parameters - -- [Performance Tuning](../operations/performance-tuning.md) — Optimizing distributed performance \ No newline at end of file diff --git a/docs/docs/architecture/gpu-acceleration.md b/docs/docs/architecture/gpu-acceleration.md deleted file mode 100644 index 7c953c1..0000000 --- a/docs/docs/architecture/gpu-acceleration.md +++ /dev/null @@ -1,260 +0,0 @@ -# 🎮 GPU Acceleration - -> **Unlock massive parallel throughput with optional CUDA GPU acceleration.** Spector loads GPU kernels via Panama FFM (Foreign Function & Memory), maintaining the zero-JNI philosophy. GPU shines for batch workloads — single queries are already sub-millisecond on CPU SIMD. - ---- - -## 🎯 When to Use GPU - -```mermaid -graph TD - Q["How many concurrent queries?"] --> Single["Single query
Low concurrency"] - Q --> Batch["Batch queries
High concurrency"] - - Single --> CPU["✅ CPU SIMD
Best for HNSW traversal"] - Batch --> GPU["✅ GPU CUDA
4× speedup at 100K+ vectors"] - - style CPU fill:#d4edda - style GPU fill:#d4edda -``` - -| Scenario | Recommendation | -|----------|---------------| -| ✅ Batch search (multiple queries at once) | GPU | -| ✅ Large collections (>100K vectors) | GPU | -| ✅ High concurrency (many simultaneous users) | GPU | -| ✅ Brute-force similarity over IVF partitions | GPU | -| ⚡ Single queries | CPU SIMD | -| ⚡ Small datasets (<10K vectors) | CPU SIMD | -| ⚡ Ultra-low latency (<0.1ms) | CPU SIMD | - ---- - -## 📋 Requirements - -### Hardware - -- NVIDIA GPU with Compute Capability ≥ 7.0 (Volta or newer) - -- Recommended: RTX 3060+ or A100/H100 for production workloads - -### Software - -| Component | Version | Notes | -|-----------|---------|-------| -| CUDA Toolkit | 12.x | Runtime libraries required | -| NVIDIA Driver | 525+ | Must match CUDA version | -| JDK | 25+ | With Panama FFM support | - -### 🐧 Installation (Linux) - -```bash -# Install CUDA toolkit -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -sudo dpkg -i cuda-keyring_1.1-1_all.deb -sudo apt update -sudo apt install cuda-toolkit-12-4 - -# Verify -nvidia-smi -nvcc --version -``` - -### ✅ Verify Spector GPU Detection - -```bash -curl http://localhost:7070/api/v1/status -``` -```json -{ - "gpuAvailable": true, - "gpuInfo": "NVIDIA RTX 4090, 24GB, CUDA 12.4" -} -``` - ---- - -## ⚙️ Configuration - -```java -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withGpu(true) - .withGpuMemoryBudget(2048); // 2 GB -``` - -| Parameter | Default | Range | Description | -|-----------|---------|-------|-------------| -| `gpuEnabled` | false | — | Enable CUDA acceleration | -| `gpuMemoryBudget` | 256 MB | 256 MB – GPU max | Maximum device memory | -| `gpuBatchWindow` | 10 ms | 1–100 ms | Batching window for query collection | -| `gpuMaxBatchSize` | 1024 | 1–1024 | Max queries per kernel launch | - -> [!TIP] -> Set `gpuMemoryBudget` to ~70% of available GPU memory to leave room for other processes. - ---- - -## 🔬 GPU Kernels - -### Dot Product Kernel - -Computes dot-product similarity between a query vector and a batch of document vectors. - -| Property | Value | -|----------|-------| -| Input | query (float32[D]) + database (float32[N × D]) | -| Output | similarity scores (float32[N]) | -| Dimensions | Multiples of 32, range 32–2048 | -| Batch size | 1–1,000,000 vectors per invocation | -| Tolerance | ≤1e-5 absolute error vs CPU SIMD | - -### Cosine Similarity Kernel - -Computes cosine similarity with cached norm computation. - -| Optimization | Benefit | -|-------------|---------| -| Pre-computes norms | Cached across queries | -| Detects pre-normalized vectors | Skips norm computation | -| Falls back to dot product | For normalized inputs | -| Tolerance | ≤1e-6 vs CPU SIMD | - -### ⏱️ Batch GPU Search - -```mermaid -sequenceDiagram - participant Q1 as Query A (t=0ms) - participant Q2 as Query B (t=3ms) - participant Q3 as Query C (t=7ms) - participant GPU as 🎮 GPU Kernel - - Note over Q1,GPU: Batch window = 10ms - Q1->>GPU: Queued - Q2->>GPU: Queued - Q3->>GPU: Queued - Note over GPU: t=10ms: Window closes - GPU->>GPU: Single kernel for [A, B, C] - GPU-->>Q1: Top-K results for A - GPU-->>Q2: Top-K results for B - GPU-->>Q3: Top-K results for C -``` - -**Properties:** - -- Each query receives its own independent top-K results - -- Individual query errors don't fail the batch - -- Achieves ≥2× throughput vs sequential for batch sizes >4 - -- Large batches are automatically partitioned to fit GPU memory - ---- - -## 💾 Memory Management - -The `GpuMemoryManager` handles device memory via Panama FFM: - -```java -// Allocation tied to Arena lifecycle -try (Arena arena = Arena.ofConfined()) { - MemorySegment deviceMem = gpuMemoryManager.allocateDevice(sizeBytes, arena); - // Use device memory... -} // Automatically freed when arena closes -``` - -**Key behaviors:** - -- ✅ Allocations are Arena-scoped with explicit lifecycle - -- ✅ Pinned host memory for efficient host↔device transfers - -- ✅ Budget enforcement prevents over-allocation - -- ✅ Device memory released within 100ms of Arena close - -- ✅ Metrics available via monitoring API - ---- - -## 🔄 Fallback Behavior - -```mermaid -graph TD - A["GPU Kernel Call"] --> B{"GPU available?"} - B -->|No| C["⚡ CPU SIMD kernel
(same interface)"] - B -->|Yes| D{"Kernel execution OK?"} - D -->|Error| E["Release device memory"] - E --> C - D -->|Success| F["✅ Return GPU results"] -``` - -> [!NOTE] -> **No code changes required.** The same method signature returns results regardless of whether GPU or CPU executed the computation. Fallback is automatic and transparent. - -**Fallback triggers:** - -- GPU not detected at startup - -- CUDA driver not installed - -- Insufficient GPU memory - -- CUDA kernel execution error - -- GPU memory budget exceeded - ---- - -## 📊 Performance Characteristics - -### Single Query (CPU wins) - -| Method | 100K vectors, 384-dim | -|--------|----------------------| -| ⚡ CPU SIMD (AVX2) | ~0.05 ms | -| 🎮 GPU (kernel launch overhead) | ~0.5–1 ms | - -### Batch Queries (GPU shines) - -| Batch Size | CPU SIMD | GPU (resident) | GPU Speedup | -|-----------|----------|----------------|-------------| -| 10K | 0.35 ms | 0.21 ms | **1.7×** | -| 100K | 9.13 ms | 2.24 ms | **4.1×** | -| 500K | 45.75 ms | 11.31 ms | **4.0×** | -| 1M | 90.77 ms | 22.09 ms | **4.1×** | - -> [!IMPORTANT] -> GPU acceleration benchmarked on RTX 4060 Ti 16GB, 384-dim vectors, with database persistently resident in VRAM. The one-time upload cost is ~464ms for 1M vectors (1.5GB). Per-query cost only includes uploading the query vector (~1.5KB) and downloading results. GPU provides consistent 4× speedup for brute-force search at scale. - ---- - -## 🔧 Troubleshooting - -| Symptom | Cause | Solution | -|---------|-------|----------| -| `gpuAvailable: false` | CUDA not installed | Install CUDA toolkit, verify `nvidia-smi` | -| Slow GPU queries | Small batch sizes | Increase `gpuBatchWindow` or disable GPU | -| Out of GPU memory | Budget too low | Increase `gpuMemoryBudget` | -| CPU fallback always used | Native access not enabled | Add `--enable-native-access=ALL-UNNAMED` | - -### JVM Arguments for GPU - -```bash -java --add-modules jdk.incubator.vector \ - --enable-native-access=ALL-UNNAMED \ - -jar spector-node.jar -``` - ---- - -## 🔗 See Also - -- [Core Concepts](core-concepts.md) — SIMD kernels that GPU extends - -- [Performance Tuning](../operations/performance-tuning.md) — When to use GPU vs CPU - -- [Configuration Guide](../configuration/parameters.md) — GPU parameters - -- [Architecture Overview](overview.md) — Where GPU fits in the system \ No newline at end of file diff --git a/docs/docs/architecture/ingestion-pipeline.md b/docs/docs/architecture/ingestion-pipeline.md deleted file mode 100644 index 034a8c5..0000000 --- a/docs/docs/architecture/ingestion-pipeline.md +++ /dev/null @@ -1,259 +0,0 @@ -# 📥 Ingestion Pipeline - -> **Unified ingestion: document → chunk → embed → target.** A single `IngestionPipeline` with builder configuration handles all ingestion — for both search engine and cognitive memory. The pipeline decides how to process content; the `IngestionTarget` decides where to store it. - ---- - -## Architecture - -All entry points (CLI, MCP, Server) route ingestion through `SpectorRuntime`: - -``` -CLI/MCP/Server → SpectorRuntime.ingestion() → IngestionHandler → IngestionPipeline - │ - ┌─────┴─────┐ - ▼ ▼ - EngineIngestionTarget CognitiveIngestionTarget - (SEARCH mode) (MEMORY mode) -``` - -- **`IngestionPipeline`** (in `spector-ingestion`) — unified chunk → embed → store orchestrator with builder pattern -- **`IngestionTarget`** (in `spector-ingestion`) — abstraction for storage backends (engine or memory) -- **`IngestionHandler`** (in `spector-runtime`) — thin routing layer over the pipeline -- **`FileDiscoveryService`** (in `spector-ingestion`) — pure file discovery + title extraction utility - -## Module: `spector-ingestion` - -The ingestion module is a **low-level utility** with no dependency on engine, runtime, or memory. It defines the pipeline and the `IngestionTarget` interface that downstream modules implement. - -**Key classes:** - -| Class | Purpose | -|-------|---------| -| `IngestionPipeline` | Builder-configured orchestrator — chunk → embed → store | -| `IngestionTarget` | Interface for storage backends (`ingest(id, text, vector)`) | -| `IngestionResult` | Outcome with chunk counts, failures, timing | -| `FileDiscoveryService` | File discovery, title extraction, config-driven filtering | - ---- - -## 🔄 Pipeline Flow - -```mermaid -flowchart LR - A["📄 Document"] --> B{"Content > threshold?"} - B -->|Yes| C["✂️ TextChunker
Config-driven
chunk size + overlap"] - B -->|No| D["Direct embed"] - C --> E["🧠 Parallel Embedding
Virtual threads
ParallelEmbeddingPipeline"] - D --> E - E --> F["💾 IngestionTarget
Engine or Cognitive"] - F --> G["✅ IngestionResult"] -``` - ---- - -## 🏗️ Builder Pattern - -The pipeline is configured once via a builder, then reused for all ingestion in a session: - -```java -// Read chunking config from spector.yml -var ingestionConfig = SpectorConfigFactory.ingestionDefaults(props); - -var pipeline = IngestionPipeline.builder() - .target(engineTarget) // or cognitiveTarget - .embeddingProvider(embedder) // for auto-embedding - .chunking(new TextChunker( - ingestionConfig.chunkSize(), - ingestionConfig.chunkOverlap())) - .chunkThreshold(ingestionConfig.chunkSize()) - .build(); -``` - -The pipeline automatically selects a strategy based on content: - -| Content | Strategy | Description | -|---------|----------|-------------| -| ≤ threshold | **Direct** | Embed whole text, store as single doc | -| > threshold | **Chunked** | Split via `TextChunker`, embed in parallel, store each chunk | -| Pre-embedded | **Passthrough** | Skip embedding, store vector directly | -| File path | **Streaming** | `StreamingChunker` for bounded-memory processing | - ---- - -## 🎯 IngestionTarget Interface - -The pipeline is decoupled from storage — it writes to any `IngestionTarget`: - -```java -public interface IngestionTarget { - void ingest(String id, String text, float[] vector); - - default void storeParentMetadata(String parentId, int chunkCount) {} - default void onBatchComplete() {} -} -``` - -### Implementations - -| Target | Module | What it does | -|--------|--------|-------------| -| `EngineIngestionTarget` | `spector-engine` | VectorStore → VectorIndex (HNSW/IVF/Spectrum) → KeywordIndex (BM25) | -| `CognitiveIngestionTarget` | `spector-memory` | Synaptic tags → Surprise detection → ICNU fusion → Quantize → Tier route → WAL | - -This decoupling enables: - -- **Testing** — Mock the target for unit tests -- **Rebuilding indexes** — Point at a fresh index during reindexing -- **Multi-tenant setups** — Route documents to different targets -- **Custom stores** — Write to external systems alongside Spector - -### Virtual Thread Parallelism - -Embedding calls (I/O-bound, network) run in parallel using the `ParallelEmbeddingPipeline`: - -```mermaid -sequenceDiagram - participant Pipeline as 📥 IngestionPipeline - participant Chunker as ✂️ TextChunker - participant Embed as 🧠 ParallelEmbeddingPipeline - participant VT1 as Virtual Thread 1 - participant VT2 as Virtual Thread 2 - participant Target as 💾 IngestionTarget - - Pipeline->>Chunker: chunk(document) - Chunker-->>Pipeline: List - Pipeline->>Embed: embed(chunkTexts) - par Batch 1 - Embed->>VT1: embedBatch([c1,c2,c3,c4]) - and Batch 2 - Embed->>VT2: embedBatch([c5,c6,c7,c8]) - end - VT1-->>Embed: vectors[0..3] - VT2-->>Embed: vectors[4..7] - Embed-->>Pipeline: List - loop For each successful embedding - Pipeline->>Target: ingest(chunkId, text, vector) - end - Pipeline-->>Pipeline: IngestionResult -``` - -> [!NOTE] -> CPU-bound work (chunking, keyword tokenization, SIMD index insertion) runs synchronously on the caller's virtual thread. Only the embedding I/O call is parallelized. This avoids context-switch overhead on hot paths. - ---- - -## 📋 Ingestion Modes - -### Text Ingestion (auto-chunked) - -```java -// Pipeline decides whether to chunk based on content length vs. threshold -IngestionResult result = pipeline.ingest("doc-1", longDocumentText); -``` - -### Pre-embedded (skip embedding) - -```java -// For pre-computed vectors — no chunking, no embedding -IngestionResult result = pipeline.ingest("doc-1", "Hello world", precomputedVector); -``` - -### Streaming File Ingestion - -For multi-GB files that can't fit in memory: - -```java -IngestionResult result = pipeline.ingest( - Path.of("corpus.txt"), "corpus"); -// Bounded memory: only ~2× chunkSize held at once via StreamingChunker -``` - ---- - -## 📊 Result Tracking - -Every ingestion operation returns an `IngestionResult`: - -```java -public record IngestionResult( - String documentId, - int chunksStored, - List failures, // chunk IDs that failed - long durationMs -) {} -``` - -**Properties:** - -- Failed chunks don't halt the pipeline — other chunks continue -- Failure reasons are logged at WARN level -- `isFullSuccess()` returns true only if all chunks succeeded -- Timing includes chunking + embedding + storage - ---- - -## 🧠 Cognitive Target Pipeline - -When the `CognitiveIngestionTarget` receives a chunk from the unified pipeline, it executes the cognitive processing steps: - -``` -IngestionPipeline CognitiveIngestionTarget - │ │ - │ ingest(id, text, vector) │ - ├──────────────────────────────────────────► │ - │ ├── 2. Encode synaptic tags (Bloom filter) - │ ├── 3. Compute surprise (Dopamine) - │ ├── 3b. ICNU fusion (if hints provided) - │ ├── 4. Flashbulb check (extreme surprise) - │ ├── 5. Quantize to INT8 - │ ├── 6. Build cognitive header - │ ├── 7. Write to tier store - │ ├── 8. Register in MemoryIndex - │ └── 9. WAL append -``` - -`SpectorMemory.remember()` calls `CognitiveIngestionTarget.ingestCognitive()` directly with full cognitive parameters (type, tags, source, ICNU hints). - ---- - -## ⚡ Design Decisions - -### Why not Reactor? - -The pipeline uses virtual threads instead of Project Reactor because: - -| Concern | Virtual Threads | Reactor | -|---------|----------------|---------| -| Embedding I/O | Native async via VT | Requires `Mono.fromCallable` wrapping | -| Error handling | try/catch, intuitive | `onErrorResume` chains | -| Debugging | Normal stack traces | Operator assembly traces | -| Testing | Standard JUnit | `StepVerifier` complexity | -| Dependencies | Zero (JDK only) | reactor-core + reactor-netty | - -### Why a unified pipeline? - -Consolidating from 3 separate ingestion paths: - -1. **Single code path** — Same chunking + embedding logic for search and memory -2. **Config-driven** — Chunk size, overlap, threshold all read from `spector.yml` -3. **No OOM** — Streaming chunker ensures bounded memory for large files -4. **Extensible** — New targets only need to implement `IngestionTarget.ingest()` - -### Why a separate module? - -Extracting ingestion from `SpectorEngine`: - -1. **Testability** — Pipeline can be unit-tested with a mock `IngestionTarget` -2. **Reusability** — Bulk ingestion tools don't need the full engine -3. **Clarity** — Ingestion logic is isolated from search/lifecycle concerns -4. **Extensibility** — Custom pipelines can compose different chunkers/embedders - ---- - -## 🔗 See Also - -- [RAG Pipeline](rag-pipeline.md) — Retrieval and context assembly -- [Architecture Overview](overview.md) — Module dependency graph -- [REST API Reference](../api-reference/rest-endpoints.md) — Ingest endpoints -- [Configuration Guide](../configuration/parameters.md) — Chunking and embedding parameters diff --git a/docs/docs/architecture/mcp-integration.md b/docs/docs/architecture/mcp-integration.md deleted file mode 100644 index 34f3459..0000000 --- a/docs/docs/architecture/mcp-integration.md +++ /dev/null @@ -1,304 +0,0 @@ -# 🤖 MCP Integration Architecture - -> **Spector's built-in Model Context Protocol (MCP) server gives any AI agent instant, in-process access to SIMD-accelerated vector search — with zero network overhead.** - ---- - -## Overview - -The [Model Context Protocol (MCP)](https://modelcontextprotocol.io/) is Anthropic's open standard for connecting AI agents to external data sources. Instead of writing custom Python glue-code with orchestration frameworks, agents connect directly to an MCP server via JSON-RPC and autonomously invoke tools. - -**Spector's MCP server runs in-process.** When Claude Desktop or Cursor calls `semantic_search`, the request goes from JSON-RPC → Java method call → SIMD kernel — never touching a network socket. This makes Spector **23–113× faster than Python-based MCP servers** that route through HTTP/gRPC. - ---- - -## Architecture - -```mermaid -graph LR - subgraph "AI Agent (Claude, Cursor, etc.)" - Agent["🤖 AI Agent"] - end - - subgraph "spector-mcp (in-process)" - Transport["📡 StdioTransport
JSON-RPC 2.0"] - Server["⚡ SpectorMcpServer
Thin orchestrator"] - - subgraph Providers - TR["🔧 SpectorToolRegistry"] - RP["📄 SpectorResourceProvider"] - PP["💬 SpectorPromptProvider"] - end - - subgraph "Tools (McpToolHandler subclasses)" - T1["SemanticSearchTool"] - T2["HybridSearchTool"] - T3["RagQueryTool"] - T4["IngestDocumentTool"] - T5["DeleteDocumentTool"] - T6["EngineStatusTool"] - end - - subgraph Foundation - SB["ToolSchemaBuilder"] - RF["ResultFormatter"] - TH["McpToolHandler
Abstract base"] - end - end - - subgraph "spector-runtime" - Runtime["⚡ SpectorRuntime
Composition Root"] - end - - subgraph "spector-engine" - Engine["🔧 SpectorEngine"] - end - - subgraph "spector-core" - SIMD["🔬 SIMD Kernels
AVX2/AVX-512/NEON"] - end - - Agent -- "stdin/stdout" --> Transport - Transport --> Server - Server --> TR & RP & PP - TR --> T1 & T2 & T3 & T4 & T5 & T6 - T1 & T2 & T3 & T4 & T5 & T6 --> TH - T1 & T2 & T3 & T4 & T5 & T6 --> SB - T1 & T2 & T3 --> RF - T6 --> RF - T1 & T2 & T3 & T4 & T5 & T6 --> Runtime - Runtime --> Engine - Engine --> SIMD -``` - -### Data Flow - -```mermaid -sequenceDiagram - participant Agent as 🤖 AI Agent - participant MCP as 📡 MCP Transport (stdio) - participant Handler as 🔧 McpToolHandler - participant Runtime as ⚡ SpectorRuntime - participant Engine as 🔧 SpectorEngine - participant SIMD as 🔬 SIMD Kernel - - Agent->>MCP: tools/call {"name": "semantic_search", "arguments": {"query": "..."}} - MCP->>Handler: SemanticSearchTool.execute(runtime, args) - - Note over Handler: requireString(args, "query")
optionalInt(args, "top_k", 5) - - Handler->>Runtime: runtime.search().query(query, topK) - Runtime->>Engine: engine.search(query, topK) - Engine->>SIMD: HNSW traversal (off-heap MemorySegment) - SIMD-->>Engine: ScoredResult[] (~100µs) - Engine-->>Runtime: SearchResponse - Runtime-->>Handler: SpectorResult[] - - Note over Handler: ResultFormatter.formatSearchResults()
McpToolHandler.textResult() - - Handler-->>MCP: CallToolResult (text content) - MCP-->>Agent: {"content": [{"type": "text", "text": "Found 5 results..."}]} -``` - ---- - -## Module Structure - -``` -spector-mcp/src/main/java/com/spectrayan/spector/mcp/ -├── SpectorMcpServer.java ← Thin orchestrator (assembly only) -├── SpectorMcpMain.java ← CLI entry point -├── schema/ -│ └── ToolSchemaBuilder.java ← Type-safe fluent builder for JSON schemas -├── tools/ -│ ├── McpToolHandler.java ← Abstract base with timing, error handling -│ ├── SpectorToolRegistry.java ← Tool discovery & registration -│ ├── SemanticSearchTool.java ← Individual tool implementations -│ ├── HybridSearchTool.java -│ ├── RagQueryTool.java -│ ├── IngestDocumentTool.java -│ ├── DeleteDocumentTool.java -│ └── EngineStatusTool.java -├── resources/ -│ └── SpectorResourceProvider.java ← Resource definitions & handlers -├── prompts/ -│ └── SpectorPromptProvider.java ← Prompt templates & handlers -└── util/ - └── ResultFormatter.java ← Search result formatting utilities -``` - ---- - -## Tool Reference - -### `semantic_search` - -Performs semantic similarity search using vector embeddings. Requires an embedding provider (e.g., Ollama) to be configured. - -| Parameter | Type | Required | Default | Description | -|:---|:---|:---|:---|:---| -| `query` | string | ✅ | — | Natural language search query | -| `top_k` | integer | ❌ | 5 | Number of results to return (1–100) | - -### `hybrid_search` - -Combined keyword (BM25) + semantic (vector) search with reciprocal rank fusion. Falls back to keyword-only if no embedding provider is configured. - -| Parameter | Type | Required | Default | Description | -|:---|:---|:---|:---|:---| -| `query` | string | ✅ | — | Search query for both keyword and semantic matching | -| `top_k` | integer | ❌ | 5 | Number of results to return | -| `mode` | enum | ❌ | `hybrid` | Search mode: `hybrid`, `keyword`, or `vector` | - -### `rag_query` - -Retrieval-Augmented Generation — retrieves relevant context with source citations formatted for LLM consumption. - -| Parameter | Type | Required | Default | Description | -|:---|:---|:---|:---|:---| -| `query` | string | ✅ | — | The question or topic to retrieve context for | -| `top_k` | integer | ❌ | 5 | Number of context passages to retrieve | - -### `ingest_document` - -Ingests a document into the search index with automatic embedding and optional chunking. - -| Parameter | Type | Required | Default | Description | -|:---|:---|:---|:---|:---| -| `id` | string | ✅ | — | Unique document identifier | -| `content` | string | ✅ | — | Document text content | -| `title` | string | ❌ | — | Optional document title | - -### `delete_document` - -Removes a document from the search index by ID. - -| Parameter | Type | Required | Default | Description | -|:---|:---|:---|:---|:---| -| `id` | string | ✅ | — | Document ID to delete | - -### `engine_status` - -Returns engine metadata including document count, dimensions, SIMD capabilities, embedding provider status, and GPU availability. - -| Parameter | Type | Required | Default | Description | -|:---|:---|:---|:---|:---| -| *(none)* | — | — | — | No input parameters required | - ---- - -## Extending the MCP Server - -### Adding a New Tool - -Every tool extends `McpToolHandler`, which handles timing, error handling, and argument parsing. You implement four methods: - -```java -public abstract class McpToolHandler { - abstract String name(); - abstract String description(); - abstract Map inputSchema(); - abstract CallToolResult execute(SpectorEngine engine, Map args); - - // Base class automatically provides: - // - Timing wrapper (nanoTime → milliseconds) - // - Structured error handling with logging - // - Argument parsing: requireString(), optionalInt(), optionalString() - // - Result factories: textResult(), errorResult() -} -``` - -Define the tool schema with `ToolSchemaBuilder`: - -```java -var schema = ToolSchemaBuilder.object() - .requiredString("query", "Natural language search query.") - .optionalInt("top_k", "Number of results to return.", 5) - .optionalEnum("mode", "Search mode.", "hybrid", "hybrid", "keyword", "vector") - .build(); -``` - -Register the tool in `SpectorToolRegistry.handlers()`: - -```java -List.of( - new SemanticSearchTool(), - new HybridSearchTool(), - new RagQueryTool(), - new IngestDocumentTool(), - new DeleteDocumentTool(), - new EngineStatusTool(serverVersion) - // new YourNewTool() ← just add here -); -``` - ---- - -## Performance: Why In-Process Wins - -### The Python MCP Tax - -Python MCP servers introduce multiple layers of overhead: - -```mermaid -graph LR - A1["🤖 Agent"] --> B1["JSON-RPC"] - B1 --> C1["🐍 Python process"] - C1 --> D1["Deserialize"] - D1 --> E1["HTTP/gRPC round-trip"] - E1 --> F1["Vector DB"] - F1 --> G1["Serialize response"] - G1 --> H1["JSON-RPC"] - H1 --> I1["🤖 Agent"] - - style C1 fill:#e74c3c,color:white - style E1 fill:#e74c3c,color:white -``` - -> **Total: 2–10ms per query** (network + GIL + serialization) - -### Spector's Zero-Copy Path - -```mermaid -graph LR - A2["🤖 Agent"] --> B2["JSON-RPC"] - B2 --> C2["☕ Virtual Thread"] - C2 --> D2["SpectorEngine.search()"] - D2 --> E2["Off-heap MemorySegment"] - E2 --> F2["SIMD registers"] - F2 --> G2["✅ Results"] - - style C2 fill:#00b894,color:white - style E2 fill:#00b894,color:white - style G2 fill:#00b894,color:white -``` - -> **Total: 88µs p50 per query** (23–113× faster) - -| Bottleneck | Python MCP | Spector MCP | -|:---|:---|:---| -| Network round-trip | 500–2,000µs | **0µs** (in-process) | -| JSON serialization | 100–500µs | **0µs** (direct Java objects) | -| Python GIL contention | Blocks concurrent queries | **0µs** (Virtual Threads) | -| GC pressure | Heap allocation per query | **0µs** (off-heap Panama) | -| Search computation | ~100µs (native C++) | **~100µs** (Panama SIMD) | -| **Total** | **2,000–10,000µs** | **88µs p50** | - ---- - -## Security Considerations - -> [!WARNING] -> The `ingest_document` and `delete_document` tools allow agents to modify the search index. In production environments, consider: -> - Running the MCP server in read-only mode (expose only search tools) -> - Implementing document-level access control -> - Rate limiting ingestion operations -> - Auditing all write operations - ---- - -## See Also - -- [MCP Server Usage Guide](../sdk-usage/mcp-server.md) — Practical setup for Claude Desktop, Cursor, and custom agents -- [Architecture Overview](overview.md) — Full system architecture -- [Core Concepts](core-concepts.md) — HNSW, BM25, RRF deep-dives diff --git a/docs/docs/architecture/overview.md b/docs/docs/architecture/overview.md index c3261a9..d0f0e0f 100644 --- a/docs/docs/architecture/overview.md +++ b/docs/docs/architecture/overview.md @@ -1,381 +1,92 @@ -# 🏗️ Architecture Overview +# Architecture -> **Spector is a modular, JVM-native AI memory backbone organized as a Maven multi-module project.** This page covers the module structure, dependency graph, data flow, threading model, and memory architecture that make sub-millisecond, agent-native search possible. +## System Overview ---- +Spector Search is a multi-module Maven project built on four foundational Java technologies: -## 📦 Module Diagram +- **Java Vector API** (jdk.incubator.vector) — SIMD-accelerated similarity kernels +- **Panama FFM** — Zero-copy memory-mapped storage and GPU interop +- **Virtual Threads** (Project Loom) — Massive concurrency without thread pool tuning +- **Memory-mapped indexes** — Instant startup, zero GC pressure -```mermaid -graph LR - subgraph "🔬 Core Layer" - core["spector-core
SIMD kernels"] - commons["spector-commons
Config, chunkers, tokenizer"] - end +## Module Structure - subgraph "💾 Storage Layer" - storage["spector-storage
Panama MemorySegment stores"] - end - - subgraph "📊 Index Layer" - index["spector-index
HNSW + IVF-PQ + BM25"] - end - - subgraph "🔍 Query Layer" - query["spector-query
Hybrid orchestrator + RRF"] - end - - subgraph "🧠 Intelligence" - embedapi["spector-embed-api
EmbeddingProvider SPI"] - embedollama["spector-embed-ollama
Ollama provider"] - gpu["spector-gpu
Panama FFM + CUDA"] - end - - subgraph "📥 Pipelines" - ingestion["spector-ingestion
Ingest orchestration"] - rag["spector-rag
RAG pipeline"] - end - - subgraph "⚡ Runtime & Interfaces" - runtime["spector-runtime
Unified context (engine + memory)"] - engine["spector-engine
Search facade + lifecycle"] - node["spector-node
Armeria: REST + gRPC + SSE + cluster"] - mcp["spector-mcp
MCP Server — Agent-native"] - cli["spector-cli
spectorctl CLI"] - client["spector-client
Java client SDK"] - spring["spector-spring
Spring AI VectorStore"] - end - - subgraph "🧠 Cognitive Memory" - memory["spector-memory
Biologically-inspired agent memory"] - end - - subgraph "📈 Distribution" - bench["spector-bench
JMH benchmarks"] - dist["spector-dist
Single fat JAR"] - end -``` - -> [!NOTE] -> **Index sub-modules:** `hnsw/` (graph-based ANN), `ivf/` (inverted file + posting lists), `pq/` (product quantizer, K-Means++, ADC), `bm25/` (keyword scoring + analyzers) - ---- - -## 🔗 Dependency Graph - -```mermaid -graph TD - node["🌐 node"] --> runtime["⚡ runtime"] - node --> mcp["🤖 mcp"] - node --> metrics["📈 metrics"] - mcp --> runtime - mcp --> ingestion["📥 ingestion"] - cli["🖥️ cli"] --> runtime - cli --> client["📦 client"] - - runtime --> engine["⚡ engine"] - runtime --> memory["🧠 memory"] - runtime --> ingestion - - engine --> query["🔍 query"] - engine --> rag["🤖 rag"] - engine --> ingestion - engine --> index["📊 index"] - engine --> storage["💾 storage"] - engine --> embedapi["🧬 embed-api"] - engine -.-> gpu["🎮 gpu"] - - memory --> index - memory --> storage - memory --> ingestion - memory --> embedapi - memory --> core["🔬 core"] - - metrics --> engine - metrics --> memory - - ingestion --> config["⚙️ config"] - ingestion --> embedapi - - rag --> query - rag --> index - rag --> storage - rag --> embedapi - rag --> commons["📄 commons"] - - query --> index - query --> commons - index --> storage - index --> config - storage --> config - storage --> core - config --> core - - embedapi --> commons - gpu --> core - gpu --> storage - - dist["📦 dist"] --> mcp - dist --> cli - dist --> runtime - - spring["🌱 spring"] --> engine - spring --> memory - spring --> metrics - bench["🧪 bench"] --> engine - bench --> memory ``` - -> **Legend:** Solid arrows = compile dependency. Dotted arrow (`gpu`) = optional dependency. - -**Dependency rules:** - -| Path | Description | -|------|-------------| -| `runtime → engine + memory + ingestion` | Composition root — wires all subsystems | -| `cli → runtime + client` | CLI with local batch (runtime) and remote (client) modes | -| `node → runtime` | Unified Armeria node: REST + gRPC + cluster coordination | -| `mcp → runtime + ingestion` | MCP agent entry point (in-process, zero network) | -| `engine → ingestion` | `EngineIngestionTarget` implements `IngestionTarget` | -| `memory → ingestion` | `CognitiveIngestionTarget` implements `IngestionTarget` | -| `engine → rag` | RAG context assembly pipeline | -| `engine -.-> gpu` | Optional GPU acceleration | -| `memory → index, storage, core, embed-api` | Cognitive memory (independent of engine) | -| `dist → mcp + cli + runtime` | Fat JAR distribution | - -!!! important - **No circular dependencies.** `spector-memory` and `spector-engine` are **peers** — both depend on `spector-ingestion` for the `IngestionTarget` interface, but neither depends on the other. `SpectorRuntime` is the single composition root that wires them together. - ---- - -## 📥 Data Flow: Ingest Path - -```mermaid -sequenceDiagram - participant Client as 👤 Client (CLI/MCP/REST) - participant Runtime as ⚡ SpectorRuntime - participant Handler as 📥 IngestionHandler - participant Pipeline as 🔄 IngestionPipeline - participant Embed as 🧠 ParallelEmbeddingPipeline - participant Target as 💾 IngestionTarget - participant Store as 💾 Storage (mmap) - - Client->>Runtime: runtime.ingestion().ingest(dir, pattern) - Runtime->>Handler: Pre-configured pipeline + target - Handler->>Handler: FileDiscoveryService.discover() - loop Each file - Handler->>Pipeline: pipeline.ingest(id, content) - Pipeline->>Pipeline: TextChunker.chunk(content) - Pipeline->>Embed: embed(chunkTexts) via virtual threads - Embed-->>Pipeline: List - loop Each chunk - Pipeline->>Target: target.ingest(id, text, vector) - Target->>Store: VectorStore + VectorIndex + KeywordIndex - end - end - Store-->>Client: ✅ Indexed +spector-search/ +├── spector-core/ # SIMD kernels (DotProduct, Cosine, Euclidean) +├── spector-commons/ # Text chunkers, tokenizer, document readers +├── spector-storage/ # Panama MemorySegment stores (InMemory + Mmap) +├── spector-index/ # HNSW + IVF-PQ + BM25 indexes +│ ├── hnsw/ # HNSW graph ANN (standard + quantized INT8/INT4/INT2) +│ ├── ivf/ # IVF inverted file index + quantized IVF-PQ +│ ├── pq/ # Product quantizer (K-Means++, ADC) +│ ├── text/ # BM25 keyword scoring + analyzers +│ └── fuzz/ # Index fuzz testing framework +├── spector-query/ # Hybrid orchestrator + RRF fusion + reranking +├── spector-embed-api/ # EmbeddingProvider SPI +├── spector-embed-ollama/ # Ollama embedding provider +├── spector-gpu/ # GPU acceleration (CUDA via Panama FFM) +├── spector-engine/ # Unified engine facade + lifecycle +├── spector-server/ # REST API (Javalin + virtual threads) +├── spector-cluster/ # Distributed gRPC search +├── spector-client/ # Java client SDK +├── spector-cli/ # spectorctl CLI tool +└── spector-bench/ # JMH benchmarks ``` -1. **Client** calls `runtime.ingestion().ingest()` — all entry points use this -2. **IngestionHandler** delegates to a pre-configured `IngestionPipeline` -3. **IngestionPipeline** handles chunking (from config) and parallel embedding -4. **IngestionTarget** receives pre-embedded chunks — `EngineIngestionTarget` for SEARCH, `CognitiveIngestionTarget` for MEMORY -5. Each target handles its own downstream storage (VectorStore/HNSW or Quantize/TierRoute/WAL) +## Dependency Flow -> [!TIP] -> `FileDiscoveryService` can be used independently for file discovery without any engine or runtime dependency. - ---- - -## 🔍 Data Flow: Search Path - -```mermaid -sequenceDiagram - participant Client as 👤 Client - participant Engine as ⚡ SpectorEngine - participant QB as 🧭 Query Builder - participant BM25 as 📝 BM25 Search - participant HNSW as 🧠 HNSW Search - participant RRF as 🧬 RRF Fusion - participant LLM as 🤖 LLM Reranker - - Client->>Engine: Search (text + vector + topK) - Engine->>QB: Auto-detect mode - Note over QB: text only → KEYWORD
vector only → VECTOR
both → HYBRID - par Parallel search on virtual threads - QB->>BM25: Keyword search - QB->>HNSW: Vector search - end - BM25->>RRF: Ranked results - HNSW->>RRF: Ranked results - RRF->>LLM: Fused top candidates - LLM-->>Client: ✨ Final ranked results ``` - -1. **Query Builder** determines search mode from provided fields -2. **BM25** and **HNSW** searches run in parallel on virtual threads -3. **RRF Fusion** merges both ranked lists using `1/(k + rank)` scoring -4. Optional **LLM Reranker** rescores top candidates via Ollama - ---- - -## 🤖 Data Flow: MCP Agent Path - -```mermaid -sequenceDiagram - participant Agent as 🤖 AI Agent (Claude/Cursor) - participant MCP as 📡 MCP Transport (stdio) - participant Handler as 🔧 McpToolHandler - participant Runtime as ⚡ SpectorRuntime - participant Engine as 🔧 SpectorEngine - participant SIMD as 🔬 SIMD Kernels - - Agent->>MCP: tools/call {"name": "semantic_search", "arguments": {"query": "..."}} - MCP->>Handler: SemanticSearchTool.execute(runtime, args) - Handler->>Runtime: runtime.search().query(text, topK) - Runtime->>Engine: engine.search(query, topK) - Engine->>SIMD: HNSW traversal (off-heap MemorySegment) - SIMD-->>Engine: ScoredResult[] (~100µs) - Engine-->>Runtime: SearchResponse - Runtime-->>Handler: SpectorResult[] - Handler-->>MCP: CallToolResult - MCP-->>Agent: JSON-RPC response with search results +server → engine → query → index → core + → index → storage → core +cluster → engine +client → (HTTP) → server +cli → (HTTP) → server +gpu → core, storage +engine → commons, embed-api ``` -The MCP path routes through `SpectorRuntime` — the single composition root that holds both the search engine and optional cognitive memory. The MCP server wraps runtime handler calls with JSON-RPC transport. There is **zero network overhead** because everything runs in the same JVM process. - -> [!TIP] -> For full MCP architecture details, tool schemas, and design patterns, see the dedicated [MCP Integration](mcp-integration.md) page. - ---- - -## 🧵 Threading Model: Virtual Threads - -Spector is designed from the ground up for Java virtual threads: - -> [!TIP] -> **No `synchronized` blocks** anywhere in the codebase. All coordination uses `ReentrantLock` to avoid virtual thread pinning. - -| Operation | Threading Strategy | -|-----------|-------------------| -| REST request handling | One virtual thread per request | -| Hybrid search | Parallel BM25 + HNSW via `StructuredTaskScope` | -| Bulk ingest | Virtual thread per document | -| Embedding generation | Batched across virtual threads | -| HNSW construction (>10K) | Virtual threads per core for parallel insertion | -| Distributed fan-out | Virtual thread per shard query | - -### 📈 Scaling Results - -At 50K docs with hybrid search (384-dim, production-realistic): - -| Virtual Threads | Throughput | Scaling | -|-----------------|-----------|---------| -| 1 | 3,739 ops/s | 1.0× | -| 4 | 10,317 ops/s | **2.8×** | -| 8 | 11,812 ops/s | **3.2×** | -| 16 | 14,022 ops/s | **3.7×** | - -> [!NOTE] -> Scaling depends on vector dimensions and workload type. 384-dim shows ~3.7× at 16 threads due to higher per-query memory bandwidth. Individual HNSW queries are inherently sequential (graph traversal data dependencies) — scaling comes from concurrent queries sharing CPU cores. - ---- - -## 💾 Memory Model: Panama Off-Heap - -All vector data lives off-heap using the Panama Foreign Function & Memory API: - -```mermaid -graph TB - subgraph "☕ JVM Heap (minimal)" - HG["HNSW Graph
(adjacency lists)"] - BM["BM25 Index
(inverted index)"] - ES["Engine State
(config, lifecycle)"] - end - - subgraph "🧊 Off-Heap (Panama MemorySegment)" - VS["Vector Store
Contiguous float32, SIMD-aligned
Zero-copy reads, no GC pressure"] - QS["Quantized Store
INT8 or PQ codes"] - GM["GPU Device Memory
CUDA via FFM"] - end - - HG -.-> VS - BM -.-> VS - ES -.-> QS - ES -.-> GM -``` - -**Benefits:** - -- ✅ **Zero GC pressure** — Vectors never touch the garbage collector - -- ✅ **Instant startup** — Memory-mapped files load via `mmap` syscall, no deserialization - -- ✅ **SIMD-friendly layout** — Contiguous float32 arrays ready for Vector API operations - -- ✅ **Explicit lifecycle** — `Arena`-scoped memory with deterministic cleanup - -- ✅ **Memory efficiency** — Store billions of vectors limited only by disk/address space - -### 📊 Storage Types - -| Store | Location | Use Case | -|-------|----------|----------| -| `InMemoryVectorStore` | Off-heap (Arena) | Development, small datasets | -| `MmapVectorStore` | Memory-mapped file | Production, persistence | -| `QuantizedVectorStore` | Off-heap (INT8) | Memory-constrained deployments | -| `IvfPqStore` | Off-heap (PQ codes) | Billion-scale (32× compression) | - ---- - -## 🌐 API Layer - -```mermaid -graph TD - subgraph "SpectorNode - Armeria Server, single port" - CORS["CorsService decorator"] - Auth["API Key decorator"] - COMPRESS["EncodingService - gzip/brotli"] - subgraph "ApiModule Registration" - SE["🔍 SearchEndpoint"] - IE["📥 IngestEndpoint"] - RE["🤖 RagEndpoint"] - DE["🗑️ DocumentEndpoint"] - STE["📊 StatusEndpoint"] - ESE["📡 EventStreamEndpoint"] - end - gRPC["gRPC Service
inter-node fan-out"] - HEALTH["💚 /health"] - PROM["📊 /metrics"] - end - - subgraph "Service Facades" - SS["SearchService"] - IS["IngestService"] - RS["RagService"] - end - - SE --> SS - IE --> IS - RE --> RS - SS & IS --> EB["SpectorEventBus
17 event types"] - SS --> ENGINE["⚡ SpectorEngine"] -``` - -Every request runs on its own virtual thread. The Armeria server handles HTTP REST, gRPC, and SSE events on a single port. API endpoints are registered via the `ApiModule` factory pattern, enabling straightforward API versioning (`/api/v1`, `/api/v2`). - -### Streaming via SSE - -The `/api/v1/search/stream` endpoint uses Server-Sent Events to emit results progressively. The `/api/v1/events` endpoint provides a live event stream where clients can subscribe to search, ingest, cluster, MCP, and engine events with optional category filtering. - ---- - -## 🔗 See Also - -- [Core Concepts](core-concepts.md) — Algorithms and data structures in detail - -- [Distributed Mode](distributed-mode.md) — Multi-node clustering architecture - -- [GPU Acceleration](gpu-acceleration.md) — CUDA kernel integration via Panama - -- [Performance Tuning](../operations/performance-tuning.md) — Optimizing for your workload \ No newline at end of file +## Data Flow + +### Ingestion Path + +1. REST request arrives at `spector-server` +2. `SpectorEngine` routes to appropriate handler +3. Vector stored in off-heap `VectorStore` (Panama MemorySegment) +4. HNSW graph updated with new node connections +5. BM25 inverted index updated with text tokens +6. Document metadata stored for retrieval + +### Search Path + +1. Query arrives at `spector-server` +2. `SpectorEngine` delegates to `QueryOrchestrator` +3. Parallel execution: + - **Vector leg**: HNSW traversal with SIMD distance computation + - **Keyword leg**: BM25 scoring across inverted index +4. Results fused via Reciprocal Rank Fusion (RRF) +5. Optional: LLM re-ranking via Ollama +6. Top-K results returned with scores + +### RAG Path + +1. Documents read by `DocumentReader` (PDF, HTML, Markdown) +2. Text split by `TokenAwareChunker` respecting sentence boundaries +3. Chunks embedded in parallel via `EmbeddingPipeline` +4. On query: relevant chunks retrieved and scored +5. `ContextBuilder` assembles context within token limit +6. Context returned with source attributions + +## Key Design Decisions + +| Decision | Rationale | +|----------|-----------| +| Off-heap vectors (Panama) | Avoids GC pressure, enables mmap for instant load | +| Virtual threads | Scales to thousands of concurrent queries without pool tuning | +| SIMD via Vector API | 10-100× faster distance computation than scalar Java | +| HNSW for ANN | Proven recall/latency tradeoff, logarithmic search time | +| IVF-PQ for scale | 32× memory compression enables billion-scale on commodity hardware | +| Multi-level quantization | INT8/INT4/INT2 with non-uniform calibration covers 4×–16× compression | +| Configurable rescore | Oversampling-based rescore recovers recall lost to quantization | +| Consistent hashing | Minimal data movement on cluster topology changes | +| gRPC for cluster | Low-latency binary protocol for shard fan-out | diff --git a/docs/docs/architecture/rag-pipeline.md b/docs/docs/architecture/rag-pipeline.md deleted file mode 100644 index 6abb068..0000000 --- a/docs/docs/architecture/rag-pipeline.md +++ /dev/null @@ -1,305 +0,0 @@ -# 🤖 RAG Pipeline - -> **End-to-end Retrieval-Augmented Generation built right into Spector.** From document ingestion to LLM-ready context assembly — with token-aware chunking, parallel embedding, and source attribution out of the box. - ---- - -## Module: `spector-rag` - -The RAG pipeline is a standalone module (`spector-rag`) that can be used independently or through the engine facade. It orchestrates the full flow: query embedding → retrieval → context assembly → attribution. - -**Key classes:** - -| Class | Purpose | -|-------|---------| -| `RagPipeline` | End-to-end orchestrator | -| `ContextBuilder` | Token-budget-aware context assembly | -| `RagRequest` / `RagResponse` | Clean input/output types | -| `ScoredChunk` | Chunk + relevance score | -| `ChunkAttribution` | Source provenance tracking | - -```java -// Standalone usage (no engine facade required) -var pipeline = new RagPipeline(searchOrchestrator, documentStore, embeddingProvider); -RagResponse response = pipeline.execute(new RagRequest("What is HNSW?")); -// response.contextText() → assembled context for LLM -// response.attributions() → source document references -``` - -> [!NOTE] -> The `spector-rag` module uses virtual threads for the embedding call and synchronous search for retrieval. No reactive framework needed — the JDK handles async I/O natively. - ---- - -## 🔄 Pipeline Overview - -```mermaid -flowchart LR - A["📄 Document Readers
PDF / HTML / Markdown"] --> B["✂️ Token-Aware Chunker
Sentence boundaries
Configurable overlap"] - B --> C["🧠 Parallel Embedding
Batched via virtual threads
Pluggable providers"] - C --> D["📊 Index & Store
HNSW + BM25 + mmap"] - D --> E["🔍 Search & Retrieve
Vector / Hybrid"] - E --> F["📝 Context Builder
Score-ranked assembly
Token limit enforcement"] - F --> G["✨ LLM-Ready Context
+ Source Attributions"] -``` - ---- - -## 📄 Document Readers - -The pipeline supports three document formats out of the box: - -| Reader | Format | Behavior | -|--------|--------|----------| -| `PdfDocumentReader` | PDF | Extracts text, preserves paragraph boundaries | -| `HtmlDocumentReader` | HTML | Strips tags, converts headings to sections | -| `MarkdownDocumentReader` | Markdown | Preserves heading structure as delimiters | - -```java -DocumentReader reader = new PdfDocumentReader(); -DocumentResult result = reader.read(Path.of("whitepaper.pdf")); -// result.text() → extracted text -// result.metadata() → {sourceFile, format: "PDF", characterCount} -``` - -| Property | Value | -|----------|-------| -| Max file size | 100 MB | -| Max extraction time | 30 seconds per file | -| Failure isolation | Per-file (one failure doesn't halt pipeline) | -| Output | Text string + metadata | - -> [!NOTE] -> Unsupported formats return a descriptive error. Corrupted files report the failure without stopping the pipeline. - ---- - -## ✂️ Token-Aware Chunking - -The `TokenAwareChunker` splits text into chunks that respect token boundaries and embedding model limits. - -```mermaid -flowchart TD - Input["📄 Input Text
(long document)"] --> Split["Split Strategy"] - Split --> S1["1️⃣ Prefer sentence boundaries"] - Split --> S2["2️⃣ Fall back to word boundaries"] - Split --> S3["3️⃣ Measure by token count"] - - S1 --> Chunks["✂️ Overlapping Chunks
Each ≤ maxTokens"] - S2 --> Chunks - S3 --> Chunks -``` - -### Configuration - -| Parameter | Default | Range | Description | -|-----------|---------|-------|-------------| -| `maxTokens` | 512 | 1–8192 | Max tokens per chunk | -| `overlapTokens` | 50 | 0–maxTokens-1 | Overlap between chunks | - -```java -ChunkConfig config = new ChunkConfig(512, 50); -List chunks = chunker.chunk(extractedText, config); -``` - -### Properties - -- ✅ **Round-trip reconstruction** — Concatenating chunks reconstructs the original text - -- ✅ **Token limit guarantee** — Every chunk has ≤ maxTokens - -- ✅ **Single chunk for short text** — Returns exactly one chunk if input fits - -- ✅ Empty/whitespace input returns an empty list - -> [!TIP] -> Set `maxTokens` to match your embedding model's max input length. Increase `overlapTokens` (100–200) if chunks need more surrounding context for coherence. - ---- - -## 🧠 Parallel Embedding Pipeline - -The `ParallelEmbeddingPipeline` generates vector embeddings from text chunks using configurable batch parallelism. - -```mermaid -flowchart LR - subgraph "Input Chunks" - C1[C1] & C2[C2] & C3[C3] & C4[C4] & C5[C5] & C6[C6] & C7[C7] & C8[C8] - end - - subgraph "Virtual Thread 1" - B1["Batch [C1-C4]
→ Embedding Provider"] - end - - subgraph "Virtual Thread 2" - B2["Batch [C5-C8]
→ Embedding Provider"] - end - - C1 & C2 & C3 & C4 --> B1 - C5 & C6 & C7 & C8 --> B2 - - B1 --> Out["Embeddings [E1...E8]
Order preserved ✅"] - B2 --> Out -``` - -| Parameter | Default | Range | Description | -|-----------|---------|-------|-------------| -| `batchSize` | 32 | 1–256 | Chunks per embedding API call | -| `maxRetries` | 3 | 0–10 | Retries for failed batches | - -**Failure handling:** - -- Failed batches are retried up to `maxRetries` times - -- Processing continues for remaining batches even if one fails - -- Input-output ordering is always preserved - ---- - -## 📝 Context Builder - -The `ContextBuilder` assembles retrieved chunks into a coherent context window for LLM prompting. - -```mermaid -flowchart TD - A["🔍 Retrieved Chunks
(scored)"] --> B["Sort by relevance ↓"] - B --> C{"Would adding next chunk
exceed token limit?"} - C -->|No| D["Add chunk to context"] - D --> C - C -->|Yes| E["Skip chunk"] - E --> F["📝 Final Context
+ Source Attributions"] - D --> F -``` - -| Parameter | Default | Range | -|-----------|---------|-------| -| `tokenLimit` | 4096 | 256–131,072 | - -**Properties:** - -- Context never exceeds the configured token limit - -- Chunks appear in descending relevance order - -- Every included chunk has a source attribution - -- Empty context (not an exception) when no chunks fit - ---- - -## 🌐 The `/api/v1/rag` Endpoint - -A single API call for retrieval-augmented generation: - -```bash -curl -X POST http://localhost:7070/api/v1/rag \ - -H "Content-Type: application/json" \ - -d '{ - "query": "How does HNSW indexing work?", - "topK": 5, - "tokenLimit": 4096, - "searchMode": "hybrid" - }' -``` - -**Request Parameters:** - -| Field | Type | Default | Range | Description | -|-------|------|---------|-------|-------------| -| `query` | string | — | 1–2000 chars | The question/query | -| `topK` | int | 5 | 1–100 | Chunks to retrieve | -| `tokenLimit` | int | 4096 | 1–8192 | Max context tokens | -| `searchMode` | string | "vector" | "vector", "hybrid" | Search strategy | - -**Response:** -```json -{ - "context": "HNSW builds a multi-layer graph structure where each layer contains a subset of nodes...", - "attributions": [ - {"documentId": "architecture.md", "chunkOffset": 3}, - {"documentId": "algorithms.md", "chunkOffset": 0} - ], - "isEmpty": false -} -``` - ---- - -## 🎯 End-to-End Example - -### 1️⃣ Ingest Documents via Ingestion Pipeline - -```java -// Create pipeline with embedding provider -var pipeline = new IngestionPipeline(target, embeddingProvider); - -// Single document (auto-embed) -pipeline.ingest("doc-1", "HNSW builds a multi-layer graph structure..."); - -// Large document (chunked, parallel embedding) -String whitepaper = Files.readString(Path.of("architecture.pdf.txt")); -IngestionResult result = pipeline.ingestChunked("whitepaper-1", whitepaper); -// result: 47 chunks stored, 0 failures, 2340ms -``` - -### 2️⃣ Query via RAG Pipeline - -```java -// Direct usage of RagPipeline (standalone module) -var ragPipeline = new RagPipeline(searchOrchestrator, documentStore, embeddingProvider); - -RagResponse response = ragPipeline.execute( - new RagRequest("What is product quantization?", 5, 4096, "hybrid")); - -System.out.println(response.contextText()); // assembled context -System.out.println(response.attributions()); // source references -System.out.println(response.queryTimeMs()); // 12ms -``` - -### 3️⃣ Query via REST API - -```bash -curl -X POST http://localhost:7070/api/v1/rag \ - -d '{"query": "What is product quantization?", "topK": 3}' -``` - -### 4️⃣ Use Context with an LLM - -```python -import requests - -# Get context from Spector -rag_response = requests.post("http://localhost:7070/api/v1/rag", json={ - "query": "Explain product quantization", - "topK": 5, - "tokenLimit": 3000 -}).json() - -# Use with your LLM -prompt = f"""Based on the following context, answer the question. - -Context: -{rag_response['context']} - -Question: Explain product quantization - -Answer:""" -``` - -> [!TIP] -> For Spring AI applications, use the `SpectorRagService` or `QuestionAnswerAdvisor` for automatic context retrieval. See [Spring AI Integration](../sdk-usage/spring-ai.md). - ---- - -## 🔗 See Also - -- [Ingestion Pipeline](ingestion-pipeline.md) — Document ingestion module - -- [Spring AI Integration](../sdk-usage/spring-ai.md) — Spring AI RAG service - -- [REST API Reference](../api-reference/rest-endpoints.md) — RAG endpoint details - -- [Core Concepts](core-concepts.md) — Algorithms used in retrieval - -- [Configuration Guide](../configuration/parameters.md) — RAG pipeline parameters \ No newline at end of file diff --git a/docs/docs/cli-reference/spectorctl.md b/docs/docs/cli-reference/spectorctl.md index 28e38b2..da77cbd 100644 --- a/docs/docs/cli-reference/spectorctl.md +++ b/docs/docs/cli-reference/spectorctl.md @@ -1,52 +1,33 @@ -# 🖥️ CLI Reference +# spectorctl CLI Reference -> **Manage Spector from the command line.** `spectorctl` connects to a running server via REST and provides commands for indexing, ingestion, search, and status monitoring — with both human-friendly tables and machine-parseable JSON output. +`spectorctl` is the command-line tool for managing Spector Search instances. It connects to a running server via the REST API. ---- - -## 📦 Installation +## Installation Build from source: ```bash -cd spector +cd spector-search mvn clean package -pl spector-cli -am -DskipTests ``` -The CLI JAR is at `spector-cli/target/spector-cli.jar`. Run it with: - -```bash -java -jar spector-cli/target/spector-cli.jar [command] [options] -``` - -> [!TIP] -> Create an alias for convenience: -> ```bash -> alias spectorctl='java -jar /path/to/spector-cli.jar' -> ``` +The CLI is available at `spector-cli/target/spector-cli.jar`. ---- - -## 🌐 Global Options +## Global Options | Option | Default | Description | |--------|---------|-------------| | `--host` | localhost | Spector server hostname | | `--port` | 7070 | Spector server port | -| `--json` | false | Output in JSON format (machine-parseable) | -| `--api-key` | — | API key for authentication | +| `--json` | false | Output in JSON format | | `--help` | — | Show help for any command | ---- - -## 📋 Commands +## Commands -### 📊 `index` — Index Management - -Create, list, and delete indexes. +### index — Index Management ```bash -# Create an index with specific dimensions +# Create an index spectorctl index create --name my-index --dimensions 384 # List all indexes @@ -56,207 +37,103 @@ spectorctl index list spectorctl index delete --name my-index ``` -| Option | Required | Description | -|--------|----------|-------------| -| `--name` | ✅ | Index name | -| `--dimensions` | ✅ (create) | Vector dimensionality | - ---- - -### 📥 `ingest` — Document Ingestion - -The `ingest` command supports two modes, auto-detected from the flags: - -#### Local Batch Mode (via Runtime) - -Discovers and ingests files directly through `SpectorRuntime` — no server needed. Reads configuration from `spector.yml`. - -```bash -# Ingest from config (root-directory, pattern, etc. from spector.yml) -spectorctl ingest --config spector.yml - -# Ingest with explicit root directory -spectorctl ingest --root /path/to/docs --pattern "**/*.md" - -# Override chunk size -spectorctl ingest --config spector.yml --root . --chunk-size 1200 -``` - -| Option | Required | Description | -|--------|----------|-------------| -| `--config` | ❌ | Path to `spector.yml` config file | -| `--root` | ❌ | Root directory for file discovery | -| `--pattern` | ❌ | File glob pattern (default from config) | -| `--chunk-size` | ❌ | Chunk size in characters (default from config) | - -> [!TIP] -> If `--config` is provided and `spector.yml` contains `spector.ingestion.root-directory`, local batch mode activates automatically — no `--root` flag needed. - -#### Remote Mode (via HTTP) - -Sends a single document to a running Spector server. +### ingest — Document Ingestion ```bash -# Ingest text content -spectorctl ingest --id doc-1 --content "Hello world" - -# Ingest from a file -spectorctl ingest --file README.md --title "Project README" +# Ingest a single document +spectorctl ingest --id doc-1 \ + --content "SIMD-accelerated vector search" \ + --vector "0.1,0.2,0.3,0.4,0.5" ``` -| Option | Required | Description | -|--------|----------|-------------| -| `--id` | ❌ | Document ID (auto-generated if not provided) | -| `--content` | ❌ | Document text content | -| `--file` | ❌ | Path to file to ingest | -| `--title` | ❌ | Document title | - ---- - -### 🔍 `search` — Search Documents +### search — Search Documents ```bash -# Text/keyword search +# Text search spectorctl search --text "vector search engine" --topK 10 # Vector search spectorctl search --vector "0.1,0.2,0.3,0.4,0.5" --topK 5 -# Hybrid search -spectorctl search --text "search" --vector "0.1,0.2,0.3,0.4,0.5" --topK 10 - -# JSON output for scripting +# JSON output spectorctl search --text "search" --json ``` -| Option | Required | Description | -|--------|----------|-------------| -| `--text` | ❌* | Query text for keyword search | -| `--vector` | ❌* | Comma-separated query vector | -| `--topK` | ❌ | Number of results (default: 10) | +### status — Server Status -> [!IMPORTANT] -> *At least one of `--text` or `--vector` is required. +```bash +# Check server status +spectorctl status +``` ---- +## Runnable CLI Example -### 💚 `status` — Server Status +This complete example demonstrates the full workflow using `spectorctl`: ```bash -# Human-readable status -spectorctl status +# 1. Check that the server is running +spectorctl --host localhost --port 7070 status -# JSON output -spectorctl status --json -``` +# 2. Ingest documents +spectorctl ingest --id cli-doc-1 \ + --content "Spector Search uses HNSW for approximate nearest neighbors" \ + --vector "0.9,0.1,0.3,0.7,0.5" + +spectorctl ingest --id cli-doc-2 \ + --content "IVF-PQ provides memory-efficient billion-scale search" \ + --vector "0.2,0.8,0.4,0.1,0.6" ---- +# 3. Search for documents +spectorctl search --text "nearest neighbor search" --topK 5 -## 🎨 Output Formats +# 4. Get results in JSON format for scripting +spectorctl search --text "billion scale" --topK 3 --json -### 📋 Table Format (Default) +# 5. Check engine status and metrics +spectorctl status +``` -Human-readable tables for interactive use: +### Expected Output ``` $ spectorctl status ╔══════════════════════════════════════╗ -║ Spector Status ║ +║ Spector Search Status ║ ╠══════════════════════════════════════╣ ║ Status: RUNNING ║ ║ Port: 7070 ║ ║ SIMD: AVX-512 (512-bit) ║ ║ GPU: Available (CUDA 12.x) ║ -║ Documents: 1250 ║ +║ Documents: 2 ║ ╚══════════════════════════════════════╝ -``` -``` $ spectorctl search --text "nearest neighbor" --topK 5 ┌─────────────┬────────┬────────────────────────────────────────────┐ │ ID │ Score │ Content │ ├─────────────┼────────┼────────────────────────────────────────────┤ -│ doc-1 │ 0.9412 │ Spector uses HNSW for approximate.. │ -│ doc-2 │ 0.7231 │ IVF-PQ provides memory-efficient billion.. │ +│ cli-doc-1 │ 0.9412 │ Spector Search uses HNSW for approximate.. │ +│ cli-doc-2 │ 0.7231 │ IVF-PQ provides memory-efficient billion.. │ └─────────────┴────────┴────────────────────────────────────────────┘ ``` -### 🔧 JSON Format (`--json`) - -Machine-parseable output for scripting and automation: +## Error Handling -```json -{"status": "RUNNING", "port": 7070, "simd": "AVX-512 (512-bit)", "gpuAvailable": true, "documentCount": 1250} -``` - ---- +| Scenario | Behavior | +|----------|----------| +| Server unreachable | Displays connection error with host:port | +| Invalid arguments | Shows error message and command usage | +| No results | Displays empty result table | -## 🔧 Scripting Examples +## Using with Scripts -### Pipe to jq +The `--json` flag makes output machine-parseable: ```bash -# Extract document IDs from search results +# Pipe search results to jq spectorctl search --text "query" --json | jq '.results[].id' -# Check server health in CI +# Check status in CI if spectorctl status --json | jq -e '.status == "RUNNING"' > /dev/null; then echo "Server is healthy" fi ``` - -### Batch Ingestion from File - -```bash -# Ingest from a JSONL file -while IFS= read -r line; do - id=$(echo "$line" | jq -r '.id') - content=$(echo "$line" | jq -r '.content') - vector=$(echo "$line" | jq -r '.vector | join(",")') - spectorctl ingest --id "$id" --content "$content" --vector "$vector" -done < documents.jsonl -``` - -### Health Check Script - -```bash -#!/bin/bash -MAX_RETRIES=30 -for i in $(seq 1 $MAX_RETRIES); do - if spectorctl --host $SPECTOR_HOST --port $SPECTOR_PORT status --json 2>/dev/null | \ - jq -e '.status == "RUNNING"' > /dev/null 2>&1; then - echo "✅ Spector is ready" - exit 0 - fi - echo "⏳ Waiting for server... ($i/$MAX_RETRIES)" - sleep 1 -done -echo "❌ Server did not start in time" -exit 1 -``` - ---- - -## ⚠️ Error Handling - -| Scenario | Behavior | -|----------|----------| -| Server unreachable | Displays connection error with host:port | -| Invalid arguments | Shows error message and command usage | -| Missing required options | Shows which options are missing | -| No results found | Displays empty result table | - -``` -$ spectorctl --host badhost --port 9999 status -Error: Cannot connect to badhost:9999 — Connection refused -``` - ---- - -## 🔗 See Also - -- [REST API Reference](../api-reference/rest-endpoints.md) — The API that spectorctl uses - -- [Getting Started](../getting-started/quickstart.md) — Server setup before using CLI - -- [Configuration Guide](../configuration/parameters.md) — Server configuration \ No newline at end of file diff --git a/docs/docs/configuration/parameters.md b/docs/docs/configuration/parameters.md index 6eb89c3..c4c1382 100644 --- a/docs/docs/configuration/parameters.md +++ b/docs/docs/configuration/parameters.md @@ -1,177 +1,89 @@ -# ⚙️ Configuration Guide +# Configuration Parameters -> **Every knob, dial, and lever in Spector — with sensible defaults and expert tuning advice.** Whether you're optimizing for recall, latency, throughput, or memory, this page has you covered. +Spector Search is configured via `SpectorConfig`. All parameters have sensible defaults. ---- - -## 🎯 Core Parameters +## Core Parameters | Parameter | Default | Range | Description | |-----------|---------|-------|-------------| -| `dimensions` | 384 | 1–2048 | Vector dimensionality (must match your embedding model) | -| `capacity` | 100,000 | 1–10,000,000 | Maximum document count | +| `dimensions` | 384 | 1–2048 | Vector dimensionality | +| `capacity` | 100,000 | 1–10M | Maximum document count | | `similarityFunction` | COSINE | COSINE, DOT_PRODUCT, EUCLIDEAN | Distance metric | -> [!TIP] -> **Quick model reference:** -> | Model | Dimensions | -> |-------|-----------| -> | all-MiniLM-L6-v2 | 384 | -> | e5-base-v2 | 768 | -> | text-embedding-ada-002 | 1536 | -> | nomic-embed-text | 768 | - -**Choosing a similarity function:** - -- **COSINE** — Normalized embeddings (most models) - -- **DOT_PRODUCT** — Unnormalized embeddings where magnitude matters - -- **EUCLIDEAN** — Spatial/geometric data - ---- - -## 🗜️ Quantization Parameters - -| Parameter | Default | Range | Description | -|-----------|---------|-------|-------------| -| `quantization` | NONE | NONE, SCALAR_INT8, SCALAR_INT4, SCALAR_INT2, IVF_PQ | Quantization type | -| `oversamplingFactor` | auto | 1–20 | Rescore oversampling (auto: INT8→1, INT4→3, INT2→5) | - -### 🎛️ Quantization Profiles - -| Priority | Type | Oversampling | Compression | Recall | Use Case | -|----------|------|--------------|-------------|--------|----------| -| 🎯 Max recall | INT8 | 1 (none) | 4× | 95–99% | Quality-critical search | -| ⚖️ Balanced | INT4 | 3 | 8× | 85–95% | Best compression/recall ratio | -| 💾 Memory-first | INT2 | 5 | 16× | 75–90% | Fit large datasets in RAM | -| 🚀 Billion-scale | IVF_PQ | — | 32× | 75–90% | Massive datasets | - -> [!TIP] -> **Start with INT4** for most workloads. It gives 8× compression with excellent recall when paired with the default 3× rescore. Only go to INT2 if memory is the binding constraint, or IVF-PQ if you're at billion scale. - -### Oversampling Tuning - -The `oversamplingFactor` controls how many extra candidates are retrieved before rescoring with exact distances: - -- **1** — No rescore (fastest, quantized scores returned directly) - -- **3** — Good balance for INT4 (retrieves 3×K candidates, rescores to top-K) - -- **5** — Recommended for INT2 (compensates for aggressive quantization) - -- **10+** — Diminishing returns; use only if recall is still insufficient - -```java -// INT4 with custom oversampling -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(50_000_000) - .withQuantization(QuantizationType.SCALAR_INT4) - .withRescore(5); // Higher oversampling = better recall, slightly slower -``` - ---- - -## 🌐 HNSW Index Parameters +## HNSW Index Parameters | Parameter | Default | Range | Description | |-----------|---------|-------|-------------| | `M` | 16 | 4–64 | Max connections per node per layer | -| `efConstruction` | 200 | 16–800 | Construction beam width | -| `efSearch` | 50 | 10–500 | Search beam width | - -### 🎛️ Tuning Profiles +| `efConstruction` | 200 | 16–800 | Construction beam width (higher = better recall, slower build) | +| `efSearch` | 50 | 10–500 | Search beam width (higher = better recall, slower query) | -| Priority | M | efConstruction | efSearch | Trade-off | -|----------|---|----------------|----------|-----------| -| 🎯 High recall | 32–64 | 400–800 | 200–500 | More memory, slower build/search | -| ⚖️ Balanced | 16 | 200 | 50 | Good recall with fast performance | -| ⚡ Low latency | 8–12 | 100 | 20–30 | Faster search, lower recall | -| 💾 Memory-constrained | 4–8 | 100 | 20 | Minimal memory, lower recall | - -> [!IMPORTANT] -> `efSearch` should be ≥ `topK` for meaningful results. Setting `efSearch < topK` means you're asking for more results than the algorithm explores. - ---- - -## 📝 BM25 Parameters +## BM25 Parameters | Parameter | Default | Range | Description | |-----------|---------|-------|-------------| | `k1` | 1.2 | 0.0–3.0 | Term frequency saturation | | `b` | 0.75 | 0.0–1.0 | Document length normalization | -| Corpus Type | Recommended k1 | Recommended b | -|-------------|----------------|---------------| -| Short docs (tweets, titles) | 1.2 | 0.3 | -| Medium docs (articles) | 1.2 | 0.75 | -| Long docs (books, papers) | 1.5–2.0 | 0.75 | -| Mixed lengths | 1.2 | 0.5 | - ---- - -## 🧬 Hybrid Search (RRF) +## Hybrid Search -| Parameter | Default | Range | Description | -|-----------|---------|-------|-------------| -| `RRF k` | 60 | 1–1000 | Reciprocal Rank Fusion constant | - -- `k = 60` — Original paper recommendation, works well generally +| Parameter | Default | Description | +|-----------|---------|-------------| +| `RRF k` | 60 | Reciprocal Rank Fusion constant | -- Lower `k` (10–30) — Emphasizes top-ranked results more strongly +## GPU Configuration -- Higher `k` (100+) — Flattens rank importance +| Parameter | Default | Description | +|-----------|---------|-------------| +| `gpuEnabled` | false | Enable CUDA GPU acceleration | +| `gpuMemoryBudget` | 256 MB | Maximum GPU memory allocation | ---- +> **Note:** For INT4/INT2 quantization, GPU acceleration requires vector dimensions to be a multiple of 32. Non-aligned dimensions automatically fall back to CPU/SIMD. -## 🎮 GPU Configuration +## Quantization Configuration | Parameter | Default | Range | Description | |-----------|---------|-------|-------------| -| `gpuEnabled` | false | true/false | Enable CUDA GPU acceleration | -| `gpuMemoryBudget` | 256 MB | 256 MB – GPU max | Maximum GPU memory allocation | -| `gpuBatchWindow` | 10 ms | 1–100 ms | Batching window for query collection | -| `gpuMaxBatchSize` | 1024 | 1–1024 | Maximum queries per GPU batch | +| `quantization` | NONE | NONE, SCALAR_INT8, SCALAR_INT4, SCALAR_INT2 | Scalar quantization type | +| `oversamplingFactor` | auto | 1–20 | Rescore oversampling factor (auto: INT8→1, INT4→3, INT2→5) | -> [!NOTE] -> Enable GPU for batch workloads with >10K vectors. Single queries are often faster on CPU SIMD due to zero kernel launch overhead. -> For INT4/INT2 quantization, GPU acceleration requires dimensions to be a multiple of 32. Non-aligned dimensions automatically fall back to CPU/SIMD. +### Quantization Types ---- +| Type | Compression | Recall | Calibration | Best For | +|------|-------------|--------|-------------|----------| +| SCALAR_INT8 | 4× | 95–99% | Linear (min/max) | High-recall, moderate scale | +| SCALAR_INT4 | 8× | 85–95% | Non-uniform (quantile) | Balanced compression/recall | +| SCALAR_INT2 | 16× | 75–90% | Non-uniform (quantile) | Memory-constrained, large datasets | -## 🤖 Reranker Configuration +### Rescore Strategy -| Parameter | Default | Range | Description | -|-----------|---------|-------|-------------| -| `rerankerEnabled` | false | true/false | Enable LLM re-ranking via Ollama | -| `rerankerModel` | — | Any Ollama model | Model name (e.g., "llama3.2") | -| `rerankerEndpoint` | http://localhost:11434 | URL | Ollama API endpoint | -| `rerankerMaxCandidates` | 20 | 1–100 | Max docs sent to LLM | +When `oversamplingFactor > 1`, Spector retrieves `oversamplingFactor × k` candidates using fast quantized distance, then rescores with exact float32 distances to return the true top-K: -> [!WARNING] -> Re-ranking adds **100–500ms latency** per query. Use only when precision is critical and latency budget allows. +| Quantization | Default Oversampling | Effect | +|-------------|---------------------|--------| +| INT8 | 1 (no rescore) | Already near-lossless | +| INT4 | 3 | Recovers recall to 85–95% | +| INT2 | 5 | Compensates for aggressive quantization | ---- +Set `oversamplingFactor` to 1 to disable rescoring (faster, lower recall). -## 🖥️ Server Configuration +## Reranker Configuration | Parameter | Default | Description | |-----------|---------|-------------| -| `port` | 7070 | HTTP server port | -| `apiKey` | — | Optional API key (empty = no auth) | -| `corsOrigins` | * | Allowed CORS origins | +| `rerankerEnabled` | false | Enable LLM re-ranking via Ollama | +| `rerankerModel` | — | Ollama model name (e.g., "llama3.2") | +| `rerankerEndpoint` | http://localhost:11434 | Ollama API endpoint | +| `rerankerMaxCandidates` | 20 | Max docs sent to LLM for re-ranking | -```bash -# Format: port dimensions apiKey -mvn exec:java -pl spector-node \ - -Dexec.mainClass="com.spectrayan.spector.server.SpectorNode" \ - -Dexec.args="7070 384 my-secret-key" -``` +## Server Configuration ---- +| Parameter | Default | Description | +|-----------|---------|-------------| +| `port` | 7070 | HTTP server port | +| `apiKey` | — | Optional API key for authentication | -## 🌐 Cluster Configuration +## Cluster Configuration | Parameter | Default | Range | Description | |-----------|---------|-------|-------------| @@ -179,98 +91,28 @@ mvn exec:java -pl spector-node \ | `replicaCount` | 1 | 1–5 | Replicas per shard | | `heartbeatInterval` | 2s | 500ms–30s | Cluster heartbeat interval | | `heartbeatTimeout` | 10s | 3s–120s | Node unavailability timeout | -| `queryTimeout` | 10s | 1s–60s | Per-shard query timeout | - -> [!TIP] -> Rule of thumb: **100K–500K docs per shard** for optimal balance. Set `heartbeatTimeout` to at least 5× `heartbeatInterval`. - ---- -## 🤖 RAG Pipeline Configuration +## RAG Pipeline Configuration | Parameter | Default | Range | Description | |-----------|---------|-------|-------------| | `maxTokens` | 512 | 1–8192 | Max tokens per chunk | | `overlapTokens` | 50 | 0–maxTokens-1 | Overlap between chunks | -| `embeddingBatchSize` | 32 | 1–256 | Batch size for embedding generation | +| `embeddingBatchSize` | 32 | 1–256 | Embedding batch size | | `embeddingRetries` | 3 | 0–10 | Retry count for failed batches | -| `contextTokenLimit` | 4096 | 256–131072 | Max tokens in assembled context | ---- - -## 🎯 Configuration Examples - -### 🎯 High-Recall Setup +## Example Configuration ```java var config = SpectorConfig.DEFAULT .withDimensions(384) - .withCapacity(500_000) - .withQuantization(QuantizationType.SCALAR_INT8) - .withM(32) - .withEfConstruction(400) - .withEfSearch(200); -``` - -### 🗜️ Balanced Compression (INT4) - -```java -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(50_000_000) - .withQuantization(QuantizationType.SCALAR_INT4) - .withRescore(3); // default for INT4 -``` - -### 💾 Maximum Compression (INT2) - -```java -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(200_000_000) - .withQuantization(QuantizationType.SCALAR_INT2) - .withRescore(5); // default for INT2 -``` - -### ⚡ Low-Latency Setup - -```java -var config = SpectorConfig.DEFAULT - .withDimensions(128) .withCapacity(100_000) - .withM(12) - .withEfConstruction(100) - .withEfSearch(30); -``` - -### 🎮 GPU-Accelerated Batch Processing - -```java -var config = SpectorConfig.DEFAULT - .withDimensions(768) - .withCapacity(1_000_000) + .withQuantization(QuantizationType.SCALAR_INT4) // 8× compression + .withRescore(3) // 3× oversampling for recall .withGpu(true) - .withGpuMemoryBudget(2048); // 2 GB -``` + .withReranker("http://localhost:11434", "llama3.2", 20); -### 🤖 RAG Pipeline - -```java -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withMaxTokens(1024) - .withOverlapTokens(100) - .withEmbeddingBatchSize(64); +try (var engine = new SpectorEngine(config)) { + // Use engine... +} ``` - ---- - -## 🔗 See Also - -- [Performance Tuning](../operations/performance-tuning.md) — Benchmarks and optimization strategies - -- [Architecture Overview](../architecture/overview.md) — How configuration affects system behavior - -- [Distributed Mode](../architecture/distributed-mode.md) — Cluster-specific configuration - -- [GPU Acceleration](../architecture/gpu-acceleration.md) — GPU setup requirements \ No newline at end of file diff --git a/docs/docs/cortex/index.md b/docs/docs/cortex/index.md deleted file mode 100644 index f1f1fc5..0000000 --- a/docs/docs/cortex/index.md +++ /dev/null @@ -1,317 +0,0 @@ ---- -title: "🧬 Spector Cortex — Neural Dashboard" -description: "Real-time visualization dashboard for Spector's cognitive memory engine — neural graphs, vector spaces, SIMD lanes, memory heatmaps, and live cognitive metrics." ---- - -# 🧬 Spector Cortex — Neural Dashboard - -!!! quote "The Vision" - What if you could **watch your AI's brain think?** Spector Cortex is a real-time neural dashboard that visualizes the cognitive memory engine — from SIMD lanes firing to Hebbian edges strengthening to memories decaying along the Ebbinghaus curve. It's the difference between a black box and a living brain. - ---- - -![Spector Cortex Dashboard](spector-cortex-dashboard.png) - ---- - -## Overview - -Spector Cortex is an Angular 21 standalone application that provides a real-time, interactive visualization of Spector's cognitive memory engine. It connects to a running Spector Node via SSE (Server-Sent Events) and renders every cognitive event — queries, recalls, consolidation cycles, graph mutations — as they happen. - -The dashboard is built around **12 panels** organized in a responsive 3-column grid, each visualizing a different aspect of the cognitive pipeline: - -| Panel | Visualization | What It Shows | -|:------|:--------------|:--------------| -| **Neural Graph** | Three.js 3D graph | 200-node cognitive network with Hebbian, temporal, and entity edges — particles flow along connections during query spreading activation | -| **Vector Space** | Three.js point cloud | 300-point PCA-projected embedding space with query dot and nearest-neighbor lines | -| **Scoring Pipeline** | Animated funnel bars | The 6-phase cognitive scoring funnel — from total records → tombstone → tags → valence → decay → distance → final top-K | -| **Live Metrics** | Canvas time-series | Real-time recall/remember/reinforce/forget rates plotted as multi-line chart | -| **Cognitive Profile** | Canvas radar chart | 6-axis radar showing current thalamic modulation parameters (α, β, strictness, hyperfocus, lateral, valence range) | -| **SIMD & Hardware** | Canvas register grid | 16-lane SIMD register heatmap showing vector processing utilization | -| **Memory Heatmap** | Canvas segment bars | Off-heap memory segment utilization across all 4 tier stores + graph structures | -| **Decay Curve** | Canvas overlay chart | Ebbinghaus forgetting curve (dashed) vs. LTP reconsolidation curve (solid) — shows how recall events boost retention | -| **Query History** | Scrollable timeline | Chronological query traces with profile, latency, and augmented result counts | -| **Zeigarnik Effect** | Tension gauge | Unresolved memory count and cognitive tension percentage — the Zeigarnik effect biases recall toward incomplete tasks | -| **Habituation** | IoR/satiation bars | Inhibition of Return, semantic satiation, and habituation penalty gauges — the anti-filter-bubble mechanisms | -| **Query Input** | Search bar | Submit queries to see the full pipeline execute in real time | - ---- - -## Architecture - -### Technology Stack - -| Layer | Technology | -|:------|:-----------| -| **Framework** | Angular 21 (standalone, zoneless) | -| **UI Components** | Angular Material 3 (M3 design tokens) | -| **3D Visualization** | Three.js (Neural Graph, Vector Space) | -| **2D Visualization** | Canvas 2D API with `requestAnimationFrame` | -| **State Management** | Angular Signals (reactive, fine-grained) | -| **Real-time Data** | SSE via `ng-sse-client` (mock data available) | -| **Styling** | SCSS with M3 CSS custom properties (`--mat-sys-*`) | -| **Theme** | Dark / Light toggle, fully token-based | - -### Signal-Based Reactive Architecture - -Spector Cortex uses a **pure signal-based architecture** — no RxJS, no NgRx, no zone.js. - -```mermaid -graph LR - SSE["SSE Stream
or Mock Data"] --> CS["CortexStateService
Signal Store"] - CS --> NG["Neural Graph"] - CS --> VS["Vector Space"] - CS --> PF["Pipeline Funnel"] - CS --> MC["Metrics Chart"] - CS --> PR["Profile Radar"] - CS --> SP["SIMD Panel"] - CS --> MH["Memory Heatmap"] - CS --> DC["Decay Curve"] - CS --> QH["Query History"] - CS --> ZT["Zeigarnik Tracker"] - CS --> HM["Habituation Meter"] - CS --> QI["Query Input"] -``` - -**`CortexStateService`** is the single source of truth. It holds 20+ signals covering: - -- **Query state**: current trace, query history, running status -- **Graph state**: nodes, edges, pulses, layer toggles, active profile -- **Metrics state**: time-series history, decay curves, habituation metrics -- **System state**: SIMD utilization, memory segments, JVM metrics -- **Vector state**: embedding points, query vector, nearest neighbors - -All components are **pure presentation** — they read signals and render. No component contains business logic. - -### Mock Data System - -The `MockDataService` generates realistic runtime data so the dashboard is fully functional without a running Spector Node: - -```typescript -// Toggle mock data on/off via signal -state.useMockData.set(true); // Enable mock data -state.useMockData.set(false); // Switch to real SSE stream -``` - -Mock data includes: - -- **Simulated queries** every 2-4 seconds with randomized latency, profile selection, and scoring funnel -- **Graph pulses** cycling through Hebbian, temporal, and entity edge types -- **Reflect cycles** with consolidation animations (edge pruning, tombstone compaction) -- **Vector points** — 300 embeddings in PCA-projected 3D space with natural tier-based clustering -- **Metrics time-series** — recall/remember/reinforce/forget rates with realistic fluctuation -- **Decay curves** — 30-day Ebbinghaus + LTP reconsolidation with stochastic recall bumps -- **Habituation metrics** — IoR, satiation, and penalty values evolving over time -- **Zeigarnik tracking** — unresolved/total task counts with tension percentage - ---- - -## Quick Start - -### Prerequisites - -- **Node.js** ≥ 20 -- **npm** ≥ 10 - -### Run Locally - -```bash -cd spector-cortex -npm install -npx ng serve --port 4300 -``` - -Open [http://localhost:4300](http://localhost:4300) — the dashboard starts immediately with mock data. - -### Connect to a Running Spector Node - -By default, the dashboard uses mock data. To connect to a real Spector Node: - -1. Ensure your Spector Node is running with SSE events enabled -2. Update the SSE endpoint in the environment configuration -3. Set `useMockData` to `false` in `CortexStateService` - ---- - -## Panel Deep Dives - -### Neural Graph - -The centerpiece of the dashboard — a Three.js 3D graph with 200 nodes organized by memory tier: - -- **Node colors**: Working (amber), Episodic (green), Semantic (blue), Procedural (purple) -- **Node radius**: Proportional to tier (Working = inner, Procedural = outer shell) -- **3 edge types**: - - **Hebbian** — solid white lines (co-activation strength) - - **Temporal** — dashed cyan lines (causal/temporal chains) - - **Entity** — solid gold lines (entity-relationship knowledge) - -**Interactive features:** - -- [x] **Layer toggles** — show/hide each edge type independently -- [x] **Query traversal particles** — colored spheres flow along edges during spreading activation -- [x] **Particle trails** — each particle has a larger, dimmer glow sphere trailing behind -- [x] **Ambient particle stream** — continuous particles to keep the graph alive -- [x] **Profile visual transforms** — HYPERFOCUS (tunnel vision), PARANOID (red shift), DIVERGENT (rainbow shimmer) -- [x] **Consolidation animation** — edges dim and prune when `reflect()` fires -- [x] **Mouse interaction** — camera follows mouse position for parallax effect - -### Vector Space - -A Three.js point cloud of 300 memory embeddings projected into 3D via PCA: - -- Points are colored by tier and sized by importance -- **Query dot** — when a query fires, a white pulsing sphere appears at the query vector position -- **Nearest-neighbor lines** — 5 translucent lines connect the query dot to its closest memories -- Camera orbits slowly with mouse parallax - -### Scoring Pipeline - -Animated horizontal funnel showing the 6-phase cognitive scoring pipeline: - -| Phase | Description | -|:------|:------------| -| Total Records | Starting record count | -| After Tombstone | Tombstone-filtered records | -| After Tag Gate | Synaptic tag bloom filter pass | -| After Valence | Emotional valence range filter | -| After Decay | Temporal decay threshold | -| Vector Distance | L2 distance scoring | -| Final Top-K | Final result set | - -Each bar animates smoothly to new values and shows the delta percentage (reduction) from the previous phase. - -### Decay Curve - -Visualizes the Ebbinghaus forgetting curve alongside LTP (Long-Term Potentiation) reconsolidation: - -- **Red dashed line** — raw Ebbinghaus exponential decay (no intervention) -- **Primary solid line** — actual retention with LTP reconsolidation bumps from recall events -- **Filled area** — shows the retention gain from the reconsolidation system -- X-axis spans 30 days; Y-axis shows retention percentage - -### Cognitive Profile Radar - -6-axis radar chart showing the current cognitive profile's thalamic modulation parameters: - -| Axis | Parameter | Range | -|:-----|:----------|:------| -| α Similarity | Similarity weight | 0–1.0 | -| β Importance | Importance weight | 0–1.0 | -| Strictness | Score threshold | 0–10.0 | -| Hyperfocus | Focus mode boost | 0–2.0 | -| Lateral | Divergent retrieval | 0–1.0 | -| Valence Range | Emotional filter width | 0–255 | - -The radar morphs smoothly when the active profile changes (BALANCED → HYPERFOCUS → PARANOID → etc.). - ---- - -## Project Structure - -``` -spector-cortex/ -├── src/ -│ ├── app/ -│ │ ├── core/ -│ │ │ ├── models/ -│ │ │ │ ├── cortex-events.ts # SSE event type interfaces -│ │ │ │ ├── graph-types.ts # Graph pulse interfaces -│ │ │ │ └── memory-types.ts # CognitiveProfile, PROFILE_PARAMS -│ │ │ └── services/ -│ │ │ ├── cortex-state.service.ts # Signal store (single source of truth) -│ │ │ ├── mock-data.service.ts # Simulated event generator -│ │ │ └── theme.service.ts # Dark/light theme toggle -│ │ ├── features/ -│ │ │ ├── dashboard/ # Main layout (3-col grid) -│ │ │ ├── header/ # Toolbar with status & controls -│ │ │ ├── neural-graph/ # Three.js neural graph -│ │ │ ├── vector-space/ # Three.js vector space -│ │ │ ├── pipeline-funnel/ # Scoring pipeline funnel -│ │ │ ├── simd-panel/ # SIMD lane heatmap -│ │ │ ├── memory-heatmap/ # Off-heap segment visualization -│ │ │ ├── profile-radar/ # Cognitive profile radar chart -│ │ │ ├── metrics-chart/ # Live metrics time-series -│ │ │ ├── decay-curve/ # Ebbinghaus + LTP chart -│ │ │ ├── query-input/ # Search bar -│ │ │ ├── query-history/ # Query timeline -│ │ │ ├── zeigarnik-tracker/ # Incomplete tension gauge -│ │ │ └── habituation-meter/ # Anti-loop mechanism gauges -│ │ └── app.component.ts # Root component -│ ├── styles.scss # Global M3 theme -│ └── index.html -├── angular.json -├── package.json -└── tsconfig.json -``` - ---- - -## Design Principles - -### 1. Angular Material 3 Tokens Only - -All colors reference M3 CSS custom properties — **zero hardcoded colors**: - -```scss -// ✅ Correct — uses M3 token -color: var(--mat-sys-primary); -background: var(--mat-sys-surface-container-high); - -// ❌ Wrong — hardcoded color -color: #bb86fc; -``` - -This ensures the entire dashboard automatically adapts when switching between dark and light themes. - -### 2. Separation of Concerns - -| Layer | Rule | -|:------|:-----| -| **Components** | Pure presentation only — read signals, render UI | -| **Services** | All business logic, state mutations, data processing | -| **Templates** | Separate `.html` files — no inline templates | -| **Styles** | Separate `.scss` files — no inline styles | - -### 3. Canvas for Performance - -All 2D charts use raw Canvas 2D API with `requestAnimationFrame` instead of chart libraries — this keeps the bundle small and eliminates third-party DOM overhead in animation-heavy panels. - -### 4. Responsive Grid - -The dashboard uses CSS Grid with breakpoints: - -| Breakpoint | Columns | Behavior | -|:-----------|:--------|:---------| -| > 1200px | 3 columns | Full layout | -| 768–1200px | 2 columns | Neural Graph spans full width | -| < 768px | 1 column | Single column, stacked | - ---- - -## Connecting to Real Data - -Spector Cortex is designed to consume SSE events from `spector-node`. The event types map directly to signals: - -| SSE Event Type | Signal | Panel | -|:---------------|:-------|:------| -| `query.trace` | `currentQueryTrace` | Neural Graph, Pipeline, History | -| `query.vector` | `queryVector` | Vector Space | -| `graph.pulse` | `graphPulses` | Neural Graph edges | -| `reflect.complete` | `lastReflect` | Neural Graph consolidation | -| `profile.change` | `activeProfile` | Profile Radar, Neural Graph | -| `metrics.snapshot` | `metricsHistory` | Metrics Chart | -| `habituation.update` | `habituation` | Habituation Meter | - -When `useMockData` is `false`, the `EventStreamService` connects to the configured SSE endpoint and pushes events into `CortexStateService` signals. - ---- - -## Future Roadmap - -- [ ] **Integration with Synaptiq** — embed Cortex panels into the Synaptiq monitoring dashboard -- [ ] **Async event emission** — SSE events emitted on virtual threads (gated behind feature flag) -- [ ] **Replay mode** — record and replay cognitive sessions for debugging -- [ ] **Cluster view** — multi-node visualization for distributed Spector deployments -- [ ] **GPU acceleration panel** — CUDA kernel execution timeline visualization -- [ ] **Memory diff view** — before/after comparison of consolidation cycles diff --git a/docs/docs/cortex/spector-cortex-dashboard.png b/docs/docs/cortex/spector-cortex-dashboard.png deleted file mode 100644 index dc8dcfd..0000000 Binary files a/docs/docs/cortex/spector-cortex-dashboard.png and /dev/null differ diff --git a/docs/docs/deep-dives/ann-search-primer.md b/docs/docs/deep-dives/ann-search-primer.md deleted file mode 100644 index 74c097a..0000000 --- a/docs/docs/deep-dives/ann-search-primer.md +++ /dev/null @@ -1,244 +0,0 @@ -# 🔍 Approximate Nearest Neighbor Search - -> **A beginner-friendly guide to how search engines find similar items in milliseconds, even across millions of records.** This page explains ANN from first principles — no math prerequisites required. - ---- - -## 🤔 The Problem: Finding Similar Things - -Imagine you have a photo of a sunset and want to find the 10 most similar sunset photos from a collection of 10 million images. Each image has been converted to a **vector** — a list of numbers that captures its visual essence: - -``` -🌅 Your photo → [0.82, -0.15, 0.44, 0.67, ..., 0.21] (768 numbers) -📸 Photo #1 → [0.79, -0.12, 0.41, 0.70, ..., 0.18] (768 numbers) -📸 Photo #2 → [-0.55, 0.88, -0.23, 0.11, ..., 0.67] (768 numbers) -... -📸 Photo #10M → [0.33, 0.44, -0.12, 0.55, ..., 0.91] (768 numbers) -``` - -The **naive approach** compares your photo to every single photo in the collection: - -``` -10,000,000 comparisons × 768 multiplications each = 7.68 billion operations -``` - -Even on a fast CPU, that takes **seconds**. For a real-time search engine serving thousands of users simultaneously, seconds is an eternity. - -> [!NOTE] -> This is called the **curse of dimensionality** — as vectors get longer (higher dimensional), the search space grows exponentially. Brute-force becomes impossible at scale. - ---- - -## 💡 The Key Insight: "Close Enough" Is Good Enough - -Here's the breakthrough: for most applications, you don't need the *mathematically perfect* top-10. You need results that are *really close* to perfect. If the true best match has a similarity score of 0.97 and your algorithm returns a match with 0.96, no user will notice the difference. - -**Approximate Nearest Neighbor (ANN)** algorithms exploit this insight. They organize vectors into clever data structures that let you skip most comparisons while still finding excellent results. - -The trade-off: - -```mermaid -graph LR - A["🎯 Exact Search\n100% recall\nO(n) time"] -->|"Trade accuracy\nfor speed"| B["⚡ ANN Search\n95%+ recall\nO(log n) time"] -``` - ---- - -## 🏗️ ANN Algorithm Families - -### 1. 🌳 Tree-Based Methods - -**Idea:** Recursively split the vector space into regions. At search time, only explore the regions near the query. - -**Example: KD-Trees** -- Split along one dimension at each level (like cutting a map into quadrants) -- Works well up to ~20 dimensions -- Falls apart in high dimensions (the "curse" again) - -**Example: Annoy (Spotify)** -- Builds random projection trees -- Each tree splits space with random hyperplanes -- Uses multiple trees and merges results for better recall - -```mermaid -graph TD - Root["All vectors"] --> L["Left half\n(dim 5 < 0.3)"] - Root --> R["Right half\n(dim 5 ≥ 0.3)"] - L --> LL["Leaf: 500 vectors"] - L --> LR["Leaf: 480 vectors"] - R --> RL["Leaf: 510 vectors"] - R --> RR["Leaf: 510 vectors"] -``` - -> **Verdict:** Simple but limited. Trees struggle above 50 dimensions, which is far below modern embedding sizes (384–4096). - ---- - -### 2. 🗂️ Inverted File (IVF) - -**Idea:** Cluster vectors into groups (using K-Means). At search time, only search the closest clusters. - -**How it works:** -1. **Training:** Run K-Means to find cluster centers (centroids) -2. **Ingestion:** Assign each vector to its nearest centroid -3. **Search:** Find the `nprobe` closest centroids to the query, then brute-force search only those clusters - -```mermaid -graph TD - Q["🔍 Query"] --> C["Find nearest centroids"] - C --> P1["📦 Cluster 1\n~3,000 vectors"] - C --> P2["📦 Cluster 2\n~3,100 vectors"] - C --> P3["📦 Cluster 3\n~2,900 vectors"] - P1 --> M["Merge & return top-K"] - P2 --> M - P3 --> M -``` - -**Speed:** With 1000 clusters and `nprobe=10`, you search only 1% of the data. - -**Recall control:** The `nprobe` parameter is your recall/speed knob: -- `nprobe=1` → Fast but ~30% recall (might miss neighbors in adjacent clusters) -- `nprobe=10` → Balanced, ~85% recall -- `nprobe=50` → Slower but ~98% recall - -> **Verdict:** Excellent at scale (billions of vectors). The foundation of most production systems. - ---- - -### 3. 🕸️ Graph-Based Methods (HNSW) - -**Idea:** Build a navigable graph where each vector is connected to its neighbors. At search time, traverse the graph like walking through a social network. - -This is the most important ANN algorithm today. See our [HNSW Deep Dive](hnsw-explained.md) for the full story. - -**Key properties:** -- **High recall** (95-99%) out of the box -- **Fast search** — O(log n) comparisons -- **Slow build** — each insertion requires graph updates -- **Memory hungry** — stores graph edges alongside vectors - -> **Verdict:** Best recall-vs-speed trade-off for datasets up to ~10M vectors. The gold standard. - ---- - -### 4. 🔗 Hybrid: IVF + HNSW (SpectorIndex) - -**Idea:** Use IVF to partition the space, then build a small HNSW graph inside each partition. Best of both worlds. - -This is what Spector's flagship **SpectorIndex** implements. See our [SpectorIndex Deep Dive](spector-index-architecture.md) for the full architecture. - -```mermaid -graph TD - Q["🔍 Query"] --> IVF["IVF: Find closest partitions"] - IVF --> S1["Shard 1: HNSW graph\n(5,000 vectors)"] - IVF --> S2["Shard 2: HNSW graph\n(4,800 vectors)"] - IVF --> S3["Shard 3: Flat scan\n(200 vectors)"] - S1 --> M["Global merge → top-K"] - S2 --> M - S3 --> M -``` - -> **Verdict:** Scales to millions while maintaining excellent recall. The future of vector search. - ---- - -## 📐 Distance Metrics - -How do we measure "similarity" between two vectors? Three common choices: - -### Cosine Similarity - -Measures the **angle** between vectors. Ignores magnitude (length). - -$$\text{cosine}(a, b) = \frac{a \cdot b}{\|a\| \cdot \|b\|}$$ - -- Range: [-1, 1] (1 = identical direction, 0 = perpendicular, -1 = opposite) -- Best for: **text embeddings** (where direction captures meaning) - -### Euclidean Distance (L2) - -Measures the **straight-line distance** between two points. - -$$L2(a, b) = \sqrt{\sum_i (a_i - b_i)^2}$$ - -- Range: [0, ∞) (0 = identical, higher = more different) -- Best for: **image embeddings**, clustering -- Key property: **translation-invariant** (shifting both vectors by the same amount doesn't change the distance) - -### Dot Product - -The raw inner product — like cosine but without normalization. - -$$\text{dot}(a, b) = \sum_i a_i \cdot b_i$$ - -- Range: (-∞, ∞) (higher = more similar for normalized vectors) -- Best for: **recommendation systems** (where magnitude matters) - -> [!TIP] -> For **unit-normalized vectors** (length = 1), all three metrics give equivalent rankings: -> $$L2^2(a, b) = 2 - 2 \cdot \text{cosine}(a, b)$$ -> So choosing between them is mainly about convention and API design. - ---- - -## 📊 The Recall–Speed–Memory Triangle - -Every ANN algorithm makes trade-offs between three properties: - -```mermaid -graph TD - R["🎯 Recall\n(accuracy)"] --- S["⚡ Speed\n(latency)"] - S --- M["💾 Memory\n(footprint)"] - M --- R -``` - -| Algorithm | Recall | Speed | Memory | Scale | -|-----------|--------|-------|--------|-------| -| Brute force | 100% | ❌ Slow | ✅ Minimal | < 100K | -| KD-Tree | 90-95% | ⚡ Fast | ✅ Low | < 1M (low-dim) | -| IVF-Flat | 85-98% | ⚡ Fast | ✅ Low | < 100M | -| HNSW | 95-99% | ⚡⚡ Very fast | ❌ High | < 10M | -| IVF-HNSW | 90-99% | ⚡⚡ Very fast | ⚡ Moderate | < 100M | -| IVF-PQ | 80-92% | ⚡ Fast | ⚡⚡ Very low | Billions | - ---- - -## 🧪 How to Measure ANN Quality - -### Recall@K - -The most common metric. For each query, what fraction of the true top-K nearest neighbors did the algorithm find? - -``` -recall@10 = (true positives in top-10 results) / 10 -``` - -A recall@10 of 0.95 means the algorithm found 9.5 out of 10 true nearest neighbors on average. - -### QPS (Queries Per Second) - -How many searches the system can handle per second. Higher is better. - -### Build Time - -How long it takes to index all vectors. Matters for systems with frequent updates. - ---- - -## 🎓 Key Takeaways - -1. **Brute force doesn't scale.** Beyond ~100K vectors, you need an ANN algorithm. -2. **HNSW is the default choice** for datasets up to 10M vectors — excellent recall with fast search. -3. **IVF shines at scale** — partitioning is essential for 10M+ vectors. -4. **Quantization complements ANN** — compress vectors to fit more in memory and scan faster. -5. **The `nprobe` and `efSearch` parameters** are your recall/speed knobs. Always tune them for your workload. -6. **Real embeddings have structure** — ANN algorithms perform much better on real data (which forms natural clusters) than on random vectors. - ---- - -## 🔗 See Also - -- [HNSW Explained](hnsw-explained.md) — How the most popular ANN algorithm works, step by step -- [SpectorIndex Architecture](spector-index-architecture.md) — Spector's IVF-HNSW-SVASQ hybrid index -- [SVASQ Quantization](svasq-deep-dive.md) — How SVASQ compresses vectors with near-lossless quality -- [Understanding Quantization](understanding-quantization.md) — All quantization techniques compared diff --git a/docs/docs/deep-dives/hnsw-explained.md b/docs/docs/deep-dives/hnsw-explained.md deleted file mode 100644 index 29f4aab..0000000 --- a/docs/docs/deep-dives/hnsw-explained.md +++ /dev/null @@ -1,270 +0,0 @@ -# 🕸️ HNSW Explained - -> **How the world's most popular vector search algorithm works, from first principles.** Hierarchical Navigable Small World graphs power vector search in Pinecone, Weaviate, Qdrant, pgvector, and Spector. This page explains HNSW step by step, with intuition, diagrams, and practical tuning advice. - ---- - -## 🤔 The Intuition: Six Degrees of Separation - -You've probably heard that any two people on Earth are connected by at most six handshakes. This is the **small-world phenomenon** — in certain networks, you can reach any node in surprisingly few hops. - -HNSW exploits this same principle for vector search. Instead of comparing your query against every vector, you **navigate a graph** — hopping from neighbor to neighbor, getting closer to the target with each step. - -```mermaid -graph LR - Q["🔍 Query"] -->|"hop 1"| A["Node A\n(far)"] - A -->|"hop 2"| B["Node B\n(closer)"] - B -->|"hop 3"| C["Node C\n(close!)"] - C -->|"hop 4"| D["🎯 Node D\n(nearest!)"] -``` - -Instead of 10 million comparisons, you make ~100. That's the magic. - ---- - -## 📐 From Flat to Hierarchical - -### The Problem with a Single Graph - -A simple navigable small-world (NSW) graph connects each vector to its nearest neighbors. Search starts at a random entry point and greedily walks toward the query — always moving to the neighbor closest to the target. - -This works, but it has a problem: **local minima**. The greedy walk can get stuck in a region that's locally optimal but globally suboptimal. - -### The Fix: Add Layers - -HNSW solves this with a **hierarchy** — multiple layers of the same graph, each progressively sparser: - -```mermaid -graph TD - subgraph "Layer 2 (sparse — highway)" - L2A["A"] --- L2D["D"] - L2D --- L2G["G"] - end - - subgraph "Layer 1 (medium)" - L1A["A"] --- L1B["B"] - L1B --- L1D["D"] - L1D --- L1F["F"] - L1F --- L1G["G"] - end - - subgraph "Layer 0 (dense — all vectors)" - L0A["A"] --- L0B["B"] - L0B --- L0C["C"] - L0C --- L0D["D"] - L0D --- L0E["E"] - L0E --- L0F["F"] - L0F --- L0G["G"] - L0A --- L0C - L0D --- L0F - end - - L2A -.-> L1A - L2D -.-> L1D - L2G -.-> L1G - L1A -.-> L0A - L1B -.-> L0B - L1D -.-> L0D - L1F -.-> L0F - L1G -.-> L0G -``` - -Think of it like navigating a city: -- **Layer 2 (highway):** A few major intersections — long jumps, coarse navigation -- **Layer 1 (main roads):** More nodes, shorter jumps -- **Layer 0 (streets):** Every single location — fine-grained search - ---- - -## 🔧 How Search Works - -### Step 1: Start at the Top - -Enter the graph at the top layer's entry point. There are very few nodes here, so you can quickly find which region of the space the query belongs to. - -### Step 2: Greedy Descent - -At each layer, perform a **greedy search**: repeatedly move to the neighbor closest to the query until no neighbor is closer. Then descend to the next layer, starting from the same node. - -### Step 3: Fine-Grained Search at Layer 0 - -At the bottom layer (which contains all vectors), perform a more thorough search. Instead of pure greedy descent, maintain a **candidate list** of the best nodes seen so far, exploring their neighbors to find even better candidates. - -```mermaid -sequenceDiagram - participant Q as 🔍 Query - participant L2 as Layer 2 - participant L1 as Layer 1 - participant L0 as Layer 0 - - Q->>L2: Start at entry point - Note over L2: Greedy walk → find region - L2->>L1: Descend with best node - Note over L1: Greedy walk → refine region - L1->>L0: Descend with best node - Note over L0: efSearch candidates → precise top-K - L0->>Q: Return top-K nearest neighbors -``` - -### The `efSearch` Parameter - -At layer 0, the algorithm maintains a **dynamic candidate list** of size `efSearch`. Larger `efSearch` = more candidates explored = higher recall but slower search. - -| `efSearch` | Recall@10 | Relative Speed | -|-----------|-----------|---------------| -| 10 | ~80% | Fastest | -| 50 | ~95% | Fast | -| 100 | ~98% | Moderate | -| 200 | ~99.5% | Slower | -| 500 | ~99.9% | Slowest | - -> [!TIP] -> Start with `efSearch=64` and increase until you hit your recall target. For most applications, `efSearch=100-200` provides an excellent balance. - ---- - -## 🏗️ How Construction Works - -Building the HNSW graph is where the algorithm spends most of its time. Each vector is inserted one at a time. - -### Step 1: Assign a Random Layer - -Each new vector is assigned a maximum layer using an exponential distribution: - -``` -layer = floor(-ln(random()) × mL) -``` - -Where `mL = 1 / ln(M)` and M is the max connections per node. This ensures: -- Most vectors (85%) exist only at Layer 0 -- ~12% reach Layer 1 -- ~2% reach Layer 2 -- ~0.2% reach Layer 3 - -### Step 2: Find Neighbors via Search - -To insert a vector, first search the existing graph to find its nearest neighbors (exactly like a query search). The search quality during insertion is controlled by `efConstruction`. - -### Step 3: Connect to Neighbors - -Connect the new vector to its `M` nearest neighbors at each layer it belongs to. Also add reverse connections (the graph is bidirectional). - -### The `efConstruction` Parameter - -Higher `efConstruction` = better neighbor selection during build = higher-quality graph = better recall at search time. But it also means slower insertion. - -| `efConstruction` | Build Speed | Graph Quality | Recall@10 | -|-----------------|------------|--------------|-----------| -| 16 | ⚡⚡ Fast | Low | ~85% | -| 100 | ⚡ Moderate | Good | ~95% | -| 200 | 🐌 Slow | High | ~98% | -| 500 | 🐌🐌 Very slow | Very high | ~99% | - -### The `M` Parameter (Max Connections) - -`M` controls how many edges each node has. More connections = more paths to explore = better recall, but more memory. - -| M | Memory per vector | Recall impact | -|---|-------------------|---------------| -| 8 | Low | Good for low-dim (< 64) | -| **16** | **Moderate** | **Default — good for most cases** | -| 32 | High | Better for high-dim (768+) | -| 64 | Very high | Diminishing returns | - -> [!IMPORTANT] -> The construction parameters `efConstruction` and `M` are permanent — they determine the graph structure. You can adjust `efSearch` at query time without rebuilding. - ---- - -## 🧮 Complexity Analysis - -| Operation | Time Complexity | Why | -|-----------|----------------|-----| -| **Search** | O(log n) | Each layer halves the search space | -| **Insert** | O(log n) | Same as search + edge updates | -| **Memory** | O(n × M) | Each vector stores M edges per layer | - -For reference, with 1 million 768-dim vectors and M=16: -- **Search:** ~100-200 distance computations (vs 1,000,000 for brute force) -- **Memory:** ~12 bytes per edge × 16 edges × 1M vectors ≈ **192 MB** (just for edges, plus the vectors themselves) - ---- - -## ⚡ Why HNSW Is Fast - -Three factors combine to make HNSW remarkably efficient: - -### 1. Logarithmic Hops - -The hierarchical structure means you traverse O(log n) layers, each requiring a small number of greedy steps. For 1M vectors, that's ~6-8 layers with ~10 steps each = ~70 distance computations. - -### 2. Locality - -As you descend layers, you converge on a small region of the space. At layer 0, you're only exploring a local neighborhood — excellent CPU cache behavior. - -### 3. SIMD Acceleration - -Each distance computation (L2, cosine, dot product) can be parallelized using SIMD instructions. Spector uses the Java Vector API to compute 8-16 dimensions simultaneously: - -```java -// 8 dimensions computed in a single CPU instruction -FloatVector va = FloatVector.fromArray(SPECIES, a, offset); -FloatVector vb = FloatVector.fromArray(SPECIES, b, offset); -FloatVector diff = va.sub(vb); -sum = diff.fma(diff, sum); // sum += diff * diff -``` - ---- - -## 🚫 HNSW's Limitations - -HNSW is excellent, but it's not perfect: - -### 1. Memory Hungry - -The graph edges consume significant memory — roughly 50-100% of the vector storage. This limits HNSW to datasets that fit in RAM. - -### 2. Slow Construction - -Building the graph requires O(n log n) total work. Inserting 1M vectors at `efConstruction=200` can take minutes. At 10M+, construction time becomes a serious concern. - -### 3. No Deletion (Efficiently) - -Removing vectors from an HNSW graph is tricky — you need to rewire edges, which can degrade graph quality over time. - -### 4. Doesn't Scale Beyond ~10M - -At 10M+ vectors, HNSW's memory consumption and construction time make it impractical as a standalone index. This is why hybrid approaches (like IVF-HNSW) are preferred at scale. - ---- - -## 🔬 HNSW in Spector - -Spector uses HNSW in two contexts: - -### 1. Standalone HNSW Index - -The `QuantizedHnswIndex` is the workhorse for datasets up to ~10M vectors. It combines HNSW with scalar or SVASQ quantization: - -- **Asymmetric Distance Computation (ADC):** Float32 query vs. quantized stored vectors -- **Off-heap memory:** Graph edges and quantized vectors stored in Panama `MemorySegment` -- **SIMD kernels:** Java Vector API for distance computation - -### 2. Adaptive Shards in SpectorIndex - -The flagship `SpectorIndex` (IVF-HNSW-SVASQ) uses HNSW graphs inside large IVF shards: - -- Shards below 20,000 vectors: exact flat scan (SIMD, faster than HNSW for small N) -- Shards above 20,000 vectors: automatically promoted to HNSW with SVASQ quantization -- This **adaptive** approach avoids HNSW's overhead for small partitions while exploiting its efficiency for large ones - -See [SpectorIndex Architecture](spector-index-architecture.md) for the full design. - ---- - -## 📖 Further Reading - -- **Original Paper:** Malkov & Yashunin, ["Efficient and robust approximate nearest neighbor using Hierarchical Navigable Small World graphs"](https://arxiv.org/abs/1603.09320) (2016) -- [ANN Search Primer](ann-search-primer.md) — Overview of all ANN algorithm families -- [SpectorIndex Architecture](spector-index-architecture.md) — How HNSW fits into the IVF-HNSW-SVASQ design -- [Performance Tuning](../operations/performance-tuning.md) — Tuning `M`, `efConstruction`, and `efSearch` in Spector diff --git a/docs/docs/deep-dives/quantization-comparison.md b/docs/docs/deep-dives/quantization-comparison.md deleted file mode 100644 index 009ce38..0000000 --- a/docs/docs/deep-dives/quantization-comparison.md +++ /dev/null @@ -1,266 +0,0 @@ -# ⚖️ Quantization Comparison - -> **How different search engines approach vector compression — and why they make different choices.** Architecture constraints, legacy decisions, and design philosophy all shape which quantization methods an engine supports. - ---- - -## 🌍 The Quantization Landscape - -Every vector search engine faces the same fundamental problem: vectors are too big to fit in memory at scale. But each engine solves it differently based on their architecture: - -| Constraint | Impact on Quantization Choice | -|-----------|-------------------------------| -| Immutable segments (Lucene) | Makes IVF training/updating difficult | -| Embedded vs. distributed | Affects whether training is practical | -| GPU availability | Enables larger codebook training | -| Disk vs. memory architecture | Changes what "compression" means | - -> [!NOTE] -> There is no universally "best" quantization method. The right choice depends on your recall requirements, memory budget, dataset size, and which engine you're already using. - ---- - -## 🟡 Elasticsearch's Approach: BBQ + DiskBBQ - -### What is BBQ (Better Binary Quantization)? - -BBQ is Elasticsearch's answer to vector compression, introduced in version 8.16. It's a **1-bit quantization** method — each float32 dimension becomes a single bit — enhanced with asymmetric rescoring to recover lost accuracy. - -**How BBQ works:** -1. **Quantize:** Convert each vector to binary (sign bit extraction) — 32× compression -2. **Store metadata:** Keep per-vector correction factors (norm, mean) -3. **First-pass search:** Use Hamming distance on binary codes (very fast) -4. **Rescore:** Re-rank top candidates using stored correction factors for better accuracy - -```mermaid -graph TD - Q["Query"] --> Binary["Binary Hamming Search
32x compressed, fast scan"] - Binary --> Candidates["Top 100 candidates"] - Candidates --> Rescore["Asymmetric Rescoring
Uses stored correction factors"] - Rescore --> Final["Top 10 final results
~90% recall"] -``` - -### Why Elasticsearch Chose Binary Over PQ - -Elasticsearch is built on **Apache Lucene**, which uses an **immutable segment** architecture: - -- Segments are write-once, read-many - -- Merging combines segments but doesn't update in place - -- New data goes to new segments - -This makes IVF-PQ challenging because: - -- **IVF centroids** need to be computed across all data — hard when data arrives in segments - -- **PQ codebooks** need training on representative data — segment-local training produces poor codebooks - -- **Partition rebalancing** on merge is expensive - -Binary quantization, by contrast, is **per-vector** — no global training needed, works perfectly with immutable segments. - -> [!TIP] -> BBQ is clever engineering within Lucene's constraints. The rescoring step recovers much of the recall lost by binary compression, achieving ~90% recall@10 for high-dimensional embeddings (768+). - -### What is DiskBBQ? - -DiskBBQ (introduced experimentally) adds IVF-like partitioning on top of BBQ: - -- Vectors are grouped into clusters (similar to IVF) - -- Only relevant clusters are loaded from disk during search - -- Designed to work within Lucene's segment model by treating clusters as segment-local structures - -**Trade-off:** More complex than plain BBQ, but enables disk-resident indexes for datasets that exceed RAM. - ---- - -## 🔵 Spector's Approach: Scalar + SVASQ + SVASQ-4 + IVF-PQ - -### Why These Two? - -Spector is a **purpose-built vector engine** — no segment model, no legacy constraints. This gives freedom to implement whatever quantization works best for the use case. - -The two-method strategy covers the full spectrum: - -| Need | Solution | Compression | Recall | -|------|----------|-------------|--------| -| Quality-first (≤50M vectors) | Scalar INT8 | 4× | 95–99% | -| Quality + rotation (≤50M) | **SVASQ INT8** | 4× | **97–99.5%** | -| Balanced (10M–100M vectors) | Scalar INT4 | 8× | 85–95% | -| Balanced + rotation (10M–100M) | **SVASQ-4** | **6–8×** | **95–99%** | -| Memory-constrained (50M–500M) | Scalar INT2 | 16× | 75–90% | -| Scale-first (100M–1B+ vectors) | IVF-PQ | 32× | 75–90% | - -### Advantages of Purpose-Built Indexes - -Without Lucene's segment model: - -- **Global IVF training** — K-Means runs over the entire dataset, producing optimal partitions - -- **Codebook updates** — Retrain when data distribution shifts significantly - -- **Partition rebalancing** — Redistribute vectors across partitions as the index grows - -- **Memory-mapped storage** — Custom binary format designed for quantized data layout - -```mermaid -graph LR - subgraph "Elasticsearch (Segment Model)" - Seg1["Segment 1
BBQ binary codes"] - Seg2["Segment 2
BBQ binary codes"] - Seg3["Segment 3
BBQ binary codes"] - end - - subgraph "Spector (Global Index)" - IVF["IVF Partitions
Globally optimized"] - PQ["PQ Codebooks
Trained on all data"] - IVF --> PQ - end -``` - -### IVF-PQ vs. BBQ at Same Compression (32×) - -| Metric | Spector IVF-PQ | Elasticsearch BBQ | -|--------|---------------|-------------------| -| Compression | 32× | 32× | -| Recall@10 (384-dim) | 80–92% | 70–85% | -| Recall@10 (768-dim) | 85–95% | 85–92% | -| Training required | Yes (K-Means + PQ) | No (per-vector) | -| Works with segments | No (global index) | Yes | -| Disk-friendly | Via mmap | Via DiskBBQ | - -> [!IMPORTANT] -> At the same 32× compression ratio, PQ preserves more information than binary because it learns the data distribution. Binary quantization discards magnitude entirely — only direction (sign) survives. - ---- - -## 🟣 Other Approaches - -### Milvus: IVF-PQ + IVF-SQ8 + DiskANN - -Milvus offers the widest quantization menu: - -| Method | Compression | Use Case | -|--------|-------------|----------| -| IVF-PQ | 32×+ | Billion-scale, memory-constrained | -| IVF-SQ8 | 4× | Moderate scale, high recall | -| DiskANN | Varies | Disk-resident billion-scale search | -| HNSW | None (full) | Highest recall, unlimited memory | - -**Philosophy:** Give users every option and let them choose. This flexibility comes with complexity — users must understand trade-offs to configure correctly. - -### Qdrant: Scalar + Binary + Oversampling - -Qdrant takes a practical approach: - -| Method | Details | -|--------|---------| -| Scalar INT8 | Standard 4× compression, applied per-segment | -| Binary | 32× with configurable oversampling for rescoring | -| Oversampling | Retrieve 3–5× more candidates, rescore with full vectors | - -**Key innovation:** Qdrant's oversampling strategy is straightforward but effective. Retrieve more candidates with cheap binary search, then rescore the shortlist with full-precision vectors. Recall depends on oversampling factor. - -### FAISS: The Research Gold Standard - -Meta's FAISS library is the reference implementation for quantization research: - -| Method | Description | -|--------|-------------| -| IVF-PQ | The classic — inverted file + product quantization | -| OPQ | Optimized PQ — rotates data before splitting to minimize quantization error | -| IVFADC | IVF with Asymmetric Distance Computation | -| IVF-PQ + Refine | Two-stage: PQ shortlist → full-precision rescore | -| ScaNN | Anisotropic quantization (prioritizes angular error) | -| Binary (LSH) | Locality-Sensitive Hashing for binary codes | - -> [!NOTE] -> FAISS isn't a search engine — it's a library. Most production vector databases (including Milvus) build on FAISS's algorithms internally. - ---- - -## 🧭 Decision Guide - -Use this flowchart to pick the right quantization for your workload: - -```mermaid -flowchart TD - Start["How many vectors?"] --> Small["< 10M vectors"] - Start --> Medium["10M - 100M vectors"] - Start --> Large["> 100M vectors"] - - Small --> SmallRecall["Recall requirement?"] - SmallRecall --> SmallHigh["> 95% recall"] - SmallRecall --> SmallMod["80-95% recall"] - SmallHigh --> A["Use: No quantization or Scalar INT8"] - SmallMod --> B["Use: Scalar INT8"] - - Medium --> MedMem["Memory budget?"] - MedMem --> MedHigh["> 64 GB available"] - MedMem --> MedLow["< 64 GB available"] - MedHigh --> C["Use: Scalar INT8"] - MedLow --> D["Use: IVF-PQ (32x)"] - - Large --> LargeRecall["Recall requirement?"] - LargeRecall --> LargeHigh["> 90% recall"] - LargeRecall --> LargeMod["75-90% recall"] - LargeRecall --> LargeLow["< 75% acceptable"] - LargeHigh --> E["Use: IVF-PQ + Rescore"] - LargeMod --> F["Use: IVF-PQ"] - LargeLow --> G["Use: Binary + Rescore"] -``` - -### Quick Rules of Thumb - -| Situation | Recommendation | -|-----------|---------------| -| "I need maximum recall" | No quantization or Scalar INT8 | -| "I want balanced compression/recall" | Scalar INT4 + rescore (8×, 85–95%) | -| "I need to fit in a single machine" | Scalar INT2 (16×) or IVF-PQ (32×) | -| "I need the fastest possible filtering" | Scalar INT2 as first pass + rescore | -| "I'm using Elasticsearch" | BBQ (it's your best option there) | -| "I'm building from scratch" | INT4 for moderate scale, IVF-PQ for billions | -| "I don't want training complexity" | Scalar INT8 or INT4 (calibration is automatic) | - ---- - -## 📊 Summary Table - -Which quantization methods are available in each engine: - -| Engine | Scalar INT8 | Scalar INT4/INT2 | Binary | Product Quantization | IVF-PQ | DiskANN | Rescoring | -|--------|:-----------:|:----------------:|:------:|:-------------------:|:------:|:-------:|:---------:| -| **Spector** | ✅ | ✅ (non-uniform) | ❌ | ✅ (via IVF-PQ) | ✅ | ❌ | ✅ (SVASQ/SVASQ-4 + configurable oversampling) | -| **Elasticsearch** | ✅ | ❌ | ✅ (BBQ) | ❌ | ❌ | ❌ | ✅ (asymmetric) | -| **Milvus** | ✅ (IVF-SQ8) | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | -| **Qdrant** | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ (oversampling) | -| **FAISS** | ✅ | ❌ | ✅ (LSH) | ✅ | ✅ | ❌ | ✅ | -| **Weaviate** | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | - -### Compression × Recall Trade-off by Engine - -| Engine | 4× (Scalar) Recall | 8× (INT4) Recall | 16× (INT2) Recall | 32× (Best Method) Recall | Architecture Constraint | -|--------|:------------------:|:-----------------:|:------------------:|:------------------------:|------------------------| -| **Spector** | 97–99.5% (SVASQ) | 95–99% (SVASQ-4+rescore) | 75–90% (INT2+rescore) | 80–92% (IVF-PQ) | None (purpose-built) | -| **Elasticsearch** | 95–99% | — | — | 70–90% (BBQ + rescore) | Lucene segments | -| **Milvus** | 95–99% | — | — | 80–92% (IVF-PQ) | Distributed complexity | -| **Qdrant** | 95–99% | — | — | 65–85% (Binary + oversample) | Per-segment quantization | -| **FAISS** | 95–99% | — | — | 85–95% (OPQ) | Library, not engine | - -> [!TIP] -> FAISS achieves the highest PQ recall because OPQ (Optimized Product Quantization) rotates the vector space before splitting into subspaces, minimizing quantization error. This is computationally expensive during training but pays off at query time. - ---- - -## 🔗 See Also - -- [Understanding Quantization](understanding-quantization.md) — Quantization from first principles - -- [Core Concepts](../architecture/core-concepts.md) — HNSW, IVF-PQ, BM25, and SIMD fundamentals - -- [Performance Tuning](../operations/performance-tuning.md) — How to tune nprobe, subspaces, and other parameters - -- [Architecture Overview](../architecture/overview.md) — How Spector's storage layer is designed \ No newline at end of file diff --git a/docs/docs/deep-dives/real-embedding-benchmarks.md b/docs/docs/deep-dives/real-embedding-benchmarks.md deleted file mode 100644 index a24f09d..0000000 --- a/docs/docs/deep-dives/real-embedding-benchmarks.md +++ /dev/null @@ -1,147 +0,0 @@ -# 📊 Large-Scale Real-Embedding & Shard Promotion Benchmarks - -This page presents the exhaustive experimental results and performance characteristics of **SpectorIndex (IVF-HNSW-SVASQ)**. - -To evaluate the system under realistic production workloads, we benchmarked the index using high-dimensional text embeddings from real-world datasets rather than synthetic structureless Gaussian noise. Additionally, we analyzed the performance and recall characteristics of our adaptive shard promotion system at a scale of 100,000 vectors. - ---- - -## 🔬 Experimental Setup & System Context - -All tests were performed locally under standard, repeatable conditions to isolate CPU and JVM execution metrics: - -- **Hardware:** 24-core Intel Core Ultra 9 285K, AVX2 256-bit SIMD instruction extensions. -- **Runtime Environment:** Java 25 (OpenJDK 25.0.1), garbage collection managed via the ZGC (Z Garbage Collector), 12GB allocated heap (`-Xmx12g`). -- **Core Optimization:** Panama Vector API (`jdk.incubator.vector`) enabled via JVM arguments to compile hardware-native SIMD instructions on the fly. -- **Embedding Model:** `qwen3-embedding` (4,096 dimensions) via a local GPU-accelerated Ollama inference runner. -- **Dataset (Real-Embedding):** 10,000 diverse sentences sampled from 8 distinct semantic topic categories (quantum mechanics, biotechnology, economics, history, creative arts, cybersecurity, environmental policy, medicine). -- **Queries:** 100 fresh, out-of-distribution sentences sampled from the same topic categories. -- **Ground Truth:** Absolute exact $L2^2$ brute-force top-10 neighbors computed on uncompressed float32 vectors. - ---- - -## 📈 Part 1: Real-Embedding Sweep (4096-dim Qwen3) - -Real-world transformer embeddings naturally cluster into distinct, low-dimensional manifolds. Sentences about quantum mechanics group together; sentences about macroeconomics form another group. SpectorIndex exploits this structured geometry, yielding **near-perfect recall at fraction-of-a-percent partition scans**. - -The sweeps evaluate different `nCentroids` (IVF Voronoi cells) and `nProbe` depths. All measurements represent average search latency and QPS over 500 query iterations. - -### 1. nCentroids = 32 -Vectors are divided into 32 cells (average ~312 vectors per partition). - -| nProbe | % of Index Searched | Avg Latency | p50 (Median) | p99 (Worst Case) | Throughput (QPS) | Recall@10 | Ingest Latency | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| **4** | 12.5% | 1.167 ms | 1.094 ms | 1.828 ms | **857** | **1.0000** | 555 ms (18,018/s) | -| **8** | 25.0% | 2.237 ms | 2.236 ms | 2.957 ms | **447** | **1.0000** | 541 ms | -| **16** | 50.0% | 4.560 ms | 4.567 ms | 5.443 ms | **219** | **1.0000** | 550 ms | -| **32** | 100.0% | 7.767 ms | 7.781 ms | 8.426 ms | **129** | **1.0000** | 554 ms | - ---- - -### 2. nCentroids = 64 -Vectors are divided into 64 cells (average ~156 vectors per partition). - -| nProbe | % of Index Searched | Avg Latency | p50 (Median) | p99 (Worst Case) | Throughput (QPS) | Recall@10 | Ingest Latency | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| **4** | 6.3% | 0.624 ms | 0.625 ms | 0.923 ms | **1,601** | **1.0000** | 1,012 ms (9,881/s) | -| **8** | 12.5% | 1.168 ms | 1.141 ms | 1.592 ms | **856** | **1.0000** | 1,007 ms | -| **16** | 25.0% | 2.198 ms | 2.233 ms | 2.805 ms | **455** | **1.0000** | 1,007 ms | -| **32** | 50.0% | 4.439 ms | 4.502 ms | 5.118 ms | **225** | **1.0000** | 1,006 ms | -| **64** | 100.0% | 7.921 ms | 7.893 ms | 8.828 ms | **126** | **1.0000** | 1,003 ms | - ---- - -### 3. nCentroids = 128 -Vectors are divided into 128 cells (average ~78 vectors per partition). - -| nProbe | % of Index Searched | Avg Latency | p50 (Median) | p99 (Worst Case) | Throughput (QPS) | Recall@10 | Ingest Latency | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| **4** | **3.1%** | **0.455 ms** | **0.443 ms** | **0.651 ms** | **2,198** | **0.9980** | 1,965 ms (5,089/s) | -| **8** | **6.3%** | **0.751 ms** | **0.719 ms** | **1.100 ms** | **1,332** | **0.9990** | 1,960 ms | -| **16** | 12.5% | 1.218 ms | 1.152 ms | 1.753 ms | **821** | **1.0000** | 1,970 ms | -| **32** | 25.0% | 2.298 ms | 2.273 ms | 2.856 ms | **435** | **1.0000** | 1,964 ms | -| **64** | 50.0% | 4.475 ms | 4.455 ms | 5.177 ms | **223** | **1.0000** | 1,965 ms | - ---- - -### 4. nCentroids = 256 -Vectors are divided into 256 cells (average ~39 vectors per partition). - -| nProbe | % of Index Searched | Avg Latency | p50 (Median) | p99 (Worst Case) | Throughput (QPS) | Recall@10 | Ingest Latency | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| **4** | **1.56%** | **0.538 ms** | **0.535 ms** | **0.642 ms** | **1,857** | **0.9950** | 3,873 ms (2,582/s) | -| **8** | **3.13%** | **0.690 ms** | **0.676 ms** | **0.997 ms** | **1,450** | **1.0000** | 3,986 ms | -| **16** | 6.25% | 0.957 ms | 0.942 ms | 1.218 ms | **1,045** | **1.0000** | 3,874 ms | -| **32** | 12.50% | 1.468 ms | 1.425 ms | 1.879 ms | **681** | **1.0000** | 3,881 ms | -| **64** | 25.00% | 2.872 ms | 2.836 ms | 3.552 ms | **348** | **1.0000** | 3,897 ms | - -> [!NOTE] -> *Note on nCentroids=256 sweeps:* The data for `nProbe=64` represents a highly comprehensive coverage of large-partition lookups. The 256 centroid partition sweeps show that searching less than **1.6% of the clusters** (nProbe=4) still yields **99.5% recall** at an incredibly low latency of **0.538ms**. - ---- - -### 💡 Structural Recall Analysis: Synthetic vs. Real Data - -The outstanding recall achieved on real text embeddings (99.5% - 100.0% even at highly aggressive probes) highlights a fundamental math concept: **synthetic high-dimensional vectors are a poor model for real-world embeddings.** - -Synthetic high-dimensional data (like random Gaussian distributions) spreads uniformly across the entire hypersphere. There is no topic coherence, no clusters, and no structure. As a result, the true nearest neighbors of a random vector are randomly scattered across the Voronoi partitions of the index, requiring an exhaustive search (`nProbe = ALL`) to get reasonable recall. - -In contrast, real embeddings (e.g., Sentence-BERT, CLIP, Qwen) occupy a much smaller semantic manifold. Vectors corresponding to similar concepts occupy the same spatial coordinate subspaces. The coarse K-Means centroids learn these clusters precisely. As a result, the nearest neighbors of a query sentence are mathematically guaranteed to reside in the exact same Voronoi cells or adjacent cells—achieving perfect search quality at extremely low probe depths. - -| Metric | Random Gaussian (128-dim) | Real Qwen3 (4096-dim) | -| :--- | :--- | :--- | -| **Recall@10 (nCentroids=128, nProbe=4)** | 23.40% | **99.80%** (4.3× increase) | -| **Recall@10 (nCentroids=128, nProbe=8)** | 38.20% | **99.90%** (2.6× increase) | -| **Recall@10 (nCentroids=128, nProbe=32)** | 59.20% | **1.0000** (1.7× increase) | - ---- - -## ⚡ Part 2: Shard Promotion Benchmark (100K Scale) - -To evaluate HNSW promotions at scale, we conducted a benchmark at 100,000 vectors comparing exhaustive **Flat Shard mode** (linear SIMD scan over float32 residuals) vs **Promoted HNSW Shard mode** (pre-calibrated 132-bit SVASQ quantized HNSW graph search inside each centroid's shard). - -A total of 32 coarse centroids were used, resulting in an average of 3,125 vectors per shard. The promotion threshold `shardThreshold` was configured to `1,000`, ensuring all 32 partitions promoted to HNSW graphs during ingestion. - -### Performance Summary (100K, 128-dim vectors) - -| nProbe | Mode | Avg Latency | p50 (Median) | p99 (Worst Case) | Throughput (QPS) | Recall@10 | Ingestion Rate | -| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | -| **4** | Flat | 0.388 ms | 0.383 ms | 0.671 ms | **2,580** | 0.3260 | 632,911 docs/s | -| **4** | HNSW | 0.418 ms | 0.362 ms | 0.962 ms | **2,392** | 0.3230 | 7,638 docs/s | -| **8** | Flat | 0.717 ms | 0.709 ms | 0.953 ms | **1,394** | 0.5350 | 632,911 docs/s | -| **8** | HNSW | 0.722 ms | 0.694 ms | 1.208 ms | **1,386** | 0.5280 | 7,638 docs/s | -| **16** | Flat | 1.462 ms | 1.462 ms | 1.704 ms | **684** | 0.7760 | 632,911 docs/s | -| **16** | HNSW | 1.719 ms | 1.541 ms | 3.716 ms | **582** | 0.7670 | 7,638 docs/s | -| **32** | Flat | 3.111 ms | 3.077 ms | 3.787 ms | **321** | **1.0000** | 632,911 docs/s | -| **32** | HNSW | **2.892 ms** | **2.724 ms** | **4.934 ms** | **346** | **0.9870** | 7,638 docs/s | - -### 🛠️ Shard Promotion Analysis - -#### 1. Recall Equivalence -The promoted HNSW shards achieve **almost identical recall** to the exhaustive float32 Flat Shards (e.g., `0.9870` HNSW vs `1.0000` Flat at `nProbe = 32`). This confirms that: -- The translation of internal HNSW contiguous graph node indices (`nodeIdx`) to external global `storeIndex` values is correct. -- Forcing `SimilarityFunction.EUCLIDEAN` for all residual operations inside the promoted HNSW index prevents mathematical similarity mismatches with the IVF boundaries. - -#### 2. Trade-Off: Ingestion vs. Search Speed -- **Ingestion:** Flat Shards ingest at an astronomical **632K docs/sec** because adding a vector requires only subtracting the centroid and appending to a float32 array. Quantized HNSW construction ingests at **7.6K docs/sec** because it performs O(N log N) graph traversals and builds indexing structures on heap. -- **Shallow Searches (nProbe <= 16):** Flat Shard mode remains slightly faster for small queries. Contiguous SIMD memory scans have zero graph traversal or pointer-chasing overhead, and the hardware prefetcher is highly efficient at low sizes. -- **Deep Searches (nProbe = 32):** Promoted HNSW Shards win at deep lookups (where all centroids are searched), achieving **346 QPS** (2.89ms) vs. **321 QPS** (3.11ms) for Flat mode. As the search space increases, the graph's logarithmic traversal complexity bypasses exhaustive scans. - ---- - -## 🛠️ Tuning Recommendations for SpectorIndex - -Based on the empirical sweeps above, we recommend the following tuning strategies: - -1. **Centroid Count Scaling:** Maintain $C \approx \sqrt{N}$ (e.g., 128 centroids for 10K–50K vectors, 512 centroids for 1M vectors) to balance coarse routing costs and partition sizing. -2. **Real-world Query Probe:** Set `nProbe` between **8 and 16** for real embedding workloads. Unlike synthetic data where nProbe must be large, real embeddings achieve 99.9% - 100% recall with `nProbe = 8`, which cuts search latency and doubles query throughput. -3. **Adaptive Promotion Boundary:** Use `shardThreshold = 10_000` to promote shards to HNSW. At sizes below 10,000 vectors, contiguous SIMD scans over residuals remain faster than graph traversal. - ---- - -## 🔗 Related Pages - -- [SVASQ Deep Dive](svasq-deep-dive.md) — The mathematics behind FWHT and affine quantization. -- [SpectorIndex Architecture](spector-index-architecture.md) — The multi-level adaptive IVF-HNSW shard strategy. -- [Spector + SVASQ Whitepaper](svasq-spectorindex-whitepaper.md) — Formal academic whitepaper detailing Spector's mathematical properties. -- [Performance Tuning Guide](../operations/performance-tuning.md) — Fine-tuning system, SIMD, and index settings. diff --git a/docs/docs/deep-dives/spector-index-architecture.md b/docs/docs/deep-dives/spector-index-architecture.md deleted file mode 100644 index 9e9a3e5..0000000 --- a/docs/docs/deep-dives/spector-index-architecture.md +++ /dev/null @@ -1,280 +0,0 @@ -# 🏛️ SpectorIndex: IVF-HNSW-SVASQ Architecture - -> **The flagship adaptive vector index of the Spector search engine.** SpectorIndex combines Inverted File partitioning, Hierarchical Navigable Small World graphs, and SVASQ residual quantization into a single index that scales from 10K to millions of vectors with excellent recall, fast ingestion, and minimal memory. - ---- - -## 🎯 Design Goals - -SpectorIndex was designed to solve the fundamental limitations of standalone HNSW: - -| Problem with HNSW | SpectorIndex Solution | -|-------------------|-----------------------| -| Slow ingestion (O(n log n)) | IVF partitioning + flat buffer → **100K+ docs/s** | -| High memory (graph edges) | SVASQ INT8 residuals → **4× compression** | -| Doesn't scale past ~10M | IVF coarse search → only probe relevant partitions | -| No compression | SVASQ with FWHT → near-lossless INT8 | - ---- - -## 🏗️ Three-Layer Architecture - -SpectorIndex combines three orthogonal techniques: - -```mermaid -graph TD - subgraph "Layer 1: IVF — Coarse Partitioning" - Q["🔍 Query"] --> KM["K-Means\nFind nProbe\nclosest centroids"] - KM --> S1["Shard 1"] - KM --> S2["Shard 2"] - KM --> S3["Shard N"] - end - - subgraph "Layer 2: Adaptive Shards" - S1 --> F1["< 20K vectors:\nExact Flat Scan\n(SIMD, zero GC)"] - S2 --> H1["≥ 20K vectors:\nLocal HNSW Graph\n(SVASQ-quantized)"] - S3 --> F2["< 20K vectors:\nExact Flat Scan"] - end - - subgraph "Layer 3: SVASQ Residual Quantization" - H1 --> V1["Residual: r = x - centroid\nFWHT rotation → INT8\n4× compression"] - end - - F1 --> M["Global Merge\n(L2 distance, top-K)"] - H1 --> M - F2 --> M - M --> R["🎯 Results"] -``` - -### Layer 1: IVF (Inverted File) - -K-Means clustering partitions the vector space into `nCentroids` Voronoi cells. At query time, only the `nProbe` closest cells are searched, reducing the effective search space by `nCentroids / nProbe`. - -### Layer 2: Adaptive Shards - -Each Voronoi cell contains a **SpectorShard** — an adaptive data structure that operates in one of two modes: - -- **Flat mode** (size < `shardThreshold`): Stores float32 residuals in a contiguous buffer. Search is an exact SIMD scan — faster than HNSW for small partitions because there's no pointer-chasing overhead. -- **HNSW mode** (size ≥ `shardThreshold`): A local SVASQ-quantized HNSW graph. The flat buffer is consumed during promotion and released to free heap memory. - -### Layer 3: SVASQ Residual Quantization - -Vectors are stored as **residuals** (`r = x − centroid`), then compressed with SVASQ: -1. Apply FWHT (Fast Walsh-Hadamard Transform) to spread variance -2. Quantize to INT8 with calibrated min/max per dimension -3. Store: `[4-byte L2 norm | D bytes of INT8 codes]` - ---- - -## 🔄 Lifecycle - -### Phase 1: Training - -```java -SpectorIndex index = SpectorIndex.builder() - .dimensions(768) - .nCentroids(256) - .nProbe(16) - .shardThreshold(20_000) - .build(); - -index.train(representativeVectors); // K-Means++ → learn centroids -``` - -Training runs K-Means++ on a representative sample to learn `nCentroids` centroids. This is a one-time operation (typically < 10 seconds for 50K training vectors). - -### Phase 2: Ingestion - -```java -index.add("doc-1", 0, vector); // ~100K-250K docs/s -``` - -For each vector: -1. Find nearest centroid (`KMeans.nearestCentroid`) -2. Compute residual: `r = vector - centroid` -3. Store in the centroid's shard (flat buffer, no graph construction) -4. If shard crosses threshold → automatic promotion to HNSW - -### Phase 3: Search - -```java -ScoredResult[] results = index.search(queryVector, 10); -``` - -1. Find `nProbe` closest centroids to query -2. For each probed centroid: compute residual query `q_res = query - centroid` -3. Search each shard with its residual query -4. **Global merge** using L2 distance (translation-invariant) -5. Return top-K - ---- - -## 🔑 Critical Design Decision: L2 for Residual Search - -This is the most important architectural decision in SpectorIndex, and getting it wrong destroys recall. - -### The Problem - -When searching across multiple shards, each shard returns results with scores computed in its own **translated coordinate system** (centered at its centroid). If you use **cosine similarity** on residuals, scores from different shards are **not comparable**: - -``` -Shard A (centroid c_A): cosine(q - c_A, x - c_A) → angle in c_A's space -Shard B (centroid c_B): cosine(q - c_B, y - c_B) → angle in c_B's space -``` - -A score of 0.95 from shard A and 0.93 from shard B might not reflect their true relative similarity to the query. - -### The Solution - -**L2 distance is translation-invariant:** - -$$\|(q - c) - (x - c)\|^2 = \|q - x\|^2$$ - -The centroid cancels out! So L2 on residuals gives the **exact same distance** as L2 on original vectors, regardless of which centroid's shard the vector resides in. This makes cross-shard scores directly comparable. - -> [!IMPORTANT] -> SpectorIndex always uses **EUCLIDEAN distance** internally for residual search and global merge, regardless of the user's configured `similarityFunction`. The user's metric is used only for centroid routing (where it operates in absolute space). This is the same approach used by FAISS's `IndexIVFFlat`. - -### Mathematical Proof - -For any two vectors $q, x$ and any centroid $c$: - -$$\|(q - c) - (x - c)\|^2 = \sum_i ((q_i - c_i) - (x_i - c_i))^2 = \sum_i (q_i - x_i)^2 = \|q - x\|^2$$ - -This identity holds exactly in floating-point arithmetic (the centroid terms cancel algebraically before any rounding). - ---- - -## 🏎️ Performance Characteristics - -### Ingestion: 100K–250K docs/s - -SpectorIndex's ingestion is **28-160× faster** than standalone HNSW because: -- No graph construction during add (flat buffer append) -- Residual computation is O(D) — just subtraction -- Memory-mapped flat arrays with sequential writes -- Graph construction is deferred until shard promotion - -### Search: Sub-millisecond at Optimal Config - -**Real embeddings (Qwen3-embedding, 4096-dim, 10K vectors):** - -| nCentroids | nProbe | % Searched | Latency | QPS | Recall@10 | -|------------|--------|-----------|---------|-----|-----------| -| **128** | **4** | **3.1%** | **0.46ms** | **2,173** | **1.0000** | -| 128 | 8 | 6.3% | 0.73ms | 1,368 | 1.0000 | -| 64 | 4 | 6.3% | 0.62ms | 1,601 | 1.0000 | -| 64 | 8 | 12.5% | 1.17ms | 856 | 1.0000 | -| 32 | 4 | 12.5% | 1.17ms | 857 | 1.0000 | - -> [!TIP] -> With real embeddings, even `nProbe=4` at 128 centroids gives **perfect recall** while searching only 3.1% of the data. Real embeddings have natural cluster structure that IVF exploits beautifully. - -### Memory: 4× Compression with SVASQ - -After shard promotion, SVASQ quantization compresses stored residuals to ~(D + 4) bytes per vector — approximately 4× compression versus float32. - ---- - -## ⚙️ Configuration Guide - -### Centroid Count (`nCentroids`) - -The number of IVF partitions. More centroids = finer partitioning = better recall at low nProbe, but slower training. - -| Dataset Size | Recommended `nCentroids` | -|-------------|-------------------------| -| 10K–50K | 32–64 | -| 50K–500K | 64–256 | -| 500K–5M | 256–1024 | -| 5M–50M | 1024–4096 | - -**Rule of thumb:** `nCentroids ≈ √N` (square root of dataset size). - -### Probe Count (`nProbe`) - -How many centroids to search at query time. The primary recall/speed knob. - -| `nProbe` | Recall | Speed | Use Case | -|---------|--------|-------|----------| -| 4 | ~30% | ⚡⚡⚡ | Filtering, not primary search | -| 16 | ~77% | ⚡⚡ | Fast approximate search | -| 32 | ~90% | ⚡ | Balanced | -| 64+ | ~95%+ | 🐌 | High-recall requirements | - -> [!TIP] -> With **real (structured) embeddings**, recall at any given nProbe is significantly higher than with random data. Expect 90%+ recall at `nProbe=16` with production embedding models. - -### Shard Threshold (`shardThreshold`) - -When a shard's size reaches this threshold, it promotes from flat scan to HNSW. - -- **Default: 20,000** — optimal for most workloads -- Lower values: earlier promotion, higher memory usage, potentially faster search in large shards -- Higher values: longer flat scan period, lower memory, simpler data path - -### Oversampling Factor (`oversamplingFactor`) - -After HNSW promotion, the number of candidates retrieved per shard is `k × oversamplingFactor`. Higher values improve recall at the cost of more candidates to merge. - -- **Default: 3** — retrieves 30 candidates per shard for top-10 queries -- Increase to 5-10 if recall is insufficient - ---- - -## 🔬 Adaptive Shard Promotion - -The adaptive shard design is inspired by the observation from the [original research](../../../new-index-research.md): - -> *"Scanning a flat, contiguous MemorySegment of SVASQ vectors using an unrolled 256-bit FMA loop utilizes aggressive CPU pre-fetchers. Panama can evaluate roughly 1,000 vectors in < 1 microsecond."* - -For small partitions (< 20K vectors), a flat SIMD scan over contiguous memory is **5-10× faster** than HNSW pointer-chasing. Only when partitions grow large enough for the O(log n) advantage to kick in does HNSW become worthwhile. - -```mermaid -graph LR - Add["add(vector)"] --> Check{"size ≥ threshold?"} - Check -->|No| Flat["Append to\nflat buffer"] - Check -->|Yes| Promote["🔄 Promote"] - Promote --> Cal["Calibrate SVASQ\nfrom flat buffer"] - Cal --> Build["Build HNSW\n(bulk insert)"] - Build --> Free["Free flat buffer\n(reclaim heap)"] -``` - -### Thread Safety During Promotion - -Promotion holds the write-lock exclusively. The sequence ensures correctness: - -1. **In-flight flat scans** complete before promotion runs (they hold read-locks) -2. **New searches** arriving during promotion block on the read-lock -3. After promotion, a `volatile` flag enables a **lock-free fast path** for all subsequent searches -4. The `volatile` write establishes a happens-before edge, guaranteeing the HNSW index is visible to all threads - ---- - -## 🧬 FWHT Order of Operations - -When combining FWHT with IVF, the order matters: - -**Ingestion:** -1. Find nearest centroid `c` (using original vector in absolute space) -2. Compute residual `r = x - c` -3. Apply FWHT to `r` (not to `x` — FWHT before centroid assignment breaks clustering) -4. Quantize to INT8 - -**Search:** -1. Find nProbe closest centroids -2. For each centroid `c`: compute `q_res = q - c` -3. Apply FWHT to `q_res` -4. Pre-multiply scale/offset (SVASQ query pushdown) -5. Scan the shard - ---- - -## 🔗 See Also - -- [Large-Scale Benchmarks](real-embedding-benchmarks.md) — Empirical sweeps for real embeddings and HNSW shard promotions. -- [SVASQ Deep Dive](svasq-deep-dive.md) — How SVASQ quantization works in detail -- [HNSW Explained](hnsw-explained.md) — How the graph search algorithm works -- [ANN Search Primer](ann-search-primer.md) — Overview of all ANN algorithm families -- [SVASQ + SpectorIndex Whitepaper](svasq-spectorindex-whitepaper.md) — Academic treatment -- [Performance Tuning](../operations/performance-tuning.md) — Practical tuning advice diff --git a/docs/docs/deep-dives/svasq-deep-dive.md b/docs/docs/deep-dives/svasq-deep-dive.md deleted file mode 100644 index cbaf6fa..0000000 --- a/docs/docs/deep-dives/svasq-deep-dive.md +++ /dev/null @@ -1,359 +0,0 @@ -# 🌀 SpectorQuant — SVASQ (Spector Vector-Aligned Scalar Quantization) - -> **How Spector achieves INT8 precision rivaling INT12–INT16 using the Fast Walsh-Hadamard Transform.** SVASQ is Spector's custom quantization technique that combines mathematical rotation with affine scalar quantization to minimize information loss. This page explains the theory, implementation, and why it outperforms standard scalar quantization. - ---- - -## 🤔 The Problem with Standard Scalar Quantization - -Standard INT8 quantization maps each dimension independently: - -``` -quantized[i] = round(255 × (value[i] - min[i]) / (max[i] - min[i])) -``` - -This works well when all dimensions have similar variance. But real embeddings often have **outlier dimensions** — a few dimensions with much larger ranges than the rest: - -``` -Dim 0: range [-0.05, 0.05] → 255 bins across 0.10 range → precision: 0.0004 -Dim 42: range [-3.50, 3.50] → 255 bins across 7.00 range → precision: 0.0275 -``` - -Dimension 42 has **70× worse precision** than dimension 0. Since distance computation sums all dimensions, these imprecise outlier dimensions dominate the quantization error — dragging down recall. - -> [!NOTE] -> This problem is particularly acute for transformer embeddings (BERT, GPT, etc.), which often have a few "dominant" dimensions with disproportionately large values. - ---- - -## 💡 The SVASQ Insight: Rotate First, Then Quantize - -SVASQ solves the outlier problem with a two-step approach: - -1. **Rotate** the vector using a mathematical transform that **spreads variance uniformly** across all dimensions -2. **Quantize** the rotated vector using standard INT8 — now every dimension has similar precision - -The rotation doesn't change any distances (it's an orthogonal transform), but it dramatically improves quantization quality. - -```mermaid -graph LR - V["Raw Vector\n(uneven variance)"] --> FWHT["🌀 FWHT Rotation\n(spread variance)"] - FWHT --> SQ["🔢 INT8 Quantization\n(uniform precision)"] - SQ --> Store["💾 Store\n(4× compressed)"] -``` - ---- - -## 🔬 The Fast Walsh-Hadamard Transform (FWHT) - -### What It Does - -The FWHT is an orthogonal transform (like the Fourier Transform, but using only +1 and -1 instead of complex exponentials). It multiplies each vector by a **Hadamard matrix**: - -$$\hat{x} = H_n \cdot x$$ - -Where $H_n$ is the Hadamard matrix of order $n$ (a power of 2): - -$$H_1 = [1], \quad H_2 = \begin{bmatrix} 1 & 1 \\ 1 & -1 \end{bmatrix}, \quad H_4 = \begin{bmatrix} H_2 & H_2 \\ H_2 & -H_2 \end{bmatrix}$$ - -### Why It Spreads Variance - -Each output dimension of the Hadamard transform is a **sum or difference of all input dimensions** (with alternating signs). If one input dimension has a spike, the Hadamard transform distributes that spike's energy equally across all output dimensions. - -**Before FWHT:** One outlier dimension (dim 42) has 100× the variance of others. - -**After FWHT:** Every output dimension has roughly equal variance — the outlier's energy is smeared uniformly. - -### Why It's Fast - -Unlike the FFT (which requires O(n log n) complex multiplications), the FWHT uses only **additions and subtractions** — no multiplications at all: - -```java -// In-place FWHT: O(n log n) additions, zero multiplications -for (int len = 1; len < n; len <<= 1) { - for (int i = 0; i < n; i += len << 1) { - for (int j = i; j < i + len; j++) { - float u = data[j]; - float v = data[j + len]; - data[j] = u + v; // butterfly add - data[j + len] = u - v; // butterfly subtract - } - } -} -``` - -On a modern CPU with SIMD, this processes 128-dim vectors in under **50 nanoseconds**. - -### Key Properties - -| Property | Value | -|----------|-------| -| Complexity | O(n log n) — only additions/subtractions | -| Invertible | Yes — `FWHT(FWHT(x)) = n·x` | -| Orthogonal | Yes — preserves L2 distances: `‖Hx - Hy‖ = ‖x - y‖` | -| Real-valued | Yes — no complex numbers (unlike FFT) | -| Dimension requirement | Power of 2 (pad if needed) | - -> [!IMPORTANT] -> **Distance preservation** is the critical property. Because the Hadamard matrix is orthogonal, `L2(FWHT(x), FWHT(y)) = L2(x, y)`. This means quantizing in the rotated space doesn't introduce any systematic bias — only the random quantization noise, which is now spread uniformly. - ---- - -## 🏗️ SVASQ Pipeline - -### Ingestion (Encoding) - -For each vector `x`: - -```mermaid -graph LR - X["x (float32)"] --> Pad["Pad to\npower-of-2"] - Pad --> FWHT["FWHT\nrotation"] - FWHT --> Norm["Extract\n‖x̂‖₂ norm"] - FWHT --> Quant["INT8\nquantize"] - Norm --> Store["Store: [norm₃₂ | int8[D]]"] - Quant --> Store -``` - -1. **Pad** the vector to the next power of 2 (e.g., 768 → 1024) -2. **Apply FWHT** — the in-place butterfly transform -3. **Extract and store the L2 norm** of the rotated vector (float32, 4 bytes) -4. **Calibrate** per-dimension min/max from a representative sample -5. **Quantize** each rotated dimension to INT8: `q[i] = round(255 × (x̂[i] - min[i]) / scale[i])` -6. **Store** as `[4-byte norm | D bytes of INT8]` - -### Search (Asymmetric Distance Computation) - -```mermaid -graph LR - Q["query (float32)"] --> Pad2["Pad to\npower-of-2"] - Pad2 --> FWHT2["FWHT\nrotation"] - FWHT2 --> Prep["Pre-multiply:\nq̃[i] = (q̂[i] - min[i]) / scale[i]"] - Prep --> Scan["SIMD scan:\ndot(q̃, int8_stored)"] - Scan --> Result["Approximate L2"] -``` - -The key optimization: **query pushdown**. Instead of dequantizing each stored vector, we transform the query into the quantized coordinate system: - -``` -q̃[i] = (q̂[i] - min[i]) / scale[i] -``` - -Then the approximate L2 distance reduces to a simple dot product between the transformed float32 query and the stored INT8 codes — which SIMD can compute at billions of operations per second. - ---- - -## 🧬 Residual SVASQ: The IVF Superpower - -When SVASQ is used inside an IVF index (like SpectorIndex), vectors are quantized as **residuals** — the difference from their assigned centroid: - -$$r = x - c_{\text{nearest}}$$ - -### Why Residuals Matter - -Residual vectors are **much tighter** than absolute vectors: -- **Absolute coordinates** might span [-3.0, 3.0] → 255 INT8 bins cover a range of 6.0 -- **Residual coordinates** span [-0.2, 0.2] → 255 INT8 bins cover a range of 0.4 - -That's a **15× improvement in quantization precision** — the same 8-bit integer now represents a 15× smaller step size. - -> [!TIP] -> **INT8 residual quantization gives the spatial precision of INT12–INT16 absolute quantization.** This is why SpectorIndex achieves excellent recall despite using only 1 byte per dimension. - -### The FWHT Order - -When combining FWHT with IVF residual quantization, the order of operations matters: - -```mermaid -graph LR - X["x"] --> C["Find nearest\ncentroid c"] - C --> R["r = x - c\n(residual)"] - R --> FWHT3["FWHT(r)\n(rotate residual)"] - FWHT3 --> Q["INT8 quantize\n(rotated residual)"] -``` - -**CRITICAL:** Apply FWHT to the **residual**, not to the raw vector. Applying FWHT before centroid assignment would break the spatial clustering — the Hadamard transform scrambles the dimensions, making K-Means clusters meaningless. - ---- - -## 📊 SVASQ vs Other Quantization - -| Technique | Compression | Recall@10 | Speed | Notes | -|-----------|------------|-----------|-------|-------| -| Float32 (baseline) | 1× | 100% | ⚡ | Reference | -| **Scalar INT8** | 4× | 95-99% | ⚡⚡ | Simple, good baseline | -| **SVASQ INT8** | ~4× | **97-99.5%** | ⚡⚡ | FWHT rotation removes outlier impact | -| **SVASQ-4 (INT4)** | **6-8×** | **95-99%** | ⚡⚡ | Nibble-packed FWHT + 3× rescore recommended | -| Scalar INT4 | 8× | 85-95% | ⚡⚡ | Aggressive, needs rescore | -| Product Quantization | 32× | 80-92% | ⚡ | Complex, requires training | - -SVASQ achieves the compression of standard INT8 with recall approaching float32 — because the FWHT rotation ensures every dimension contributes equally to the quantized distance. - ---- - -## 🔢 SVASQ-4: INT4 Nibble-Packed Quantization - -SVASQ-4 extends the SVASQ pipeline to 4-bit quantization, achieving **~2× additional compression** over SVASQ-8 (6–8× total vs float32). - -### Why It Works - -The FWHT rotation that makes SVASQ-8 work is equally beneficial for INT4: - -- After FWHT, all dimensions contribute equally → INT4 quantization noise is **isotropic** -- With IVF residuals, the tight range means INT4 on residuals ≈ INT6–INT7 on absolute vectors -- 15 quantization levels (vs 255 for INT8) is sufficient for ranking with oversampling rescore - -### Memory Layout - -``` -[float32 normSq (4 bytes)] [INT4 × paddedDim nibble-packed (paddedDim/2 bytes)] -``` - -Two 4-bit values are packed per byte using **offset encoding** (shifting [-7, 7] to [0, 14]): - -``` -byte = (hiNibble << 4) | loNibble -``` - -| Dims | Float32 | SVASQ-8 | SVASQ-4 | SVASQ-4 Compression | -|------|---------|--------|--------|-------------------| -| 384 → 512 | 1,536 B | 516 B | 260 B | **5.9×** | -| 768 → 1024 | 3,072 B | 1,028 B | 516 B | **6.0×** | -| 4096 | 16,384 B | 4,100 B | 2,052 B | **8.0×** | - -### Calibration - -SVASQ-4 uses **tighter clipping** than SVASQ-8 (2.5σ vs 3.0σ) to optimize for 15 quantization levels: - -```java -SvasqParams params = SvasqCalibrator.calibrate4bit(corpus, dimensions, seed); -// params.bitWidth() == 4 -// params.bytesPerVector() == 4 + paddedDim / 2 -``` - -### SIMD Kernel - -The `Svasq4SimdKernel` extracts nibbles via shift+mask in each loop iteration, providing natural instruction-level parallelism: - -```java -// Load VL packed bytes = 2×VL dimensions -ByteVector packed = ByteVector.fromMemorySegment(B_SPECIES, segment, offset, nativeOrder); - -// Extract high nibbles (even dims) and low nibbles (odd dims) -ByteVector hi = packed.lanewise(LSHR, 4).and(0x0F); // → [0, 14] -ByteVector lo = packed.and(0x0F); // → [0, 14] - -// Widen to float32 and FMA with deinterleaved query arrays -accHi = ((FloatVector) hi.castShape(F_SPECIES, 0)).fma(qTildeHi[i], accHi); -accLo = ((FloatVector) lo.castShape(F_SPECIES, 0)).fma(qTildeLo[i], accLo); -``` - -The hi/lo split gives the CPU two independent FMA chains — one for even dimensions and one for odd — maximizing pipeline utilization. - -### Usage - -=== "Builder API" - - ```java - SpectorEngine engine = SpectorEngine.builder() - .dimensions(768) - .capacity(500_000) - .svasq4() // SVASQ-4 with default 3× rescore - .build(); - ``` - -=== "Config API" - - ```java - SpectorConfig config = SpectorConfig.DEFAULT - .withDimensions(768) - .withSvasq4(5); // 5× oversampling for higher recall - ``` - -=== "Direct Index API" - - ```java - QuantizedHnswIndex index = QuantizedHnswIndex.svasq4( - 768, 100_000, SimilarityFunction.COSINE, HnswParams.DEFAULT, 3); - ``` - -### Expected Recall - -| Configuration | Recall@10 | Notes | -|--------------|-----------|-------| -| SVASQ-4 (no rescore) | ~95–97% | Direct quantized distance only | -| SVASQ-4 (2× rescore) | ~96–98% | Moderate oversampling | -| **SVASQ-4 (3× rescore)** | **~97–99%** | **Recommended default** | -| SVASQ-4 (5× rescore) | ~98–99% | Higher latency, diminishing returns | -| SVASQ-8 (no rescore) | ~97–99.5% | For comparison | - ---- - -## 💻 Implementation in Spector - -### SvasqCalibrator - -Calibrates min/max statistics per dimension from a representative sample: - -```java -// SVASQ-8 calibration -SvasqParams params8 = SvasqCalibrator.calibrate(flatData, sampleSize, dimensions); - -// SVASQ-4 calibration (tighter clipping for 15 levels) -SvasqParams params4 = SvasqCalibrator.calibrate4bit(flatData, sampleSize, dimensions); -``` - -### SvasqStrategy / Svasq4Strategy - -Encodes vectors and computes asymmetric distances: - -```java -// SVASQ-8 -SvasqStrategy strategy = new SvasqStrategy(params, SimilarityFunction.EUCLIDEAN); - -// SVASQ-4 -Svasq4Strategy strategy4 = new Svasq4Strategy(params4, SimilarityFunction.EUCLIDEAN); - -// Both implement QuantizationStrategy — same API -byte[] encoded = strategy.encode(residualVector); -float dist = strategy.computeDistance(segment, offset, qs); -``` - -### SvasqSimdKernel / Svasq4SimdKernel - -The Panama SIMD kernel that computes SVASQ distances directly from off-heap memory: - -```java -// SVASQ-8: Zero-copy INT8 codes from MemorySegment -float l2Dist = SvasqSimdKernel.computeL2(segment, offset, paddedDim, queryState); - -// SVASQ-4: Zero-copy nibble-packed INT4 codes from MemorySegment -float l2Dist4 = Svasq4SimdKernel.computeL2(segment, offset, halfPaddedDim, queryState4); -``` - ---- - -## 📐 Mathematical Proof: Distance Preservation - -For completeness, here's why FWHT preserves L2 distance. - -The Hadamard matrix $H_n$ (normalized by $1/\sqrt{n}$) is orthogonal: $H^T H = I$. - -For any two vectors $x, y$: - -$$\|Hx - Hy\|^2 = (Hx - Hy)^T(Hx - Hy) = (x - y)^T H^T H (x - y) = (x - y)^T(x - y) = \|x - y\|^2$$ - -Therefore: $L2(Hx, Hy) = L2(x, y)$. QED. - -The quantization error is now distributed uniformly across all dimensions (because FWHT spread the variance), so the expected quantization error is **minimized** — this is the optimality condition proven by Lyubarskii & Vershynin (2010) for random orthogonal transforms. - ---- - -## 🔗 See Also - -- [Large-Scale Benchmarks](real-embedding-benchmarks.md) — Empirical sweeps for real embeddings and HNSW shard promotions. -- [Roadmap](../roadmap.md) — Future compression improvements (SVASQ-PQ, padding-aware storage, norm f16) -- [Understanding Quantization](understanding-quantization.md) — All quantization techniques compared -- [SpectorIndex Architecture](spector-index-architecture.md) — How SVASQ fits into the IVF-HNSW index -- [SVASQ Whitepaper](svasq-spectorindex-whitepaper.md) — Academic treatment with proofs and benchmarks -- [Quantization Comparison](quantization-comparison.md) — How Spector compares to other engines' quantization diff --git a/docs/docs/deep-dives/svasq-spectorindex-whitepaper.md b/docs/docs/deep-dives/svasq-spectorindex-whitepaper.md deleted file mode 100644 index ae58764..0000000 --- a/docs/docs/deep-dives/svasq-spectorindex-whitepaper.md +++ /dev/null @@ -1,298 +0,0 @@ -# SVASQ + SpectorIndex: A Technical Whitepaper - -> **Vectorized Affine Scalar Quantization with Adaptive IVF-HNSW Indexing for High-Performance Approximate Nearest Neighbor Search** - -*Spector Engine — 2026* - ---- - -## Abstract - -We present **SVASQ** (Vectorized Affine Scalar Quantization), a novel vector compression technique that applies the Fast Walsh-Hadamard Transform (FWHT) to spread dimensional variance before INT8 affine quantization, achieving near-lossless recall with 4× compression. We integrate SVASQ into **SpectorIndex**, an adaptive hybrid index combining Inverted File (IVF) coarse partitioning with per-partition Hierarchical Navigable Small World (HNSW) graphs that automatically promote from exact flat scans to quantized graph search as partitions grow. The system achieves 100K–250K vector ingestions per second (28–160× faster than standalone HNSW), sub-millisecond search latency, and perfect recall at full probe depth, implemented entirely on the JVM using Java 21's Vector API (Project Panama) for SIMD acceleration and off-heap memory for zero-GC search paths. - ---- - -## 1. Introduction - -Vector similarity search is the computational backbone of modern AI applications — retrieval-augmented generation (RAG), semantic search, recommendation systems, and multimodal retrieval all depend on finding the K nearest neighbors of a query vector among millions or billions of stored embeddings. - -The fundamental tension in ANN search is the **recall–speed–memory triangle**: -- **HNSW** [[1]](#references) achieves excellent recall (95–99%) with O(log n) search, but suffers from slow O(n log n) construction and high memory consumption (graph edges consume 50–100% of vector storage). -- **IVF** [[2]](#references) enables fast ingestion and cache-friendly search through spatial partitioning, but standalone flat IVF has limited recall at low probe depths. -- **Product Quantization** [[3]](#references) provides aggressive compression (32–96×) but requires expensive codebook training, complex lookup-table-based distance computation, and suffers from significant recall degradation. - -SpectorIndex addresses all three limitations simultaneously by combining the strengths of IVF, HNSW, and a novel quantization approach (SVASQ) that achieves the simplicity and speed of scalar quantization with recall approaching float32 exact search. - ---- - -## 2. SVASQ: Vectorized Affine Scalar Quantization - -### 2.1 The Outlier Problem in Scalar Quantization - -Standard INT8 scalar quantization maps each dimension independently using a linear affine transform: - -$$q_i = \text{round}\left(255 \cdot \frac{x_i - \text{min}_i}{\text{max}_i - \text{min}_i}\right)$$ - -The quantization error per dimension is bounded by $\epsilon_i \leq \frac{\text{max}_i - \text{min}_i}{510}$, which is inversely proportional to the dynamic range. When a small number of dimensions have disproportionately large ranges (common in transformer embeddings [[4]](#references)), the quantization error concentrates in those dimensions, degrading distance approximation quality. - -### 2.2 Variance Equalization via FWHT - -SVASQ resolves the outlier problem by applying an orthogonal rotation before quantization. We use the **Fast Walsh-Hadamard Transform** (FWHT), which multiplies the vector by the normalized Hadamard matrix $H_n$: - -$$\hat{x} = \frac{1}{\sqrt{n}} H_n \cdot x$$ - -**Theorem 1 (Distance Preservation).** For any vectors $x, y \in \mathbb{R}^n$: $\|H_n x - H_n y\| = \|x - y\|$. - -*Proof.* $H_n$ is orthogonal ($H_n^T H_n = nI$), so $\|(Hx - Hy)\|^2 = (x-y)^T H^T H (x-y) = n\|x-y\|^2$. After normalization by $1/\sqrt{n}$, the distance is preserved exactly. □ - -**Theorem 2 (Variance Spreading).** Let $x$ be a random vector with covariance $\Sigma$. The Hadamard transform $\hat{x} = H_n x / \sqrt{n}$ has covariance $\hat{\Sigma} = H_n \Sigma H_n^T / n$. If $\Sigma$ has one dominant eigenvalue $\lambda_1 \gg \lambda_i$ for $i > 1$, the diagonal entries of $\hat{\Sigma}$ are approximately equal: $\hat{\Sigma}_{ii} \approx \text{tr}(\Sigma)/n$. - -*Intuition:* Each output dimension of the Hadamard transform is a sum/difference of all input dimensions with alternating signs. A single outlier dimension's variance is distributed across all output dimensions. - -### 2.3 SVASQ Encoding Pipeline - -Given a vector $x \in \mathbb{R}^D$: - -1. **Pad** to $\hat{D}$ = next power of 2 (zero-fill) -2. **FWHT:** $\hat{x} = \text{FWHT}(x)$ — in-place O(D log D) using only additions/subtractions -3. **Extract norm:** $\|x\|_2$ stored as float32 (4 bytes) -4. **Calibrate:** Per-dimension $\text{min}_i, \text{scale}_i$ from a representative sample (one-time) -5. **Quantize:** $q_i = \text{clamp}(\text{round}(255 \cdot (\hat{x}_i - \text{min}_i) / \text{scale}_i), 0, 255)$ -6. **Store:** `[norm_f32 | q_0, q_1, ..., q_{D̂-1}]` — total $\hat{D} + 4$ bytes - -**Storage cost:** For 768-dim vectors: $\hat{D} = 1024$, total = 1028 bytes/vector (vs. 3072 bytes for float32) — **3.0× compression**. - -### 2.4 Asymmetric Distance Computation (ADC) - -At query time, we avoid dequantizing stored vectors. Instead, we transform the query into the quantized coordinate system (**query pushdown**): - -$$\tilde{q}_i = \frac{\hat{q}_i - \text{min}_i}{\text{scale}_i}$$ - -The approximate L2² distance reduces to: - -$$\hat{d}(q, x) \approx \sum_i (\tilde{q}_i - q_i)^2 \cdot \text{scale}_i^2$$ - -This is a weighted dot-product between float32 query coefficients and INT8 stored codes, which the Java Vector API (Panama) computes using fused multiply-add SIMD instructions at ~1 billion operations per second. - ---- - -## 3. SpectorIndex: Adaptive IVF-HNSW Architecture - -### 3.1 Two-Level Partitioning - -SpectorIndex organizes vectors in a two-level hierarchy: - -**Level 1 (IVF):** K-Means++ produces $C$ centroids from a training sample. Each vector is assigned to its nearest centroid and stored as a **residual** $r = x - c_{\text{nearest}}$. - -**Level 2 (Adaptive Shards):** Each centroid's partition is a `SpectorShard` operating in one of two modes: - -| Mode | Condition | Search | Memory | -|------|-----------|--------|--------| -| **Flat** | size < $T$ | Exact SIMD scan over float32 residuals | Float32 buffer | -| **HNSW** | size ≥ $T$ | SVASQ-quantized graph traversal | SVASQ codes + graph edges | - -Where $T$ is the `shardThreshold` (default: 20,000). - -### 3.2 Why Flat Scan Beats HNSW for Small Partitions - -Modern SIMD hardware can scan contiguous memory at extraordinary speed. Using the Java Vector API with 256-bit lanes: - -- **Flat scan throughput:** ~1,000 vectors per microsecond (sequential memory access, hardware prefetcher engaged) -- **HNSW graph traversal:** ~10–50 nodes per microsecond (random memory access, L2 cache misses at ~100ns each) - -For partitions of $N < 20{,}000$ vectors, the flat scan completes in $N / 1000 \approx 20\mu s$ — faster than HNSW's $O(\log N)$ graph hops with their cache miss penalties. - -### 3.3 Automatic Shard Promotion - -When a shard's flat buffer reaches `shardThreshold`, it automatically promotes to HNSW mode: - -1. **Calibrate SVASQ** from the flat buffer (in-place, single pass) -2. **Build HNSW graph** with pre-calibrated SVASQ strategy (bulk insertion) -3. **Null flat buffer** to reclaim heap memory -4. **Volatile publication** — a `volatile` write to the `promoted` flag establishes a happens-before edge, guaranteeing the HNSW index is visible to all concurrent search threads - -The promotion is performed under an exclusive write-lock. In-flight flat scans hold the read-lock and complete before promotion begins; new searches arriving during promotion block on the read-lock. - -### 3.4 Translation-Invariant Cross-Shard Merge - -**This is the most critical correctness property of the architecture.** - -After searching $k_{\text{probe}}$ shards, the results must be merged into a global top-K. Each shard returns scores computed on **residuals** — vectors translated to different coordinate origins (centroids). For the merge to be correct, scores from different shards must be **comparable**. - -**L2 distance is translation-invariant:** - -$$\|(q - c) - (x - c)\|^2 = \|q - x\|^2$$ - -The centroid $c$ cancels algebraically, so the residual L2 distance equals the original-space L2 distance regardless of which shard the vector resides in. - -**Cosine similarity is NOT translation-invariant:** $\cos(q - c_1, x - c_1) \neq \cos(q - c_2, y - c_2)$ in general. Using cosine for cross-shard merge produces incorrect rankings. - -> **Design rule:** SpectorIndex always uses **EUCLIDEAN distance** internally for residual search and global merge, regardless of the user's configured similarity function. This is consistent with FAISS's `IndexIVFFlat` and the SPANN architecture [[5]](#references). - -### 3.5 ADC for Graph Construction - -When promoting a shard, the HNSW graph must be wired correctly. For each new node, the algorithm finds its nearest existing neighbors. We use **Asymmetric Distance Computation (ADC)**: - -- **Incoming vector:** exact float32 residual (treated as a "query") -- **Existing nodes:** already SVASQ-quantized - -The ADC distance between an exact float32 vector and a quantized vector is more accurate than the Symmetric Distance (SDC) between two quantized vectors, producing a higher-quality graph with better recall. - ---- - -## 4. Implementation: Java 21 + Project Panama - -### 4.1 SIMD Distance Kernels - -All distance computations use the Java Vector API (`jdk.incubator.vector`): - -```java -FloatVector va = FloatVector.fromArray(SPECIES, a, offset); -FloatVector vb = FloatVector.fromArray(SPECIES, b, offset); -FloatVector diff = va.sub(vb); -sum = diff.fma(diff, sum); // fused multiply-add -``` - -The JIT compiler maps these to AVX2/AVX-512 instructions, achieving 8–16 float operations per clock cycle. - -### 4.2 Off-Heap Memory - -SVASQ-quantized vectors and HNSW graph edges are stored in Panama `MemorySegment` (off-heap), avoiding GC pressure during search. The `SvasqSimdKernel` reads INT8 codes directly from off-heap memory without any intermediate `byte[]` allocation. - -### 4.3 Zero-GC Flat Scan - -The flat scan uses array-based top-K tracking (parallel `float[]` scores and `int[]` indices) instead of `PriorityQueue`. No per-candidate object allocation occurs during the scan — only the final `ScoredResult[]` is allocated once per search. - -### 4.4 Virtual Thread Compatibility - -All locks use `ReentrantReadWriteLock`, which calls `LockSupport.park()` for blocking. This unmounts (not pins) virtual threads, making SpectorIndex safe for high-concurrency virtual thread workloads on Java 21+. - ---- - -## 5. Experimental Results - -### 5.1 L2 vs Cosine Residual Search Comparison - -We validated that L2 residual search produces perfect recall when all centroids are probed, compared to the incorrect use of cosine similarity for cross-shard merge: - -| Dataset | L2 Residual (nProbe=ALL) | Cosine Residual (nProbe=ALL) | -|---------|-----------|--------------------------| -| 10K (32 centroids) | **1.000** | 0.741 | -| 50K (32 centroids) | **1.000** | 0.726 | -| 100K (32 centroids) | **1.000** | 0.714 | - -The ~26% recall degradation with cosine similarity is caused by its lack of translation invariance — residual distances from different centroid origins are not directly comparable under cosine. - -### 5.2 Ingestion Throughput - -| Dataset Size | SpectorIndex | Standalone HNSW | Speedup | -|-------------|-------------|-----------------|---------| -| 10K | 130K docs/s | 4,677 docs/s | **28×** | -| 50K | 140K docs/s | 2,483 docs/s | **56×** | -| 100K | 150K docs/s | 1,535 docs/s | **98×** | -| 500K | 246K docs/s | — | — | -| 1M | 128K docs/s | — | — | - -### 5.3 Search Latency (128-dim random Gaussian vectors) - -| nProbe | 10K avg | 100K avg | 1M avg | -|--------|---------|----------|--------| -| 4 | 0.07ms | 0.33ms | 0.92ms | -| 8 | 0.08ms | 0.70ms | 2.00ms | -| 16 | 0.14ms | 1.5ms | 3.76ms | -| 32 | 0.29ms | 3.2ms | 7.45ms | -| 64 | — | — | 15.0ms | - -### 5.4 Real-Embedding Validation (Qwen3-embedding, 4096-dim) - -> [!NOTE] -> For the comprehensive, empirical sweeps across multiple coarse partition configurations ($C \in \{32, 64, 128, 256\}$) and deep analyses of HNSW shard promotions, refer to the dedicated [Large-Scale Real-Embedding Benchmarks page](real-embedding-benchmarks.md). - -To validate the architecture with structured data, we embedded 10,000 diverse sentences (8 topic categories) using Qwen3-embedding (4096 dimensions) via local Ollama inference. - -**Result: recall@10 = 1.0000 across ALL configurations tested.** - -| nCentroids | nProbe | % Data Searched | Avg Latency | QPS | Recall@10 | -|------------|--------|-----------------|-------------|-----|-----------| -| **128** | **4** | **3.1%** | **0.46ms** | **2,173** | **1.0000** | -| 128 | 8 | 6.3% | 0.73ms | 1,368 | 1.0000 | -| 128 | 16 | 12.5% | 1.26ms | 792 | 1.0000 | -| 64 | 4 | 6.3% | 0.62ms | 1,601 | 1.0000 | -| 64 | 8 | 12.5% | 1.17ms | 856 | 1.0000 | -| 32 | 4 | 12.5% | 1.17ms | 857 | 1.0000 | - -Even at `nProbe=4` with 128 centroids — searching only **3.1% of the data** — recall is perfect. This confirms that real embeddings form tight semantic clusters that IVF captures effectively. The random Gaussian results (Section 5.3) represent the worst-case scenario for IVF, not the typical production workload. - -**Comparison: random vs. real embeddings at nProbe=4, nCentroids=128:** - -| Metric | Random Gaussian (128-dim) | Real Qwen3 (4096-dim) | -|--------|--------------------------|----------------------| -| Recall@10 | 0.234 | **1.000** | -| Latency | 1.05ms | 0.46ms | - -The 4.3× recall improvement and 2.3× latency improvement demonstrate that SpectorIndex is **designed for real workloads**, where data structure is the norm. - ---- - -## 6. Discussion - -### 6.1 Random vs. Structured Data - -Recall at practical nProbe values is lower with random Gaussian vectors than with real embeddings because random high-dimensional data has no natural cluster structure — true nearest neighbors are distributed uniformly across Voronoi cells. Real embedding models (BERT, Sentence-BERT, CLIP, etc.) produce vectors with strong topic-based clustering, where nearest neighbors tend to reside in the same or adjacent IVF cells. - -### 6.2 Scaling Analysis - -SpectorIndex's architecture suggests the following scaling behavior: - -- **Memory:** O(D × N) with ~4× compression via SVASQ -- **Ingestion:** O(D × N) — dominated by residual computation and flat buffer appends -- **Search:** O(D × N/C × nProbe) — linear in partition size, controlled by nProbe -- **Optimal centroid count:** C ≈ √N minimizes the search cost × recall product - -### 6.3 Limitations - -1. **Training required:** K-Means training requires a representative sample. For streaming workloads, online centroid updates would be needed. -2. **Static partitioning:** Once centroids are learned, vector distribution changes can cause partition imbalance. Periodic re-training addresses this. -3. **No native deletion:** Removing vectors from HNSW shards is not implemented. A tombstone approach with periodic compaction is recommended. - ---- - -## 7. Related Work - -- **FAISS IndexIVFFlat** [[2]](#references): IVF with flat scan per partition. SpectorIndex adds adaptive HNSW promotion and SVASQ quantization. -- **SPANN** [[5]](#references): Space-Partitioned ANN by Microsoft. Similar IVF + local graph concept; SpectorIndex adds SVASQ and adaptive flat/HNSW shard modes. -- **ScaNN** [[6]](#references): Google's ANN library using anisotropic quantization. SVASQ achieves similar variance equalization via FWHT instead of learned rotations. -- **DiskANN** [[7]](#references): SSD-optimized graph index. SpectorIndex is RAM-optimized with off-heap Panama memory. - ---- - -## 8. Conclusion - -SVASQ + SpectorIndex demonstrates that combining three orthogonal techniques — IVF partitioning, adaptive HNSW graphs, and FWHT-rotated scalar quantization — produces a vector index with: - -- **Ingestion speed** rivaling flat arrays (100K+ docs/s) -- **Search recall** approaching exact brute-force (with sufficient nProbe) -- **Memory efficiency** of 4× scalar quantization with near-lossless quality -- **Implementation simplicity** on the JVM without native code or GPU dependencies - -The critical insight that L2 distance must be used for cross-shard merge (due to translation invariance) ensures correct global rankings — a property shared with all production IVF implementations. - ---- - -## References - - - -1. Malkov, Y.A. and Yashunin, D.A. (2018). "Efficient and robust approximate nearest neighbor using Hierarchical Navigable Small World graphs." *IEEE TPAMI*, 42(4), 824-836. - -2. Jégou, H., Douze, M., and Schmid, C. (2011). "Product quantization for nearest neighbor search." *IEEE TPAMI*, 33(1), 117-128. - -3. Johnson, J., Douze, M., and Jégou, H. (2019). "Billion-scale similarity search with GPUs." *IEEE TBD*, 7(2), 535-547. (FAISS) - -4. Kovaleva, O., et al. (2019). "Revealing the Dark Secrets of BERT." *EMNLP 2019*. (Outlier dimensions in transformers) - -5. Chen, Q., et al. (2021). "SPANN: Highly-efficient Billion-scale Approximate Nearest Neighbor Search." *NeurIPS 2021*. - -6. Guo, R., et al. (2020). "Accelerating Large-Scale Inference with Anisotropic Vector Quantization." *ICML 2020*. (ScaNN) - -7. Subramanya, S.J., et al. (2019). "DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node." *NeurIPS 2019*. diff --git a/docs/docs/deep-dives/turbo-quant.md b/docs/docs/deep-dives/turbo-quant.md deleted file mode 100644 index a25878e..0000000 --- a/docs/docs/deep-dives/turbo-quant.md +++ /dev/null @@ -1,209 +0,0 @@ -# ⚡ TurboQuant: Near-Optimal Vector Quantization - -> **8× compression with ~97%+ recall — no heavy training required.** TurboQuant applies a random orthogonal rotation before scalar quantization, making per-coordinate quantization near-optimal for any data distribution. - ---- - -## 🧠 How It Works - -TurboQuant is a two-step quantization scheme: - -```mermaid -flowchart LR - A["📄 Float32 Vector
(384 dims × 4 bytes = 1536B)"] --> B["🔄 Random Rotation
Orthogonal matrix × vector
SIMD-accelerated"] - B --> C["📊 Scalar Quantization
Per-coordinate to 4 bits
Nibble-packed"] - C --> D["💾 Stored
(384 dims × 0.5 bytes = 192B)
8× compression"] -``` - -### Step 1: Random Orthogonal Rotation - -A fixed random orthogonal matrix R is applied to every vector before quantization. This: - - -- **Isotropizes** the distribution — coordinates become near-independent - -- **Spreads information** uniformly across all dimensions - -- **Preserves distances** — orthogonal transforms don't change L2/cosine/IP - -The rotation matrix is generated once at calibration time from a deterministic seed. - -### Step 2: Per-Coordinate Scalar Quantization - -After rotation, each coordinate is quantized independently using linear min/max scaling to 4-bit values [0, 15]. Because the rotation made coordinates near-independent and uniformly distributed, this simple scalar quantization achieves near-optimal distortion rates. - ---- - -## 📊 Comparison with Other Quantization Methods - -| Method | Compression | Recall@10 | Training | SIMD-Friendly | -|--------|-------------|-----------|----------|---------------| -| Float32 (none) | 1× | 100% | None | ✅ | -| Scalar INT8 | 4× | ~99.5% | Min/max calibration | ✅ | -| **TurboQuant (4-bit)** | **8×** | **~97%+** | **Rotation + min/max** | **✅** | -| Scalar INT4 | 8× | ~93% | Quantile calibration | ✅ | -| Product Quantization | 32× | ~95% | K-Means (expensive) | ❌ | -| Scalar INT2 | 16× | ~88% | Quantile calibration | ✅ | - -### Key Advantages over Standard SQ4 - -Standard INT4 quantization has uneven distortion because embedding dimensions are correlated and non-uniform. TurboQuant's rotation decorrelates them first, resulting in: - - -- **4-5% higher recall** at the same bit budget - -- **No quantile training** needed (just min/max in rotated space) - -- **Better theoretical guarantees** (matches rate-distortion bounds) - -### Key Advantages over Product Quantization - - -- **No K-Means training** — PQ requires expensive clustering; TurboQuant is data-oblivious - -- **Simpler implementation** — No codebooks, no ADC lookup tables - -- **SIMD-friendly** — Packed 4-bit values use the same NibblePacker as standard SQ4 - -- **Lower latency** — Direct scalar operations vs. table lookups - ---- - -## 🚀 SIMD-Accelerated Implementation - -The rotation (the most expensive step) uses the Java Vector API for hardware acceleration: - -```java -// Inner dot product uses SIMD fused-multiply-add -FloatVector mv = FloatVector.fromArray(SPECIES, matrixRow, j); -FloatVector vv = FloatVector.fromArray(SPECIES, vector, j); -acc = mv.fma(vv, acc); // acc += mv * vv (single instruction) -``` - -### Memory Layout Optimizations - -| Optimization | Purpose | -|--------------|---------| -| Flat 1D array (not `float[][]`) | Sequential memory access, no pointer chasing | -| Pre-transposed matrix for inverse | Cache-friendly row access during decode | -| `System.arraycopy` for bulk ops | JVM intrinsic, bypasses bounds checks | -| SIMD dot products in Gram-Schmidt | Faster calibration (one-time cost) | - -### Performance Characteristics - -| Operation | Complexity | SIMD Speedup | -|-----------|-----------|--------------| -| Rotation (384-dim) | O(n²) = 147K muls | ~4-8× via FMA lanes | -| Scalar quantize | O(n) = 384 ops | Negligible cost | -| Pack to nibbles | O(n) = 192 bytes | Memory-bound | -| Distance computation | O(n) per vector | Same as scalar | - -> [!NOTE] -> For 384-dim vectors, rotation takes ~20µs on modern hardware with AVX2. This is amortized across thousands of distance computations in a search query. - ---- - -## 💻 Usage - -### Calibration - -```java -// Calibrate from a representative sample (100+ vectors recommended) -float[][] sampleVectors = loadSampleVectors(); -TurboQuantizer tq = TurboQuantizer.calibrate(sampleVectors, 384, 4, 42L); -// samples dims bits seed -``` - -The calibration: -1. Generates a random orthogonal matrix from the seed -2. Rotates all sample vectors -3. Computes per-dimension min/max in the rotated space (with 5% margin) - -### Encoding & Decoding - -```java -// Encode a vector -TurboQuantizer.TurboCode code = tq.encode(vector); -// code.packed() → 192 bytes (384 dims × 4 bits / 8) -// code.norm() → original L2 norm (for cosine/IP reconstruction) - -// Decode (approximate reconstruction) -float[] reconstructed = tq.decode(code); -``` - -### Distance Computation - -```java -// Approximate distances in quantized space -float l2dist = tq.approximateL2Distance(queryVector, code); -float ip = tq.approximateInnerProduct(queryVector, code); -float cosine = tq.approximateCosineSimilarity(queryVector, code); -``` - -### Batch-Optimized Search - -```java -// Rotate query once, then scan many database vectors -float[] rotatedQuery = tq.rotateQuery(queryVector); - -for (byte[] dbVector : database) { - float dist = tq.distanceFromRotatedQuery(rotatedQuery, dbVector); - // ... -} -``` - -### With QuantizedVectorStore - -```java -// Create a TurboQuant-backed store -var store = new QuantizedVectorStore(384, 100_000, turboQuantizer); - -// Store vectors (automatically rotated + quantized) -store.put("doc-1", embedding); - -// Retrieve (automatically dequantized + inverse-rotated) -float[] approx = store.getFloat(0); -``` - -### With SpectorEngine - -```java -SpectorEngine engine = SpectorEngine.builder() - .dimensions(384) - .quantization(QuantizationType.TURBO_QUANT) - .build(); -``` - ---- - -## 🔬 Mathematical Foundation - -TurboQuant is based on the observation that for a random orthogonal rotation R: - -1. If x has any distribution, then Rx has coordinates that are **near-independent** -2. For near-independent coordinates, **per-coordinate scalar quantization** achieves the **rate-distortion bound** -3. The rotation preserves all geometric relationships (L2, cosine, IP) - -This means: - -- **MSE** between original and reconstructed vectors is minimized - -- **Inner product estimation** is near-unbiased - -- **Nearest-neighbor search** quality matches the information-theoretic optimum for the given bit budget - -> [!TIP] -> For most use cases, 4-bit TurboQuant is the sweet spot: 8× compression with recall loss under 3%. Use 8-bit for maximum quality (4× compression, <0.5% loss) or 2-bit for extreme compression (16×, ~8% loss). - ---- - -## 🔗 See Also - - -- [Understanding Quantization](understanding-quantization.md) — Quantization theory and tradeoffs - -- [Quantization Comparison](quantization-comparison.md) — Benchmarks across all modes - -- [Architecture Overview](../architecture/overview.md) — How quantization fits in the stack - -- [Configuration Guide](../configuration/parameters.md) — Setting quantization parameters diff --git a/docs/docs/deep-dives/understanding-quantization.md b/docs/docs/deep-dives/understanding-quantization.md deleted file mode 100644 index 26ce823..0000000 --- a/docs/docs/deep-dives/understanding-quantization.md +++ /dev/null @@ -1,563 +0,0 @@ -# 🗜️ Understanding Quantization - -> **How search engines compress vectors to fit billions of embeddings in memory.** This page explains vector quantization from first principles — what it is, why it matters, and how different techniques trade off accuracy for efficiency. - ---- - -## 🤔 What is Quantization? - -Think of quantization like compressing a photo. A RAW image from your camera might be 25 MB — full precision, every detail preserved. Save it as JPEG and it drops to 2 MB. You lose some information, but for most purposes the image looks identical. - -Vector quantization does the same thing for embeddings. When a machine learning model encodes text or images, it produces a **vector** — a list of numbers (typically 128–1536 floating-point values) that captures meaning. These vectors are precise, but they're also *big*. - -``` -"The quick brown fox" → [0.0234, -0.1567, 0.4521, ..., 0.0891] - ↑ 384 float32 values = 1,536 bytes per vector -``` - -Quantization reduces the precision of each number — or replaces groups of numbers with compact codes — so vectors take less space while still being "close enough" for similarity search. - -> [!NOTE] -> **Quantization ≠ dimensionality reduction.** Dimensionality reduction (like PCA) removes dimensions entirely. Quantization keeps all dimensions but reduces the *precision* of each value. - ---- - -## 💰 Why Compress Vectors? - -Let's do the math. A typical embedding model produces 384-dimensional vectors in float32: - -``` -1 vector = 384 dimensions × 4 bytes = 1,536 bytes -``` - -Now scale that up: - -| Dataset Size | Memory (float32) | With 4× Compression | With 32× Compression | -|-------------|-----------------|---------------------|----------------------| -| 100K vectors | 146 MB | 37 MB | 4.6 MB | -| 1M vectors | **1.5 GB** | 375 MB | 46 MB | -| 10M vectors | **15 GB** | 3.7 GB | 460 MB | -| 100M vectors | **150 GB** | 37 GB | 4.6 GB | -| 1B vectors | **1.5 TB** | 375 GB | **46 GB** | - -At billion scale, full-precision vectors require **1.5 terabytes** of RAM — far beyond what typical servers provide. With 32× compression (Product Quantization), that same dataset fits in 46 GB — a single machine with a decent memory budget. - -> [!TIP] -> Even at smaller scales, compression matters. Less memory means better cache utilization, which means faster search. A 4× compressed index that fits in L3 cache will outperform a full-precision index that spills to RAM. - ---- - -## 📊 Types of Quantization - -### 🔢 Scalar Quantization (INT8) - -The simplest approach: map each float32 value to an int8 (8-bit integer). This gives exactly **4× compression** — every 4-byte float becomes a 1-byte integer. - -#### How It Works - -For each dimension, find the min and max values across all vectors, then linearly scale every float into the [0, 255] range: - -``` -quantized_value = round(255 × (value - min) / (max - min)) -``` - -To reconstruct (dequantize): - -``` -reconstructed = min + quantized_value × (max - min) / 255 -``` - -#### Properties - -| Metric | Value | -|--------|-------| -| Compression ratio | **4×** | -| Recall@10 | **≥ 95%** (often 98%+) | -| Speed impact | Faster (smaller data = better cache) | -| Complexity | Very low — simple min/max scaling | -| Calibration | Linear (min/max per dimension) | - -> [!TIP] -> Scalar INT8 is the "safe default" — you get meaningful memory savings with almost no recall loss. Start here unless you need more aggressive compression. - ---- - -### 🔢 Scalar Quantization (INT4) — Non-Uniform - -INT4 quantization maps each float32 value to a 4-bit integer (0–15), packed two values per byte. This gives **8× compression**. Unlike INT8's linear mapping, INT4 uses **non-uniform (quantile-based) calibration** to better preserve the data distribution. - -#### How It Works - -1. **Calibration:** Compute quantile-based boundaries per dimension from a representative sample. This creates 16 non-uniformly spaced buckets that match the actual data distribution. -2. **Encoding:** Assign each dimension value to the nearest boundary interval (0–15). -3. **Packing:** Store two 4-bit values per byte (nibble packing) — first value in bits 7–4, second in bits 3–0. - -``` -Original: [0.23, -0.45, 0.67, 0.01] -Encoded: [9, 2, 14, 7] (4-bit level per dimension) -Packed: [0x92, 0xE7] (two nibbles per byte → 50% storage) -``` - -#### Properties - -| Metric | Value | -|--------|-------| -| Compression ratio | **8×** | -| Recall@10 | **85–95%** (with rescore) | -| Speed impact | Fast — SIMD-accelerated packed dot product | -| Complexity | Medium — requires calibration on representative data | -| Calibration | Non-uniform (quantile-based boundaries per dimension) | -| Rescore default | 3× oversampling | - -> [!TIP] -> INT4 hits the sweet spot between INT8 and IVF-PQ: **8× compression with 85–95% recall** when paired with the configurable rescore strategy. Ideal for 10M–100M vector workloads that can't afford full PQ training complexity. - ---- - -### 🔢 Scalar Quantization (INT2) — Non-Uniform - -INT2 quantization maps each float32 value to a 2-bit integer (0–3), packed four values per byte. This gives **16× compression** — the most aggressive scalar quantization before going to binary. - -#### How It Works - -1. **Calibration:** Same quantile-based approach as INT4, but with only 4 buckets per dimension. -2. **Encoding:** Assign each dimension value to one of 4 levels. -3. **Packing:** Store four 2-bit values per byte (crumb packing) — values stored in bits 7–6, 5–4, 3–2, 1–0. - -``` -Original: [0.23, -0.45, 0.67, 0.01] -Encoded: [2, 0, 3, 1] (2-bit level per dimension) -Packed: [0x8D] (four crumbs per byte → 75% storage reduction) -``` - -#### Properties - -| Metric | Value | -|--------|-------| -| Compression ratio | **16×** | -| Recall@10 | **75–90%** (with rescore) | -| Speed impact | Fastest scalar — minimal data to scan | -| Complexity | Medium — same calibration as INT4 | -| Calibration | Non-uniform (quantile-based boundaries per dimension) | -| Rescore default | 5× oversampling | - -> [!IMPORTANT] -> INT2 is aggressive — only 4 levels per dimension. The higher default oversampling (5×) compensates by rescoring more candidates with exact float32 distances. Best suited for memory-constrained environments where you accept some recall trade-off. - ---- - -### 🔲 Binary Quantization (1-bit) - -The most aggressive approach: each float becomes a single bit — 0 if negative, 1 if positive. This gives **32× compression** (32 float32 values → 32 bits = 4 bytes). - -#### How It Works - -``` -bit = 1 if value > 0, else 0 -``` - -A 384-dimensional vector becomes just 384 bits = **48 bytes** (down from 1,536 bytes). - -```mermaid -graph LR - subgraph "Original floats" - V1["0.23"] - V2["-0.45"] - V3["0.01"] - V4["-0.89"] - V5["0.67"] - V6["-0.12"] - V7["0.34"] - V8["-0.56"] - end - - subgraph "Binary (sign bit)" - B1["1"] - B2["0"] - B3["1"] - B4["0"] - B5["1"] - B6["0"] - B7["1"] - B8["0"] - end - - V1 --> B1 - V2 --> B2 - V3 --> B3 - V4 --> B4 - V5 --> B5 - V6 --> B6 - V7 --> B7 - V8 --> B8 -``` - -#### Hamming Distance - -With binary vectors, similarity is measured using **Hamming distance** — just count how many bits differ. Modern CPUs have a hardware `POPCNT` instruction that makes this blazing fast: - -``` -vector_a = 10101010 -vector_b = 10100110 -XOR = 00001100 → popcount = 2 (Hamming distance = 2) -``` - -#### Properties - -| Metric | Value | -|--------|-------| -| Compression ratio | **32×** | -| Recall@10 | **60–80%** (varies by dataset) | -| Speed impact | Extremely fast (bitwise ops + POPCNT) | -| Complexity | Trivial — just sign extraction | - -> [!IMPORTANT] -> Binary quantization loses significant information. It works best with **high-dimensional embeddings** (768+) where the sign pattern alone carries meaning. For 384-dim or lower, expect noticeable recall degradation. Always pair with rescoring (recompute exact distance on top candidates). - ---- - -### 🧩 Product Quantization (PQ) - -Product Quantization is the "sweet spot" — achieving **32× compression** while maintaining much higher recall than binary quantization. It's more complex, but the idea is elegant. - -#### The Core Idea - -Instead of compressing each number independently, PQ groups dimensions into **subspaces** and finds the best approximation for each group from a learned codebook. - -#### Step by Step - -```mermaid -graph TD - subgraph "Step 1: Split vector into subspaces" - V["384-dim vector"] - V --> S1["Subspace 1
dims 0-23"] - V --> S2["Subspace 2
dims 24-47"] - V --> S3["Subspace 3
dims 48-71"] - V --> SD["..."] - V --> S16["Subspace 16
dims 360-383"] - end - - subgraph "Step 2: Quantize each subspace" - S1 --> C1["Centroid ID: 42"] - S2 --> C2["Centroid ID: 187"] - S3 --> C3["Centroid ID: 3"] - SD --> CD["..."] - S16 --> C16["Centroid ID: 201"] - end - - subgraph "Step 3: Store compact code" - C1 --> Code["[42, 187, 3, ..., 201]
16 bytes total"] - end -``` - -**Training phase:** -1. Split all vectors into M subspaces (e.g., 16 subspaces of 24 dims each) -2. Run K-Means clustering on each subspace independently (K=256 centroids) -3. Store the 16 codebooks (256 centroids × 24 dims × 4 bytes each) - -**Encoding phase:** -1. For each vector, split into M subspaces -2. Find the nearest centroid in each subspace's codebook -3. Store M centroid indices (1 byte each) → **M bytes per vector** - -**Search phase (Asymmetric Distance Computation):** -1. Compute distances from the *full-precision query* to all 256 centroids in each subspace → 256 × M lookup table -2. For each stored code, sum up M table lookups → approximate distance -3. Return top candidates (optionally rescore with full vectors) - -#### Properties - -| Metric | Value | -|--------|-------| -| Compression ratio | **32×** (with 16 subspaces) to **96×** (with 48 subspaces) | -| Recall@10 | **80–92%** (depends on subspaces and dataset) | -| Speed impact | Fast — table lookups instead of floating-point math | -| Complexity | High — requires training codebooks on representative data | - -> [!NOTE] -> The "product" in Product Quantization refers to the Cartesian product of subspace codebooks. Each subspace is quantized independently, and the full approximation is the product of these independent approximations. - ---- - -### 📂 IVF-PQ (Inverted File + Product Quantization) - -IVF-PQ combines two techniques for maximum efficiency at billion scale: - -1. **IVF (Inverted File):** Partition vectors into clusters using K-Means. At search time, only examine the nearest clusters. -2. **PQ (Product Quantization):** Compress vectors within each cluster. - -#### Two-Level Search - -```mermaid -graph TD - Q["Query vector"] --> Coarse["Stage 1: Coarse Search
Find nprobe nearest clusters"] - - Coarse --> P1["Partition 1
50K PQ codes"] - Coarse --> P2["Partition 2
50K PQ codes"] - Coarse --> P3["Partition 3
50K PQ codes"] - - P1 --> Fine["Stage 2: Fine Search
ADC distance on PQ codes"] - P2 --> Fine - P3 --> Fine - - Fine --> Results["Top-K results
(optionally rescore)"] -``` - -**How it reduces work:** - -- With 1000 partitions and `nprobe=10`, you only examine **1% of the dataset** - -- Within those partitions, PQ codes are tiny, so scanning is cache-friendly - -- Combined effect: search billions of vectors in milliseconds - -#### Properties - -| Metric | Value | -|--------|-------| -| Compression ratio | **32×** (same as PQ) | -| Recall@10 | **75–90%** (depends on nprobe and PQ settings) | -| Speed | Very fast — coarse filtering + compressed scan | -| Scale | **Billions of vectors** on a single node | -| Complexity | Requires training (K-Means for partitions + PQ codebooks) | - -> [!TIP] -> **Tuning `nprobe`** is the key recall/speed knob. Higher nprobe = more partitions searched = higher recall but slower queries. Start with nprobe=10 and increase until you hit your recall target. - ---- - -## 📋 Comparison Table - -| Method | Compression | Recall@10 | Speed | Memory (1B × 384d) | Best For | -|--------|------------|-----------|-------|---------------------|----------| -| **Scalar INT8** | 4× | 95–99% | ⚡ Fast | 375 GB | High-recall, moderate scale | -| **Scalar INT4** | 8× | 85–95% | ⚡ Fast | 188 GB | Balanced compression/recall | -| **Scalar INT2** | 16× | 75–90% | ⚡⚡ Very Fast | 94 GB | Memory-constrained, pre-filter | -| **Binary (1-bit)** | 32× | 60–80% | ⚡⚡ Fastest | 46 GB | First-pass candidate generation | -| **Product Quantization** | 32–96× | 80–92% | ⚡ Fast | 46 GB (32×) | Large-scale with good recall | -| **IVF-PQ** | 32–96× | 75–90% | ⚡⚡ Very Fast | 46 GB (32×) | Billion-scale, balanced | - ---- - -## 🎯 What Spector Uses - -Spector provides a full spectrum of scalar quantization plus IVF-PQ, covering every memory/recall trade-off: - -### Scalar INT8 — For High-Recall Scenarios - -When recall is critical (search quality matters more than memory), Scalar INT8 delivers: - -- **4× compression** with nearly lossless quality (≥ 95% recall) - -- Simple min/max calibration — no training phase needed - -- SIMD-friendly — int8 operations parallelize beautifully on modern CPUs - -- Ideal for datasets up to ~50M vectors on a 64 GB machine - -### Scalar INT4 — The Balanced Sweet Spot - -When you need more compression than INT8 but don't want the complexity of PQ: - -- **8× compression** with **85–95% recall** when paired with rescore - -- Non-uniform (quantile-based) calibration adapts to your data distribution - -- Nibble-packed storage (2 values/byte) with SIMD-accelerated distance - -- Default 3× oversampling rescore recovers recall lost to quantization - -- Ideal for 10M–100M vector workloads on moderate hardware - -### Scalar INT2 — Maximum Scalar Compression - -When memory is the primary constraint and you can tolerate some recall loss: - -- **16× compression** — just 4 levels per dimension - -- Same quantile-based calibration as INT4 for optimal bucket placement - -- Crumb-packed storage (4 values/byte) for minimal memory footprint - -- Default 5× oversampling rescore compensates for aggressive quantization - -- Ideal for memory-constrained environments or as a fast pre-filter - -### IVF-PQ — For Billion-Scale Scenarios - -When you need to search billions of vectors on commodity hardware: - -- **32× compression** brings 1B vectors down to ~46 GB - -- Two-level search (coarse IVF + fine PQ) keeps latency low - -- Trained codebooks preserve more information than binary quantization - -- `nprobe` parameter lets you dial recall vs. speed at query time - -### Configurable Rescore Strategy - -All quantization modes support an **oversampling-based rescore** to recover recall: -1. Retrieve `oversamplingFactor × k` candidates using fast quantized distance -2. Recompute exact float32 distances for those candidates -3. Return the true top-K based on exact scores - -| Quantization | Default Oversampling | Effective Recall | -|-------------|---------------------|-----------------| -| INT8 | 1 (no rescore) | 95–99% | -| INT4 | 3× | 85–95% | -| INT2 | 5× | 75–90% | - -> [!NOTE] -> Set oversampling to 1 to disable rescore entirely (faster but lower recall). GPU acceleration for INT4/INT2 requires dimensions to be a multiple of 32; otherwise Spector automatically falls back to CPU/SIMD. - -### The Full Spectrum - -```mermaid -graph LR - subgraph "Spector Coverage" - INT8["Scalar INT8
4× compression
95-99% recall"] - INT4["Scalar INT4
8× compression
85-95% recall"] - INT2["Scalar INT2
16× compression
75-90% recall"] - IVFPQ["IVF-PQ
32× compression
75-90% recall"] - end - - INT8 ---|"Need more compression"| INT4 - INT4 ---|"Need more compression"| INT2 - INT2 ---|"Billion scale"| IVFPQ - - Small["1M-50M vectors
Quality-first"] --> INT8 - Medium["10M-100M vectors
Balanced"] --> INT4 - Constrained["Memory-limited
50M-500M"] --> INT2 - Large["100M-1B+ vectors
Scale-first"] --> IVFPQ -``` - ---- - -## 💻 Code Examples - -### Configuring Scalar INT8 Quantization - -```java -// Scalar INT8 — 4× compression, near-lossless recall -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(10_000_000) // 10M vectors - .withQuantization(QuantizationType.SCALAR_INT8); - -try (var engine = new SpectorEngine(config)) { - // Ingest as normal — quantization happens automatically - engine.ingest("doc-1", "Vector search fundamentals", embedding); - - // Search uses quantized vectors for distance computation - // Recall remains ≥ 95% with 4× less memory - var results = engine.hybridSearch("vector compression", queryVector, 10); -} -``` - -### Configuring Scalar INT4 (Non-Uniform, with Rescore) - -```java -// Scalar INT4 — 8× compression, non-uniform calibration, rescore for recall -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(50_000_000) // 50M vectors - .withQuantization(QuantizationType.SCALAR_INT4) - .withRescore(3); // 3× oversampling (default for INT4) - -try (var engine = new SpectorEngine(config)) { - // Calibration happens automatically from ingested vectors - engine.ingestBulk(vectors); - - // Search: fast quantized distance → rescore top candidates with exact float32 - // Effective recall: 85–95% - var results = engine.vectorSearch(queryVector, 10); -} -``` - -### Configuring Scalar INT2 (Maximum Compression) - -```java -// Scalar INT2 — 16× compression, aggressive but memory-efficient -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(100_000_000) // 100M vectors - .withQuantization(QuantizationType.SCALAR_INT2) - .withRescore(5); // 5× oversampling (default for INT2) - -try (var engine = new SpectorEngine(config)) { - engine.ingestBulk(vectors); - - // 16× less memory than float32 — fits large datasets in RAM - // Rescore compensates for aggressive quantization - var results = engine.vectorSearch(queryVector, 10); -} -``` - -### Configuring IVF-PQ for Billion-Scale - -```java -// IVF-PQ — 32× compression for billion-scale datasets -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(1_000_000_000) // 1 billion vectors - .withQuantization(QuantizationType.IVF_PQ) - .withIvfPartitions(4096) // Number of coarse clusters - .withPqSubspaces(32) // Subspaces (384/32 = 12 dims each) - .withNprobe(16); // Partitions to search (recall/speed knob) - -try (var engine = new SpectorEngine(config)) { - // Training happens automatically on first batch of vectors - engine.ingestBulk(trainingVectors); // First batch trains codebooks - - // Subsequent ingestion uses trained codebooks - engine.ingestBulk(remainingVectors); - - // Search: coarse IVF lookup → PQ distance within partitions - var results = engine.vectorSearch(queryVector, 10); -} -``` - -### REST API Configuration - -```bash -# Start with Scalar INT4 + rescore -curl -X PUT http://localhost:7070/api/v1/config \ - -H "Content-Type: application/json" \ - -d '{ - "quantization": "scalar_int4", - "oversamplingFactor": 3 - }' - -# Start with Scalar INT2 + higher rescore for better recall -curl -X PUT http://localhost:7070/api/v1/config \ - -H "Content-Type: application/json" \ - -d '{ - "quantization": "scalar_int2", - "oversamplingFactor": 5 - }' - -# Start with IVF-PQ -curl -X PUT http://localhost:7070/api/v1/config \ - -H "Content-Type: application/json" \ - -d '{ - "quantization": "ivf_pq", - "ivfPartitions": 4096, - "pqSubspaces": 32, - "nprobe": 16 - }' -``` - ---- - -## 🔗 See Also - -- [Core Concepts](../architecture/core-concepts.md) — HNSW, BM25, RRF, and SIMD fundamentals - -- [Quantization Comparison](quantization-comparison.md) — How different engines approach quantization - -- [Performance Tuning](../operations/performance-tuning.md) — Tuning quantization parameters for your workload - -- [Architecture Overview](../architecture/overview.md) — How quantization fits into the storage layer - -- [Configuration Guide](../configuration/parameters.md) — All quantization parameters and defaults \ No newline at end of file diff --git a/docs/docs/faq.md b/docs/docs/faq.md deleted file mode 100644 index 50d560b..0000000 --- a/docs/docs/faq.md +++ /dev/null @@ -1,252 +0,0 @@ -# ❓ FAQ - -> **Quick answers to the most common questions about Spector.** Can't find what you're looking for? Check [GitHub Discussions](https://github.com/spectrayan/spector/discussions) or the specific wiki pages linked throughout. - ---- - -## 🌟 General - -### What Java version do I need? - -**JDK 25 or later.** Spector uses the Java Vector API (incubator module) for SIMD acceleration and Panama FFM for off-heap memory. [OpenJDK builds](https://jdk.java.net/) include these by default. - ---- - -### Does it work without a GPU? - -**Yes, completely.** GPU is optional. Without a GPU, Spector uses CPU SIMD acceleration (AVX2/AVX-512/NEON) which delivers sub-millisecond search at 100K documents. GPU helps primarily for high-concurrency batch workloads. - -> [!TIP] -> See [GPU Acceleration](architecture/gpu-acceleration.md) for details on when GPU adds value (spoiler: batch sizes > 32). - ---- - -### Can I use it as an embedded library? - -**Absolutely!** Spector runs in two modes: - -| Mode | Description | Overhead | -|------|-------------|----------| -| **Embedded** | Add JAR to classpath, create `SpectorEngine` | Zero network overhead | -| **Server** | REST API with auth, CORS, metrics | HTTP overhead | - -```java -try (var engine = new SpectorEngine(SpectorConfig.DEFAULT.withDimensions(384))) { - engine.ingest("id", "content", vector); - var results = engine.hybridSearch("query", queryVector, 10); -} -``` - ---- - -### What about persistence? Do I lose data on restart? - -**No!** Spector supports persistence through memory-mapped files. The HNSW index uses a page-aligned binary format that loads instantly via `mmap` — no deserialization needed. Vector data survives restarts. - ---- - -### How does it compare to Elasticsearch? - -| Aspect | ⚡ Spector | Elasticsearch | -|--------|---------------|--------------| -| Vector search latency | **0.13 ms** (100K, in-process) | 2–10 ms | -| Hybrid search latency | **1.01 ms** (100K, in-process) | 10–30 ms | -| Deployment | Embedded JAR or server | Cluster only | -| Dependencies | **Zero** (JDK only) | JVM + heavy stack | -| GPU support | ✅ CUDA | ❌ | -| IVF-PQ compression | ✅ 32× | ❌ | - -> Elasticsearch excels at distributed full-text search with a mature query language and ecosystem. Spector excels at raw in-process performance, embedded use, and modern JVM features. The latency advantage is largest for in-process embedded use; network-bound deployments narrow the gap. - ---- - -### Does it support filtering/metadata queries? - -**Yes.** The Spring AI integration supports filter expressions: - -```java -vectorStore.similaritySearch( - SearchRequest.query("search algorithms") - .withFilterExpression("category == 'indexing' && version > 2") -); -``` - ---- - -### What embedding models work with Spector? - -Any model that produces float32 vectors. Set `dimensions` to match: - -| Model | Dimensions | Provider | -|-------|-----------|----------| -| all-MiniLM-L6-v2 | 384 | Sentence Transformers / Ollama | -| e5-base-v2 | 768 | Sentence Transformers | -| text-embedding-ada-002 | 1536 | OpenAI | -| nomic-embed-text | 768 | Ollama | -| mxbai-embed-large | 1024 | Ollama | - -> [!NOTE] -> Spector includes an Ollama embedding provider out of the box. Implement the `EmbeddingProvider` SPI for any other source. - ---- - -## 🔧 Technical - -### What similarity functions are supported? - -| Function | Best For | -|----------|----------| -| **COSINE** (default) | Normalized embeddings (most models) | -| **DOT_PRODUCT** | Unnormalized embeddings, magnitude matters | -| **EUCLIDEAN** | Spatial/geometric data | - ---- - -### What's the maximum dataset size? - -| Mode | Scale | -|------|-------| -| Single node | Up to 10 million documents | -| IVF-PQ mode | Billions of vectors (32× compression) | -| Distributed mode | Scale horizontally (2–256 shards) | - ---- - -### How does the LLM re-ranking work? - -```mermaid -flowchart LR - A["🔍 Search
Top-N candidates"] --> B["🤖 LLM (Ollama)
Listwise scoring"] - B --> C["✨ Re-ranked
Top-K results"] -``` - -1. Vector/hybrid search retrieves top-N candidates (default: 20) -2. Candidates sent to Ollama for listwise relevance scoring -3. LLM reorders based on semantic relevance -4. Final top-K results reflect LLM judgment - -> [!WARNING] -> Adds 100–500ms latency but significantly improves precision for ambiguous queries. - ---- - -### What are virtual threads and why do they matter? - -Virtual threads (Project Loom) are lightweight threads that don't map 1:1 to OS threads: - -- ✅ Handle millions of concurrent requests without pool tuning - -- ✅ No `synchronized` blocks that pin platform threads - -- ✅ Near-zero scheduling overhead - -- ✅ Linear scaling (4.5× at 16 threads measured) - ---- - -### How does zero-copy storage work? - -Vectors are stored in memory-mapped files using Panama's `MemorySegment`: - -- OS maps file directly into process address space - -- SIMD kernels read vectors without copying to Java heap - -- Zero garbage collection pressure - -- Instant startup (no deserialization) - -- Supports datasets larger than available RAM - ---- - -### What's the difference between HNSW and IVF-PQ? - -| Aspect | 🌐 HNSW | 🗜️ IVF-PQ | -|--------|------|--------| -| Speed | Fastest (0.05ms) | Fast (nprobe-dependent) | -| Memory | Full vectors (1.5KB/vec @ 384-dim) | 32× compressed (48 bytes/vec) | -| Recall | High (configurable) | Moderate (nprobe-dependent) | -| Scale | Up to millions | Up to billions | -| Use case | Default for most workloads | Memory-constrained, billion-scale | - ---- - -### Can I run benchmarks in CI? - -**Yes!** JSON output + baseline regression detection: - -```bash -mvn -pl spector-bench exec:java -Dexec.args="-rf json -rff results.json" -``` - ---- - -## ⚙️ Operations - -### What ports does Spector use? - -| Port | Protocol | Purpose | -|------|----------|---------| -| 7070 | HTTP | REST API (configurable) | -| 9090 | gRPC | Cluster communication (distributed mode) | - ---- - -### How do I monitor Spector? - -```bash -curl http://localhost:7070/health # Health check -curl http://localhost:7070/api/v1/status # Engine status -curl http://localhost:7070/api/v1/metrics # Request metrics -``` - ---- - -### What JVM arguments should I use in production? - -```bash -java \ - --add-modules jdk.incubator.vector \ - --enable-native-access=ALL-UNNAMED \ - -XX:+UseZGC -XX:+ZGenerational \ - -Xmx4g -Xms4g \ - -jar spector-node.jar -``` - ---- - -### How do I upgrade without downtime? - -**Distributed mode:** -1. Drain one node (stop routing requests) -2. Upgrade the node binary -3. Restart and wait for replica sync -4. Repeat for each node - -**Embedded mode:** Standard application deployment with new Spector version. - ---- - -### Is there authentication? - -**Yes.** Set an API key at server startup: - -```bash -mvn exec:java -pl spector-node \ - -Dexec.args="7070 384 my-secret-key" -``` - -Clients include `X-API-Key: my-secret-key` in requests. Without a key configured, all requests are allowed. - ---- - -## 🔗 See Also - -- [Getting Started](getting-started/quickstart.md) — Quick start guide - -- [What is Spector](about.md) — Product overview - -- [Configuration Guide](configuration/parameters.md) — All parameters - -- [Performance Tuning](operations/performance-tuning.md) — Optimization strategies \ No newline at end of file diff --git a/docs/docs/getting-started/installation.md b/docs/docs/getting-started/installation.md index 3dfa9c5..865577d 100644 --- a/docs/docs/getting-started/installation.md +++ b/docs/docs/getting-started/installation.md @@ -12,19 +12,19 @@ ## Building from Source ```bash -git clone https://github.com/spectrayan/spector.git -cd spector +git clone https://github.com/spectrayan/spector-search.git +cd spector-search mvn clean install -DskipTests ``` ## Running with JVM Flags -Spector uses incubator modules. The required JVM flags are configured in `pom.xml`, but if running manually: +Spector Search uses incubator modules. The required JVM flags are configured in `pom.xml`, but if running manually: ```bash java --add-modules jdk.incubator.vector \ --enable-native-access=ALL-UNNAMED \ - -jar spector-node/target/spector-node.jar + -jar spector-server/target/spector-server.jar ``` ## Server Configuration @@ -32,8 +32,8 @@ java --add-modules jdk.incubator.vector \ Start with custom port, dimensions, and API key: ```bash -mvn exec:java -pl spector-node \ - -Dexec.mainClass="com.spectrayan.spector.server.SpectorNode" \ +mvn exec:java -pl spector-server \ + -Dexec.mainClass="com.spectrayan.spector.server.SpectorServer" \ -Dexec.args="7070 384 my-secret-key" ``` @@ -44,9 +44,7 @@ Arguments: ` [api-key]` GPU acceleration requires: - NVIDIA GPU with CUDA support - - CUDA toolkit installed - - Set `gpuEnabled=true` in configuration The system falls back to CPU SIMD automatically when GPU is unavailable. @@ -57,4 +55,4 @@ Spector ships with an Ollama embedding provider. To enable auto-embedding: 1. Install [Ollama](https://ollama.ai) 2. Pull an embedding model: `ollama pull nomic-embed-text` -3. Configure the embedding endpoint in your application \ No newline at end of file +3. Configure the embedding endpoint in your application diff --git a/docs/docs/getting-started/jdk-api-status.md b/docs/docs/getting-started/jdk-api-status.md deleted file mode 100644 index d773ccd..0000000 --- a/docs/docs/getting-started/jdk-api-status.md +++ /dev/null @@ -1,199 +0,0 @@ -# ☕ JDK API Status & Compatibility - -> **Spector deliberately adopts the latest JDK innovations for maximum hardware utilization.** This page explains which APIs are finalized, which are still incubating or in preview, what that means in practice, and how to handle the required JVM flags. - ---- - -## API Status Summary - -| API | JDK Status | Since | JVM Flag | Risk Level | -|:---|:---|:---|:---|:---| -| **Panama FFM** (MemorySegment, Arena) | ✅ **Finalized** | JDK 22 (JEP 454) | `--enable-native-access` | **None** — stable, production-ready | -| **Virtual Threads** (Project Loom) | ✅ **Finalized** | JDK 21 (JEP 444) | None | **None** — stable, production-ready | -| **Vector API** (`jdk.incubator.vector`) | 🔬 **Incubator** | JDK 16 (JEP 338) | `--add-modules jdk.incubator.vector` | **Low** — API stable across 10 rounds | -| **Structured Concurrency** | 🔬 **Preview** | JDK 21 (JEP 505) | `--enable-preview` | **Low** — Spector has fallback mode | - -> [!IMPORTANT] -> **Two of Spector's four core JDK technologies are already finalized** — Panama FFM (off-heap memory) and Virtual Threads (concurrency). The remaining two (Vector API, Structured Concurrency) are in incubator/preview but have been stable in practice. - ---- - -## What "Incubator" and "Preview" Mean - -### Incubator Modules - -Incubator modules are **non-final APIs** shipped in the JDK for real-world feedback. They: - -- Require an explicit opt-in flag: `--add-modules jdk.incubator.vector` -- Emit a startup warning: `WARNING: Using incubator modules: jdk.incubator.vector` -- May change API surface between JDK releases -- Are **not enabled by default** — they won't interfere with other applications - -### Preview Features - -Preview features are **language or VM features** that are functionally complete but seeking final feedback. They: - -- Require `--enable-preview` at both compile and runtime -- Are expected to be finalized in a near-future JDK release -- May have minor signature changes before finalization - -> [!NOTE] -> Both incubator and preview APIs are **fully functional and performant** — they are not experimental prototypes. The designation means the API surface could evolve, not that the implementation is unreliable. - ---- - -## Vector API — Detailed Assessment - -The Vector API (`jdk.incubator.vector`) has been incubating since **JDK 16 (March 2021)** — over 5 years and 10 incubation rounds. Despite its incubator status, it is the most mature incubator module in JDK history. - -### Why It's Still Incubating - -The Vector API's finalization is blocked by **Project Valhalla** (value types). The JDK team wants `FloatVector`, `IntVector`, etc. to be value types for optimal JIT behavior. Until Valhalla delivers value types, the Vector API remains in incubator — not because of instability, but because the JDK team wants the finalized version to have optimal memory layout. - -### Stability in Practice - -| Aspect | Assessment | -|:---|:---| -| **API surface** | Stable since JDK 19. No breaking changes in 6+ rounds. | -| **Performance** | Fully JIT-optimized. HotSpot intrinsics compile to native AVX/NEON. | -| **Adoption** | Used internally by OpenJDK itself and major open-source projects. | -| **ISA support** | AVX2, AVX-512, NEON — all production-grade. | - -### Spector's Usage Pattern - -Spector uses the **most stable subset** of the Vector API: - -```java -// ISA-agnostic — works on any platform -FloatVector.SPECIES_PREFERRED - -// Standard operations — unlikely to change -vector.mul(other).reduceLanes(VectorOperators.ADD) -``` - -We avoid experimental or niche operations, sticking to arithmetic (mul, add, sub, fma) and reductions (reduceLanes) — the core operations that have been stable across all incubation rounds. - -### Migration Path When Finalized - -When the Vector API is finalized (expected when Project Valhalla matures): - -1. Remove `--add-modules jdk.incubator.vector` from JVM flags -2. Change imports from `jdk.incubator.vector` to `java.util.vector` (or wherever the final package lands) -3. No algorithmic changes expected — the math operations are stable - -> [!TIP] -> Spector centralizes all Vector API usage in `spector-core`, so migration will require changes in a single module. - ---- - -## Structured Concurrency — Detailed Assessment - -Structured Concurrency (JEP 505) is a **preview feature** that Spector uses for safe parallel task management. It has been in preview since JDK 21. - -### Spector's Fallback Mode - -Spector includes a **runtime fallback** to classic virtual threads: - -```bash -# Use structured concurrency (default) -java --enable-preview -jar spector.jar - -# Fall back to classic ExecutorService + virtual threads -java -Dspector.concurrency.structured=false -jar spector.jar -``` - -The fallback mode uses `Executors.newVirtualThreadPerTaskExecutor()` — fully finalized, production-ready, and functionally equivalent for all Spector use cases. - -### Where Spector Uses Structured Concurrency - -| Site | Module | Benefit | -|:---|:---|:---| -| Hybrid search fan-out | spector-query | Auto-cancel sibling on failure | -| Distributed shard fan-out | spector-node | Auto-cancel all on shard failure | -| Batch embedding | spector-embed-api | Scope-per-call lifecycle | -| PQ subspace training | spector-index | All-or-nothing structured scope | -| BM25 parallel scoring | spector-index | Auto-cancel with sequential fallback | - -> [!NOTE] -> All structured concurrency usage is centralized in `ConcurrentTasks` (spector-commons). When the API finalizes, updates are needed in a single file. - ---- - -## Panama FFM — Finalized ✅ - -The Foreign Function & Memory API was **finalized in JDK 22** (JEP 454). This is the foundation of Spector's off-heap memory management. - -The `--enable-native-access=ALL-UNNAMED` flag is still recommended to suppress warnings about native memory access, but the API itself is stable and will not change. - -| Component | Used For | -|:---|:---| -| `MemorySegment` | Off-heap vector storage (zero-copy, zero-GC) | -| `Arena` | Scoped memory lifecycle management | -| `ValueLayout.JAVA_FLOAT` | Type-safe memory access for vector data | -| `MemorySegment.ofBuffer()` | Memory-mapped file I/O | - ---- - -## Virtual Threads — Finalized ✅ - -Virtual Threads (Project Loom) were **finalized in JDK 21** (JEP 444). Spector uses them throughout: - -- REST API request handling (one virtual thread per request) -- MCP server tool dispatch -- Hybrid search fan-out -- Bulk ingestion pipelines - -No JVM flags are required — virtual threads are a standard JDK feature. - ---- - -## Required JVM Flags - -Here is the complete set of JVM flags Spector requires and why: - -```bash -java \ - --add-modules jdk.incubator.vector \ # Vector API (incubator) - --enable-native-access=ALL-UNNAMED \ # Panama FFM native access (finalized, but flag suppresses warnings) - --enable-preview \ # Structured Concurrency (preview) - -jar spector.jar -``` - -### Flag Compatibility Matrix - -| Flag | Required? | What Happens Without It | -|:---|:---|:---| -| `--add-modules jdk.incubator.vector` | ✅ Required | `ClassNotFoundException` — Vector API classes not available | -| `--enable-native-access=ALL-UNNAMED` | ⚠️ Recommended | Works, but emits native access warnings on stderr | -| `--enable-preview` | ⚠️ Optional | Works if `spector.concurrency.structured=false` (fallback mode) | - -> [!TIP] -> **Minimum viable flags:** If you want to avoid preview features entirely, you can run with just `--add-modules jdk.incubator.vector` and `-Dspector.concurrency.structured=false`. This disables structured concurrency but everything else works normally. - ---- - -## FAQ - -### Is it safe to use incubator modules in production? - -Yes, with awareness. The "incubator" label means the API *surface* could change, not that the implementation is unstable. The Vector API has been functionally stable across 10 JDK releases. Many organizations run incubator modules in production. - -### What happens when I upgrade JDK versions? - -When upgrading from one JDK version to the next, check the release notes for Vector API changes. In practice, no breaking changes have occurred since JDK 19. Spector's test suite (331+ tests) will catch any issues during a JDK upgrade. - -### Will the startup warnings affect my application? - -No. The `WARNING: Using incubator modules: jdk.incubator.vector` message goes to stderr and has zero performance impact. It's purely informational. In MCP mode, all logging goes to stderr by design, so the warning doesn't affect the JSON-RPC protocol stream. - -### Can I use Spector without any incubator/preview features? - -Not currently — the Vector API is fundamental to Spector's SIMD acceleration. However, you can avoid preview features by using the structured concurrency fallback (`-Dspector.concurrency.structured=false`). - ---- - -## See Also - -- [Quick Start](quickstart.md) — Build and run with all required JVM flags -- [Installation](installation.md) — JDK setup and verification -- [Architecture Overview](../architecture/overview.md) — How these APIs fit into the architecture diff --git a/docs/docs/getting-started/quickstart.md b/docs/docs/getting-started/quickstart.md index ea53d8b..fb810a4 100644 --- a/docs/docs/getting-started/quickstart.md +++ b/docs/docs/getting-started/quickstart.md @@ -1,250 +1,55 @@ -# 🚀 Getting Started +# Quick Start -> **Go from zero to your first search result in under 5 minutes.** This guide walks you through building Spector from source, starting the server, ingesting documents, and running your first hybrid search. +Get Spector Search running and execute your first search in under 5 minutes. ---- +## Prerequisites -## 📋 Prerequisites +- **JDK 25+** (OpenJDK with Vector API incubator) +- **Maven 3.9+** -| Tool | Version | How to Check | -|------|---------|-------------| -| ☕ JDK | 25+ | `java -version` | -| 📦 Maven | 3.9+ | `mvn --version` | -| 🔧 Git | 2.40+ | `git --version` | - -> [!IMPORTANT] -> Spector requires **JDK 25 or later** with the Vector API incubator module. [OpenJDK builds](https://jdk.java.net/) include this by default. - ---- - -## 🏗️ Clone and Build +## Build ```bash -# Clone the repository -git clone https://github.com/spectrayan/spector.git -cd spector - -# Build all modules (includes 316+ tests) +git clone https://github.com/spectrayan/spector-search.git +cd spector-search mvn clean test - -# Build without tests (faster) -mvn clean package -DskipTests -``` - -> [!TIP] -> The full test suite runs 316+ tests across all modules. Expect ~2 minutes on a modern machine. - ---- - -## 🔬 Verify SIMD Support - -Confirm your hardware's SIMD acceleration level: - -```bash -java --add-modules jdk.incubator.vector -cp spector-core/target/classes \ - com.spectrayan.spector.core.SimdCapability -``` - -Expected output (varies by hardware): -``` -SIMD Species: S_256_BIT (AVX2, 8 float lanes) -``` - ---- - -## 🖥️ Start the Server - -```bash -# Start on default port 7070 with 384 dimensions -mvn exec:java -pl spector-node \ - -Dexec.mainClass="com.spectrayan.spector.server.SpectorNode" - -# Start with custom port, dimensions, and API key -mvn exec:java -pl spector-node \ - -Dexec.mainClass="com.spectrayan.spector.server.SpectorNode" \ - -Dexec.args="7070 384 my-secret-key" ``` -Verify it's running: +## Start the Server ```bash -curl http://localhost:7070/health +mvn exec:java -pl spector-server \ + -Dexec.mainClass="com.spectrayan.spector.server.SpectorServer" ``` -```json -{"status": "UP"} -``` - -> [!NOTE] -> The server starts on virtual threads — it can handle thousands of concurrent requests out of the box with no thread pool configuration needed. - ---- +The server starts on port 7070 by default. -## 📄 Ingest Your First Document +## Ingest a Document ```bash curl -X POST http://localhost:7070/api/v1/ingest \ -H "Content-Type: application/json" \ -d '{ "id": "doc-1", - "title": "Introduction to Vector Search", - "content": "Vector search finds similar items by comparing their mathematical representations called embeddings.", - "vector": [0.12, 0.45, 0.78, 0.23, 0.91, 0.34, 0.67, 0.55, 0.11, 0.89] + "title": "Java Vector API", + "content": "SIMD-accelerated search engine on modern JVM", + "vector": [0.1, 0.2, 0.3, 0.4, 0.5] }' ``` -```json -{"id": "doc-1", "status": "indexed"} -``` - -### 🤖 Ingest with Auto-Embedding - -If you have Ollama running with an embedding model: - -```bash -curl -X POST http://localhost:7070/api/v1/ingest/auto \ - -H "Content-Type: application/json" \ - -d '{ - "id": "doc-2", - "title": "HNSW Algorithm", - "content": "Hierarchical Navigable Small World graphs enable fast approximate nearest neighbor search." - }' -``` - -### 📦 Bulk Ingest - -```bash -curl -X POST http://localhost:7070/api/v1/ingest/bulk \ - -H "Content-Type: application/json" \ - -d '{ - "documents": [ - {"id": "d1", "content": "BM25 keyword scoring uses term frequency and document length.", "vector": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}, - {"id": "d2", "content": "Reciprocal Rank Fusion combines multiple ranked lists.", "vector": [0.5, 0.4, 0.3, 0.2, 0.1, 0.9, 0.8, 0.7, 0.6, 0.5]} - ] - }' -``` - ---- - -## 🔍 Run Your First Search - -### 🧬 Hybrid Search (keyword + vector) +## Search ```bash curl -X POST http://localhost:7070/api/v1/search \ -H "Content-Type: application/json" \ -d '{ - "text": "nearest neighbor search", - "vector": [0.15, 0.42, 0.73, 0.28, 0.88, 0.31, 0.62, 0.51, 0.14, 0.85], - "topK": 5 + "text": "vector search", + "topK": 10 }' ``` -```json -{ - "results": [ - { - "id": "doc-1", - "score": 0.9234, - "title": "Introduction to Vector Search", - "content": "Vector search finds similar items..." - } - ], - "searchMode": "HYBRID", - "latencyMs": 0.31 -} -``` - -### 📝 Keyword-Only Search - -```bash -curl -X POST http://localhost:7070/api/v1/search \ - -H "Content-Type: application/json" \ - -d '{"text": "BM25 scoring", "topK": 10}' -``` - -### 🧠 Vector-Only Search - -```bash -curl -X POST http://localhost:7070/api/v1/search \ - -H "Content-Type: application/json" \ - -d '{"vector": [0.15, 0.42, 0.73, 0.28, 0.88, 0.31, 0.62, 0.51, 0.14, 0.85], "topK": 10}' -``` - ---- - -## 📊 Check Engine Status - -```bash -curl http://localhost:7070/api/v1/status -``` - -```json -{ - "status": "RUNNING", - "simd": "AVX2 (256-bit, 8 lanes)", - "gpuAvailable": false, - "rerankerEnabled": false, - "documentCount": 3, - "dimensions": 384 -} -``` - ---- - -## 💻 Use as an Embedded Library - -No server needed — use Spector directly in your Java application: - -```java -import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.engine.SpectorConfig; - -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(100_000); - -try (var engine = new SpectorEngine(config)) { - // Ingest - engine.ingest("doc-1", "Hello world", new float[]{0.1f, 0.2f, ...}); - - // Search - var results = engine.hybridSearch("hello", queryVector, 10); - - for (var result : results.results()) { - System.out.printf("%s → %.4f%n", result.id(), result.score()); - } -} -``` - -> [!TIP] -> Embedded mode has **zero network overhead** — perfect for microservices, desktop apps, and edge deployments. - ---- - -## 🎉 What You've Accomplished - -In just a few minutes, you've: - -- ✅ Built Spector from source - -- ✅ Verified SIMD hardware acceleration - -- ✅ Started a search server - -- ✅ Ingested documents - -- ✅ Run hybrid search queries - ---- - -## 🚀 Next Steps +## Next Steps -| What to explore | Page | -|----------------|------| -| Full API documentation | [REST API Reference](../api-reference/rest-endpoints.md) | -| Type-safe Java client | [Java SDK Guide](../sdk-usage/java-client.md) | -| Tune for your workload | [Configuration Guide](../configuration/parameters.md) | -| Command-line management | [CLI Reference](../cli-reference/spectorctl.md) | -| Understand the internals | [Architecture Overview](../architecture/overview.md) | -| Spring AI integration | [Spring AI Integration](../sdk-usage/spring-ai.md) | \ No newline at end of file +- [Installation guide](installation.md) for detailed setup options +- [API Reference](../api-reference/overview.md) for all endpoints +- [Java SDK](../sdk-usage/java-client.md) for programmatic access diff --git a/docs/docs/index.md b/docs/docs/index.md index 5751df6..9ab627b 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -1,109 +1,33 @@ -# ⚡ Welcome to Spector +# Spector Search -> **The Zero-Overhead, Agent-Ready AI Memory Backbone.** +**Ultra-fast, SIMD-accelerated semantic search engine built on Java Vector API + modern JVM technologies.** -Welcome to the Spector documentation — your central hub for the high-performance, agent-native AI search engine. Whether you're connecting AI agents via MCP, building RAG pipelines, powering recommendation systems, or need sub-millisecond search with zero infrastructure, you're in the right place. +## What is Spector Search? ---- +Spector Search is a high-performance vector search engine written in Java 25 that leverages: -## 🔥 Why Spector? +- **Java Vector API** (jdk.incubator.vector) for SIMD-accelerated similarity kernels +- **Panama FFM** for zero-copy memory-mapped storage and GPU interop +- **Virtual Threads** for massive concurrency in ingestion, embedding, and query execution +- **Memory-mapped ANN indexes** for instant startup and zero-GC-pressure search -| Metric | Value | -|--------|-------| -| 🤖 MCP Tools | **6 agent-ready tools** (semantic, hybrid, RAG, ingest, delete, status) | -| ⚡ Vector Search Latency | **0.05 ms** avg @ 10K docs (128-dim) | -| 🔍 Keyword Search Latency | **0.98 ms** avg @ 100K docs | -| 🧬 Hybrid Search Latency | **0.17 ms** avg @ 10K docs | -| 🚀 Vector Throughput | **18,800 queries/sec** @ 10K | -| 🧵 Concurrent Hybrid | **14,000+ ops/sec** @ 16 threads (384-dim) | -| 🗜️ IVF-PQ + TurboQuant | **8–32× memory reduction** | -| ✅ Test Suite | **331+ tests**, all passing | -| 📦 Dependencies | **Zero** (JDK only) | +## Key Features ---- +| Feature | Description | +|---------|-------------| +| Sub-millisecond queries | HNSW vector search at 0.05ms avg latency | +| Hybrid search | Combines semantic + keyword search via RRF | +| Multi-level quantization | INT8 (4×), INT4 (8×), INT2 (16×) with configurable rescore | +| GPU acceleration | CUDA kernels via Panama FFM | +| IVF-PQ compression | 32× memory reduction for billion-scale | +| Distributed search | gRPC fan-out with consistent hash sharding | +| Zero dependencies | Pure JDK, drop-in JAR | -## 🗺️ Quick Navigation +## Quick Links -### 🚀 Getting Started - -| Page | Description | -|------|-------------| -| [Getting Started](getting-started/quickstart.md) | Build, run, and search in 5 minutes | -| [What is Spector](about.md) | Product overview, use cases, and comparisons | -| [JDK API Status](getting-started/jdk-api-status.md) | Vector API, Panama FFM, and preview feature compatibility | -| [FAQ](faq.md) | Common questions answered | - -### 🤖 Agent Integration (MCP) - -| Page | Description | -|------|-------------| -| [MCP Integration Architecture](architecture/mcp-integration.md) | How the MCP server works under the hood | -| [MCP Server Guide](sdk-usage/mcp-server.md) | Setup for Claude Desktop, Cursor, and custom agents | - -### 🏗️ Architecture & Concepts - -| Page | Description | -|------|-------------| -| [Architecture Overview](architecture/overview.md) | Module diagram, data flow, threading model | -| [Core Concepts](architecture/core-concepts.md) | HNSW, IVF-PQ, BM25, RRF, SIMD deep-dives | -| [Ingestion Pipeline](architecture/ingestion-pipeline.md) | Document → chunk → embed → index pipeline | -| [RAG Pipeline](architecture/rag-pipeline.md) | End-to-end retrieval-augmented generation | -| [Distributed Mode](architecture/distributed-mode.md) | Clustering, sharding, and replication | -| [GPU Acceleration](architecture/gpu-acceleration.md) | CUDA setup and kernel details | - -### 📖 Reference - -| Page | Description | -|------|-------------| -| [REST API Reference](api-reference/rest-endpoints.md) | All endpoints with curl examples | -| [Java SDK Guide](sdk-usage/java-client.md) | Programmatic usage (client + embedded) | -| [Spring AI Integration](sdk-usage/spring-ai.md) | Spring AI VectorStore adapter | -| [CLI Reference](cli-reference/spectorctl.md) | `spectorctl` commands | -| [Configuration Guide](configuration/parameters.md) | All parameters with tuning advice | - -### ⚙️ Operations & Community - -| Page | Description | -|------|-------------| -| [Performance Tuning](operations/performance-tuning.md) | Benchmarks and optimization strategies | -| [Contributing](operations/contributing.md) | Development setup and PR process | - ---- - -## 💡 Highlights at a Glance - -```mermaid -graph LR - A["🤖 AI Agent"] --> B["📡 MCP Server"] - B --> C["⚡ SpectorEngine"] - C --> D["🧠 Hybrid Search"] - D --> E["🎯 RRF Fusion"] - E --> F["🤖 LLM Re-ranking"] - F --> G["✨ Results"] - - H["📄 Document"] --> I["🧩 Chunking"] - I --> J["🧬 Embedding"] - J --> C -``` - -> [!TIP] -> New here? Start with [Getting Started](getting-started/quickstart.md) to build and run your first search in under 5 minutes. Want to connect an AI agent? See the [MCP Server Guide](sdk-usage/mcp-server.md). - ---- - -## 🌟 Project Stats - -| | | -|---|---| -| **Language** | Java 25 | -| **License** | Apache 2.0 · [BSL 1.1](https://github.com/spectrayan/spector/blob/main/spector-memory/LICENSE) (memory module) | -| **Modules** | 18 Maven modules | -| **Dependencies** | Zero (JDK only) | -| **SIMD** | AVX2 / AVX-512 / NEON | -| **GPU** | CUDA via Panama FFM | -| **MCP** | Built-in, 6 agent-ready tools | -| **Distributed** | gRPC fan-out + consistent hashing | - ---- - -**Built with ⚡ by [Spectrayan](https://www.spectrayan.com/)** · [GitHub](https://github.com/spectrayan/spector) · [Apache 2.0](https://github.com/spectrayan/spector/blob/main/LICENSE) · [BSL 1.1 (memory)](https://github.com/spectrayan/spector/blob/main/spector-memory/LICENSE) \ No newline at end of file +- [Getting Started](getting-started/quickstart.md) — Build, run, and search in 5 minutes +- [API Reference](api-reference/overview.md) — All REST endpoints documented +- [Configuration](configuration/parameters.md) — Tune Spector for your workload +- [Architecture](architecture/overview.md) — Understand the system design +- [Java SDK](sdk-usage/java-client.md) — Programmatic access from Java +- [CLI Reference](cli-reference/spectorctl.md) — Command-line management diff --git a/docs/docs/javascripts/mathjax.js b/docs/docs/javascripts/mathjax.js deleted file mode 100644 index 30180ed..0000000 --- a/docs/docs/javascripts/mathjax.js +++ /dev/null @@ -1,18 +0,0 @@ -window.MathJax = { - tex: { - inlineMath: [["\\(", "\\)"]], - displayMath: [["\\[", "\\]"]], - processEscapes: true, - processEnvironments: true - }, - options: { - ignoreHtmlClass: ".*", - processHtmlClass: "arithmatex" - } -}; - -document$.subscribe(() => { - if (typeof MathJax !== "undefined" && MathJax.typesetPromise) { - MathJax.typesetPromise(); - } -}); diff --git a/docs/docs/javascripts/mermaid-init.js b/docs/docs/javascripts/mermaid-init.js deleted file mode 100644 index 9a66f72..0000000 --- a/docs/docs/javascripts/mermaid-init.js +++ /dev/null @@ -1,30 +0,0 @@ -// Initialize mermaid with theme that respects dark/light mode -document.addEventListener('DOMContentLoaded', function() { - const observer = new MutationObserver(function() { - const scheme = document.body.getAttribute('data-md-color-scheme'); - const isDark = scheme === 'slate'; - - if (window.mermaid) { - window.mermaid.initialize({ - theme: isDark ? 'dark' : 'default', - themeVariables: isDark ? { - primaryColor: '#1e1e2e', - primaryTextColor: '#cdd6f4', - primaryBorderColor: '#6c6c8a', - lineColor: '#6c6c8a', - secondaryColor: '#313244', - tertiaryColor: '#181825', - background: '#1e1e2e', - mainBkg: '#1e1e2e', - nodeBorder: '#6c6c8a', - clusterBkg: '#181825', - clusterBorder: '#45475a', - titleColor: '#cdd6f4', - edgeLabelBackground: '#1e1e2e' - } : {} - }); - } - }); - - observer.observe(document.body, { attributes: true, attributeFilter: ['data-md-color-scheme'] }); -}); diff --git a/docs/docs/labs/roadmap.md b/docs/docs/labs/roadmap.md deleted file mode 100644 index dd278a2..0000000 --- a/docs/docs/labs/roadmap.md +++ /dev/null @@ -1,555 +0,0 @@ ---- -title: "Labs — Experimental Features" -description: "Research roadmap for Spector's experimental cognitive features: Neuromodulatory Gain Control, Executive Dysfunction Profile, Two-Factor Memory Strength, and Dynamic Quantization Stepping." ---- - -# 🔬 Labs — Experimental Features - -> **Status**: Research / Future Work -> -> These features are under active research and planned for implementation -> in the `labs` branch. They are not yet available in the main release. - ---- - -## Neuromodulatory Gain Control - -### Concept - -Dynamic retrieval tuning via simulated neurotransmitter modulation. Rather than using static cognitive profiles, the system would maintain a **runtime neuromodulatory state** that continuously adjusts retrieval parameters based on the agent's recent activity, outcomes, and context. - -### Biological Basis - -The brain's retrieval characteristics aren't fixed — they shift moment-to-moment based on neuromodulatory tone. A developer who just encountered a production outage has elevated norepinephrine, which sharpens recency bias and narrows attention. A developer brainstorming during a design review has elevated serotonin, which broadens associative scope. - -Currently, Spector models this via discrete [Cognitive Profiles](../memory/cognitive-profiles.md) (DEBUGGING, EXPLORING, etc.). Neuromodulatory Gain Control would replace discrete switching with **continuous, gradient modulation**. - -### Proposed Architecture - -```mermaid -flowchart TD - subgraph "Neuromodulatory State" - ACh["Acetylcholine
attention sharpness"] - 5HT["Serotonin
retrieval breadth"] - DA["Dopamine
novelty seeking"] - NE["Norepinephrine
urgency bias"] - end - - subgraph "Retrieval Modulation" - TM["Tag Match Strictness"] - LS["Lateral Scope"] - NW["Novelty Weight"] - RB["Recency Bias"] - end - - ACh --> TM - 5HT --> LS - DA --> NW - NE --> RB - - subgraph "Inputs" - O["Outcome Feedback
(reinforce calls)"] - C["Context Signals
(tags, valence)"] - T["Temporal Patterns
(query rate, errors)"] - end - - O --> DA - O --> NE - C --> ACh - C --> 5HT - T --> NE - T --> DA -``` - -### Modulation Parameters - -| Neurotransmitter | Parameter Affected | Low Level Effect | High Level Effect | -|:---|:---|:---|:---| -| Acetylcholine (ACh) | `tagMatchStrictness` | Loose tag gating (any overlap passes) | Strict tag gating (all bits must match) | -| Serotonin (5-HT) | `lateralDistanceThreshold` | Narrow scope (close matches only) | Wide scope (cross-domain retrieval active) | -| Dopamine (DA) | `noveltyWeight` in ICNU | Familiar memories preferred | Novel/surprising memories preferred | -| Norepinephrine (NE) | `recencyBias` | All ages equal | Strong recency bias (last hour dominates) | - -### State Update Model - -Each neurotransmitter level $n_i(t)$ follows an exponential decay toward a baseline, with spikes driven by events: - -$$ -n_i(t + \Delta t) = n_i^{\text{base}} + \left(n_i(t) - n_i^{\text{base}}\right) \cdot e^{-\Delta t / \tau_i} + \sum_{\text{events}} \Delta n_i -$$ - -Where: - -- $n_i^{\text{base}}$ — resting level for neurotransmitter $i$ (profile-dependent) -- $\tau_i$ — decay constant (how quickly it returns to baseline after a spike) -- $\Delta n_i$ — event-driven spike (e.g., negative reinforcement → +NE, +ACh) - -**Example decay constants:** - -| Neurotransmitter | $\tau$ | Rationale | -|:---|:---|:---| -| ACh | 5 minutes | Attention shifts are fast | -| 5-HT | 30 minutes | Mood/scope changes are slow | -| DA | 10 minutes | Novelty-seeking is moderate | -| NE | 2 minutes | Urgency is very transient | - -### Event-to-Spike Mapping - -| Event | ACh | 5-HT | DA | NE | -|:---|:---:|:---:|:---:|:---:| -| Negative reinforcement (bug found) | +0.3 | -0.1 | — | +0.5 | -| Positive reinforcement (solution worked) | — | +0.2 | +0.3 | -0.2 | -| High recall latency (slow query) | +0.1 | — | — | +0.2 | -| Lateral result selected by agent | — | +0.3 | +0.2 | — | -| Repeated query (same topic, 3rd time) | +0.4 | -0.2 | -0.1 | — | -| No results found | — | +0.2 | +0.4 | +0.1 | - -### Implementation Sketch - -```java -public final class NeuromodulatoryState { - - private volatile float acetylcholine = 0.5f; // baseline - private volatile float serotonin = 0.5f; - private volatile float dopamine = 0.5f; - private volatile float norepinephrine = 0.3f; - - private volatile long lastUpdateMs = System.currentTimeMillis(); - - /** - * Applies exponential decay toward baseline, then adds event spikes. - */ - public synchronized void update(NeuroEvent... events) { - long now = System.currentTimeMillis(); - float dtSeconds = (now - lastUpdateMs) / 1000f; - - // Exponential decay toward baseline - acetylcholine = decayToward(acetylcholine, 0.5f, dtSeconds, TAU_ACH); - serotonin = decayToward(serotonin, 0.5f, dtSeconds, TAU_5HT); - dopamine = decayToward(dopamine, 0.5f, dtSeconds, TAU_DA); - norepinephrine = decayToward(norepinephrine, 0.3f, dtSeconds, TAU_NE); - - // Apply event spikes - for (var event : events) { - acetylcholine = clamp(acetylcholine + event.deltaACh()); - serotonin = clamp(serotonin + event.delta5HT()); - dopamine = clamp(dopamine + event.deltaDA()); - norepinephrine = clamp(norepinephrine + event.deltaNE()); - } - - lastUpdateMs = now; - } - - /** - * Modulates RecallOptions based on current neuromodulatory state. - */ - public RecallOptions modulate(RecallOptions base) { - return base.toBuilder() - .lateralDistanceThreshold(base.lateralDistanceThreshold() * (2.0f * serotonin)) - .hyperfocusBoost(base.hyperfocusBoost() * (1.0f + acetylcholine)) - // ... other modulations - .build(); - } -} -``` - -### Dependencies & Complexity - -- **Dependencies:** CognitiveProfile extensions, configurable ICNU weights -- **Complexity:** High — requires runtime state management, thread-safe neuromodulatory state, and careful calibration of decay constants and spike magnitudes -- **Risk:** Over-tuning can create oscillatory behavior (agent flip-flops between modes) - ---- - -## Executive Dysfunction Profile - -### Concept - -A Hebbian-first recall path that bypasses vector similarity entirely. When the agent can't formulate a clear query (analogous to executive dysfunction), it falls back to associative recall: "what have I been thinking about recently?" - -### Biological Basis - -In executive dysfunction, the prefrontal cortex struggles with **top-down, goal-directed retrieval** — the ability to say "I need to find X" and systematically search for it. However, **bottom-up, associative recall** remains intact — memories surface via association chains rather than directed search. - -This is common in ADHD: you can't remember the specific thing you were looking for, but a tangential mention triggers a cascade of related memories. The [STDP infrastructure](../memory/hebbian.md#offheapedgetable--directed-stdp-edges) now makes this possible — directed causal edges encode "thinking about A leads to thinking about B." - -### Proposed Architecture - -```mermaid -flowchart TD - Q["Query: 'I was working on something...'"] --> D{"Executive
Dysfunction
Profile?"} - D -->|No| VS["Standard: Vector Search"] - D -->|Yes| STDP["STDP Edge Lookup"] - - STDP --> CT["Get context tags from
recent recall history"] - CT --> CE["Follow causal edges
(predictive strength > 0.3)"] - CE --> R1["Memory: database config"] - CE --> R2["Memory: connection pool tuning"] - CE --> R3["Memory: timeout settings"] - - VS --> S["6-Phase Scoring Pipeline"] - - R1 --> M["Merge & Rank"] - R2 --> M - R3 --> M - S --> M - M --> F["Final Results"] - - style STDP fill:#e74c3c,color:white - style CE fill:#e74c3c,color:white -``` - -### Recall Algorithm - -1. **Collect context tags** from the last N recall results (default N=10) -2. **Query STDP edges** for all causal predictions from those context tags -3. **Filter edges** by predictive strength threshold (default > 0.3) -4. **Retrieve memories** whose synaptic tags match the predicted tags -5. **Rank by STDP weight** instead of vector similarity -6. **Optionally blend** with a low-weight vector search for hybrid results - -### Key Differences from Standard Recall - -| Aspect | Standard Recall | Executive Dysfunction | -|:---|:---|:---| -| Primary signal | Vector similarity | STDP causal edges | -| Query requirement | Clear, specific query | Vague or absent query | -| Scoring formula | $\alpha \cdot sim + \beta \cdot imp \cdot decay$ | $stdp\_weight \cdot recency$ | -| Tag usage | Bloom filter pre-screen | Primary retrieval key | -| Lateral mode | Optional (DIVERGENT) | Always enabled | - -### Implementation Sketch - -```java -public List recallAssociative(RecallOptions options) { - // Step 1: Collect recent context tags - Set contextTags = recallHistory.recentTags(10); - - // Step 2: Query STDP for causal predictions - Map predictions = new LinkedHashMap<>(); - for (String tag : contextTags) { - tracker.getStdpEdgesFrom(tag).forEach((targetTag, weight) -> { - if (weight.weight() > 0.3f) { - predictions.merge(targetTag, weight.weight(), Math::max); - } - }); - } - - // Step 3: Encode predicted tags as a synaptic mask - long predictedMask = SynapticTagEncoder.encode( - predictions.keySet().toArray(String[]::new)); - - // Step 4: Scan with STDP-weighted scoring - var modifiedOptions = options.toBuilder() - .synapticTagMask(predictedMask) - .alpha(0.1f) // minimal vector similarity - .beta(0.9f) // importance-dominated - .build(); - - return recallPipeline.execute(queryVector, modifiedOptions); -} -``` - -### Dependencies & Complexity - -- **Dependencies:** Full STDP (Stage 3) ✅ **Complete** — directed, timestamped edges are live in `CoActivationTracker` -- **Complexity:** Medium — the STDP infrastructure is the hard part (done). Remaining work is the bypass routing logic and recall history tracking. -- **Risk:** Cold-start problem — STDP edges are empty until the agent has sufficient recall history - ---- - -## Two-Factor Memory Strength (Bjork & Bjork, 1992) - -### Concept - -Separate **retrieval strength** R(t) from **storage strength** S(t). Currently, Spector uses a single decay curve based on age. The Two-Factor model captures a deeper truth: a memory's *accessibility* (can I recall it now?) and its *durability* (will it survive long-term?) are independent dimensions. - -### Biological Basis - -The New Theory of Disuse (Bjork & Bjork, 1992) explains several well-known memory phenomena: - -| Phenomenon | Explanation via R(t) and S(t) | -|:---|:---| -| **Spacing effect** | Spaced retrieval at low R(t) produces higher ΔS than massed retrieval at high R(t) | -| **Testing effect** | Active retrieval (low R(t)) boosts S(t) more than passive re-study | -| **Savings in relearning** | High S(t) memory with low R(t) relearns faster than a genuinely new memory | -| **Tip-of-the-tongue** | High S(t), very low R(t) — the memory is stored but temporarily inaccessible | - -### Mathematical Model - -**Retrieval strength** decays with time since last access: - -$$ -R(t) = e^{-\lambda / S(t) \cdot (t - t_{\text{last}})} -$$ - -Where $\lambda$ is the base decay rate (currently modeled by `DecayStrategy.ageToBucket()`). - -**Storage strength** increases at each retrieval, with the boost inversely proportional to R(t): - -$$ -\Delta S = S_{\text{gain}} \times (1 - R(t)) -$$ - -This creates the spacing effect: when R(t) is near 0 (memory is hard to retrieve), the storage boost is maximal. When R(t) is near 1 (memory is easily retrieved), the storage boost is minimal. - -### Visual Model - -```mermaid -graph LR - subgraph "Easy Retrieval (High R)" - E1["R(t) = 0.9"] --> E2["ΔS = 0.1 × S_gain"] - E2 --> E3["Low storage boost"] - end - - subgraph "Hard Retrieval (Low R)" - H1["R(t) = 0.1"] --> H2["ΔS = 0.9 × S_gain"] - H2 --> H3["High storage boost"] - end - - style E3 fill:#f39c12,color:white - style H3 fill:#27ae60,color:white -``` - -### Integration with Existing Header Layout - -The `storage_strength` field is already present in the V2 (48B) and V3 (64B) header layouts: - -``` -V2 Header Layout (48 bytes): - [32B core] — shared with V1 - [1B arousal] Offset 32 — emotional intensity - [3B padding] Offset 33 — alignment - [4B storage_str] Offset 36 — S(t) ← THIS FIELD - [8B reserved] Offset 40 — future use -``` - -**Current default:** `storage_strength = 1.0f` for all new memories. The field is written and read but not yet used in scoring. - -### Proposed Scoring Integration - -The current scoring formula: - -$$ -\text{score} = \alpha \cdot \text{similarity} + \beta \cdot \text{importance} \cdot \text{decay}(t) -$$ - -Would become: - -$$ -\text{score} = \alpha \cdot \text{similarity} + \beta \cdot \text{importance} \cdot R(t) \cdot S(t)^{0.3} -$$ - -Where $S(t)^{0.3}$ provides a gentle boost for well-stored memories without dominating the score. - -### Wiring into `reinforce()` - -The `reinforce()` path in `DefaultSpectorMemory` already updates valence and recall count. The Two-Factor update would add: - -```java -public void reinforce(String memoryId, byte valence) { - MemoryLocation loc = index.lookup(memoryId); - MemorySegment segment = tierRouter.segmentFor(loc.type()); - long offset = loc.offset(); - - // Existing: update valence - segment.set(LAYOUT_VALENCE, offset + OFFSET_VALENCE, valence); - - // Existing: increment recall count (atomic CAS) - int recallCount = incrementRecallCount(segment, offset); - - // NEW: Two-Factor update - if (layout.headerLayout().headerBytes() >= 48) { // V2+ - long timestamp = segment.get(LAYOUT_TIMESTAMP, offset + OFFSET_TIMESTAMP); - float currentS = segment.get(LAYOUT_STORAGE_STRENGTH, offset + OFFSET_STORAGE_STRENGTH); - - // Compute current R(t) - float ageFraction = DecayStrategy.decay( - DecayStrategy.ageToBucket(timestamp, System.currentTimeMillis())); - - // ΔS = S_gain × (1 - R(t)) — maximum boost when retrieval is hard - float deltaS = S_GAIN * (1.0f - ageFraction); - float newS = Math.min(currentS + deltaS, MAX_STORAGE_STRENGTH); - - segment.set(LAYOUT_STORAGE_STRENGTH, offset + OFFSET_STORAGE_STRENGTH, newS); - } -} -``` - -### Calibration Challenges - -| Parameter | Proposed Default | Notes | -|:---|:---|:---| -| $S_{\text{gain}}$ | 0.1 | Per-retrieval storage increment | -| $S_{\text{max}}$ | 5.0 | Cap to prevent runaway storage strength | -| $\lambda$ | 0.1 | Base decay rate | -| S(t) exponent in scoring | 0.3 | Gentle boost, prevents S domination | - -These need empirical calibration with real agent workloads. The key question: how quickly should storage strength accumulate to produce meaningful behavioral differences? - -### Dependencies & Complexity - -- **Dependencies:** V2+ header layout (`storage_strength` field) ✅ **Ready** — field exists and is read/written -- **Complexity:** Medium — formula is simple, calibration is the hard part -- **Risk:** Miscalibrated S_gain can cause "immortal" memories that never decay - ---- - -## Dynamic Quantization Stepping - -### Concept - -Auto-downgrade vector precision under memory pressure. When off-heap memory usage exceeds a configurable threshold, the system progressively reduces vector quantization from SQ8 (8-bit scalar) to SQ4 (4-bit scalar), trading a small amount of recall accuracy for 2× memory savings. - -### Biological Basis - -The brain performs a similar optimization — older memories are stored with less perceptual detail (lower precision) but retain their gist (semantic meaning). You remember *that* you had a great dinner, but not the exact flavors. The gist is sufficient for retrieval; the sensory detail is pruned. - -### Quantization Precision Impact - -| Format | Bits/Dim | Memory/Vector (768d) | Recall@10 Impact | -|:---|:---:|:---:|:---| -| FP32 | 32 | 3,072 bytes | Baseline | -| SQ8 (current) | 8 | 768 bytes | ~0.5% degradation | -| SQ4 (proposed) | 4 | 384 bytes | ~2-3% degradation | -| Binary | 1 | 96 bytes | ~8-12% degradation | - -### Pressure-Based Stepping - -```mermaid -flowchart TD - M["Monitor: off-heap usage"] --> C{"Usage > threshold?"} - C -->|"< 70%"| OK["Phase 0: Normal (SQ8)"] - C -->|"70-85%"| P1["Phase 1: SQ4 oldest 25%"] - C -->|"85-95%"| P2["Phase 2: SQ4 all non-pinned"] - C -->|"> 95%"| P3["Phase 3: Deep Sleep + aggressive prune"] - - P1 -.- S1["~12% memory saved
~0.5% recall impact"] - P2 -.- S2["~50% memory saved
~2% recall impact"] - P3 -.- S3["Variable savings
memories permanently lost"] - - style OK fill:#27ae60,color:white - style P1 fill:#f39c12,color:white - style P2 fill:#e67e22,color:white - style P3 fill:#e74c3c,color:white -``` - -### SQ4 Encoding - -SQ4 packs two dimensions into a single byte using 4-bit uniform quantization: - -$$ -q_4(x) = \text{round}\left(\frac{x - \min}{\max - \min} \times 15\right) -$$ - -```java -/** - * Encodes two float values into a single byte (4 bits each). - */ -static byte encodeSQ4Pair(float v1, float v2, float min, float scale) { - int q1 = Math.clamp(Math.round((v1 - min) / scale * 15f), 0, 15); - int q2 = Math.clamp(Math.round((v2 - min) / scale * 15f), 0, 15); - return (byte) ((q1 << 4) | q2); -} - -/** - * Decodes a byte back to two approximate float values. - */ -static float[] decodeSQ4Pair(byte packed, float min, float scale) { - int q1 = (packed >> 4) & 0x0F; - int q2 = packed & 0x0F; - return new float[]{ - min + (q1 / 15f) * scale, - min + (q2 / 15f) * scale - }; -} -``` - -### Online Re-Quantization - -The critical engineering challenge: re-quantizing vectors **without locking the store**. The proposed approach: - -1. **Shadow copy:** Create a parallel SQ4 segment alongside the existing SQ8 segment -2. **Background conversion:** A background Virtual Thread re-quantizes records in batches of 1,000 -3. **Atomic swap:** Once complete, atomically update the `CognitiveRecordLayout` stride to use SQ4 offsets -4. **Lazy cleanup:** The old SQ8 bytes become dead space, reclaimed at next compaction - -```java -/** - * Re-quantizes a batch of records from SQ8 to SQ4 in-place. - * - * Thread safety: uses compare-and-swap on a "quantization version" byte - * in the header flags to prevent double-conversion. - */ -public int requantizeBatch(MemorySegment segment, int startRecord, - int batchSize, CognitiveRecordLayout layout) { - int converted = 0; - for (int i = startRecord; i < startRecord + batchSize; i++) { - long offset = (long) i * layout.stride(); - byte flags = segment.get(LAYOUT_FLAGS, offset + OFFSET_FLAGS); - - // Skip pinned, already-SQ4, or tombstoned - if (isPinned(flags) || isSQ4(flags) || isTombstoned(flags)) continue; - - // Read SQ8 vector, re-quantize to SQ4 - byte[] sq8 = readVector(segment, offset, layout); - byte[] sq4 = convertSQ8toSQ4(sq8); - - // Write SQ4 in-place (half the space) - writeVectorSQ4(segment, offset, layout, sq4); - - // Mark as SQ4 in flags (atomic CAS) - setQuantizationFlag(segment, offset, QUANT_SQ4); - converted++; - } - return converted; -} -``` - -### Mixed-Precision Scoring - -The `CognitiveScorer` must handle mixed SQ8/SQ4 segments: - -```java -// Phase 5: Vector distance — check quantization format per-record -byte flags = segment.get(LAYOUT_FLAGS, offset + OFFSET_FLAGS); -float l2dist; -if (isSQ4(flags)) { - l2dist = SimilarityFunction.EUCLIDEAN.computeSQ4FromSegment( - queryVector, segment, layout.vectorOffset(offset), - effectiveMins, effectiveScales, layout.quantizedVecBytes() / 2); -} else { - l2dist = SimilarityFunction.EUCLIDEAN.computeQuantizedFromSegment( - queryVector, segment, layout.vectorOffset(offset), - effectiveMins, effectiveScales, layout.quantizedVecBytes()); -} -``` - -### Dependencies & Complexity - -- **Dependencies:** ReflectDaemon Phase 0 (memory pressure monitoring), ScalarQuantizer SQ4 support (new) -- **Complexity:** High — online re-quantization without locking, mixed-precision scoring in the hot loop, SIMD kernel for SQ4 distance computation -- **Risk:** SQ4 distance computation is not yet SIMD-optimized; 4-bit unpacking adds ~30% overhead per distance call until a dedicated SIMD kernel is written - ---- - -## Priority Matrix - -| Feature | Value | Complexity | Dependencies Ready? | Estimated Effort | -|:---|:---:|:---:|:---:|:---| -| Two-Factor Memory (R+S) | 🟢 High | Medium | ✅ | 1-2 weeks | -| Executive Dysfunction | 🟡 Medium | Medium | ✅ | 1-2 weeks | -| Neuromodulatory Gain | 🟡 Medium | High | ⏳ | 3-4 weeks | -| Dynamic Quantization | 🟡 Medium | High | ⏳ | 4-6 weeks | - ---- - -## Contributing to Labs - -Labs features are developed on `labs/*` branches and are not merged to `main` until they graduate from experimental status. If you're interested in contributing: - -1. Check the [Contributing Guide](../operations/contributing.md) -2. Open an issue with the `labs` label describing which feature and your proposed approach -3. Branch from `main` as `labs/feature-name` -4. Labs branches have relaxed test coverage requirements (60% vs 80% for main) -5. Features graduate to `main` after passing a design review + benchmark validation diff --git a/docs/docs/memory/amygdala.md b/docs/docs/memory/amygdala.md deleted file mode 100644 index 963eec8..0000000 --- a/docs/docs/memory/amygdala.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: "Amygdala — Emotional Valence" -description: "How ValenceTracker adds emotional coloring to memories — enabling agents to recall by mood, sentiment, and outcome quality." ---- - -# 😱 Amygdala — Emotional Valence - -> **Package**: `com.spectrayan.spector.memory.amygdala` -> -> **Biological Analog**: The **amygdala** is the brain's emotional processor. It assigns emotional significance to experiences — fear, joy, anger, relief — which profoundly influences how memories are encoded, stored, and retrieved. Emotionally charged memories are remembered more vividly and last longer. - ---- - -## The Concept - -Every memory in Spector carries a **valence score** — a single byte (`-128` to `+127`) representing its emotional coloring: - -| Range | Meaning | Examples | -|---|---|---| -| `-128` to `-50` | **Strongly negative** | Critical errors, data loss, security breaches | -| `-50` to `-10` | **Mildly negative** | Warnings, slow performance, minor bugs | -| `-10` to `+10` | **Neutral** | Factual information, routine operations | -| `+10` to `+50` | **Mildly positive** | Successful deployments, optimizations | -| `+50` to `+127` | **Strongly positive** | Major breakthroughs, user praise, goals achieved | - ---- - -## ValenceTracker - -The `ValenceTracker` manages emotional coloring of memories: - -```java -public final class ValenceTracker { - - /** - * Computes valence from text content analysis. - * Uses keyword-based sentiment detection with configurable weights. - */ - public byte computeValence(String text, MemorySource source) { - float score = 0f; - - // Source-based bias - if (source == MemorySource.PROCEDURAL) score += 0.1f; // Rules are slightly positive - if (source == MemorySource.OBSERVED && containsError(text)) score -= 0.5f; - - // Content-based sentiment - score += sentimentScore(text); - - // Clamp and convert to byte range - return (byte) Math.max(-128, Math.min(127, (int)(score * 127))); - } -} -``` - ---- - -## Valence-Filtered Recall - -The most powerful use of valence is in **recall filtering**. The `RecallOptions` builder supports valence range filtering: - -```java -// Recall only negative-outcome memories (for debugging) -List errors = memory.recall("database connection", - RecallOptions.builder() - .topK(10) - .maxValence((byte) -10) // Only negative memories - .build()); - -// Recall only positive outcomes (for best practices) -List successes = memory.recall("deployment strategy", - RecallOptions.builder() - .topK(5) - .minValence((byte) 10) // Only positive memories - .build()); -``` - -### Phase 3 — Valence Filter in CognitiveScorer - -Valence filtering happens at **Phase 3** of the scoring pipeline — before the expensive SIMD vector math: - -```java -// Phase 3: Valence Filter (~2 cycles) -byte valence = segment.get(LAYOUT_VALENCE, offset + OFFSET_VALENCE); -if (valence < minValence || valence > maxValence) continue; -``` - -**Cost**: 2 CPU cycles — a single byte read and two comparisons. Records outside the valence range are eliminated before Phase 5's ~200-cycle SIMD computation. - ---- - -## Use Cases - -### 1. Debugging: "What Went Wrong?" - -An agent can filter for negative-valence memories when debugging: - -```java -// "Show me only memories associated with failures" -memory.recall("connection timeout", - RecallOptions.builder() - .maxValence((byte) -10) - .synapticFilter("database", "error") - .build()); -``` - -### 2. Best Practices: "What Worked Well?" - -```java -// "Show me successful approaches" -memory.recall("deployment strategy", - RecallOptions.builder() - .minValence((byte) 10) - .synapticFilter("deployment") - .build()); -``` - -### 3. Balanced Recall: Full Emotional Range - -By default, no valence filter is applied — the agent sees the full emotional spectrum. The valence still influences recall indirectly because the `FlashbulbPolicy` pins emotionally intense memories at higher importance. - ---- - -## Storage - -Valence is stored in the 32-byte synaptic header at **offset 30** as a single signed byte: - -``` -Offset 30: [1B valence] — signed byte [-128 to +127] -``` - -This costs exactly **1 byte per memory** — negligible overhead for a powerful filtering dimension. - ---- - -## Next Steps - -- :material-link: [**Hebbian — Association Learning**](hebbian.md) — "neurons that fire together wire together" -- :material-head-cog: [**Dopamine — Surprise Detection**](dopamine.md) — auto-importance scoring -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — where valence filtering happens diff --git a/docs/docs/memory/api-reference.md b/docs/docs/memory/api-reference.md deleted file mode 100644 index 5fe4e96..0000000 --- a/docs/docs/memory/api-reference.md +++ /dev/null @@ -1,243 +0,0 @@ ---- -title: API Reference -description: "Complete API reference for SpectorMemory, RecallOptions, CognitiveResult, and related types." ---- - -# 📖 API Reference - ---- - -## SpectorMemory - -The main façade for all cognitive memory operations. - -### Builder - -```java -SpectorMemory memory = SpectorMemory.builder() - .dimensions(int) // Vector dimensionality (required) - .embeddingProvider(EmbeddingProvider) // Embedding provider (required) - .workingCapacity(int) // Working memory slots (default: 100) - .episodicPartitionCapacity(int) // Records per episodic partition (default: 10,000) - .semanticCapacity(int) // Semantic memory slots (default: 5,000) - .proceduralCapacity(int) // Procedural memory slots (default: 500) - .quantizer(ScalarQuantizer) // Custom quantizer (default: identity) - .persistenceDir(Path) // Episodic mmap directory (default: temp dir) - .build(); -``` - -### Core Methods - -| Method | Return Type | Description | -|---|---|---| -| `remember(id, text, type, source, tags...)` | `CompletableFuture` | Async ingestion — embeds, quantizes, stores, indexes | -| `recall(queryText, options)` | `List` | Parallel SIMD-accelerated recall with cognitive scoring | -| `forget(id)` | `void` | Tombstones a memory (permanent, excluded from all scans) | -| `suppress(id, reason)` | `void` | Suppresses from recall results (reversible) | -| `unsuppress(id)` | `void` | Removes suppression | -| `totalMemories()` | `int` | Total record count across all tiers | -| `introspect()` | `MemoryIntrospector` | Memory health analytics | -| `close()` | `void` | Releases all off-heap memory and file handles | - ---- - -## RecallOptions - -Builder for recall query configuration. - -```java -RecallOptions options = RecallOptions.builder() - .topK(int) // Max results (default: 10) - .synapticFilter(String... tags) // Bloom filter pre-screen - .minImportance(float) // Minimum importance [0.0-1.0] (default: 0.0) - .memoryTypes(MemoryType... types) // Tier filter (default: all) - .minValence(byte) // Min emotional valence (default: -128) - .maxValence(byte) // Max emotional valence (default: +127) - .alpha(float) // Similarity weight (default: 0.6) - .beta(float) // Importance × decay weight (default: 0.4) - .build(); -``` - -### Default Options - -```java -RecallOptions.DEFAULT // topK=10, no filters, alpha=0.6, beta=0.4 -``` - -### Scoring Formula - -$$\text{FinalScore} = \alpha \cdot \text{Similarity} + \beta \cdot \text{Importance} \cdot \text{Decay}$$ - -Where: - -- **Similarity** = `1 / (1 + L2_distance)` — semantic relevance -- **Importance** = `[0.0 - 1.0]` — computed by SurpriseDetector at ingestion -- **Decay** = precomputed bucket lookup based on memory age - ---- - -## CognitiveResult - -Immutable record returned by `recall()`: - -```java -public record CognitiveResult( - String id, // Unique memory identifier - String text, // Raw text content - float score, // Final cognitive score (after habituation) - float importance, // Original importance at ingestion - float ageDays, // Age in fractional days - short recallCount, // Times previously recalled - byte valence, // Emotional coloring [-128 to +127] - MemoryType memoryType, // Cognitive tier (WORKING/EPISODIC/SEMANTIC/PROCEDURAL) - MemorySource source, // Provenance (USER_STATED/OBSERVED/PROCEDURAL/...) - String[] synapticTags, // Decoded tag labels - float decayFactor, // Current temporal decay multiplier - float ltpAdjustedDecay // Decay after reconsolidation adjustment -) {} -``` - ---- - -## MemoryType - -Enum representing the four cognitive tiers: - -```java -public enum MemoryType { - WORKING, // Prefrontal Cortex — volatile circular buffer - EPISODIC, // Hippocampus — time-partitioned mmap - SEMANTIC, // Neocortex — permanent knowledge - PROCEDURAL // Basal Ganglia — learned procedures -} -``` - ---- - -## MemorySource - -Provenance tracking for memory origin: - -```java -public enum MemorySource { - USER_STATED, // Explicit user input - OBSERVED, // System observation (logs, events) - INFERRED, // AI inference - PROCEDURAL, // Rule or procedure - CONSOLIDATED // Created by sleep consolidation (ReflectDaemon) -} -``` - ---- - -## SynapticTagEncoder - -64-bit inline Bloom filter encoder: - -```java -// Encode tags into a Bloom filter -long mask = SynapticTagEncoder.encode("java", "debugging", "performance"); - -// Check if a record matches (containment check) -long recordTags = layout.readSynapticTags(segment, offset); -boolean matches = (recordTags & mask) == mask; - -// Match individual tag -boolean hasJava = SynapticTagEncoder.matches(recordTags, "java"); -``` - ---- - -## CognitiveRecordLayout - -Binary layout for the 32-byte header + quantized vector: - -```java -CognitiveRecordLayout layout = new CognitiveRecordLayout(quantizedVecBytes); - -// Record stride (header + vector) -int stride = layout.stride(); // e.g., 800 for 768-dim INT8 - -// Read/write header -CognitiveHeader header = layout.readHeader(segment, offset); -layout.writeHeader(segment, offset, header); - -// Read individual fields -long tags = layout.readSynapticTags(segment, offset); -float importance = layout.readImportance(segment, offset); - -// Merge tags (OR operation for co-activation) -layout.mergeSynapticTags(segment, offset, additionalTags); -``` - -### CognitiveHeader - -```java -public record CognitiveHeader( - long timestampMs, // Unix epoch milliseconds - long synapticTags, // 64-bit Bloom filter - float exactNorm, // L2 norm of original float vector - float importance, // Cognitive importance [0.0 – 1.0] - int centroidId, // IVF centroid assignment - short recallCount, // Reconsolidation counter - byte valence, // Emotional coloring - byte flags // Bit flags: [0] tombstone, [1] pinned -) {} -``` - ---- - -## ReflectReport - -Summary of a sleep consolidation cycle: - -```java -public record ReflectReport( - int partitionsProcessed, - int memoriesConsolidated, - int semanticMemoriesCreated, - long durationMs -) {} -``` - ---- - -## EpisodicPartition - -A single time-partitioned episodic memory file: - -```java -// Access partition data -int count = partition.count(); -int tombstoneCount = partition.tombstoneCount(); -float tombstoneRatio = partition.tombstoneRatio(); -PartitionState state = partition.state(); -MemorySegment segment = partition.segment(); -CognitiveRecordLayout layout = partition.layout(); - -// Lifecycle operations -partition.seal(); // Prevent further writes -partition.setState(PartitionState.REFLECTABLE); -partition.force(); // Flush to disk -partition.close(); // Release resources -``` - -### PartitionState - -```java -public enum PartitionState { - ACTIVE, // Accepting writes - SEALED, // Read-only, awaiting consolidation - REFLECTABLE, // Consolidation complete, eligible for pruning - TOMBSTONED, // High tombstone ratio, queued for compaction - COMPACTED // Rebuilt as dense partition -} -``` - ---- - -## Next Steps - -- :material-rocket: [**Getting Started**](getting-started.md) — set up in 5 minutes -- :material-brain: [**Architecture**](architecture.md) — how it all fits together -- :material-speedometer: [**Performance**](performance.md) — benchmark results diff --git a/docs/docs/memory/architecture.md b/docs/docs/memory/architecture.md deleted file mode 100644 index 630b686..0000000 --- a/docs/docs/memory/architecture.md +++ /dev/null @@ -1,263 +0,0 @@ ---- -title: System Architecture -description: "Package hierarchy, data flow, and extensibility model for Spector Memory." ---- - -# System Architecture - -Spector Memory is organized around a **biological metaphor** where each Java package corresponds to a brain region or cognitive mechanism. This isn't just naming — the architecture genuinely mirrors how biological memory systems interact. - ---- - -## Extensibility - -| Component | Extension point | What you can customize | -|---|---|---| -| `SpectorMemory` | Single entry point for all operations | Configure tiers, capacities, embedding providers | -| `TierStore` interface | Add new memory tiers | Implement the interface + register in `TierRouter` — no other changes needed | -| `AbstractTierStore` | Common tier lifecycle | Extend for new off-heap tier stores with Arena/segment management | -| `RecallListener` | Post-recall hooks | Add async listeners for co-activation tracking, logging, metrics | -| `CognitiveIngestionTarget` / `RecallPipeline` | Discrete processing steps | Each step is independently testable and replaceable | - ---- - -## Data Flow: Ingestion - -The ingestion pipeline is split across two layers: - -- **`IngestionPipeline`** (in `spector-ingestion`) — handles step 1 (embed) and chunking for large documents -- **`CognitiveIngestionTarget`** (in `spector-memory`) — handles steps 2–9 (synaptic encoding → WAL) - -```mermaid -sequenceDiagram - participant App as Application - participant SM as SpectorMemory - participant CT as CognitiveIngestionTarget - participant EP as EmbeddingProvider - participant SD as SurpriseDetector - participant FP as FlashbulbPolicy - participant SQ as ScalarQuantizer - participant TR as TierRouter - participant MI as MemoryIndex - participant WAL as MemoryWal - participant HG as HebbianGraph - participant TC as TemporalChain - participant EG as EntityGraph - - App->>SM: remember(id, text, type, tags) - SM->>CT: ingestCognitive(id, text, vector, type, tags, ...) - - Note over CT: Step 1: Embed (done by unified IngestionPipeline) - Note over CT: or via CognitiveIngestionTarget.ingestCognitive() - CT->>EP: embed(text) - EP-->>CT: float[4096] - - Note over CT: Step 2: Encode tags - CT->>CT: SynapticTagEncoder.encode(tags) → 64-bit Bloom - - Note over CT: Step 3: Surprise detection - CT->>SD: computeImportance(l2Norm) - SD-->>CT: importance (0.0 – 1.0) - - Note over CT: Step 4: Flashbulb check - CT->>FP: evaluate(zScore) - FP-->>CT: flashbulb? → pin + max importance - - Note over CT: Step 5: Quantize - CT->>SQ: encode(float[]) → byte[] - - Note over CT: Step 6: Build header - CT->>CT: CognitiveHeader(timestamp, tags, importance, ...) - - Note over CT: Step 7: Route & write - CT->>TR: write(type, header, quantized) - TR-->>CT: byte offset - - Note over CT: Step 8: Index - CT->>MI: register(id, location, text, source, tags) - - Note over CT: Step 9a: WAL - CT->>WAL: appendRemember(id, quantized) - - Note over CT: Step 9b: Hebbian edge strengthening - CT->>HG: strengthen(currentIdx, previousIdx, 1.0f) - - Note over CT: Step 9c: Temporal chain linking - CT->>TC: link(currentIdx, lastIdx, sessionId) - - Note over CT: Step 9d: Entity extraction & graph population - CT->>EG: addEntity() + linkToMemory() + addRelation() - - Note over CT: Step 10: Circadian check - CT->>CT: triggerReflectIfDue() -``` - -> [!NOTE] -> When ingestion comes through the unified `IngestionPipeline` (e.g., file ingestion), embedding (step 1) is handled by the pipeline itself. `CognitiveIngestionTarget.ingest()` receives a pre-embedded vector and executes steps 2–9. When called via `SpectorMemory.remember()`, `CognitiveIngestionTarget.ingestCognitive()` handles embedding internally. - -> [!NOTE] -> Steps 9b–9d are **gracefully degrading**: if any graph component is null (not configured) or throws, the step is skipped with a `log.warn()` and ingestion continues normally. - ---- - -## Data Flow: Recall - -The recall pipeline executes parallel tier scans using Virtual Threads: - -```mermaid -sequenceDiagram - participant App as Application - participant RP as RecallPipeline - participant EP as EmbeddingProvider - participant PS as ProspectiveScheduler - participant CT as ConcurrentTasks - participant CS as CognitiveScorer - participant SS as SuppressionSet - participant HP as HabituationPenalty - participant HG as HebbianGraph - participant TC as TemporalChain - participant EG as EntityGraph - - App->>RP: recall("query", options) - - Note over RP: Step 1: Embed query - RP->>EP: embed("query") - EP-->>RP: float[4096] - - Note over RP: Step 2: Prospective reminders - RP->>PS: collectDue() - PS-->>RP: due reminders - - Note over RP: Step 3: Parallel tier scanning - RP->>CT: forkJoinAll(scanTasks) - - par Working Memory - CT->>CS: score(workingSegment, ...) - and Episodic Partition 1 - CT->>CS: score(partition1, ...) - and Episodic Partition 2 - CT->>CS: score(partition2, ...) - and Semantic - CT->>CS: score(semanticSlab, ...) - and Procedural - CT->>CS: score(proceduralSegment, ...) - end - - CS-->>RP: List - - Note over RP: Step 4: Filter suppressed - RP->>SS: isSuppressed(id)? - - Note over RP: Step 5a: Habituation penalty - RP->>HP: recordAndComputePenalty(id) - - Note over RP: Step 5b: STDP causal boost - RP->>RP: CoActivationTracker.getPredictiveStrength() - - Note over RP: Step 5c: Hebbian spreading activation - RP->>HG: activateNeighbors(seedIdx, depth=2) - HG-->>RP: graph-activated memory indices - - Note over RP: Step 5d: Temporal chain extension - RP->>TC: followForward/Backward(idx, maxHops=3) - TC-->>RP: temporally-linked memory indices - - Note over RP: Step 5e: Entity graph traversal - RP->>EG: extract query entities → BFS 2-hop - EG-->>RP: entity-linked memory indices - - Note over RP: Step 6: Merge, dedup, sort → final top-K - RP-->>App: List - - Note over RP: Step 7: Async listeners (Virtual Thread) - RP->>RP: notify(HebbianListener, LtpListener) -``` - ---- - -## Package Dependency Graph - -```mermaid -graph LR - SM[SpectorMemory
Façade] --> CT[pipeline/
CognitiveIngestionTarget] - SM --> RP[pipeline/
RecallPipeline] - SM --> TR[cortex/
TierRouter] - SM --> MI[index/
MemoryIndex] - - CT --> EP[embed-api/
EmbeddingProvider] - CT --> SQ[core/
ScalarQuantizer] - CT --> SD[dopamine/
SurpriseDetector] - CT --> TR - CT --> MI - CT --> WAL[sync/
MemoryWal] - CT --> HG[hebbian/
HebbianGraph] - CT --> TC[temporal/
TemporalChain] - CT --> EG[graph/
EntityGraph] - CT --> EX[graph/
EntityExtractor] - - RP --> EP - RP --> CS[synapse/
CognitiveScorer] - RP --> TR - RP --> MI - RP --> SS[inhibition/
SuppressionSet] - RP --> HP[habituation/
HabituationPenalty] - RP --> HG - RP --> TC - RP --> EG - - CS --> SF[core/
SimilarityFunction] - CS --> DS[synapse/
DecayStrategy] - - TR --> WM[cortex/
WorkingMemoryStore] - TR --> EM[cortex/
EpisodicMemoryStore] - TR --> SE[cortex/
SemanticMemoryStore] - TR --> PR[cortex/
ProceduralMemoryStore] - - RP -.->|async| HL[pipeline/
HebbianListener] - RP -.->|async| LL[pipeline/
LtpListener] - - style SM fill:#4a90d9,color:white - style CS fill:#e74c3c,color:white - style TR fill:#2ecc71,color:white - style HG fill:#e74c3c,color:white - style EG fill:#9b59b6,color:white - style TC fill:#f39c12,color:white -``` - ---- - -## The 32-Byte Cognitive Record - -Every memory is stored as a fixed-size binary record in off-heap memory: - -``` -┌──────────────────────────────────────────────────────────┐ -│ 32-Byte Synaptic Header │ -├────────────┬──────────┬──────────┬────────┬──────────────┤ -│ timestamp │ synaptic │ exactNorm│ import │ centroidId │ -│ 8 bytes │ tags │ 4 bytes │ ance │ 4 bytes │ -│ (offset 0) │ 8 bytes │ (off 16) │ 4 bytes│ (offset 24) │ -│ │ (off 8) │ │(off 20)│ │ -├────────────┴──────────┴──────────┴────────┼──────┬───┬───┤ -│ │recall│val│flg│ -│ (continued) │count │enc│s │ -│ │2B │1B │1B │ -│ │off 28│o30│o31│ -├───────────────────────────────────────────┴──────┴───┴───┤ -│ Quantized Vector (N bytes) │ -│ INT8 values, 32-byte aligned │ -└──────────────────────────────────────────────────────────┘ -``` - -**Total record size** = 32 (header) + N (quantized vector bytes), aligned to 32 bytes. - -At 768 dimensions (INT8): **32 + 768 = 800 bytes/memory** — 50,000 memories fit in 40 MB of off-heap RAM. - ---- - -## Next Steps - -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — the SIMD hot-loop that makes it fast -- :material-share-variant: [**3-Layer Cognitive Graph**](hebbian.md) — Hebbian, Entity, and Temporal graphs -- :material-brain: [**Cortex — Tier Stores**](cortex.md) — the 4-tier memory architecture -- :material-memory: [**Off-Heap Panama Design**](panama-design.md) — zero-GC binary layout diff --git a/docs/docs/memory/biological-systems.md b/docs/docs/memory/biological-systems.md deleted file mode 100644 index c9c2ba4..0000000 --- a/docs/docs/memory/biological-systems.md +++ /dev/null @@ -1,230 +0,0 @@ ---- -title: "Biological Systems — Overview" -description: "How Spector Memory maps neuroscience concepts to code — a guided tour of the 12 cognitive subsystems and their biological foundations." ---- - -# 🧬 Biological Systems — Overview - -Spector Memory doesn't just borrow neuroscience *terminology* — it implements the actual **computational principles** behind biological memory. Each package in `spector-memory` corresponds to a distinct brain region or cognitive mechanism, implementing the mathematical models that neuroscientists have validated over decades of research. - ---- - -## The Brain–Code Mapping - -```mermaid -graph TB - subgraph "Encoding & Storage" - STE["🧩 Synapse
Synaptic Tags & Scoring
Bloom filter + binary layout"] - CT["🧠 Cortex
4-Tier Memory Stores
Working → Episodic → Semantic → Procedural"] - end - - subgraph "Emotional & Importance Modulation" - DA["⚡ Dopamine
Surprise Detection
Welford Z-score → importance"] - AM["❤️ Amygdala
Emotional Valence
-128 to +127 coloring"] - end - - subgraph "Retrieval Dynamics" - HB["🛑 Habituation
Anti-Filter Bubble
Repetition penalty"] - IN["🚫 Inhibition
Suppression Set
Inhibition of return"] - IF["🔀 Interference
Deduplication
Proactive/retroactive"] - end - - subgraph "Association & Learning" - HE["🔗 3-Layer Cognitive Graph
Hebbian + Entity + Temporal
Off-heap graph structures"] - end - - subgraph "Consolidation & Planning" - HP["💤 Hippocampus
Sleep Consolidation
ReflectDaemon cycle"] - PR["📋 Prospective
Future Intents
Scheduled reminders"] - MM["🔍 Metamemory
Self-Reflection
Confidence calibration"] - end - - DA --> STE - AM --> STE - STE --> CT - CT --> HE - HE --> HP - - style HE fill:#e74c3c,color:white - style DA fill:#f39c12,color:white - style HP fill:#9b59b6,color:white -``` - ---- - -## Systems at a Glance - -| System | Brain Region | Key Concept | Spector Implementation | Reference | -|---|---|---|---|---| -| [**Cortex**](cortex.md) | Prefrontal, Hippocampus, Neocortex, Basal Ganglia | Multi-store memory model | 4-tier off-heap stores (Working, Episodic, Semantic, Procedural) | Atkinson & Shiffrin, 1968[^1] | -| [**Synapse**](synapse.md) | Synaptic junction | Synaptic tagging & capture | 64-bit Bloom filter tag encoding, 32B binary header | Frey & Morris, 1997[^2] | -| [**Dopamine**](dopamine.md) | Ventral tegmental area | Prediction error signaling | Welford Z-score surprise detection, flashbulb encoding | Schultz, 1997[^3] | -| [**Amygdala**](amygdala.md) | Amygdala | Emotional memory modulation | Signed valence byte (-128 to +127), emotional filtering | McGaugh, 2004[^4] | -| [**3-Layer Graph**](hebbian.md) | Cortical networks, Hippocampus | Hebbian learning, STDP, episodic sequences | Off-heap HebbianGraph, EntityGraph, TemporalChain | Hebb, 1949[^5]; Bi & Poo, 2001[^6] | -| [**Habituation**](habituation.md) | Sensory cortex | Response decrement to repetition | Exponential penalty on repeated recall | Thompson & Spencer, 1966[^7] | -| [**Inhibition**](inhibition.md) | Prefrontal cortex | Inhibition of return | SuppressionSet with TTL-based suppression windows | Klein, 2000[^8] | -| [**Interference**](interference.md) | Hippocampus | Proactive/retroactive interference | Similarity-based deduplication during ingestion | Underwood, 1957[^9] | -| [**Hippocampus**](hippocampus.md) | Hippocampus | Sleep consolidation & replay | ReflectDaemon: decay, compaction, episodic→semantic promotion | Rasch & Born, 2013[^10] | -| [**Prospective**](prospective.md) | Prefrontal cortex | Prospective memory | Scheduled future intent reminders | Einstein & McDaniel, 1990[^11] | -| [**Metamemory**](metamemory.md) | Prefrontal cortex | Metacognitive monitoring | Confidence calibration, recall quality estimation | Nelson & Narens, 1990[^12] | -| [**Sync**](sync.md) | — (engineering) | Persistence & replication | WAL + mmap-backed partitions | — | - ---- - -## Key Mathematical Models - -### Temporal Decay (Ebbinghaus Forgetting Curve) - -Spector approximates the exponential forgetting curve using precomputed decay buckets — avoiding expensive `Math.exp()` calls in the hot loop: - -$$R(t) = e^{-\lambda t / S}$$ - -Where $R(t)$ is retrieval strength, $\lambda$ is the decay rate, $t$ is time since encoding, and $S$ is storage strength. Spector discretizes this into 9 buckets (see [Scoring Pipeline](scoring-pipeline.md)). - -> **Reference**: Ebbinghaus, H. (1885). *Über das Gedächtnis*[^13] - -### Reconsolidation (Spacing Effect) - -Each recall shifts the decay bucket backward, simulating how retrieved memories become more durable: - -$$\text{adjustedBucket} = \text{rawBucket} - \lfloor \text{recallCount} / 3 \rfloor$$ - -> **Reference**: Bjork & Bjork (1992). *A New Theory of Disuse*[^14] - -### Surprise Detection (Dopamine Prediction Error) - -The importance signal uses a Z-score from Welford's online statistics: - -$$\text{importance} = \alpha \cdot \sigma\left(\frac{x - \mu}{\sigma}\right) + \beta \cdot \text{temporalNovelty}$$ - -Where $\sigma()$ is the sigmoid function, $\alpha = 0.6$, $\beta = 0.4$. - -> **Reference**: Schultz, W. (1997). *A neural substrate of prediction and reward*[^3] - -### Hebbian Edge Strengthening - -Co-ingested memories strengthen their bidirectional edge: - -$$w_{ij}(t+1) = w_{ij}(t) + \Delta w$$ - -With decay during consolidation: $w_{ij}(t+1) = 0.9 \cdot w_{ij}(t)$ - -> **Reference**: Hebb, D.O. (1949). *The Organization of Behavior*[^5] - -### STDP — Spike-Timing Dependent Plasticity - -Directed causal edges are strengthened when tag A is recalled *before* tag B: - -$$\Delta w = \begin{cases} A_+ \cdot e^{-\Delta t / \tau_+} & \text{if } \Delta t > 0 \text{ (causal)} \\ -A_- \cdot e^{\Delta t / \tau_-} & \text{if } \Delta t < 0 \text{ (anti-causal)} \end{cases}$$ - -> **Reference**: Bi & Poo (2001). *Synaptic modification by correlated activity*[^6] - -### Habituation Penalty - -Repeated recall of the same memory incurs an exponentially increasing penalty: - -$$\text{penalty}(n) = 1 - e^{-\gamma \cdot n}$$ - -Where $n$ is the number of times the memory appeared in recent results and $\gamma$ controls penalty steepness. - -> **Reference**: Thompson & Spencer (1966). *Habituation: A model phenomenon*[^7] - ---- - -## Design Principles - -1. **Fidelity to neuroscience**: Each system implements a real cognitive mechanism, not just a metaphor. The mathematical models are drawn from peer-reviewed research. - -2. **Independent testability**: Each biological system is a standalone package with its own unit tests. Systems compose via dependency injection, not inheritance. - -3. **Graceful degradation**: Every system is optional. Disabling surprise detection, habituation, or graph augmentation produces a functional (if less intelligent) memory system. - -4. **Performance-first biology**: Biological accuracy is constrained by microsecond latency requirements. Where exact models are too expensive (e.g., continuous exponential decay), we use precomputed approximations (decay buckets, Bloom filter tags). - ---- - -## Explore Each System - -
- -- :material-brain:{ .lg .middle } **Cortex — Tier Stores** - - --- - - Working, Episodic, Semantic, and Procedural memory tiers - - [:octicons-arrow-right-24: Cortex](cortex.md) - -- :material-flash:{ .lg .middle } **Synapse — Tags & Scoring** - - --- - - Bloom filter encoding, binary layout, 6-phase scorer - - [:octicons-arrow-right-24: Synapse](synapse.md) - -- :material-head-lightning-bolt:{ .lg .middle } **Dopamine — Surprise** - - --- - - Welford Z-score, flashbulb encoding, importance scoring - - [:octicons-arrow-right-24: Dopamine](dopamine.md) - -- :material-heart:{ .lg .middle } **Amygdala — Valence** - - --- - - Emotional coloring, valence-based filtering - - [:octicons-arrow-right-24: Amygdala](amygdala.md) - -- :material-share-variant:{ .lg .middle } **3-Layer Cognitive Graph** - - --- - - Hebbian, Entity-Relationship, and Temporal Causal graphs - - [:octicons-arrow-right-24: Cognitive Graph](hebbian.md) - -- :material-sleep:{ .lg .middle } **Hippocampus — Consolidation** - - --- - - Sleep cycles, decay, episodic-to-semantic promotion - - [:octicons-arrow-right-24: Hippocampus](hippocampus.md) - -
- ---- - -## References - -[^1]: Atkinson, R.C. & Shiffrin, R.M. (1968). Human memory: A proposed system and its control processes. In *Psychology of Learning and Motivation*, 2, 89–195. [doi:10.1016/S0079-7421(08)60422-3](https://doi.org/10.1016/S0079-7421(08)60422-3) - -[^2]: Frey, U. & Morris, R.G.M. (1997). Synaptic tagging and long-term potentiation. *Nature*, 385, 533–536. [doi:10.1038/385533a0](https://doi.org/10.1038/385533a0) - -[^3]: Schultz, W. (1997). A neural substrate of prediction and reward. *Science*, 275(5306), 1593–1599. [doi:10.1126/science.275.5306.1593](https://doi.org/10.1126/science.275.5306.1593) - -[^4]: McGaugh, J.L. (2004). The amygdala modulates the consolidation of memories of emotionally arousing experiences. *Annual Review of Neuroscience*, 27, 1–28. [doi:10.1146/annurev.neuro.27.070203.144157](https://doi.org/10.1146/annurev.neuro.27.070203.144157) - -[^5]: Hebb, D.O. (1949). *The Organization of Behavior: A Neuropsychological Theory*. New York: Wiley. - -[^6]: Bi, G. & Poo, M. (2001). Synaptic modification by correlated activity: Hebb's postulate revisited. *Annual Review of Neuroscience*, 24, 139–166. [doi:10.1146/annurev.neuro.24.1.139](https://doi.org/10.1146/annurev.neuro.24.1.139) - -[^7]: Thompson, R.F. & Spencer, W.A. (1966). Habituation: A model phenomenon for the study of neuronal substrates of behavior. *Psychological Review*, 73(1), 16–43. [doi:10.1037/h0022681](https://doi.org/10.1037/h0022681) - -[^8]: Klein, R.M. (2000). Inhibition of return. *Trends in Cognitive Sciences*, 4(4), 138–147. [doi:10.1016/S1364-6613(00)01452-2](https://doi.org/10.1016/S1364-6613(00)01452-2) - -[^9]: Underwood, B.J. (1957). Interference and forgetting. *Psychological Review*, 64(1), 49–60. [doi:10.1037/h0044616](https://doi.org/10.1037/h0044616) - -[^10]: Rasch, B. & Born, J. (2013). About sleep's role in memory. *Physiological Reviews*, 93(2), 681–766. [doi:10.1152/physrev.00032.2012](https://doi.org/10.1152/physrev.00032.2012) - -[^11]: Einstein, G.O. & McDaniel, M.A. (1990). Normal aging and prospective memory. *Journal of Experimental Psychology: Learning, Memory, and Cognition*, 16(4), 717–726. [doi:10.1037/0278-7393.16.4.717](https://doi.org/10.1037/0278-7393.16.4.717) - -[^12]: Nelson, T.O. & Narens, L. (1990). Metamemory: A theoretical framework and new findings. In *Psychology of Learning and Motivation*, 26, 125–173. [doi:10.1016/S0079-7421(08)60053-5](https://doi.org/10.1016/S0079-7421(08)60053-5) - -[^13]: Ebbinghaus, H. (1885). *Über das Gedächtnis: Untersuchungen zur experimentellen Psychologie*. Leipzig: Duncker & Humblot. English translation: *Memory: A Contribution to Experimental Psychology* (1913). - -[^14]: Bjork, R.A. & Bjork, E.L. (1992). A new theory of disuse and an old theory of stimulus fluctuation. In *From Learning Processes to Cognitive Processes: Essays in Honor of William K. Estes*, 2, 35–67. diff --git a/docs/docs/memory/cognitive-profiles.md b/docs/docs/memory/cognitive-profiles.md deleted file mode 100644 index e3b90e4..0000000 --- a/docs/docs/memory/cognitive-profiles.md +++ /dev/null @@ -1,266 +0,0 @@ -# Cognitive Profiles - -Cognitive profiles are **pre-configured scoring presets** that modulate how the memory system prioritizes, retrieves, and consolidates information. They act as a thalamic filter — adjusting the balance between similarity-driven and importance-driven recall to match different task contexts. - -## How Profiles Work - -Every recall query is scored using the **fused cognitive score** formula: - -$$ -\text{score} = \alpha \cdot \text{similarity} + \beta \cdot \text{importance} \cdot \text{decay} -$$ - -Where: - -- **α (alpha)** — Weight on vector similarity (how close is this memory to the query?) -- **β (beta)** — Weight on learned importance (how important was this memory at ingestion?) -- **α + β = 1.0** — Always normalized - -A profile sets α, β, and optional modifiers (hyperfocus boost, lateral mode, episode pinning) to bias the scoring pipeline for a specific cognitive strategy. - -## Built-in Profiles - -### Standard Profiles - -| Profile | α | β | Valence Filter | Best For | -|:---|:---:|:---:|:---:|:---| -| `BALANCED` | 0.6 | 0.4 | All | General-purpose recall | -| `EXPLORING` | 0.8 | 0.2 | All | Broad discovery, creative exploration | -| `DEBUGGING` | 0.3 | 0.7 | Negative only (≤ -10) | Precise error-matching, diagnostic search | -| `RECALLING` | 0.4 | 0.6 | Positive only (≥ +10) | Retrieving proven solutions and successes | -| `CRITICAL` | 0.2 | 0.8 | All | Security audits, compliance checks, high-stakes | - -### Advanced Profiles — Neurodivergent - -These profiles go beyond α/β tuning — they activate specialized scoring mechanics in the [6-Phase Pipeline](scoring-pipeline.md) and model specific neurocognitive patterns. - -| Profile | α | β | Biological Analog | Special Mechanics | -|:---|:---:|:---:|:---|:---| -| `HYPERFOCUS` | 1.0 | 0.0 | Monotropism | [Focus Mode](focus-mode.md) — Zero decay, strict tag gate, boost multiplier | -| `SYSTEMATIZER` | 0.3 | 0.7 | Bottom-up processing (autism) | [Systemizer](focus-mode.md#systemizer) — Pins source episodes during consolidation | -| `DIVERGENT` | 0.8 | 0.2 | Reduced Latent Inhibition (ADHD) | [Explorer](lateral-retrieval.md) — Lateral cross-domain retrieval | -| `PARANOID_SENTINEL` | 0.2 | 0.8 | Amygdala threat-detection | Negative-only valence, mood-congruent threat recall | -| `THE_EXECUTOR` | 0.3 | 0.7 | Prefrontal executive function | Heaviside Cliff (strictness=10.0), no lateral retrieval | -| `HIGHLY_SENSITIVE` | 0.7 | 0.3 | Sensory Processing Sensitivity | Low flashbulb threshold, strong lateral inhibition | -| `DEFAULT_MODE_NETWORK` | 0.2 | 0.8 | Brain's resting state network | Skips Working + Episodic, Semantic + Procedural only | - ---- - -## New Profile Deep Dives - -### PARANOID_SENTINEL — Amygdala Threat Detection - -**Biological analog:** The amygdala's threat-detection circuitry, which filters sensory input for potential dangers and amplifies recall of negative experiences (mood-congruent memory bias). - -**Use case:** SRE agents, security auditors, compliance monitors. Only surfaces memories associated with negative outcomes — errors, failures, security incidents, regressions. - -```java -PARANOID_SENTINEL(0.2f, 0.8f, Byte.MIN_VALUE, (byte) -1) -// α β minValence maxValence -``` - -**How it works:** - -- **Valence range [-128, -1]:** Only negative memories pass the valence filter in Phase 3 of the scorer. Successes, neutral logs, and positive outcomes are invisible. -- **α=0.2, β=0.8:** Importance-dominated — the severity of the past failure matters more than how closely it matches the current query. -- **Valence alignment:** Query valence is set to -128 (maximum threat), triggering mood-congruent recall amplification. - -!!! example "Scenario" - Agent query: "deployment configuration" → BALANCED returns general config docs. PARANOID_SENTINEL returns only the config-related incidents: the time a bad config caused a 4-hour outage, the security CVE from an exposed config file, the memory leak from misconfigured thread pool. - -### THE_EXECUTOR — Prefrontal Executive Function - -**Biological analog:** The prefrontal cortex in full executive function mode — goal-directed, no tangential exploration, pure task completion. - -**Use case:** Devin-style agentic task runners. Combined with Zeigarnik Effect (`markUnresolved()`) for tracking open tasks that resist decay. - -```java -THE_EXECUTOR(0.3f, 0.7f, Byte.MIN_VALUE, Byte.MAX_VALUE) -// + strictnessCoefficient = 10.0 -// + lateralMode = false -``` - -**How it works:** - -- **Heaviside Cliff scoring:** The strictness coefficient reshapes the similarity curve into a cliff function: - -$$ -\text{similarity} = \frac{1}{1 + d_{L2} \times 10.0} -$$ - -At strictness=1.0 (default), this is a gentle hyperbola. At strictness=10.0, it's a **cliff** — 95% of candidates score near zero, and only the closest matches survive. - -- **Lateral retrieval disabled:** No DIVERGENT-style cross-domain exploration. Results must be directly relevant. -- **Zeigarnik integration:** Unresolved tasks (flagged via `markUnresolved()`) resist time-decay entirely — their decay bucket is clamped to 0. - -### HIGHLY_SENSITIVE — Sensory Processing Sensitivity - -**Biological analog:** Enhanced sensory processing depth (Aron & Aron, 1997). The highly sensitive brain processes stimuli more deeply, captures finer environmental details, and has a lower threshold for emotional activation. - -```java -HIGHLY_SENSITIVE(0.7f, 0.3f, Byte.MIN_VALUE, Byte.MAX_VALUE) -// + flashbulbThreshold = 2.0 (default: 3.0) -// + inhibitionFloor = 0.3 (stronger lateral inhibition) -// + minImportance = 0.01 -``` - -**How it works:** - -- **Lower flashbulb threshold (2.0 vs 3.0):** Captures more "important" moments as flashbulb memories. Events that BALANCED would consider routine, HIGHLY_SENSITIVE pins permanently. -- **Stronger lateral inhibition (0.3 floor):** Less interference between memories. Each memory maintains its distinctiveness rather than blurring with similar neighbors. -- **minImportance=0.01:** Nothing is too small to remember. Subtle signals that other profiles would round down to zero are preserved. -- **α=0.7:** Similarity-leaning — captures nuanced matches that importance-dominated profiles would miss. - -!!! tip "Ideal for" - Medical reasoning, quality assurance, code review, accessibility testing — anywhere subtle signals could be critical. - -### DEFAULT_MODE_NETWORK — "Shower Thoughts" - -**Biological analog:** The brain's default mode network (DMN), which activates during rest, mind-wandering, and unfocused cognition. The DMN surfaces deep, consolidated knowledge rather than recent events. - -```java -DEFAULT_MODE_NETWORK(0.2f, 0.8f, Byte.MIN_VALUE, Byte.MAX_VALUE) -// + memoryTypes = {SEMANTIC, PROCEDURAL} -// + skipTiers = {WORKING, EPISODIC} -``` - -**How it works:** - -- **Skips Working and Episodic tiers entirely.** Only Semantic (consolidated facts) and Procedural (learned procedures) are searched. -- **α=0.2, β=0.8:** Importance-dominated. The DMN isn't looking for direct matches — it surfaces whatever the agent "knows deeply" about a topic. -- **No recency bias:** Since Episodic is skipped, all results are from long-term consolidated memory. No "what happened today" noise. - -!!! example "Scenario" - Agent is stuck on a performance problem → switches to DEFAULT_MODE_NETWORK → surfaces a deep architectural principle from 3 months ago that reframes the problem entirely. This is the computational equivalent of "sleeping on it." - ---- - -## Usage - -### Via CognitiveProfile Enum - -```java -// Simple: use a profile preset -List results = memory.recall("database deadlock", CognitiveProfile.HYPERFOCUS); -``` - -### Via RecallOptions Builder - -```java -// Advanced: profile + custom overrides -var options = RecallOptions.builder() - .profile(CognitiveProfile.DIVERGENT) - .topK(20) - .lateralDistanceThreshold(1.5f) // override default - .build(); - -List results = memory.recall("performance optimization", options); -``` - -### Via MCP Tool - -The `recall_context` MCP tool accepts a `profile` parameter: - -```json -{ - "name": "recall_context", - "arguments": { - "query": "database deadlock", - "profile": "HYPERFOCUS", - "top_k": 10 - } -} -``` - ---- - -## Profile Selection Guide - -```mermaid -flowchart TD - A["What is the agent doing?"] --> B{"Focused on\none topic?"} - B -->|Yes| C{"Need encyclopedic\ndetail?"} - C -->|Yes| D["SYSTEMATIZER"] - C -->|No| E["HYPERFOCUS"] - B -->|No| F{"Exploring new\nterritory?"} - F -->|Yes| G{"Want cross-domain\ninsights?"} - G -->|Yes| H["DIVERGENT"] - G -->|No| I["EXPLORING"] - F -->|No| J{"Task execution\nor debugging?"} - J -->|"Executing tasks"| J2["THE_EXECUTOR"] - J -->|"Debugging"| K["DEBUGGING"] - J -->|"Threat hunting"| M["PARANOID_SENTINEL"] - J -->|"Need deep insight"| N["DEFAULT_MODE_NETWORK"] - J -->|"Detail-sensitive"| O["HIGHLY_SENSITIVE"] - J -->|No| L["BALANCED"] -``` - ---- - -## Agent Self-Extension - -Agents can dynamically switch profiles during a conversation: - -1. **Start with `BALANCED`** for general context -2. **Switch to `HYPERFOCUS`** when a specific topic is identified (e.g., user mentions "database deadlock") -3. **Switch to `DIVERGENT`** when stuck — lateral results may surface unexpected solutions -4. **Switch to `SYSTEMATIZER`** when building a comprehensive knowledge base - -The `HyperfocusState` object supports TTL-based activation with agent self-extension: - -```java -// Agent detects a focused topic -memory.hyperfocusState().activateFromTags("database", "deadlock"); - -// Agent extends focus when the topic continues -memory.hyperfocusState().extend(); - -// Focus automatically expires after TTL (default: 30 minutes) -``` - ---- - -## Custom Profiles - -You can create custom profiles by using `RecallOptions.builder()` directly: - -```java -var customProfile = RecallOptions.builder() - .alpha(0.9f) - .beta(0.1f) - .hyperfocusMask("java", "concurrency") - .hyperfocusBoost(2.0f) - .lateralMode(false) - .build(); -``` - ---- - -## Result Metadata - -Each `CognitiveResult` carries a `RetrievalMode` indicating how it was retrieved: - -| Mode | Meaning | -|:---|:---| -| `STANDARD` | Normal similarity + importance scoring | -| `LATERAL` | Cross-domain retrieval via the Explorer dual-heap | -| `HYPERFOCUS` | Tag-matched with zero decay and boost multiplier | - -```java -for (CognitiveResult r : results) { - if (r.isLateral()) { - // Cross-domain insight — consider carefully - } else if (r.isHyperfocused()) { - // Focused match — high confidence - } -} -``` - -## What's Next - -- [Focus Mode](focus-mode.md) — Deep dive on HYPERFOCUS and SYSTEMATIZER -- [Explorer — Lateral Retrieval](lateral-retrieval.md) — Cross-domain dual-heap mechanics -- [Importance Fusion (ICNU)](importance-fusion.md) — Sigmoid-gated importance with dopaminergic I×N interaction -- [Synapse — Tags & Scoring](synapse.md) — Versioned header layouts (V1/V2/V3) and arousal-modulated decay -- [Hebbian — Association Learning](hebbian.md) — STDP with directed causal edges -- [Labs — Research Roadmap](../labs/roadmap.md) — Neuromodulatory Gain, Executive Dysfunction Profile diff --git a/docs/docs/memory/cortex.md b/docs/docs/memory/cortex.md deleted file mode 100644 index 59ef66c..0000000 --- a/docs/docs/memory/cortex.md +++ /dev/null @@ -1,223 +0,0 @@ ---- -title: "Cortex — Tier Stores" -description: "The 4-tier cognitive memory architecture: Working, Episodic, Semantic, and Procedural — each modeled after a brain region." ---- - -# 🧠 Cortex — Tier Stores - -> **Package**: `com.spectrayan.spector.memory.cortex` -> -> **Biological Analog**: The **Cerebral Cortex** — the outer layer of the brain responsible for higher-order cognitive functions. Different cortical regions specialize in different types of memory. - ---- - -## The 4-Tier Architecture - -Human memory is not a single system. Cognitive science identifies distinct memory systems with different characteristics, durations, and purposes. Spector mirrors this with four tier stores: - -```mermaid -graph TB - subgraph "TierRouter — Polymorphic Registry" - direction TB - TR["TierStore interface"] - end - - TR --> WM["🧪 Working Memory
WorkingMemoryStore
━━━━━━━━━━━━━━━━━
Prefrontal Cortex
Volatile circular buffer
~100 records"] - TR --> EM["📝 Episodic Memory
EpisodicMemoryStore
━━━━━━━━━━━━━━━━━
Hippocampus
Time-partitioned mmap
Unbounded"] - TR --> SE["🧬 Semantic Memory
SemanticMemoryStore
━━━━━━━━━━━━━━━━━
Neocortex
Permanent knowledge
~5,000 records"] - TR --> PR["⚙️ Procedural Memory
ProceduralMemoryStore
━━━━━━━━━━━━━━━━━
Basal Ganglia
Learned procedures
~500 records"] - - style WM fill:#e74c3c,color:white - style EM fill:#3498db,color:white - style SE fill:#2ecc71,color:white - style PR fill:#9b59b6,color:white -``` - ---- - -## TierStore Interface - -All four stores implement a common `TierStore` interface, enabling polymorphic dispatch in the router: - -```java -public interface TierStore extends AutoCloseable { - MemoryType type(); - int size(); - CognitiveRecordLayout layout(); - MemorySegment primarySegment(); - long write(CognitiveHeader header, byte[] quantizedVec); -} -``` - -The `TierRouter` holds an `EnumMap` and dispatches all operations polymorphically: - -```java -// Zero switch statements — polymorphic dispatch -public long write(MemoryType type, CognitiveHeader header, byte[] quantized) { - return get(type).write(header, quantized); -} -``` - -> Adding a new tier (e.g., `FLASH` for ultra-fast scratch memory) requires only: (1) implement `TierStore`, (2) register in `TierRouter`. No changes needed in `SpectorMemory`, `RecallPipeline`, or `CognitiveIngestionTarget`. - ---- - -## AbstractTierStore - -Three of the four stores (Working, Semantic, Procedural) extend `AbstractTierStore`, which provides: - -- **Arena lifecycle**: `Arena.ofShared()` for thread-safe off-heap access -- **Segment allocation**: 32-byte aligned via `arena.allocate(bytes, 32)` -- **Layout creation** from quantized vector byte count -- **Capacity tracking** and size reporting -- **Close/cleanup** lifecycle - -`EpisodicMemoryStore` implements `TierStore` directly because it uses mmap-backed partitions rather than a single Arena-allocated segment. - ---- - -## 🧪 Working Memory (Prefrontal Cortex) - -**Biological Analog**: The **Prefrontal Cortex** maintains a limited workspace for active processing. It holds ~7±2 items in biological systems. - -| Property | Value | -|---|---| -| Storage | `Arena.ofShared()` volatile segment | -| Capacity | Configurable (default: 100) | -| Eviction | Circular buffer — oldest entries overwritten | -| Persistence | **None** — lost on JVM shutdown | -| Use case | Current task context, recent conversation | - -```java -// Circular buffer write -public long write(CognitiveHeader header, byte[] quantizedVec) { - long offset = (long) (count % capacity) * layout.stride(); - layout.writeHeader(segment, offset, header); - MemorySegment.copy(MemorySegment.ofArray(quantizedVec), 0, - segment, layout.vectorOffset(offset), quantizedVec.length); - count++; - return offset; -} -``` - -**Special capability**: Synaptic tag search without vector math. WorkingMemoryStore supports a `findByTag(mask)` method that scans only the 64-bit Bloom filter field — useful for fast context lookups. - ---- - -## 📝 Episodic Memory (Hippocampus) - -**Biological Analog**: The **Hippocampus** encodes autobiographical events as time-ordered traces. New events are appended rapidly (one-trial learning), and during sleep the hippocampus replays sequences for consolidation into cortical memory. - -| Property | Value | -|---|---| -| Storage | `FileChannel.map()` mmap-backed files | -| Capacity | Unbounded (1 partition per day, each up to 10,000 records) | -| Eviction | Tombstone + compaction | -| Persistence | **Full** — survives JVM restarts | -| Use case | "What error did we debug yesterday?", "What did the user say last week?" | - -### Partition Lifecycle - -Each episodic partition is a memory-mapped file with a 64-byte metadata header: - -``` -┌─── Partition File ─────────────────────────────────────────┐ -│ [64B Metadata Header] │ -│ ├── 4B magic (0x45504943 = "EPIC") │ -│ ├── 4B version (1) │ -│ ├── 4B count (live records) │ -│ ├── 4B tombstoneCount │ -│ ├── 4B capacity │ -│ ├── 4B state (ACTIVE/SEALED/REFLECTABLE/TOMBSTONED/...) │ -│ ├── 4B stride │ -│ └── 36B reserved │ -├── [Record 0: 32B header + NB vector] ──────────────────────┤ -├── [Record 1: 32B header + NB vector] ──────────────────────┤ -│ ... │ -└── [Record N-1] ───────────────────────────────────────────┘ -``` - -**Partition state machine**: - -```mermaid -stateDiagram-v2 - [*] --> ACTIVE: Create partition - ACTIVE --> SEALED: Day rolls over - SEALED --> REFLECTABLE: ReflectDaemon marks eligible - REFLECTABLE --> TOMBSTONED: High tombstone ratio - TOMBSTONED --> COMPACTED: TombstoneCompactor rebuilds - COMPACTED --> [*]: Old partition swapped out -``` - ---- - -## 🧬 Semantic Memory (Neocortex) - -**Biological Analog**: The **Neocortex** stores distilled, permanent world knowledge — facts, concepts, and generalized rules extracted from repeated experience. - -| Property | Value | -|---|---| -| Storage | Header-only slab (`Arena.ofShared()`) | -| Capacity | Configurable (default: 5,000) | -| Eviction | None (permanent) | -| Persistence | Via WAL replay | -| Use case | "The user prefers dark mode", "Java uses garbage collection" | - -!!! info "Header-Only Storage" - Semantic memories store only the 32-byte synaptic header, not the full quantized vector. This enables fast metadata scans (tag match, importance, valence) at minimal memory cost. For vector similarity, the text is re-embedded at query time when needed. - -**Creation**: Semantic memories are created either: - -1. **Directly** by the user (`MemoryType.SEMANTIC`) -2. **By consolidation** — the `ReflectDaemon` clusters similar episodic memories during "sleep" and promotes the cluster centroid to semantic memory - ---- - -## ⚙️ Procedural Memory (Basal Ganglia) - -**Biological Analog**: The **Basal Ganglia** stores learned motor programs and habitual behaviors — "how to ride a bicycle" type knowledge that operates below conscious awareness. - -| Property | Value | -|---|---| -| Storage | `Arena.ofShared()` linear segment | -| Capacity | Configurable (default: 500) | -| Eviction | None (append-only) | -| Persistence | Via WAL replay | -| Use case | "Always use exponential backoff", "Format SQL with uppercase keywords" | - -Procedural memories represent **rules and patterns** that the agent has internalized. They are typically higher-importance, persistent, and rarely forgotten. - ---- - -## TierRouter - -The `TierRouter` dispatches all operations to the appropriate store via an `EnumMap`: - -```java -public final class TierRouter implements AutoCloseable { - private final EnumMap stores = new EnumMap<>(MemoryType.class); - - // Polymorphic dispatch — zero switch statements - public long write(MemoryType type, CognitiveHeader header, byte[] quantized) { - return get(type).write(header, quantized); - } - - public MemorySegment segmentFor(MemoryType type) { - return get(type).primarySegment(); - } - - public static boolean shouldScan(MemoryType type, MemoryType[] targetTypes) { - if (targetTypes == null || targetTypes.length == 0) return true; - for (MemoryType t : targetTypes) if (t == type) return true; - return false; - } -} -``` - ---- - -## Next Steps - -- :material-sleep: [**Hippocampus — Sleep Consolidation**](hippocampus.md) — how episodic memories are consolidated into semantic knowledge -- :material-flash: [**Synapse — Tags & Scoring**](synapse.md) — the 32-byte header and Bloom filter -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — the SIMD hot-loop diff --git a/docs/docs/memory/dopamine.md b/docs/docs/memory/dopamine.md deleted file mode 100644 index bd461ed..0000000 --- a/docs/docs/memory/dopamine.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -title: "Dopamine — Surprise Detection" -description: "How SurpriseDetector uses Welford online statistics to automatically score memory importance based on novelty." ---- - -# ⚡ Dopamine — Surprise Detection - -> **Package**: `com.spectrayan.spector.memory.dopamine` -> -> **Biological Analog**: The **dopaminergic system** signals prediction error — the difference between what the brain expected and what actually happened. When a stimulus is surprising (high prediction error), dopamine release strengthens memory encoding. This is why we vividly remember surprising events (flashbulb memories) but quickly forget routine ones. - ---- - -## The Problem - -Without surprise detection, an AI agent treats all memories as equally important. A routine "code compiled successfully" gets the same importance as "production database corrupted." This leads to: - -- Important memories drowning in noise -- Critical errors being forgotten as quickly as routine events -- No adaptive importance — every memory starts at the same baseline - ---- - -## SurpriseDetector - -The `SurpriseDetector` maintains a running statistical model of "normal" memory vectors using **Welford's online algorithm** (numerically stable one-pass mean/variance). When a new memory arrives, its L2 distance from the running centroid is converted to a Z-score: - -```mermaid -graph LR - A["New Memory
L2 norm = 3.7"] --> B["Welford Stats
μ=2.1, σ=0.6"] - B --> C["Z-score
(3.7 - 2.1) / 0.6 = 2.67"] - C -->|"Z > 2.0"| D["⚡ Surprising!
importance = 0.85"] - C -->|"Z < 0.5"| E["😐 Normal
importance = 0.4"] - - style D fill:#e74c3c,color:white - style E fill:#95a5a6,color:white -``` - -### Dual Importance Formula - -```java -public float computeDualImportance(float distanceToNearest, long synapticTags, - float spatialWeight, float temporalWeight) { - // Spatial surprise: how far is this from the running centroid? - float zScore = welford.zScore(distanceToNearest); - float spatialSurprise = sigmoid(zScore); - - // Temporal surprise: how long since we saw this tag pattern? - Long lastSeen = lastSeenByTags.put(synapticTags, nowMs); - float temporalSurprise = lastSeen == null ? 1.0f - : Math.min(1.0f, (nowMs - lastSeen) / (float) DAY_MS); - - // Fused importance - return spatialWeight * spatialSurprise + temporalWeight * temporalSurprise; -} -``` - -Two dimensions of surprise: - -| Dimension | Signal | Weight | -|---|---|---| -| **Spatial surprise** | Z-score of L2 norm vs. running statistics | 0.6 (default) | -| **Temporal surprise** | Time since last memory with matching tags | 0.4 (default) | - ---- - -## WelfordStats — Online Statistics - -`WelfordStats` implements Welford's algorithm for numerically stable online mean and variance computation: - -```java -public final class WelfordStats { - private long count = 0; - private double mean = 0.0; - private double m2 = 0.0; // Sum of squared differences - - public synchronized void update(double value) { - count++; - double delta = value - mean; - mean += delta / count; - double delta2 = value - mean; - m2 += delta * delta2; - } - - public double variance() { - return count < 2 ? 0.0 : m2 / (count - 1); - } - - public float zScore(double value) { - double stdDev = Math.sqrt(variance()); - return stdDev < 1e-9 ? 0f : (float) ((value - mean) / stdDev); - } -} -``` - -!!! tip "Why Welford?" - Naive variance computation (`Σ(x-μ)²/n`) requires two passes or suffers from catastrophic cancellation with floating-point arithmetic. Welford's algorithm maintains numerical stability with a single pass — critical for an always-running system that processes millions of memories over its lifetime. - ---- - -## FlashbulbPolicy — Extreme Surprise - -**Biological analog**: **Flashbulb memories** are vivid, long-lasting memories formed during moments of extreme surprise or emotional intensity (e.g., hearing about a major world event). The amygdala signals the hippocampus to strengthen encoding. - -When the Z-score exceeds a threshold (default: 3.0), the `FlashbulbPolicy` kicks in: - -```java -public FlashbulbDecision evaluate(float zScore, float baseImportance) { - if (zScore >= flashbulbThreshold) { - return new FlashbulbDecision( - true, // isFlashbulb - 1.0f, // maxImportance - true // pinned (exempt from decay) - ); - } - return FlashbulbDecision.NORMAL; -} -``` - -**Effects**: - -- Importance is set to **1.0** (maximum) -- The **pinned flag** (bit 1 of flags byte) is set — this memory is exempt from temporal decay in Phase 4 of the scoring pipeline -- The memory will persist indefinitely unless explicitly `forget()`'d - -!!! example "Use Case" - An AI coding agent encounters `OutOfMemoryError` for the first time (Z-score: 4.2). This triggers flashbulb encoding — the error memory is pinned at maximum importance and will always surface when the agent encounters memory-related issues. - ---- - -## Integration with Ingestion Pipeline - -The surprise detection happens at **Step 3** of the ingestion pipeline: - -```java -// In CognitiveIngestionTarget.ingestCognitive() - -// Step 1: Embed -float[] vector = embeddingProvider.embed(text).vector(); -float l2Norm = VectorOps.l2Norm(vector); - -// Step 2: Encode tags -long synapticTags = SynapticTagEncoder.encode(tags); - -// Step 3: Surprise detection -float importance = surpriseDetector.computeDualImportance(l2Norm, synapticTags); - -// Step 4: Flashbulb check -FlashbulbDecision flashbulb = flashbulbPolicy.evaluate( - surpriseDetector.lastZScore(), importance); -if (flashbulb.isFlashbulb()) { - importance = 1.0f; - flags |= FLAG_PINNED; -} -``` - ---- - -## Next Steps - -- :material-emoticon: [**Amygdala — Emotional Valence**](amygdala.md) — emotional coloring of memories -- :material-flash: [**Synapse — Tags & Scoring**](synapse.md) — the 32-byte header -- :material-sleep: [**Hippocampus — Sleep Consolidation**](hippocampus.md) — what happens to important memories diff --git a/docs/docs/memory/focus-mode.md b/docs/docs/memory/focus-mode.md deleted file mode 100644 index dfcd8c0..0000000 --- a/docs/docs/memory/focus-mode.md +++ /dev/null @@ -1,137 +0,0 @@ -# Focus Mode - -Focus Mode is a specialized cognitive scoring strategy that simulates **sustained attention** — the ability to deeply concentrate on a single topic while filtering out irrelevant information. - -Two profiles use focus-oriented mechanics: **HYPERFOCUS** (strict retrieval tunnel) and **SYSTEMATIZER** (lossless knowledge accumulation). - ---- - -## HYPERFOCUS — Strict Retrieval Tunnel - -When an agent activates Focus Mode, three things change in the [6-Phase Scoring Pipeline](scoring-pipeline.md): - -### 1. Strict Tag Gate (Phase 2) - -Only memories whose synaptic tags **exactly match** the focus mask pass through. This is a bitwise AND check: - -``` -if (recordTags & hyperfocusMask) != hyperfocusMask → SKIP -``` - -Unlike normal tag filtering (which accepts partial overlap), Focus Mode requires **all** focus tags to be present. This creates a narrow retrieval tunnel — only deeply relevant memories survive. - -### 2. Zero Decay (Phase 4) - -For tag-matched memories, the time decay factor is clamped to **1.0**: - -``` -adjustedBucket = 0 // no time decay for focused memories -``` - -This means old memories about the focused topic are treated as if they were just created. A 6-month-old memory about "database deadlocks" is equally accessible as one from today — as long as the tags match. - -### 3. Post-Score Boost (Phase 6) - -After the standard cognitive score is computed, focus-matched memories receive a configurable multiplier: - -``` -finalScore = score × hyperfocusBoost // default: 1.5× -``` - -This ensures focused memories consistently outrank non-focused ones in the final result list. - -### Configuration - -```java -// Via profile preset -var results = memory.recall("database deadlock", CognitiveProfile.HYPERFOCUS); - -// Via explicit options -var options = RecallOptions.builder() - .profile(CognitiveProfile.HYPERFOCUS) - .hyperfocusMask("database", "deadlock") // Bloom filter encoded - .hyperfocusBoost(2.0f) // custom boost - .build(); -``` - -### TTL and Self-Extension - -Focus Mode is governed by `HyperfocusState`, a TTL-based state machine: - -```java -// Activate focus for 30 minutes (default) -memory.hyperfocusState().activateFromTags("database", "deadlock"); - -// Agent extends focus when topic continues -memory.hyperfocusState().extend(); // adds another 30 minutes -memory.hyperfocusState().extend(60_000L); // adds 1 minute - -// Check status -memory.hyperfocusState().isActive(); // true -memory.hyperfocusState().remainingMs(); // milliseconds remaining - -// Deactivate manually (or wait for TTL expiry) -memory.hyperfocusState().deactivate(); -``` - -!!! tip "Agent Self-Extension" - The `extend()` method is designed to be called by the agent itself. When the agent detects that the conversation is still focused on the same topic, it extends the TTL. When the topic naturally shifts, the TTL expires and Focus Mode deactivates automatically. - ---- - -## SYSTEMATIZER — Lossless Knowledge Accumulation {#systemizer} - -The Systematizer profile is designed for agents that need to build **comprehensive knowledge bases** — where losing detail during consolidation is unacceptable. - -### Scoring Weights - -| Parameter | Value | Rationale | -|:---|:---:|:---| -| α (similarity) | 0.3 | Low — details matter more than semantic similarity | -| β (importance) | 0.7 | High — prioritizes learned importance | - -### Persistent Memory Pinning - -The key feature of SYSTEMATIZER is **lossless consolidation**. During the [sleep consolidation cycle](hippocampus.md) (REM sleep), the system normally clusters similar episodic memories and promotes a summary to semantic memory. The source episodes may then be tombstoned. - -With SYSTEMATIZER, source episodes are **pinned** — they receive the `FLAG_PINNED` bit in their record header, which prevents tombstoning: - -``` -Episodic: [mem-1] [mem-2] [mem-3] → Cluster → Semantic summary created - ↓ - Standard: mem-1, mem-2, mem-3 tombstoned - Systemizer: mem-1, mem-2, mem-3 PINNED ✓ -``` - -### Quota Management - -To prevent unbounded memory growth, pinning is governed by a configurable quota: - -```java -var memory = SpectorMemory.builder() - .pinSourceEpisodes(true) // enable pinning - .pinnedQuota(10_000) // max pinned records (default) - .build(); -``` - -When the quota is reached, the oldest pinned records are eligible for tombstoning during the next consolidation cycle. - -### Use Cases - -- **Legal/compliance agents** that must retain all original evidence -- **Research agents** building encyclopedic knowledge bases -- **Audit trails** where summarization must not lose detail - ---- - -## Performance Impact - -!!! note "Zero Overhead When Disabled" - All Focus Mode mechanics are gated by `hyperfocusMask != 0` in the hot loop. When no focus is active (the default), the code paths are identical to standard scoring — zero additional cost. - -| Mechanic | Hot-Loop Cost | When Active | -|:---|:---|:---| -| Tag gate | ~2 cycles (bitwise AND) | Only when `hyperfocusMask != 0` | -| Decay clamp | ~1 cycle (conditional) | Only for tag-matched records | -| Boost multiply | ~1 cycle (float multiply) | Only for tag-matched records | -| Episode pinning | 0 (off hot loop) | During consolidation only | diff --git a/docs/docs/memory/getting-started.md b/docs/docs/memory/getting-started.md deleted file mode 100644 index fe66def..0000000 --- a/docs/docs/memory/getting-started.md +++ /dev/null @@ -1,227 +0,0 @@ ---- -title: Getting Started -description: "Set up Spector Memory in 5 minutes — from Maven dependency to your first remember/recall cycle." ---- - -# Getting Started - -Get cognitive memory running in your Java application in under 5 minutes. - ---- - -## Prerequisites - -| Requirement | Version | Notes | -|---|---|---| -| **JDK** | 25+ | OpenJDK with Vector API incubator | -| **Maven** | 3.9+ | Build tool | -| **Ollama** | Latest | For real embeddings (optional — mock provider works for testing) | - -## Maven Dependency - -```xml - - com.spectrayan - spector-memory - 0.1.0-SNAPSHOT - - - - - com.spectrayan - spector-embed-ollama - 0.1.0-SNAPSHOT - -``` - -## JVM Flags - -Spector Memory uses the Vector API (incubator) for SIMD acceleration: - -```bash -java --add-modules jdk.incubator.vector \ - --enable-native-access=ALL-UNNAMED \ - --enable-preview \ - -jar your-app.jar -``` - -!!! tip "Maven Surefire" - These flags are already configured in the parent `pom.xml`. Tests run out of the box with `mvn test`. - ---- - -## Minimal Example - -### With Mock Embeddings (No Ollama Required) - -```java -import com.spectrayan.spector.memory.*; -import com.spectrayan.spector.memory.cortex.MemorySource; - -// Create a mock embedding provider for testing -EmbeddingProvider mock = text -> { - float[] vec = new float[128]; - // Deterministic hash-based vector for reproducibility - var rng = new java.util.Random(text.hashCode()); - for (int i = 0; i < 128; i++) vec[i] = rng.nextFloat() - 0.5f; - return new EmbeddingResult(vec, text.split("\\s+").length, "mock"); -}; - -try (SpectorMemory memory = SpectorMemory.builder() - .dimensions(128) - .embeddingProvider(mock) - .build()) { - - // Remember - memory.remember("fact-1", - "The user prefers dark mode in all editors.", - MemoryType.EPISODIC, MemorySource.USER_STATED, - "preferences", "ui").get(); - - // Recall - List results = memory.recall("dark theme settings", - RecallOptions.builder().topK(5).build()); - - results.forEach(r -> - System.out.printf("%.4f [%s] %s%n", r.score(), r.memoryType(), r.text())); -} -``` - -### With Real Ollama Embeddings - -```java -import com.spectrayan.spector.embed.ollama.OllamaEmbeddingProvider; - -// Pull the model first: ollama pull qwen3-embedding -var embedder = OllamaEmbeddingProvider.create("qwen3-embedding"); - -try (SpectorMemory memory = SpectorMemory.builder() - .dimensions(embedder.dimensions()) // Auto-detect: 4096 for qwen3-embedding - .embeddingProvider(embedder) - .workingCapacity(100) - .episodicPartitionCapacity(10_000) - .semanticCapacity(5_000) - .proceduralCapacity(500) - .build()) { - - // Ingest diverse memories - memory.remember("err-db", - "Database connection pool exhausted — 50 active, 0 idle connections.", - MemoryType.EPISODIC, MemorySource.OBSERVED, - "error", "database").get(); - - memory.remember("rule-retry", - "Always implement exponential backoff for database retries.", - MemoryType.PROCEDURAL, MemorySource.PROCEDURAL, - "database", "retry").get(); - - // Semantic recall with synaptic tag filtering - List results = memory.recall("database connection error", - RecallOptions.builder() - .topK(5) - .synapticFilter("database") // Only memories tagged "database" - .minImportance(0.2f) // Skip trivial memories - .build()); -} -``` - ---- - -## Core Operations - -### Remember (Ingestion) - -```java -// Async — returns CompletableFuture -CompletableFuture future = memory.remember( - "unique-id", // Unique memory identifier - "The text content to remember", // Raw text (will be auto-embedded) - MemoryType.EPISODIC, // Cognitive tier - MemorySource.USER_STATED, // Provenance - "tag1", "tag2", "tag3" // Synaptic tags (Bloom filter encoded) -); -future.get(); // Block if needed -``` - -### Recall (Retrieval) - -```java -List results = memory.recall("query text", - RecallOptions.builder() - .topK(10) // Max results - .synapticFilter("java", "debugging") // Bloom filter pre-screen - .minImportance(0.3f) // Importance threshold - .memoryTypes(MemoryType.EPISODIC, // Tier filter - MemoryType.SEMANTIC) - .minValence((byte) -50) // Emotional range - .maxValence((byte) 50) - .alpha(0.6f) // Similarity weight - .beta(0.4f) // Importance × decay weight - .build()); -``` - -### Forget & Suppress - -```java -// Permanent: tombstone the memory (excluded from all future scans) -memory.forget("memory-id"); - -// Temporary: suppress from recall (can be un-suppressed later) -memory.suppress("memory-id", "Not relevant to current task"); -``` - -### Introspect - -```java -// Memory health statistics -int total = memory.totalMemories(); -var stats = memory.introspect(); -``` - ---- - -## Claude Desktop / MCP Integration - -Add cognitive memory to your AI agent via the built-in MCP server. Enable memory in your `spector.yml`: - -```yaml -spector: - engine: - dimensions: 4096 - embedding: - model: qwen3-embedding - base-url: http://localhost:11434 - memory: - enabled: true - persistence-path: .spector/memory -``` - -Then configure your agent: - -```json -{ - "mcpServers": { - "spector": { - "command": "java", - "args": [ - "--add-modules", "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "--enable-preview", - "-jar", "/path/to/spector.jar", - "--config", "/path/to/spector.yml" - ] - } - } -} -``` - -With `memory.enabled: true`, the MCP server registers all 13 tools (6 search + 7 cognitive memory). - ---- - -## Next Steps - -- :material-brain: [**System Architecture**](architecture.md) — understand the full package hierarchy -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — how recall actually works under the hood -- :material-head-cog: [**Biological Systems**](cortex.md) — explore each brain region -- :material-speedometer: [**Performance**](performance.md) — benchmarks and optimization techniques diff --git a/docs/docs/memory/habituation.md b/docs/docs/memory/habituation.md deleted file mode 100644 index e729136..0000000 --- a/docs/docs/memory/habituation.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -title: "Habituation — Anti-Filter Bubble" -description: "How HabituationPenalty prevents repetitive recall by attenuating scores for frequently-returned memories." ---- - -# 😴 Habituation — Anti-Filter Bubble - -> **Package**: `com.spectrayan.spector.memory.habituation` -> -> **Biological Analog**: **Habituation** is the simplest form of learning — a decrease in response to a stimulus after repeated presentations. You stop hearing the ticking clock after a few minutes. The brain allocates attention to *novel* stimuli, not repeated ones. This prevents sensory overload and enables adaptation. - ---- - -## The Problem - -Without habituation, an AI agent repeatedly recalls the same "most relevant" memories — creating a **filter bubble**. If memory A has the highest similarity score, it dominates every recall, crowding out potentially useful but slightly-less-similar memories. - -``` -Query 1: "database issues" → [A, B, C, D, E] ← A dominates -Query 2: "database issues" → [A, B, C, D, E] ← Same results! -Query 3: "database issues" → [A, B, C, D, E] ← Filter bubble -``` - -### With Habituation - -``` -Query 1: "database issues" → [A, B, C, D, E] ← Fresh results -Query 2: "database issues" → [B, C, A, D, F] ← A drops, F emerges -Query 3: "database issues" → [C, F, B, G, D] ← New memories surface -``` - ---- - -## HabituationPenalty - -The `HabituationPenalty` tracks recall frequency per memory ID and computes a decay penalty: - -```java -public final class HabituationPenalty { - - private final ConcurrentHashMap recallCounts - = new ConcurrentHashMap<>(); - private final float decayRate; // default: 0.85 - - /** - * Records a recall event and returns the habituation penalty. - * First recall: 1.0 (no penalty). Each subsequent recall multiplies - * the penalty by decayRate (default: 0.85). - * - * @param memoryId the memory being recalled - * @return penalty multiplier [0.0 – 1.0] - */ - public float recordAndComputePenalty(String memoryId) { - int count = recallCounts.merge(memoryId, 1, Integer::sum); - if (count <= 1) return 1.0f; // First recall — no penalty - return (float) Math.pow(decayRate, count - 1); - } - - /** - * Batch penalty computation for multiple IDs (P7 optimization). - */ - public float[] batchPenalty(String[] ids) { - float[] penalties = new float[ids.length]; - for (int i = 0; i < ids.length; i++) { - penalties[i] = recordAndComputePenalty(ids[i]); - } - return penalties; - } -} -``` - -### Penalty Curve - -| Recall # | Penalty (rate=0.85) | Effect | -|---|---|---| -| 1st | 1.00 | Full score | -| 2nd | 0.85 | 15% reduction | -| 3rd | 0.72 | 28% reduction | -| 5th | 0.52 | Half score | -| 10th | 0.20 | 80% reduction | -| 20th | 0.04 | Nearly eliminated | - -!!! info "Decay Rate Configuration" - The default decay rate of 0.85 provides a balance between novelty and relevance. A higher rate (0.95) creates a gentler penalty — useful when the agent genuinely needs to recall the same memory frequently. A lower rate (0.70) aggressively surfaces new content. - ---- - -## Integration with RecallPipeline - -Habituation is applied at **Step 5** of the recall pipeline — after scoring but before final ranking: - -```java -// In RecallPipeline.recall() - -// Step 5: Apply habituation penalty (anti-filter-bubble) -for (int i = 0; i < allResults.size(); i++) { - CognitiveResult r = allResults.get(i); - float habPenalty = habituationPenalty.recordAndComputePenalty(r.id()); - if (habPenalty < 1.0f) { - allResults.set(i, new CognitiveResult( - r.id(), r.text(), r.score() * habPenalty, - r.importance(), r.ageDays(), - r.recallCount(), r.valence(), r.memoryType(), r.source(), - r.synapticTags(), r.decayFactor(), r.ltpAdjustedDecay())); - } -} -``` - -**Key**: The penalty multiplies the `score()` field — it doesn't modify the underlying memory. Habituation is a recall-time effect, not a storage-time effect. - ---- - -## Interaction with Other Systems - -| System | Interaction | -|---|---| -| **Reconsolidation** | Habituation reduces recall score, but reconsolidation *increases* the memory's durability. A frequently-recalled memory resists temporal decay (fewer buckets) but gets a lower score on repeated queries. | -| **Surprise Detection** | New, surprising memories start with high importance and no habituation penalty — they naturally dominate initial queries. | -| **Suppression** | If a memory is fully suppressed, habituation is irrelevant — it's excluded at Step 4 before habituation is applied. | - ---- - -## Next Steps - -- :material-cancel: [**Inhibition — Suppression**](inhibition.md) — explicit memory blocking -- :material-link: [**Hebbian — Association Learning**](hebbian.md) — how co-activation creates associations -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — the full recall pipeline diff --git a/docs/docs/memory/hebbian.md b/docs/docs/memory/hebbian.md deleted file mode 100644 index 39ee6cd..0000000 --- a/docs/docs/memory/hebbian.md +++ /dev/null @@ -1,315 +0,0 @@ ---- -title: "3-Layer Cognitive Graph" -description: "HebbianGraph, TemporalChain, and EntityGraph — three biologically-inspired off-heap graph structures that augment vector recall with associative, temporal, and relational signals." ---- - -# 🧠 3-Layer Cognitive Graph - -> **Packages**: `com.spectrayan.spector.memory.hebbian`, `.temporal`, `.graph` -> -> **Biological Analog**: The brain doesn't retrieve memories by content similarity alone. It uses **associative networks** (neurons that fire together wire together), **temporal sequences** (what happened next?), and **semantic knowledge** (who manages what project?). Spector Memory implements all three as off-heap graph structures that augment vector recall. - ---- - -## Architecture Overview - -```mermaid -graph TB - subgraph "RecallPipeline" - RP["Vector Search → 6-Phase Scoring → Top-K Seed Set"] - end - - RP --> S5c["Step 5c: Hebbian
Spreading Activation"] - RP --> S5d["Step 5d: Temporal
Chain Extension"] - RP --> S5e["Step 5e: Entity
Graph Traversal"] - - S5c --> M["Merge & Dedup → Re-sort → Final Top-K"] - S5d --> M - S5e --> M - - subgraph "Layer 1 — Hebbian Association" - HG["HebbianGraph
164B/node, off-heap"] - CAT["CoActivationTracker
OffHeapPairTable + OffHeapEdgeTable"] - end - - subgraph "Layer 2 — Entity-Relationship" - EG["EntityGraph
64B/entity, 12B/edge"] - EX["EntityExtractor SPI
LLM / NoOp / Custom"] - end - - subgraph "Layer 3 — Temporal Causal" - TC["TemporalChain
16B/node, linked list"] - end - - S5c --> HG - S5c --> CAT - S5d --> TC - S5e --> EG - - style RP fill:#4a90d9,color:white - style M fill:#00b894,color:white - style HG fill:#e74c3c,color:white - style EG fill:#9b59b6,color:white - style TC fill:#f39c12,color:white -``` - -!!! tip "Graceful Degradation" - Each graph step is **additive** — it can only ADD candidates to the result set, never remove. If a graph is null, empty, or throws an exception, the step is a no-op. Zero risk of regression. - ---- - -## Layer 1: Hebbian Association Graph - -> *"Neurons that fire together, wire together."* — Donald Hebb, 1949 - -### HebbianGraph — Memory-Level Associations - -The `HebbianGraph` stores explicit **memory-to-memory edges** with association weights in an off-heap adjacency list. - -```mermaid -graph LR - A["Memory #42
'database error'"] ---|"weight: 0.83
co-ingested 5×"| B["Memory #87
'connection pool'"] - A ---|"weight: 0.47
co-ingested 2×"| C["Memory #103
'retry strategy'"] - B ---|"weight: 0.63
co-ingested 3×"| C - - style A fill:#e74c3c,color:white - style B fill:#3498db,color:white - style C fill:#2ecc71,color:white -``` - -**Off-heap layout** (164 bytes per node): - -``` -┌──────────┬──────────────────────────────────────────────┐ -│ degree │ edges[0..19]: (neighborIdx:4B, weight:4B) │ -│ (4B) │ = 20 × 8B = 160B │ -└──────────┴──────────────────────────────────────────────┘ -``` - -**Key properties:** - -| Property | Value | -|---|---| -| Storage | Off-heap `MemorySegment` via Panama | -| Max degree | 20 neighbors per memory | -| Edge weight | Float — strengthened on co-ingestion | -| Eviction | Weakest edge evicted when degree exceeds MAX_DEGREE | -| Decay | 0.9 multiplicative factor per consolidation cycle | -| Spreading activation | BFS with depth=2, attenuated by edge weight | -| Persistence | `HGPH` magic header, chunked 64KB FileChannel I/O | - -**Pipeline integration:** - -- **Ingestion (Step 9b):** When memories are co-ingested within the same session, `HebbianGraph.strengthen(currentIdx, previousIdx, 1.0f)` strengthens the bidirectional edge. -- **Recall (Step 5c):** After the 6-phase scorer produces a seed set, `HebbianGraph.activateNeighbors(seedIdx, depth=2)` discovers associated memories. These are added to the result set with a 0.3× score attenuation. - -### CoActivationTracker — Tag-Level Associations - -The `CoActivationTracker` tracks **tag co-occurrence patterns** using two off-heap hash tables: - -#### OffHeapPairTable — Undirected Co-Activation Counts - -Tracks how often two tags appear together in ingested memories. - -``` -Slot layout (32 bytes): -┌───────────┬───────────┬──────────┬───────┐ -│ keyHashA │ keyHashB │ count │ flags │ -│ 8 bytes │ 8 bytes │ 4 bytes │ ... │ -└───────────┴───────────┴──────────┴───────┘ -``` - -- Open-addressing hash table with linear probing -- FNV-1a 64-bit hashing for tag strings -- ~50% load factor for fast lookups - -#### OffHeapEdgeTable — Directed STDP Edges - -Tracks causal/predictive relationships between tags (Spike-Timing Dependent Plasticity): - -``` -Slot layout (40 bytes): -┌────────────┬────────────┬────────┬──────────┬───────────┬───────┐ -│ sourceHash │ targetHash │ weight │ lastMs │ actCount │ flags │ -│ 8 bytes │ 8 bytes │ 4 bytes│ 8 bytes │ 4 bytes │ ... │ -└────────────┴────────────┴────────┴──────────┴───────────┴───────┘ -``` - -- Weight clamped to `[0.0, 1.0]` -- Temporal metadata for STDP learning rules -- Persistence via `COAX` magic header with hash→tag reverse map - -!!! info "STDP — Spike-Timing Dependent Plasticity" - If tag A is consistently recalled *before* tag B, the directed edge A→B is strengthened. This creates predictive associations: "when I think of A, I should also think of B." The `HebbianCoActivationListener` runs after each recall on a Virtual Thread, updating STDP weights with zero impact on recall latency. - ---- - -## Layer 2: Entity-Relationship Graph - -> *"What was the budget of the project managed by the person who met with me yesterday?"* - -The `EntityGraph` stores **typed entities** (PERSON, PROJECT, ORG, ...) and **typed relations** (MANAGES, AUTHORED, PART_OF, ...) extracted from ingested text. This enables **multi-hop knowledge traversal** that pure vector similarity cannot achieve. - -### Entity Extraction - -Entities are extracted at ingestion time via the `EntityExtractor` SPI: - -| Mode | Implementation | Description | -|---|---|---| -| `NONE` (default) | `NoOpEntityExtractor` | No extraction — graph features disabled | -| `LLM` | `LlmEntityExtractor` | Uses `TextGenerationProvider` with a structured prompt | -| `CUSTOM` | User-provided | Any custom `EntityExtractor` implementation | - -```java -// Enable LLM entity extraction via Builder -SpectorMemory.builder() - .entityExtractionMode(EntityExtractionMode.LLM) - .textGenerationProvider(provider) - .build(); -``` - -### Type System - -**22 Entity Types:** -`PERSON`, `ORGANIZATION`, `PROJECT`, `CONCEPT`, `EVENT`, `LOCATION`, `TOOL`, `SKILL`, `DOCUMENT`, `API`, `DATABASE`, `FRAMEWORK`, `PROTOCOL`, `METRIC`, `ROLE`, `TEAM`, `PRODUCT`, `SERVICE`, `WORKFLOW`, `DECISION`, `RISK`, `OTHER` - -**21 Relation Types:** -`MANAGES`, `AUTHORED`, `ATTENDED`, `PART_OF`, `RELATED_TO`, `CAUSES`, `DEPENDS_ON`, `USES`, `CREATED`, `MENTIONS`, `MEMBER_OF`, `ASSIGNED_TO`, `REPORTED_BY`, `BLOCKED_BY`, `IMPLEMENTS`, `EXTENDS`, `TESTED_BY`, `DEPLOYED_TO`, `MONITORS`, `TRIGGERS`, `OTHER` - -### Off-Heap Layout - -**Entity Node (64 bytes, 8-byte aligned):** -``` -[type:1B][pad:7B][nameHash:8B][memRef0:4B][memRef1:4B][memRef2:4B][memRef3:4B] -[refCount:4B][degree:4B][edgeStart:4B][pad:20B] -``` - -**Entity Edge (12 bytes):** -``` -[targetId:4B][relationType:4B][weight:4B] -``` - -**Traversal:** BFS with typed edge filtering, max 32 edges per entity, max 4 memory references per entity. - -**Pipeline integration:** - -- **Ingestion (Step 9d):** Extract entities from text → `entityGraph.addEntity(name, type)` → `entityGraph.linkEntityToMemory(eid, memoryIdx)` → `entityGraph.addRelation(fromEid, toEid, relationType)` -- **Recall (Step 5e):** Extract entities from query → find in graph by name → 2-hop BFS → collect `memoriesForEntity(eid)` → add to result set with 0.25× attenuation per hop -- **Persistence:** `ENTG` magic header with on-heap nameIndex reconstruction on load - ---- - -## Layer 3: Temporal Causal Chain - -> *"What happened after the deployment failed?"* - -The `TemporalChain` links memories ingested within the same session into a **doubly-linked list**, enabling temporal navigation. - -```mermaid -graph LR - M1["Memory #12
'deploy started'"] --> M2["Memory #13
'tests passed'"] - M2 --> M3["Memory #14
'deploy failed'"] - M3 --> M4["Memory #15
'rollback initiated'"] - - style M1 fill:#3498db,color:white - style M2 fill:#2ecc71,color:white - style M3 fill:#e74c3c,color:white - style M4 fill:#f39c12,color:white -``` - -### Off-Heap Layout (16 bytes per node) - -``` -┌──────────┬──────────┬───────────┬──────────┐ -│ prevIdx │ nextIdx │ sessionId │ pad │ -│ 4 bytes │ 4 bytes │ 4 bytes │ 4 bytes │ -└──────────┴──────────┴───────────┴──────────┘ -``` - -`-1` is used as sentinel for "no link" (beginning or end of chain). - -**API:** - -| Method | Description | -|---|---| -| `link(currentIdx, prevIdx, sessionId)` | Links two memories within a session | -| `followForward(startIdx, maxHops)` | "What happened next?" → `List` | -| `followBackward(startIdx, maxHops)` | "What happened before?" → `List` | -| `save(Path)` / `load(Path)` | Persistence with `TPCH` magic header | - -**Pipeline integration:** - -- **Ingestion (Step 9c):** When a new memory is ingested within the same session, `temporalChain.link(currentIdx, lastIngestedIdx, sessionId)` creates the bidirectional link. -- **Recall (Step 5d):** For each seed result, `followForward(idx, 3)` and `followBackward(idx, 3)` discover temporally adjacent memories. Forward links get 0.8× score, backward links get 0.7×. - ---- - -## Persistence - -All graph components persist alongside episodic partitions in DISK mode: - -| Component | File | Magic | Format | -|---|---|---|---| -| HebbianGraph | `hebbian.graph` | `HGPH` | 16B header + raw segment bytes | -| CoActivationTracker | `coactivation.dat` | `COAX` | 16B header + pair table + edge table + hash→tag map | -| EntityGraph | `entity.graph` | `ENTG` | 16B header + entity segment + edge segment + name index | -| TemporalChain | `temporal.chain` | `TPCH` | 16B header + raw segment bytes | - -All use chunked 64KB FileChannel I/O to avoid `ByteBuffer` overflow on large segments. - ---- - -## Error Framework - -Graph operations use granular exceptions from the `SpectorGraphException` hierarchy: - -``` -SpectorMemoryException (SPE-310-xxx) - └── SpectorGraphException (base) - ├── SpectorHebbianException (SPE-310-006) - ├── SpectorTemporalChainException (SPE-310-007) - ├── SpectorEntityGraphException (SPE-310-008) - ├── SpectorCoActivationException (SPE-310-009) - ├── SpectorGraphPersistenceException(SPE-310-010) - └── SpectorGraphDecayException (SPE-310-011) -``` - -All pipeline catch sites use `catch(RuntimeException)` → create granular exception → `log.warn(ex.getMessage())`. No generic catches, no swallowed exceptions. - ---- - -## Memory Budget - -| Layer | Per-Node | At 100K memories | At 1M memories | -|---|---|---|---| -| Hebbian (L1) | 164B | 16.4 MB | 164 MB | -| CoActivation | ~1MB total | ~1 MB | ~1 MB | -| Entity (L2) | ~64B + edges | ~8 MB | ~80 MB | -| Temporal (L3) | 16B | 1.6 MB | 16 MB | -| **Total** | | **~27 MB** | **~261 MB** | - -This is small compared to the vector store (100K × 768-dim × 1B quantized = 75 MB). - ---- - -## Why This Matters for AI Agents - -Traditional vector search treats each query independently. The 3-layer graph creates **emergent intelligence**: - -!!! example "Scenario: Multi-Signal Recall" - 1. Agent queries "why is the app slow?" - 2. **Vector search** → finds memory about "application latency" - 3. **Hebbian (Layer 1)** → that memory was co-ingested with "connection pool settings" → adds it to results - 4. **Temporal (Layer 3)** → follows the chain: connection pool → timeout config → retry backoff → adds all three - 5. **Entity (Layer 2)** → "connection pool" mentions entity "DatabaseService" → traverses DEPENDS_ON edge → finds "Redis cache config" → adds it - - The final result set contains memories that no single retrieval signal could have found alone. - ---- - -## Next Steps - -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — the SIMD hot-loop that produces the seed set -- :material-sleep: [**Habituation — Anti-Filter Bubble**](habituation.md) — preventing repetitive recall -- :material-head-cog: [**Dopamine — Surprise Detection**](dopamine.md) — auto-importance scoring -- :material-brain: [**Architecture**](architecture.md) — how graphs fit in the full pipeline diff --git a/docs/docs/memory/hippocampus.md b/docs/docs/memory/hippocampus.md deleted file mode 100644 index 1bf87c1..0000000 --- a/docs/docs/memory/hippocampus.md +++ /dev/null @@ -1,160 +0,0 @@ ---- -title: "Hippocampus — Sleep Consolidation" -description: "How ReflectDaemon consolidates episodic memories into semantic knowledge during 'sleep' — K-Means clustering, tombstone compaction, and partition rebuild." ---- - -# 🛏️ Hippocampus — Sleep Consolidation - -> **Package**: `com.spectrayan.spector.memory.hippocampus` -> -> **Biological Analog**: During sleep, the **hippocampus replays** episodic memory traces to the neocortex, gradually transferring knowledge from episode-specific to generalized semantic form. This is called **systems consolidation**. Simultaneously, **synaptic pruning** weakens unused connections — the brain's garbage collector. - ---- - -## The Two Mechanisms - -### 1. ReflectDaemon — Sleep Consolidation - -The `ReflectDaemon` performs K-Means clustering on episodic memories to extract semantic knowledge: - -```mermaid -sequenceDiagram - participant RD as ReflectDaemon - participant EP as EpisodicMemoryStore - participant CS as CognitiveScorer - participant SE as SemanticMemoryStore - participant MI as MemoryIndex - - Note over RD: Circadian trigger (configurable interval) - RD->>EP: partitions().filter(SEALED) - - loop Each sealed partition - RD->>EP: Read all records from partition - Note over RD: K-Means clustering on header features - RD->>RD: Cluster by (synapticTags AND, importance AVG) - - loop Each cluster (size ≥ threshold) - Note over RD: Compute centroid header - RD->>RD: Merge synaptic tags (AND across cluster) - RD->>RD: Average importance, max valence - RD->>SE: Write consolidated semantic record - RD->>MI: Register new semantic memory - end - - RD->>EP: Mark partition as REFLECTABLE - end -``` - -**Key behaviors**: - -- **Tag merging**: Uses bitwise AND across the cluster — only common tags survive, representing the shared theme -- **Importance averaging**: The consolidated memory inherits the mean importance of its source episodes -- **Minimum cluster size**: Small clusters (noise) are not promoted — only patterns are - -!!! example "Example: Consolidation in Action" - An agent encounters 15 episodic memories tagged `[database, connection, error]` over a week. The ReflectDaemon clusters them and promotes a single semantic memory: *"Database connection issues are recurring — check connection pool sizing and timeout settings."* - ---- - -### 2. TombstoneCompactor — Synaptic Pruning - -When memories are `forget()`'d, they are tombstoned (bit 0 of flags byte set to 1). The scorer skips them in Phase 1 (~1 cycle). But tombstoned records still consume disk space. - -When the tombstone ratio in a partition exceeds a threshold (default: 30%), the `TombstoneCompactor` triggers a **partition rebuild**: - -```mermaid -graph LR - A["Old Partition
1000 records
400 tombstoned
(40% ratio)"] -->|TombstoneCompactor| B["New Partition
600 records
0 tombstoned
(dense)"] - A -->|"atomicSwap()"| C["Closed & Deleted"] - - style A fill:#e74c3c,color:white - style B fill:#2ecc71,color:white - style C fill:#95a5a6,color:white -``` - -**The rebuild process**: - -1. Allocate a new partition file -2. Sequentially copy only live (non-tombstoned) records -3. Atomically swap the new partition into the `ConcurrentMap` -4. Close and delete the old partition - -```java -// Atomic swap — readers see either the old or new partition, never a torn state -public boolean replacePartition(String key, - EpisodicPartition oldPartition, EpisodicPartition newPartition) { - boolean replaced = partitions.replace(key, oldPartition, newPartition); - if (replaced) { - oldPartition.close(); - } - return replaced; -} -``` - -!!! warning "Concurrent Safety" - The swap uses `ConcurrentMap.replace(key, old, new)` — a CAS (compare-and-swap) operation. Readers that are mid-scan on the old partition will complete safely because the old `MemorySegment` remains valid until `close()`. New scans will use the compacted partition. - ---- - -## Circadian Trigger - -The ReflectDaemon runs on a configurable schedule. During ingestion, the `CognitiveIngestionTarget` checks if it's time for a consolidation cycle: - -```java -// In CognitiveIngestionTarget — after each write -private void checkCircadianTrigger() { - long now = System.currentTimeMillis(); - if (now - lastReflectMs > reflectIntervalMs) { - lastReflectMs = now; - reflectDaemon.reflect(); - } -} -``` - -The default interval is 24 hours — matching the biological circadian cycle. For testing, it can be set to any duration. - ---- - -## Partition State Machine - -```mermaid -stateDiagram-v2 - [*] --> ACTIVE: New day → create partition - ACTIVE --> SEALED: Day rolls over - SEALED --> REFLECTABLE: ReflectDaemon processes - REFLECTABLE --> TOMBSTONED: tombstoneRatio > 30% - TOMBSTONED --> COMPACTED: TombstoneCompactor rebuilds - - ACTIVE --> TOMBSTONED: High forget rate during active day - - note right of ACTIVE: Accepting writes - note right of SEALED: Read-only, awaiting consolidation - note right of REFLECTABLE: Consolidation complete, eligible for pruning - note right of TOMBSTONED: Queued for compaction - note right of COMPACTED: Rebuilt as dense partition -``` - ---- - -## ReflectReport - -Each consolidation cycle produces a `ReflectReport` summarizing what happened: - -```java -public record ReflectReport( - int partitionsProcessed, - int memoriesConsolidated, - int semanticMemoriesCreated, - long durationMs -) {} -``` - -This can be logged, monitored, or exposed via the `MemoryIntrospector` for observability. - ---- - -## Next Steps - -- :material-brain: [**Cortex — Tier Stores**](cortex.md) — the 4-tier architecture -- :material-flash: [**Synapse — Tags & Scoring**](synapse.md) — the 32-byte header -- :material-head-cog: [**Dopamine — Surprise Detection**](dopamine.md) — auto-importance scoring diff --git a/docs/docs/memory/importance-fusion.md b/docs/docs/memory/importance-fusion.md deleted file mode 100644 index 6c832a6..0000000 --- a/docs/docs/memory/importance-fusion.md +++ /dev/null @@ -1,178 +0,0 @@ -# Importance Fusion (ICNU) - -The **ICNU Importance Fusion** system computes a memory's importance score at ingestion time by blending four signals: **Interest**, **Challenge**, **Novelty**, and **Urgency**. - ---- - -## The Problem - -Without ICNU, importance is determined solely by the [Surprise Detector](dopamine.md) — a statistical outlier test based on how "surprising" a memory's embedding is relative to recent memories. This works well for detecting unusual information, but has blind spots: - -- A memory about a user's **urgent deadline** might not be statistically surprising -- A memory about a **challenging technical problem** might have a common embedding -- A memory that the agent finds **interesting** has no way to signal that interest - -ICNU adds three LLM-provided signals alongside the existing novelty signal to produce a richer importance score. - ---- - -## The Formula - -$$ -\text{importance} = 0.05 + \left(\sum_{i \in \{I,C,N,U\}} w_i \cdot x_i\right) \times 9.95 -$$ - -Where: - -| Signal | Symbol | Range | Source | -|:---|:---:|:---:|:---| -| Interest | $x_I$ | [0, 1] | LLM-provided hint | -| Challenge | $x_C$ | [0, 1] | LLM-provided hint | -| Novelty | $x_N$ | [0, 1] | Computed from working memory scan | -| Urgency | $x_U$ | [0, 1] | LLM-provided hint | - -The weights $w_i$ are configurable and auto-normalize to sum=1.0: - -| Weight | Default | Rationale | -|:---|:---:|:---| -| $w_I$ (interest) | 0.30 | Agent engagement is a strong signal | -| $w_C$ (challenge) | 0.10 | Complexity is less important than novelty | -| $w_N$ (novelty) | 0.40 | Novelty is the strongest predictor of future usefulness | -| $w_U$ (urgency) | 0.20 | Time-sensitive information needs priority | - -### Output Range - -The formula maps to importance ∈ **[0.05, 10.0]**: - -- **0.05** — All signals zero (routine, uninteresting, familiar, non-urgent) -- **10.0** — All signals maximal (interesting, challenging, novel, urgent) - ---- - -## Novelty Computation - -### How It Works - -Novelty is computed using the **nearest-neighbor distance** in working memory — the minimum L2 distance between the incoming embedding and all existing working memory slots: - -```java -float nearestDist = workingStore.nearestDistance(quantizedVector, mins, scales); -``` - -`nearestDistance()` performs a SIMD-accelerated scan of all working memory slots (~0.5ms for 100 slots × 768 dims) and returns the minimum L2 distance. A high distance means the memory is genuinely novel — it's far from everything the agent has seen recently. - -### Normalization - -The raw distance is normalized to [0, 1] via: - -$$ -\text{noveltyNorm} = \min\left(\frac{d_{\text{nearest}}}{2.0}, 1.0\right) -$$ - -Where 2.0 is a configurable threshold representing "maximally novel." - ---- - -## IngestionHints - -The LLM provides hints via the `IngestionHints` record: - -```java -// At ingestion time -var hints = new IngestionHints( - 0.8f, // interest: agent finds this very interesting - 0.3f, // challenge: moderate complexity - 0.9f // urgency: high time sensitivity -); - -// Novelty is computed automatically from working memory -cognitiveTarget.ingestCognitive(id, text, type, tags, source, hints); -``` - -### Safety Features - -- **Clamping**: All values are clamped to [0.0, 1.0] on construction -- **Fallback**: `IngestionHints.NONE` triggers novelty-only mode (backward compatible) -- **Gaming detection**: If all hints are maximal (I=1.0, C=1.0, U=1.0), a WARN is logged - -### NONE Fallback - -When no hints are provided (`IngestionHints.NONE`), the system falls back to `IcnuWeights.NOVELTY_ONLY` — importance is determined solely by nearest-neighbor distance, matching the pre-ICNU behavior. - ---- - -## Configuration - -### Fusion Weights - -```java -var memory = SpectorMemory.builder() - .icnuWeights(new IcnuWeights(0.4f, 0.1f, 0.3f, 0.2f)) // custom weights - .build(); -``` - -### Built-in Weight Presets - -| Preset | I | C | N | U | Use Case | -|:---|:---:|:---:|:---:|:---:|:---| -| `DEFAULT` | 0.30 | 0.10 | 0.40 | 0.20 | General-purpose | -| `NOVELTY_ONLY` | 0.00 | 0.00 | 1.00 | 0.00 | Backward-compatible | - -### Weight Auto-Normalization - -Weights are automatically normalized on construction: - -```java -var w = new IcnuWeights(1f, 1f, 1f, 1f); -// → interest=0.25, challenge=0.25, novelty=0.25, urgency=0.25 -``` - ---- - -## Worked Example - -Agent ingests: *"User has a production outage — database connections exhausted"* - -| Signal | Value | Source | -|:---|:---:|:---| -| Interest | 0.7 | LLM hint — agent finds this relevant | -| Challenge | 0.5 | LLM hint — moderate complexity | -| Novelty | 0.9 | Working memory scan — nothing like this recently | -| Urgency | 1.0 | LLM hint — production outage | - -With default weights: - -$$ -\text{weighted} = 0.30 \times 0.7 + 0.10 \times 0.5 + 0.40 \times 0.9 + 0.20 \times 1.0 = 0.81 -$$ - -$$ -\text{importance} = 0.05 + 0.81 \times 9.95 = \mathbf{8.11} -$$ - -This is a high-importance memory (8.11 / 10.0) — it will be prioritized in future recalls and resist time decay. - ---- - -## MCP Integration - -When using the MCP tools, importance fusion happens automatically if the ingestion tool provides hints: - -```json -{ - "name": "core_memory_append", - "arguments": { - "id": "outage-2024-01", - "text": "Production database connections exhausted at 2AM", - "tags": "production,database,outage", - "hints": { - "interest": 0.7, - "challenge": 0.5, - "urgency": 1.0 - } - } -} -``` - -!!! note "Backward Compatibility" - The `hints` field is optional. When omitted, importance is computed using novelty-only mode — identical to the pre-ICNU behavior. diff --git a/docs/docs/memory/index.md b/docs/docs/memory/index.md deleted file mode 100644 index 4da4ee4..0000000 --- a/docs/docs/memory/index.md +++ /dev/null @@ -1,193 +0,0 @@ ---- -title: "🧠 Cognitive Memory" -description: "The biologically-inspired memory engine that gives AI agents the ability to remember, forget, consolidate, and associate — at microsecond latency." ---- - -# 🧠 Cognitive Memory - -!!! quote "The Vision" - Legacy AI frameworks bolt memory onto flat vector databases. Spector Memory is designed from the ground up as a **cognitive memory engine** — a biologically-inspired system where memories have importance, emotions, temporal decay, and contextual tags. It's the difference between a filing cabinet and a brain. - ---- - -## What Makes This Different - -Every AI memory solution today — Mem0, Letta (MemGPT), Zep — wraps a Python layer around Postgres/pgvector or ChromaDB. They suffer from: - -- **Network latency**: 50-200ms per query (HTTP → Postgres → HTTP) -- **Python GIL**: Sequential embedding + scoring under a global lock -- **Post-filtering trap**: Retrieve top-K by similarity, *then* filter by importance/time — losing critical memories that are old but vital - -Spector Memory collapses the entire cognitive stack onto a **zero-GC, off-heap Panama memory store** with SIMD-accelerated scoring. The result: - -| Metric | Python Memory Layer | **Spector Memory** | -|---|---|---| -| Query latency (1M memories) | 50-200ms | **0.13ms** † | -| GC pauses | Unpredictable | **≤0.01%** (100% off-heap) † | -| Scoring pipeline | Post-filter (lossy) | **Fused SIMD** (lossless) | -| Concurrent queries | GIL-limited | **61,000 QPS** (Virtual Threads) † | -| Memory per record | ~500B (Python objects) | **32B header + quantized vector** | - -† *Measured on Intel Core Ultra 9 285K, Java 25, AVX2. See [Benchmarks](performance.md).* - ---- - -## The Biological Metaphor - -Spector Memory maps every major cognitive subsystem from neuroscience to a dedicated Java package: - -```mermaid -graph TB - subgraph "🧠 Spector Memory" - SM[SpectorMemory
Façade] --> CT[CognitiveIngestionTarget
Cognitive remember] - SM --> RP[RecallPipeline
Parallel recall] - - subgraph "Cortex — Tier Stores" - TR[TierRouter] --> WM[Working
Prefrontal Cortex] - TR --> EM[Episodic
Hippocampus] - TR --> SE[Semantic
Neocortex] - TR --> PR[Procedural
Basal Ganglia] - end - - subgraph "Synapse — Scoring" - CS[CognitiveScorer
6-phase SIMD] --> STE[SynapticTagEncoder
Bloom Filter] - CS --> DS[DecayStrategy
Temporal Decay] - end - - subgraph "Neuromodulators" - SD[SurpriseDetector
Dopamine] --> FP[FlashbulbPolicy] - VT[ValenceTracker
Amygdala] - HP[HabituationPenalty
Anti-filter bubble] - SS[SuppressionSet
Inhibition] - end - - subgraph "3-Layer Cognitive Graph" - HG[HebbianGraph
Layer 1: Association] - EG[EntityGraph
Layer 2: Knowledge] - TC[TemporalChain
Layer 3: Causal] - CA[CoActivationTracker
STDP Learning] - end - - subgraph "Consolidation" - RD[ReflectDaemon
Sleep Consolidation] - TCC[TombstoneCompactor
Synaptic Pruning] - end - - CT --> TR - RP --> CS - RP --> TR - RP --> HG - RP --> TC - RP --> EG - end -``` - ---- - -## The 4-Tier Memory Architecture - -Just as the human brain has distinct memory systems, Spector organizes memories into four cognitive tiers: - -=== "🧪 Working Memory" - - **Biological analog: Prefrontal Cortex** - - Volatile, limited-capacity buffer for the current task context. Circular buffer — oldest entries are evicted when full. - - - **Capacity**: Configurable (default: 100 records) - - **Storage**: In-memory `Arena.ofShared()` segment - - **Use case**: "What was the user just talking about?" - -=== "📝 Episodic Memory" - - **Biological analog: Hippocampus** - - Time-stamped event records. Partitioned by day, backed by mmap'd files for persistence across JVM restarts. Supports sleep consolidation into semantic memory. - - - **Capacity**: Unbounded (partitioned, mmap-backed) - - **Storage**: `FileChannel.map()` with 64-byte metadata header per partition - - **Use case**: "What error did we debug yesterday?" - -=== "🧬 Semantic Memory" - - **Biological analog: Neocortex** - - Distilled, permanent knowledge. Created by sleep consolidation (ReflectDaemon) from episodic clusters, or directly by the user. - - - **Capacity**: Configurable (default: 5,000 records) - - **Storage**: Header-only slab (fast metadata scan) - - **Use case**: "The user prefers dark mode." - -=== "⚙️ Procedural Memory" - - **Biological analog: Basal Ganglia** - - Learned procedures, rules, and patterns. Small, append-only store for procedural knowledge. - - - **Capacity**: Configurable (default: 500 records) - - **Storage**: In-memory `Arena.ofShared()` segment - - **Use case**: "Always use exponential backoff for retries." - ---- - -## Explore the Documentation - -
- -- :material-brain:{ .lg .middle } **System Architecture** - - --- - - Package hierarchy, data flow diagrams, and extensibility model - - [:octicons-arrow-right-24: Architecture](architecture.md) - -- :material-lightning-bolt:{ .lg .middle } **6-Phase Scoring Pipeline** - - --- - - Deep dive into the SIMD hot-loop: tombstone → tags → valence → importance → L2 → fused score - - [:octicons-arrow-right-24: Scoring Pipeline](scoring-pipeline.md) - -- :material-share-variant:{ .lg .middle } **3-Layer Cognitive Graph** - - --- - - Hebbian association, entity-relationship knowledge, and temporal causal chains — three off-heap graph structures that augment vector recall - - [:octicons-arrow-right-24: Cognitive Graph](hebbian.md) - -- :material-head-cog:{ .lg .middle } **Biological Systems** - - --- - - Each brain region mapped to code: Cortex, Hippocampus, Synapse, Dopamine, Amygdala, Habituation, Inhibition - - [:octicons-arrow-right-24: Start with Cortex](cortex.md) - -- :material-speedometer:{ .lg .middle } **Performance & SIMD** - - --- - - Benchmark results, SIMD kernel throughput, optimization techniques, virtual thread scaling - - [:octicons-arrow-right-24: Performance](performance.md) - -- :material-memory:{ .lg .middle } **Off-Heap Panama Design** - - --- - - Zero-GC architecture, MemorySegment lifecycle, mmap partitions, 32-byte CognitiveRecord binary format - - [:octicons-arrow-right-24: Panama Design](panama-design.md) - -- :material-api:{ .lg .middle } **API Reference** - - --- - - SpectorMemory.Builder, RecallOptions, CognitiveResult, MemoryType — full method signatures - - [:octicons-arrow-right-24: API Reference](api-reference.md) - -
diff --git a/docs/docs/memory/inhibition.md b/docs/docs/memory/inhibition.md deleted file mode 100644 index 09b08c9..0000000 --- a/docs/docs/memory/inhibition.md +++ /dev/null @@ -1,150 +0,0 @@ ---- -title: "Inhibition — Suppression" -description: "SuppressionSet enables explicit memory blocking — the digital equivalent of motivated forgetting." ---- - -# 🚫 Inhibition — Suppression - -> **Package**: `com.spectrayan.spector.memory.inhibition` -> -> **Biological Analog**: **Retrieval-Induced Forgetting** (Anderson et al., 1994) — the brain actively suppresses competing memories during recall. When you try to remember where you parked today, your brain inhibits memories of yesterday's parking spot. This is an active process, not passive decay. - ---- - -## The Concept - -Suppression is different from forgetting: - -| Operation | Method | Effect | Reversible? | -|---|---|---|---| -| **Forget** | `memory.forget(id)` | Tombstones the record — permanently excluded from all scans | No | -| **Suppress** | `memory.suppress(id, reason)` | Adds to suppression set — excluded from recall results | **Yes** | - -Tombstoning modifies the off-heap flags byte (bit 0 = 1). Suppression maintains a separate in-memory set — the underlying memory is untouched and can be un-suppressed later. - ---- - -## SuppressionSet - -```java -public final class SuppressionSet { - - private final ConcurrentHashMap suppressed = new ConcurrentHashMap<>(); - - /** - * Suppresses a memory — it will be excluded from all future recall results. - * - * @param memoryId the memory to suppress - * @param reason human-readable reason (for auditability) - */ - public void suppress(String memoryId, String reason) { - suppressed.put(memoryId, reason != null ? reason : ""); - } - - /** - * Removes suppression — the memory will appear in recall results again. - */ - public void unsuppress(String memoryId) { - suppressed.remove(memoryId); - } - - /** - * Checks if a memory is currently suppressed. - * Called at Step 4 of the recall pipeline. - */ - public boolean isSuppressed(String memoryId) { - return suppressed.containsKey(memoryId); - } - - /** - * Returns the number of currently suppressed memories. - */ - public int size() { - return suppressed.size(); - } -} -``` - ---- - -## Integration with RecallPipeline - -Suppression is checked at **Step 4** of the recall pipeline — after scoring but before habituation: - -```java -// Step 4: Filter suppressed memories (inhibition) -allResults.removeIf(r -> suppressionSet.isSuppressed(r.id())); -``` - -**Timing matters**: Suppression is checked *after* the CognitiveScorer completes. This means suppressed memories still consume SIMD cycles during scoring. For high-frequency suppression scenarios, consider using `forget()` instead. - ---- - -## Use Cases - -### 1. User Redaction - -```java -// User says: "Please forget what I said about project X" -memory.suppress("project-x-conversation-1", "User requested redaction"); -memory.suppress("project-x-conversation-2", "User requested redaction"); -``` - -### 2. Context Switching - -```java -// Agent is switching tasks — suppress irrelevant context -memory.suppress("frontend-task-context", "Switching to backend work"); - -// Later, when switching back: -memory.unsuppress("frontend-task-context"); -``` - -### 3. Stale Data Quarantine - -```java -// A data source is known to be stale — suppress while validating -for (String id : staleSourceMemories) { - memory.suppress(id, "Source under validation — suppressed until confirmed"); -} -``` - -### 4. A/B Testing Memory Strategies - -```java -// Suppress certain memories to test how the agent performs without them -experimentGroup.forEach(id -> - memory.suppress(id, "A/B test: control group")); -``` - ---- - -## Suppression vs. Tombstone - -```mermaid -graph TB - subgraph "Suppress (Reversible)" - S1["memory.suppress(id)"] --> S2["SuppressionSet.add(id)"] - S2 --> S3["Recall Pipeline
Step 4: removeIf(suppressed)"] - S3 --> S4["Can unsuppress(id)"] - end - - subgraph "Forget (Permanent)" - F1["memory.forget(id)"] --> F2["flags byte |= 0x01
(tombstone bit)"] - F2 --> F3["CognitiveScorer
Phase 1: skip immediately"] - F3 --> F4["Permanent — cannot undo"] - end - - style S4 fill:#2ecc71,color:white - style F4 fill:#e74c3c,color:white -``` - -**Performance difference**: Tombstoned memories are skipped in Phase 1 of the scorer (~1 cycle). Suppressed memories go through the full 6-phase scoring pipeline and are only filtered at Step 4 of the recall pipeline. For bulk suppression, `forget()` is more efficient. - ---- - -## Next Steps - -- :material-speedometer: [**Performance**](performance.md) — benchmark results and optimization techniques -- :material-sleep: [**Habituation — Anti-Filter Bubble**](habituation.md) — automatic score attenuation -- :material-brain: [**Architecture**](architecture.md) — where suppression fits in the pipeline diff --git a/docs/docs/memory/interference.md b/docs/docs/memory/interference.md deleted file mode 100644 index 24d11d3..0000000 --- a/docs/docs/memory/interference.md +++ /dev/null @@ -1,76 +0,0 @@ ---- -title: "Interference — Deduplication" -description: "SemanticDeduplicator detects near-duplicate memories and merges them to prevent proactive interference." ---- - -# 🔀 Interference — Deduplication - -> **Package**: `com.spectrayan.spector.memory.interference` -> -> **Biological Analog**: **Proactive interference** occurs when old memories interfere with new learning. If you move to a new city, your old address "interferes" when you try to recall the new one. The brain resolves this by strengthening the newer trace and weakening the old one. - ---- - -## The Problem - -Without deduplication, an agent remembering the same fact repeatedly creates redundant entries: - -``` -memory[0]: "User prefers dark mode" importance=0.8 -memory[1]: "User prefers dark mode" importance=0.7 -memory[2]: "The user likes dark mode" importance=0.9 ← near-duplicate -``` - -These compete during recall, waste storage, and dilute the Hebbian co-activation signal. - ---- - -## SemanticDeduplicator - -The `SemanticDeduplicator` detects near-duplicates by computing L2 distance between the new memory's vector and existing memories. When a match is found within a configurable threshold, it **merges** rather than creating a new record: - -```java -public final class SemanticDeduplicator { - - /** - * Checks if a near-duplicate exists and merges if found. - * - * Merge strategy: - * - importance = max(existing, new) - * - synapticTags = existing | new (OR-merge: union of Bloom filters) - * - timestamp = most recent - * - recallCount preserved from existing - */ - public Optional findAndMerge(MemorySegment segment, int recordCount, - CognitiveRecordLayout layout, - float[] newVector, CognitiveHeader newHeader, - float threshold) { - // Scan for near-duplicate within L2 threshold - // If found: merge headers via OR on tags, max on importance - // Return offset of merged record (or empty if no duplicate) - } -} -``` - -**Merge rules**: - -| Field | Strategy | Rationale | -|---|---|---| -| `importance` | `max(existing, new)` | Keep the highest importance signal | -| `synapticTags` | `existing \| new` | Union of Bloom filters — broader context | -| `timestamp` | Most recent | Memory is "refreshed" | -| `recallCount` | Preserved | Reconsolidation history maintained | -| `valence` | From newer | Most recent emotional assessment | - ---- - -## Integration - -Deduplication runs during the ingestion pipeline **after embedding but before writing**. If a merge occurs, no new record is created — the existing record is updated in-place. - ---- - -## Next Steps - -- :material-clock: [**Prospective — Future Intents**](prospective.md) — time-triggered reminders -- :material-brain: [**Architecture**](architecture.md) — where deduplication fits in the pipeline diff --git a/docs/docs/memory/lateral-retrieval.md b/docs/docs/memory/lateral-retrieval.md deleted file mode 100644 index 8df3117..0000000 --- a/docs/docs/memory/lateral-retrieval.md +++ /dev/null @@ -1,185 +0,0 @@ -# Explorer — Lateral Retrieval - -The **Explorer** profile enables **lateral retrieval** — surfacing memories that are semantically distant from the query but share contextual tags. This is the computational equivalent of divergent thinking: connecting ideas across domains. - ---- - -## The Problem - -Standard similarity-based retrieval has a blind spot: it only finds memories that are **close** to the query in vector space. This creates a filter bubble — the agent keeps retrieving the same cluster of closely related memories. - -But some of the most valuable insights come from **cross-domain connections**: - -- A debugging agent stuck on a race condition might benefit from recalling a design pattern used in a completely different subsystem -- A research agent exploring "database indexing" might gain from a memory about "B-tree file system layouts" — related by tags, but distant in embedding space - ---- - -## How It Works - -### Dual-Heap Architecture - -When `lateralMode=true`, the [CognitiveScorer](scoring-pipeline.md) maintains **two priority queues** instead of one: - -```mermaid -flowchart LR - Q["Query Vector"] --> S["CognitiveScorer"] - S --> |"L2 distance ≤ threshold"| H1["Standard Heap\n(top-K by score)"] - S --> |"L2 distance > threshold\n+ tag overlap ≥ minOverlap"| H2["Lateral Heap\n(top-N by lateral score)"] - H1 --> M["Merged Results"] - H2 --> M -``` - -A memory is classified as a **lateral candidate** when: - -1. **Semantically distant**: `l2dist > lateralDistanceThreshold` (default: 1.2) -2. **Contextually related**: `tagOverlap >= lateralMinTagOverlap` (default: 0.5) - -### Lateral Scoring Formula - -Lateral candidates use an **inverted** scoring function — higher distance means higher lateral score: - -$$ -\text{lateralScore} = \frac{1}{1 + \frac{1}{d}} \cdot \text{tagOverlap} \cdot \text{importance} \cdot \text{decay} -$$ - -Where $d$ is the L2 distance. This produces a bounded score in $(0, 1)$: - -| L2 Distance | Lateral Similarity | -|:---:|:---:| -| 0.5 | 0.33 | -| 1.0 | 0.50 | -| 1.5 | 0.60 | -| 2.0 | 0.67 | -| 5.0 | 0.83 | -| ∞ | 1.00 | - -### Result Blending - -After the scoring loop, lateral results are appended after standard results: - -``` -Final results = [standard top-K] + [lateral top-N] -``` - -The caller can distinguish them via `CognitiveResult.retrievalMode()`: - -```java -for (CognitiveResult r : results) { - if (r.isLateral()) { - System.out.println("Cross-domain insight: " + r.text()); - } -} -``` - ---- - -## Configuration - -```java -// Via profile preset (recommended) -var results = memory.recall("performance optimization", CognitiveProfile.DIVERGENT); - -// Via explicit options -var options = RecallOptions.builder() - .profile(CognitiveProfile.DIVERGENT) - .lateralDistanceThreshold(1.5f) // how far is "far enough" - .lateralMaxResults(5) // max lateral candidates - .lateralMinTagOverlap(0.3f) // minimum tag overlap - .build(); -``` - -### Parameter Tuning - -| Parameter | Default | Effect | -|:---|:---:|:---| -| `lateralDistanceThreshold` | 1.2 | Higher → only very distant memories qualify | -| `lateralMaxResults` | topK/3 | Caps the number of lateral results | -| `lateralMinTagOverlap` | 0.5 | Higher → requires stronger contextual connection | - ---- - -## Auto-Tuning via the Lateral Evaluator - -The system automatically monitors whether lateral results are useful through the **LateralEvaluator**: - -### Feedback Loop - -```mermaid -sequenceDiagram - participant A as Agent - participant S as SpectorMemory - participant E as LateralEvaluator - - A->>S: recall("topic", DIVERGENT) - S-->>A: [standard + lateral results] - Note right of A: Agent uses results... - A->>S: reinforce("lateral-mem-1", positive) - S->>E: recordLateralReinforcement() - Note right of E: LUR increases - A->>S: suppress("lateral-mem-2", "irrelevant") - S->>E: recordLateralSuppression() - Note right of E: LSR increases - Note right of E: Auto-tune threshold -``` - -### Metrics - -| Metric | Formula | Meaning | -|:---|:---|:---| -| **LUR** (Lateral Utility Rate) | reinforced / returned | "Are lateral results useful?" | -| **LSR** (Lateral Suppression Rate) | suppressed / returned | "Are lateral results noise?" | -| **LHI** (Lateral Hallucination Index) | suppressed / (reinforced + suppressed) | "Of all feedback, how much is negative?" | - -### Auto-Tuning Rules - -| Condition | Action | -|:---|:---| -| LUR < 0.05 (5%) | **Auto-disable** lateral mode | -| LUR < 0.10 (10%) | **Tighten** distance threshold by 10% | -| LUR > 0.30 (30%) | Lateral mode is healthy, no change | - -### MCP Monitoring - -The `memory_status` MCP tool shows lateral metrics: - -``` -Lateral Retrieval: - Enabled: true - Threshold: 1.20 - Samples: 47 - LUR (util): 0.34 - LSR (supp): 0.09 - LHI (hall): 0.20 -``` - -The `memory_reinforce` tool reports when feedback is recorded for a lateral result: - -``` -👍 Reinforced 'mem-123' with valence=50 (lateral result — feedback recorded) -``` - ---- - -## Performance - -| Metric | Cost | -|:---|:---| -| Lateral detection | ~3 cycles per record (threshold compare + tag overlap) | -| Lateral heap | O(N log N) where N = lateralMaxResults (typically 3-5) | -| Auto-tuning | O(1) atomic increments, evaluated every `evaluationWindow` returns | - -!!! note "Zero Overhead When Disabled" - The lateral code path is gated by `lateralMode == true`. When `lateralMode` is false (the default for all profiles except DIVERGENT), no lateral detection or heap management occurs. - ---- - -## When to Use Explorer - -| Scenario | Recommendation | -|:---|:---| -| Agent is stuck on a problem | ✅ Switch to DIVERGENT | -| Brainstorming or creative tasks | ✅ Use DIVERGENT | -| Precision recall (debugging, audit) | ❌ Use DEBUGGING or CRITICAL | -| Building a knowledge base | ❌ Use SYSTEMATIZER | -| General conversation | ⚠️ BALANCED is usually sufficient | diff --git a/docs/docs/memory/metamemory.md b/docs/docs/memory/metamemory.md deleted file mode 100644 index 0a4776b..0000000 --- a/docs/docs/memory/metamemory.md +++ /dev/null @@ -1,83 +0,0 @@ ---- -title: "Metamemory — Self-Reflection" -description: "MemoryIntrospector provides self-reflective analytics — the agent's ability to reason about its own memory health." ---- - -# 🪞 Metamemory — Self-Reflection - -> **Package**: `com.spectrayan.spector.memory.metamemory` -> -> **Biological Analog**: **Metamemory** is the awareness of one's own memory processes — "I know I'm forgetting things more often" or "I'm confident I remember this correctly." It's what enables humans to say "I need to write this down" or "Let me double-check that." - ---- - -## MemoryIntrospector - -The `MemoryIntrospector` provides analytics and health metrics for the memory system: - -```java -public final class MemoryIntrospector { - - /** - * Returns per-tier memory counts. - */ - public Map countsByTier() { ... } - - /** - * Returns the most common synaptic tags across all memories. - * Useful for understanding what topics dominate the agent's memory. - */ - public Map tagDistribution() { ... } - - /** - * Returns importance distribution statistics. - */ - public DoubleSummaryStatistics importanceStats() { ... } - - /** - * Returns memories with the highest recall counts - * (most frequently accessed — potential habituation candidates). - */ - public List mostRecalled(int topK) { ... } - - /** - * Returns the oldest active memories (potential consolidation candidates). - */ - public List oldestActive(int topK) { ... } -} -``` - ---- - -## Use Cases - -### Memory Health Dashboard - -```java -var introspector = memory.introspect(); - -// Tier distribution — is memory balanced? -introspector.countsByTier().forEach((tier, count) -> - System.out.printf(" %s: %d memories%n", tier, count)); - -// Tag distribution — what topics dominate? -introspector.tagDistribution().entrySet().stream() - .sorted(Map.Entry.comparingByValue().reversed()) - .limit(10) - .forEach(e -> System.out.printf(" %s: %d occurrences%n", e.getKey(), e.getValue())); -``` - -### Adaptive Agent Behavior - -An agent can use metamemory to self-optimize: - -- **High episodic count, low semantic**: "I should consolidate — trigger a reflect cycle" -- **High recall count on one memory**: "I'm over-relying on this — diversify" -- **Low importance average**: "Most memories are routine — increase surprise sensitivity" - ---- - -## Next Steps - -- :material-sync: [**Sync — Persistence & Replication**](sync.md) — WAL and CRDT merge -- :material-brain: [**Architecture**](architecture.md) — system overview diff --git a/docs/docs/memory/panama-design.md b/docs/docs/memory/panama-design.md deleted file mode 100644 index 14313b8..0000000 --- a/docs/docs/memory/panama-design.md +++ /dev/null @@ -1,304 +0,0 @@ ---- -title: "Off-Heap Panama Design" -description: "Zero-GC architecture using Project Panama MemorySegment, Arena management, mmap partitions, and versioned header layouts (V1/V2/V3)." ---- - -# 💾 Off-Heap Panama Design - -Spector Memory achieves **zero garbage collection pressure** by storing all vector data and cognitive headers off-heap using Java Project Panama's Foreign Function & Memory API. No memory record ever touches the JVM heap. - ---- - -## Why Off-Heap? - -In a standard JVM application, objects live on the heap and are managed by the garbage collector. For AI memory workloads, this creates problems: - -| On-Heap (Traditional) | Off-Heap (Panama) | -|---|---| -| GC pauses (10-100ms for large heaps) | **Zero GC pauses** — data is invisible to GC | -| Object overhead (16-24 bytes per object header) | **Zero overhead** — raw bytes, no object headers | -| Memory fragmentation over time | **Compact** — contiguous byte arrays | -| Heap size limits JVM config | **System memory** — limited only by OS | -| Serialization required for persistence | **Direct mmap** — bytes are already on disk | - ---- - -## Panama Architecture - -### MemorySegment — The Core Abstraction - -Every memory record is stored in a `MemorySegment` — a contiguous off-heap byte buffer managed by an `Arena`: - -```java -// Allocate 8 MB of off-heap memory, 32-byte aligned -Arena arena = Arena.ofShared(); -MemorySegment segment = arena.allocate(8 * 1024 * 1024, 32); - -// Write a float directly at a byte offset — no Java objects involved -segment.set(ValueLayout.JAVA_FLOAT, offset + 20, 0.85f); - -// Read it back — zero deserialization -float importance = segment.get(ValueLayout.JAVA_FLOAT, offset + 20); -``` - -**Key properties**: - -- `Arena.ofShared()` — thread-safe for concurrent reads (Virtual Threads) -- 32-byte alignment ensures SIMD-friendly access patterns -- No Java objects are created — the GC never sees this memory - -### Arena Lifecycle - -```mermaid -graph LR - A["Arena.ofShared()"] --> B["allocate(bytes, alignment)"] - B --> C["MemorySegment
(off-heap)"] - C -->|read/write| D["SIMD Scorer
Virtual Threads"] - C -->|"arena.close()"| E["Memory Released
to OS"] - - style A fill:#3498db,color:white - style C fill:#2ecc71,color:white - style E fill:#e74c3c,color:white -``` - -!!! warning "Lifetime Management" - Unlike heap objects, off-heap memory is **not garbage collected**. You must explicitly close the `Arena` when done. `SpectorMemory` implements `AutoCloseable` and closes all arenas in its `close()` method. Always use try-with-resources. - ---- - -## Three Storage Modes - -### 1. Arena-Allocated (Working, Procedural) - -Volatile, in-memory segments for transient data: - -```java -// WorkingMemoryStore — circular buffer -Arena arena = Arena.ofShared(); -long totalBytes = (long) capacity * stride; -MemorySegment segment = arena.allocate(totalBytes, HEADER_BYTES); -``` - -**Characteristics**: - -- Fast allocation (~1µs) -- Lost on JVM shutdown -- No file I/O overhead -- Fixed capacity - -### 2. mmap-Backed (Episodic) - -Persistent, memory-mapped files for durable storage: - -```java -// EpisodicPartition — mmap via FileChannel.map() -FileChannel channel = FileChannel.open(path, READ, WRITE); -MemorySegment segment = channel.map(MapMode.READ_WRITE, 0, totalBytes, arena); -``` - -**Characteristics**: - -- Persists across JVM restarts -- OS handles paging to/from disk -- Lazy loading — only mapped pages are in physical RAM -- Atomic `force()` for durability - -### 3. Header-Only Slab (Semantic) - -Compact metadata-only storage (no vectors): - -```java -// SemanticMemoryStore — header slab -// Uses configured HeaderLayout (V1=32B, V2=48B, V3=64B) -long slabBytes = (long) capacity * headerLayout.headerBytes(); -MemorySegment headerSlab = arena.allocate(slabBytes, headerLayout.headerBytes()); -``` - -**Characteristics**: - -- Minimal memory footprint (32-64B per record vs. ~800B for full records) -- Fast metadata scans (tag match, importance, valence, arousal) -- No vector data — re-embed at query time if needed - ---- - -## Binary Record Format - -### Versioned Header Layouts - -The cognitive record format uses a **versioned header** via the `HeaderLayout` sealed interface. The header version determines the record stride and available fields. See [Synapse — Tags & Scoring](synapse.md) for the full byte-level specification. - -```mermaid -graph LR - subgraph "V1 — 32B" - V1H["Header (32B)"] --> V1V["INT8 Vector (NB)"] - end - subgraph "V2 — 48B" - V2H["Header (48B)"] --> V2V["INT8 Vector (NB)"] - end - subgraph "V3 — 64B ⭐ Default" - V3H["Header (64B)"] --> V3V["INT8 Vector (NB)"] - end - - style V3H fill:#27ae60,color:white - style V3V fill:#2ecc71,color:white -``` - -### V1 Layout (32 bytes) — Legacy - -``` - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -+ timestamp (8B) + ← Offset 0 -| | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -+ synapticTags (8B) + ← Offset 8 -| | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| exactNorm (4B) | ← Offset 16 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| importance (4B) | ← Offset 20 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| recallCount (4B) | ← Offset 24 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| centroidId (2B) | valence (1B) | flags (1B) | ← Offset 28 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -+ Quantized Vector — INT8[N] + ← Offset 32 -| (dequantize: float = byte × scale + min) | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - stride = 32 + N bytes per record -``` - -### V2 Layout (48 bytes) — Extended - -Adds arousal and storage strength fields: - -``` - [32B V1 core as above] -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| arousal (1B) | padding (3B) | ← Offset 32 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| storageStrength (4B) | ← Offset 36 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -+ reserved (8B) + ← Offset 40 -| | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| Quantized Vector — INT8[N] | ← Offset 48 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - stride = 48 + N bytes per record -``` - -### V3 Layout (64 bytes) — Full Cache Line ⭐ Default - -Extends V2 with a 16-byte future buffer, aligned to a full CPU cache line: - -``` - [48B V2 core as above] -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -+ reserved_2 (16B) + ← Offset 48 -| (future expansion buffer) | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| Quantized Vector — INT8[N] | ← Offset 64 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - stride = 64 + N bytes per record -``` - -### Memory Cost Comparison - -| Version | Header | Stride (768-dim) | 1M Records | Alignment | -|:---|:---:|:---:|:---:|:---| -| V1 | 32B | 800B | ~763 MB | 1× AVX2 register | -| V2 | 48B | 816B | ~778 MB | 1.5× AVX2 | -| V3 ⭐ | 64B | 832B | ~793 MB | 1× cache line (64B) | - -### Field Access Patterns - -The header layout is designed for **sequential access** in the scoring hot-loop. Fields are ordered by access frequency: - -``` -Phase 1: flags (offset 31, 1B) — First check, highest skip rate -Phase 2: synapticTags (offset 8, 8B) — Second check, eliminates 99% -Phase 3: valence (offset 30, 1B) — Third check (profile-dependent) -Phase 4: importance (offset 20, 4B) — Fourth check -Phase 4: timestamp (offset 0, 8B) — Read with importance -Phase 4: recallCount (offset 24, 4B) — Reconsolidation adjustment -Phase 4: arousal (offset 32, 1B) — V2+: arousal-modulated decay -Phase 5: vector (offset H, NB) — Only if all filters pass (H = header bytes) -``` - -!!! tip "Cache Line Optimization" - V3's 64-byte header occupies exactly **one CPU cache line**. During sequential scans, each header read hits exactly one cache line — no split-line loads, no false sharing. The CPU prefetcher can pre-fetch the next record's header while the current one is being scored. V1's 32-byte header fits in half a cache line, meaning the vector data starts mid-cache-line which can cause split reads. - ---- - -## Episodic Partition File Format - -Each episodic partition file has a 64-byte metadata header: - -``` -Offset Size Field Description -────── ──── ───── ─────────── - 0 4B magic 0x45504943 ("EPIC" in ASCII) - 4 4B version Format version (1) - 8 4B count Number of live records - 12 4B tombstoneCount Number of tombstoned records - 16 4B capacity Maximum records in partition - 20 4B state PartitionState ordinal - 24 4B stride Record stride in bytes - 28 36B reserved Future use (alignment padding) -``` - -**File naming**: `episodic-{yyyyMMdd}.mem` (e.g., `episodic-20260527.mem`) - -**Partition capacity**: Default 10,000 records per partition. At 800 bytes/record (768-dim INT8), each partition file is ~8 MB. - ---- - -## Thread Safety Model - -| Component | Thread Safety | Mechanism | -|---|---|---| -| `Arena.ofShared()` | ✅ Concurrent reads | Built-in Panama support | -| `MemorySegment` reads | ✅ Lock-free | Direct memory access | -| `MemorySegment` writes | ⚠️ Single writer | `synchronized` on partition append | -| `ConcurrentHashMap` (index) | ✅ Lock-free reads | CAS-based updates | -| Partition metadata | ⚠️ Single writer | Metadata header writes are synchronized | - -**Recall**: Multiple Virtual Threads read different partitions concurrently — zero contention because each partition's `MemorySegment` is disjoint. - -**Ingestion**: Writes are serialized per partition (one writer at a time) but different partitions can accept writes concurrently. - ---- - -## Zero-Copy Data Path - -```mermaid -graph LR - A["💾 Disk"] -->|mmap| B["MemorySegment"] - B -->|"direct read"| C["SIMD Registers"] - C --> D["✅ Score"] - - style A fill:#3498db,color:white - style B fill:#2ecc71,color:white - style D fill:#00b894,color:white -``` - -> **No Java objects created. No serialization. No deserialization. No GC pressure.** - -The entire data path from persistent storage to CPU computation operates on **raw bytes**. The JVM heap is used only for the top-K result set (`List`) — typically 5-20 small Java records. - ---- - -## Next Steps - -- :material-speedometer: [**Performance**](performance.md) — benchmark results -- :material-brain: [**Architecture**](architecture.md) — system design -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — the SIMD hot-loop -- :material-tag: [**Synapse — Tags & Scoring**](synapse.md) — versioned header byte maps, arousal decay, Bloom filter -- :material-flask: [**Labs — Research Roadmap**](../labs/roadmap.md) — Dynamic Quantization (SQ4), Two-Factor Memory diff --git a/docs/docs/memory/performance.md b/docs/docs/memory/performance.md deleted file mode 100644 index 0f4e379..0000000 --- a/docs/docs/memory/performance.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -title: "Performance & SIMD" -description: "Benchmark results, SIMD kernel throughput, and architecture decisions that enable microsecond-scale latency in Spector Memory." ---- - -# ⚡ Performance & SIMD - -Spector Memory is engineered for microsecond-scale latency. This page documents the benchmark results and the key performance techniques that make it possible. - ---- - -## Benchmark Summary - -Measured on **Intel Core Ultra 9 285K**, Java 25, AVX2 256-bit (8 float lanes), ZGC: - -| Benchmark | Result | Notes | -|---|---|---| -| **SIMD L2 Distance (128-dim)** | 0.8 µs/vector | 1.2M vectors/sec | -| **SIMD L2 Distance (384-dim)** | 1.5 µs/vector | 2.6M vectors/sec | -| **SIMD L2 Distance (768-dim)** | 2.2 µs/vector | 1.4M vectors/sec | -| **SIMD L2 Distance (1024-dim)** | 3.0 µs/vector | 1.0M vectors/sec | -| **Reverse Index Lookup** | 180 ns/lookup | O(1) packed-key ConcurrentHashMap | -| **CognitiveScorer (10K × 128-dim)** | 2.9 ms total | Full 6-phase pipeline | -| **Batch Habituation (1K IDs)** | 101 µs total | 100 ns per penalty computation | -| **TierRouter.totalCount()** | 17 ms / 100K calls | 170 ns per call | -| **Full Pipeline (1K ingest + 100 recall)** | < 50 ms/query | End-to-end latency | -| **Real Embedding (qwen3-embedding 4096-dim)** | 31 ms/embed | Via Ollama (network bound) | - ---- - -## Key Techniques - -### O(1) Reverse Index - -Memory IDs are resolved in constant time using a packed-key `ConcurrentHashMap`: - -```java -// Pack (type, offset) into a single long — zero String concatenation -private static long reverseKey(MemoryType type, long offset) { - return ((long) type.ordinal() << 48) | (offset & 0x0000_FFFF_FFFF_FFFFL); -} -``` - -This yields **180 ns** lookups at 50K entries. - ---- - -### SIMD Euclidean Distance - -Quantized INT8 Euclidean distance uses the Java Vector API for hardware acceleration: - -```java -// Vectorized dequantization + L2 in a single SIMD pass -FloatVector vQuery = FloatVector.fromArray(SPECIES, queryVector, i); -ByteVector vQuantized = ByteVector.fromMemorySegment(SPECIES_BYTE, segment, offset + i, NATIVE); -FloatVector vFloat = vQuantized.castShape(SPECIES, 0); // INT8 → float32 -FloatVector vDequant = vFloat.mul(vScale).add(vMin); // Affine dequantization -FloatVector vDiff = vQuery.sub(vDequant); -vSum = vDiff.fma(vDiff, vSum); // Fused multiply-add -``` - -This achieves **2.2 µs/vector** at 768 dimensions (1.4M vectors/sec). - ---- - -### Batch Habituation - -The habituation penalty module computes all penalties in a single batch call with amortized map access, processing 1K penalties in **101 µs** total. - ---- - -### Inline Header Capture - -`ScoredRecord` captures the `CognitiveHeader` inline during scoring, eliminating N×8 off-heap re-reads per recall query. - ---- - -### Direct TierRouter Access - -`totalCount()` uses direct field access to typed store references rather than iteration, completing 100K calls in **17 ms** (170 ns/call). - ---- - -## Parallel Tier Scanning - -Each memory tier is scanned on a dedicated **Virtual Thread** via `ConcurrentTasks.forkJoinAll()`: - -```mermaid -gantt - title Parallel Recall: 5 concurrent scans - dateFormat X - axisFormat %L ms - - section Working (100 records) - Scan :a1, 0, 1 - section Episodic P1 (5K records) - Scan :a2, 0, 3 - section Episodic P2 (3K records) - Scan :a3, 0, 2 - section Semantic (200 headers) - Scan :a4, 0, 1 - section Procedural (50 records) - Scan :a5, 0, 1 - section Merge + Rank - Top-K :a6, 3, 4 -``` - -**Key insight**: Episodic partitions use **disjoint memory segments** — each partition's mmap is a separate `MemorySegment`. This guarantees zero contention between virtual threads, enabling perfect parallel scaling. - -**Fallback**: If parallel scanning fails (e.g., thread pool exhaustion), the pipeline falls back to sequential scanning with identical results. - ---- - -## Memory Footprint - -| Component | Formula | 10K memories (768-dim) | -|---|---|---| -| Episodic partition | 64B header + N × (32B + vecBytes) | 64B + 10K × 800B = **7.8 MB** | -| Working memory | capacity × (32B + vecBytes) | 100 × 800B = **78 KB** | -| Semantic headers | capacity × 32B | 5K × 32B = **156 KB** | -| Procedural store | capacity × (32B + vecBytes) | 500 × 800B = **390 KB** | -| Forward index | ~120B per entry | 10K × 120B = **1.2 MB** | -| Reverse index | ~60B per entry | 10K × 60B = **600 KB** | -| **Total** | | **~10.2 MB** | - -!!! tip "vs. Python Memory Layers" - A Python memory system stores each memory as a Python object (~500-800 bytes overhead) plus the vector in NumPy (~3KB for 768-dim float32). Spector stores the same memory in **800 bytes** (32B header + 768B INT8 vector) — a 5-10× reduction. - ---- - -## Test Suite - -``` -spector-core: 276 tests ✅ (includes 15 SIMD kernel verification tests) -spector-memory: 167 tests ✅ (includes performance benchmarks + index tests) - + 10 Ollama real embedding E2E tests (gated by OLLAMA_LIVE=true) -Total: 443 tests, 0 failures -``` - -### Running Benchmarks - -```bash -# Run all memory tests (includes benchmark assertions) -mvn test -pl spector-memory - -# Run only performance benchmarks -mvn test -pl spector-memory -Dtest=PerformanceBenchmarkTest - -# Run Ollama real embedding E2E tests -OLLAMA_LIVE=true mvn test -pl spector-memory -Dtest=OllamaRealEmbeddingTest -``` - ---- - -## Next Steps - -- :material-memory: [**Off-Heap Panama Design**](panama-design.md) — zero-GC architecture -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — the SIMD hot-loop -- :material-brain: [**Architecture**](architecture.md) — system-level design diff --git a/docs/docs/memory/prospective.md b/docs/docs/memory/prospective.md deleted file mode 100644 index 6ddd8b0..0000000 --- a/docs/docs/memory/prospective.md +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: "Prospective — Future Intents" -description: "ProspectiveScheduler enables time-triggered memory reminders — the agent's ability to remember to do something in the future." ---- - -# 🔮 Prospective — Future Intents - -> **Package**: `com.spectrayan.spector.memory.prospective` -> -> **Biological Analog**: **Prospective memory** is the ability to remember to perform an intended action in the future — "Remember to call the doctor at 3pm." Unlike retrospective memory (recalling the past), prospective memory is future-oriented and time-triggered. - ---- - -## The Concept - -An AI agent needs to remember not just *what happened*, but *what to do next*. Prospective memory enables: - -- "Remind me to check the build in 10 minutes" -- "Flag this issue for follow-up tomorrow" -- "Alert when deployment completes" - ---- - -## ProspectiveScheduler - -```java -public final class ProspectiveScheduler { - - /** - * Schedules a prospective reminder. - * - * @param text reminder text - * @param triggerAt when to surface the reminder - * @param tags synaptic tags for contextual association - * @return the scheduled Reminder - */ - public Reminder schedule(String text, Instant triggerAt, String... tags) { - long synapticTags = SynapticTagEncoder.encode(tags); - String id = "prospective-" + UUID.randomUUID(); - Reminder reminder = new Reminder(id, text, triggerAt, synapticTags, tags); - reminders.add(reminder); - return reminder; - } - - /** - * Collects all reminders whose trigger time has passed. - * Called at Step 2 of the RecallPipeline. - */ - public List collectDue() { - Instant now = Instant.now(); - List due = new ArrayList<>(); - reminders.removeIf(r -> { - if (r.triggerAt().isBefore(now)) { - due.add(r); - return true; - } - return false; - }); - return due; - } -} -``` - -## Reminder Record - -```java -public record Reminder( - String id, - String text, - Instant triggerAt, - long synapticTags, - String[] tags -) {} -``` - ---- - -## Integration with Recall - -Due reminders are injected at **Step 2** of the `RecallPipeline` with maximum score (10.0), ensuring they always appear at the top of results: - -```java -// In RecallPipeline.recall() -List dueReminders = prospectiveScheduler.collectDue(); -for (Reminder r : dueReminders) { - allResults.add(new CognitiveResult( - r.id(), r.text(), 10.0f, 10.0f, 0f, - (short) 0, (byte) 0, MemoryType.WORKING, MemorySource.PROCEDURAL, - new String[]{"prospective"}, 1.0f, 1.0f)); -} -``` - ---- - -## Next Steps - -- :material-mirror: [**Metamemory — Self-Reflection**](metamemory.md) — memory health analytics -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — the full recall flow diff --git a/docs/docs/memory/scoring-pipeline.md b/docs/docs/memory/scoring-pipeline.md deleted file mode 100644 index b2c1512..0000000 --- a/docs/docs/memory/scoring-pipeline.md +++ /dev/null @@ -1,314 +0,0 @@ ---- -title: The 6-Phase Scoring Pipeline -description: "A deep dive into CognitiveScorer — the SIMD hot-loop that fuses six filtering and scoring phases into a single off-heap scan." ---- - -# The 6-Phase Scoring Pipeline - -The `CognitiveScorer` is the performance-critical inner loop of Spector Memory. It scans off-heap `MemorySegment` data using **six sequential phases**, each eliminating candidates before the expensive SIMD vector math. This design is inspired by the brain's **sensory gating** — the auditory cortex filters out background noise before the prefrontal cortex evaluates it. - ---- - -## Why Fused Scoring? - -### The Truncation Trap - -In a standard vector database, you: - -1. Retrieve the top-K nearest vectors by L2 distance -2. **Then** apply business logic (importance, time, tags) in Java - -This **fails catastrophically** for AI memory: - -!!! danger "The Problem" - If an AI agent asks *"What is the user's core preference?"*, the most important memory might be 6 months old and slightly less semantically similar than a useless conversation from 5 minutes ago. If you pull the top-100 nearest vectors and *then* sort by importance, the vital 6-month-old memory was already **dropped at step 1**. - -### The Fix: Fuse Everything - -Spector fuses temporal decay and importance directly into the scoring loop: - -$$\text{Similarity} = \frac{1}{1 + \text{L2\_Distance}(q, x)}$$ - -$$\text{FinalScore} = \alpha \cdot \text{Similarity} + \beta \cdot \text{Importance} \cdot \text{Decay}(\text{AdjustedAge})$$ - -Where $\alpha$ (default: 0.6) and $\beta$ (default: 0.4) are user-configurable scoring weights. - ---- - -## The Six Phases - -```java -for (int i = 0; i < recordCount; i++) { - long offset = baseOffset + (long) i * stride; - - // ── Phase 1: Tombstone Check (~1 cycle) ── - byte flags = segment.get(LAYOUT_FLAGS, offset + OFFSET_FLAGS); - if (isTombstoned(flags)) continue; - - // ── Phase 2: Synaptic Tag Gating (~1 cycle) ── - if (queryTagMask != 0) { - long recordTags = segment.get(LAYOUT_SYNAPTIC_TAGS, offset + OFFSET_SYNAPTIC_TAGS); - if ((recordTags & queryTagMask) != queryTagMask) continue; - } - - // ── Phase 3: Valence Filter (~2 cycles) ── - byte valence = segment.get(LAYOUT_VALENCE, offset + OFFSET_VALENCE); - if (valence < minValence || valence > maxValence) continue; - - // ── Phase 4: Temporal/Importance Pre-screen (~5 cycles) ── - float importance = segment.get(LAYOUT_IMPORTANCE, offset + OFFSET_IMPORTANCE); - if (importance < minImportance) continue; - long timestamp = segment.get(LAYOUT_TIMESTAMP, offset + OFFSET_TIMESTAMP); - short recallCount = segment.get(LAYOUT_RECALL_COUNT, offset + OFFSET_RECALL_COUNT); - int adjustedBucket = DecayStrategy.adjustForReconsolidation(rawBucket, recallCount); - if (adjustedBucket >= MAX_BUCKET && importance < 1.0f && !isPinned(flags)) continue; - - // ── Phase 5: SIMD L2 Distance (~200 cycles) ── - float l2dist = SimilarityFunction.EUCLIDEAN.computeQuantizedFromSegment( - queryVector, segment, layout.vectorOffset(offset), - effectiveMins, effectiveScales, quantizedVecBytes); - float similarity = 1.0f / (1.0f + l2dist); - - // ── Phase 6: Fused Cognitive Score (~7 cycles) ── - float decay = DecayStrategy.decay(adjustedBucket); - float finalScore = alpha * similarity + beta * importance * decay; - - heap.insertWithOverflow(offset, finalScore); -} -``` - ---- - -## Phase-by-Phase Deep Dive - -### Phase 1: Tombstone Check - -**Cost**: ~1 CPU cycle (single byte read + bit test) - -```java -byte flags = segment.get(LAYOUT_FLAGS, offset + OFFSET_FLAGS); -if ((flags & 0x01) != 0) continue; // Bit 0 = tombstone -``` - -Tombstoned memories are skipped without reading any other fields. When the tombstone ratio in an episodic partition exceeds 30%, the `TombstoneCompactor` triggers a partition rebuild. - ---- - -### Phase 2: Synaptic Tag Gating - -**Cost**: ~1 CPU cycle (single `long` read + bitwise AND) - -```java -long recordTags = segment.get(LAYOUT_SYNAPTIC_TAGS, offset + OFFSET_SYNAPTIC_TAGS); -if ((recordTags & queryTagMask) != queryTagMask) continue; -``` - -!!! info "Bloom Filter Containment" - The check `(record & query) != query` is a **containment check**, not an overlap check. It verifies that **all** query tag bits are present in the record's Bloom filter. This is the correct Bloom filter match — it can have false positives but never false negatives. - -**Selectivity**: If an agent has 1,000,000 memories and only 10,000 match the query tags, this phase eliminates **990,000 records** in ~990µs — saving 990,000 × 200 cycles of SIMD math. - -The synaptic tag Bloom filter uses MurmurHash3-inspired double hashing with k=3 hash functions in a 64-bit field. False positive rates: - -| Tags per Record | FPR | Assessment | -|---|---|---| -| 5 | 0.03% | Excellent | -| 10 | 0.2% | Excellent | -| 20 | 2.3% | Good | -| 50 | 12% | Acceptable — vector distance rejects false matches | - ---- - -### Phase 3: Valence Filter - -**Cost**: ~2 CPU cycles (byte read + 2 comparisons) - -```java -byte valence = segment.get(LAYOUT_VALENCE, offset + OFFSET_VALENCE); -if (valence < minValence || valence > maxValence) continue; -``` - -Valence represents **emotional coloring** on a scale of -128 to +127: - -- **Negative**: Error memories, failures, warnings -- **Zero**: Neutral factual memories -- **Positive**: Successes, preferred outcomes - -!!! example "Use Case" - An agent debugging an error can filter to `maxValence = -10` to recall only negative-outcome memories — "What went wrong last time?" - ---- - -### Phase 4: Importance/Decay Pre-screen - -**Cost**: ~5 CPU cycles (float read + timestamp read + bucket computation) - -```java -float importance = segment.get(LAYOUT_IMPORTANCE, offset + OFFSET_IMPORTANCE); -if (importance < minImportance) continue; - -int rawBucket = DecayStrategy.ageToBucket(timestamp, nowMs); -int adjustedBucket = DecayStrategy.adjustForReconsolidation(rawBucket, recallCount); - -if (adjustedBucket >= MAX_BUCKET && importance < 1.0f && !isPinned(flags)) continue; -``` - -**Reconsolidation**: Every 3 recalls shifts the decay bucket back by 1, simulating how frequently-recalled memories become more durable (Long-Term Potentiation). A memory recalled 12 times is 4 buckets "younger" than its actual age. - -**Decay Buckets** (precomputed — no `Math.exp()` required): - -| Bucket | Age Range | Decay Multiplier | -|---|---|---| -| 0 | 0–1 hours | 1.00 | -| 1 | 1–6 hours | 0.95 | -| 2 | 6–24 hours | 0.85 | -| 3 | 1–3 days | 0.70 | -| 4 | 3–7 days | 0.50 | -| 5 | 1–2 weeks | 0.30 | -| 6 | 2–4 weeks | 0.15 | -| 7 | 1–3 months | 0.05 | -| 8+ | 3+ months | 0.01 | - -!!! warning "The `exp()` Bottleneck" - Naive exponential decay `Math.exp(-λ·age)` costs 50-100ns per call and cannot be SIMD-vectorized. Spector uses precomputed decay buckets — a single array lookup per record (~1ns). At 1M memories, this saves **50-100ms** of scalar overhead. - ---- - -### Phase 5: SIMD L2 Distance - -**Cost**: ~200 CPU cycles (the dominant cost) - -```java -float l2dist = SimilarityFunction.EUCLIDEAN.computeQuantizedFromSegment( - queryVector, segment, layout.vectorOffset(offset), - effectiveMins, effectiveScales, quantizedVecBytes); -float similarity = 1.0f / (1.0f + l2dist); -``` - -This is the expensive operation that phases 1-4 are designed to gate. It: - -1. Reads INT8 quantized vector bytes directly from the off-heap `MemorySegment` -2. Dequantizes via calibration: `float_val = byte_val * scale + min` -3. Computes Euclidean distance using the Java Vector API (AVX2/AVX-512) -4. Converts distance to similarity: `1 / (1 + L2)` - -**Throughput**: ~2.2µs per 768-dim vector (1.4M vectors/sec on AVX2). - ---- - -### Phase 6: Fused Cognitive Score - -**Cost**: ~7 CPU cycles (2 multiplies + 1 add + heap insert) - -```java -float decay = DecayStrategy.decay(adjustedBucket); -float finalScore = alpha * similarity + beta * importance * decay; -heap.insertWithOverflow(offset, finalScore); -``` - -The final score fuses three signals: - -- **Semantic similarity** (α-weighted): How relevant is this memory to the query? -- **Importance** (β-weighted): How important was this memory at ingestion? -- **Temporal decay** (β-weighted): How recent is this memory? - -Results are tracked in a **min-heap** of size K — only the top-K scored records survive. - ---- - -## The Math: Gating Efficiency - -```mermaid -graph TD - A["1,000,000 episodic memories"] --> B["Phase 1: Tombstone check
−50,000 → 950,000 remain
~1 cycle each"] - B --> C["Phase 2: Synaptic tag gating
−940,000 → 10,000 remain
~1 cycle each"] - C --> D["Phase 3: Valence filter
−2,000 → 8,000 remain
~2 cycles each"] - D --> E["Phase 4: Importance pre-screen
−3,000 → 5,000 remain
~5 cycles each"] - E --> F["Phase 5: SIMD L2 distance
5,000 × 200 cycles
expensive"] - F --> G["Phase 6: Fused score
5,000 × 7 cycles"] - G --> H["✅ ~0.13ms total"] - - style A fill:#e74c3c,color:white - style C fill:#f39c12,color:white - style H fill:#00b894,color:white -``` - -> **Without gating**: 1,000,000 × 200 cycles = ~200ms → **100× improvement** from early elimination. - ---- - -## Parallel Tier Scanning - -The `RecallPipeline` scans all tiers in parallel using `ConcurrentTasks.forkJoinAll()`: - -```mermaid -gantt - title Parallel Recall Scan (Virtual Threads) - dateFormat X - axisFormat %L ms - section Working - Scan 100 records :a1, 0, 1 - section Episodic P1 - Scan 5000 records :a2, 0, 3 - section Episodic P2 - Scan 3000 records :a3, 0, 2 - section Semantic - Header scan 200 :a4, 0, 1 - section Procedural - Scan 50 records :a5, 0, 1 - section Merge - Sort + top-K :a6, 3, 4 -``` - -Each partition scan runs on a **dedicated Virtual Thread** — disjoint memory segments guarantee zero contention. The merge phase sorts all tier results and returns the global top-K. - ---- - -## Graph Augmentation (Post-Scorer) - -After the 6-phase scorer produces a **seed set** (top-K by fused cognitive score), three graph layers expand the result set by discovering memories that the scorer alone couldn't find: - -```mermaid -graph LR - S["Seed Set
(6-Phase Scorer Top-K)"] --> H["Step 5c: Hebbian
Spreading Activation
(depth=2, 0.3× attenuation)"] - H --> T["Step 5d: Temporal
Chain Extension
(maxHops=3, 0.8×/0.7×)"] - T --> E["Step 5e: Entity
Graph Traversal
(2-hop BFS, 0.25×/hop)"] - E --> M["Merge & Dedup
→ Re-sort
→ Final Top-K"] - - style S fill:#4a90d9,color:white - style H fill:#e74c3c,color:white - style T fill:#f39c12,color:white - style E fill:#9b59b6,color:white - style M fill:#00b894,color:white -``` - -### Step 5c: Hebbian Spreading Activation - -For each seed result, `HebbianGraph.activateNeighbors(memoryIdx, depth=2)` traverses the off-heap adjacency list (164B/node, MAX_DEGREE=20). Activated neighbor memories are added to the result set with their score attenuated by **0.3×**. - -**Example:** Seed memory "database error" has a strong Hebbian edge (weight: 0.83) to "connection pool settings" → "connection pool settings" is added even though it wasn't in the vector similarity top-K. - -### Step 5d: Temporal Chain Extension - -For each seed result, `TemporalChain.followForward(idx, 3)` and `followBackward(idx, 3)` follow session-local linked list pointers. Forward-linked memories get **0.8×** score, backward-linked get **0.7×**. - -**Example:** Seed memory "deploy failed" → follow forward → "rollback initiated" → "post-mortem notes" — both added to results. - -### Step 5e: Entity Graph Traversal - -Entities are extracted from the query text, then looked up in the `EntityGraph`. For each matched entity, a 2-hop BFS with typed edge filtering discovers related entities. Their linked memories are added with **0.25× attenuation per hop**. - -**Example:** Query mentions "Alice" → Entity "Alice" → MANAGES → "Project Alpha" → memories mentioning "Project Alpha" are added. - -!!! tip "Graceful Degradation" - Each graph step is **additive and independently optional**. If a graph component is null (not configured), empty, or throws a `RuntimeException`, the step is a no-op. The system degrades gracefully to vector-only recall. Zero risk of regression. - ---- - -## Next Steps - -- :material-share-variant: [**3-Layer Cognitive Graph**](hebbian.md) — deep dive into Hebbian, Entity, and Temporal graphs -- :material-brain: [**Cortex — Tier Stores**](cortex.md) — the 4-tier memory architecture -- :material-flash: [**Synapse — Tags & Scoring**](synapse.md) — Bloom filter and binary layout -- :material-speedometer: [**Performance**](performance.md) — benchmark results - diff --git a/docs/docs/memory/synapse.md b/docs/docs/memory/synapse.md deleted file mode 100644 index d7c29d1..0000000 --- a/docs/docs/memory/synapse.md +++ /dev/null @@ -1,479 +0,0 @@ ---- -title: "Synapse — Tags & Scoring" -description: "The versioned synaptic header (V1/V2/V3), 64-bit inline Bloom filter, arousal-modulated decay, and CognitiveRecordLayout binary format." ---- - -# 🔗 Synapse — Tags & Scoring - -> **Package**: `com.spectrayan.spector.memory.synapse` -> -> **Biological Analog**: In neuroscience, the **Synaptic Tagging and Capture (STC)** hypothesis (Frey & Morris, 1997) describes how synapses are "tagged" during learning with lightweight chemical markers. These tags don't contain the memory itself — they identify *what* the memory is about and *when* it was formed, enabling the brain to route consolidation activity efficiently. - ---- - -## Versioned Header Layouts - -Every cognitive memory record begins with a synaptic header — the digital equivalent of a synaptic tag. The header format is **versioned** via the `HeaderLayout` sealed interface, supporting three layout sizes: - -```mermaid -classDiagram - class HeaderLayout { - <> - +headerBytes() int - +version() int - +readHeader(segment, offset) CognitiveHeader - +writeHeader(segment, offset, header) - +forVersion(int) HeaderLayout$ - +defaultLayout() HeaderLayout$ - } - - class HeaderLayoutV1 { - +headerBytes() = 32 - +version() = 1 - } - class HeaderLayoutV2 { - +headerBytes() = 48 - +version() = 2 - } - class HeaderLayoutV3 { - +headerBytes() = 64 - +version() = 3 - } - - HeaderLayout <|.. HeaderLayoutV1 : permits - HeaderLayout <|.. HeaderLayoutV2 : permits - HeaderLayout <|.. HeaderLayoutV3 : permits -``` - -### V1 — Core Layout (32 bytes) - -The original layout, still supported for backward compatibility. Contains all fields required for the [6-Phase Scoring Pipeline](scoring-pipeline.md). - -``` - Offset Size Field Description - ────── ──── ───── ─────────── - 0 8B timestamp_ms Unix epoch ms when memory was formed - 8 8B synaptic_tags 64-bit Bloom filter of contextual markers - 16 4B exact_norm L2 norm of original float vector - 20 4B importance Cognitive importance (0.05 – 10.0) - 24 4B recall_count Times recalled (LTP reconsolidation counter) - 28 2B centroid_id IVF centroid assignment (max 65,535) - 30 1B valence Emotional coloring (signed: -128 to +127) - 31 1B flags Bit flags (see below) - ═══════════════════════════════════ - Total: 32 bytes (1× AVX2 register) -``` - -!!! info "Why 32 bytes?" - The V1 header is exactly one **AVX2 register width** (256 bits). The entire header can be loaded in a single SIMD instruction for bulk scanning operations. - -### V2 — Extended Layout (48 bytes) - -Adds **arousal** and **storage strength** for emotional modulation and the future [Two-Factor Memory Strength](../labs/roadmap.md#two-factor-memory-strength-bjork-bjork-1992) model. - -``` - Offset Size Field Description - ────── ──── ───── ─────────── - 0 32B [V1 core] All V1 fields (timestamp through flags) - ─────────────────────────────── V2 extension ─────────────────────── - 32 1B arousal Emotional intensity (unsigned: 0-255) - 33 3B [padding] Alignment padding - 36 4B storage_strength Durability factor S(t) for Two-Factor model - 40 8B [reserved] Future use (zeroed) - ═══════════════════════════════════ - Total: 48 bytes (1.5× AVX2 registers) -``` - -**New fields:** - -| Field | Type | Range | Purpose | -|:---|:---|:---|:---| -| `arousal` | unsigned byte | 0 (calm) – 255 (extreme) | Modulates decay curve — high-arousal memories resist forgetting | -| `storage_strength` | float | 0.0 – 5.0 | Two-Factor model durability (default: 1.0). Reserved for [Labs](../labs/roadmap.md) | - -### V3 — Full Cache-Line Layout (64 bytes) ⭐ Default - -The default for all new stores. Extends V2 with a 16-byte future buffer, aligned to a full **CPU cache line** (64 bytes) for optimal sequential scan performance. - -``` - Offset Size Field Description - ────── ──── ───── ─────────── - 0 32B [V1 core] All V1 fields (timestamp through flags) - ─────────────────────────────── V2 extension ─────────────────────── - 32 1B arousal Emotional intensity (unsigned: 0-255) - 33 3B [padding] Alignment padding - 36 4B storage_strength Durability factor S(t) - 40 8B [reserved_1] Future use (zeroed) - ─────────────────────────────── V3 extension ─────────────────────── - 48 16B [reserved_2] Future expansion buffer (zeroed) - ═══════════════════════════════════ - Total: 64 bytes (1× cache line, 2× AVX2) -``` - -!!! tip "Why V3 is the default" - **Cache-line alignment** eliminates split-line reads during sequential scans. When the scorer iterates over 1M records, each header read hits exactly one cache line — no partial line loads, no false sharing. The 16 bytes of reserved space cost ~1.5% total memory overhead but prevent future migration costs when new fields are added. - -### Version Comparison - -| Property | V1 (32B) | V2 (48B) | V3 (64B) | -|:---|:---:|:---:|:---:| -| Core fields | ✅ | ✅ | ✅ | -| Arousal | ❌ (default: 0) | ✅ | ✅ | -| Storage strength | ❌ (default: 1.0) | ✅ | ✅ | -| Future buffer | ❌ | ❌ | ✅ (16B) | -| Cache-line aligned | ❌ | ❌ | ✅ | -| Memory per 1M records | 32 MB | 48 MB | 64 MB | -| SIMD reads per header | 1 | 2 | 2 | - -### Backward Compatibility - -When a V3 reader encounters a V1 file, the missing fields return safe defaults: - -```java -// V1 → V3 transparent upgrade -CognitiveHeader header = layout.readHeader(segment, offset); -header.arousal(); // → 0 (neutral — no arousal effect) -header.storageStrength(); // → 1.0 (default durability) -``` - -No data migration is required for reads. The `CognitiveScorer` checks `headerBytes > 32` to determine whether arousal is available and skips the arousal read on V1 segments. - ---- - -## HeaderMigrator — One-Time Version Upgrades - -The `HeaderMigrator` performs atomic, one-time migration of store files between header versions. - -### Supported Paths - -``` - Upgrade (lossless): - V1 (32B) ──→ V2 (48B) ✅ New fields filled with defaults - V1 (32B) ──→ V3 (64B) ✅ New fields filled with defaults - V2 (48B) ──→ V3 (64B) ✅ Existing V2 fields preserved - - Downgrade (lossy): - V3 (64B) ──→ V2 (48B) ⚠️ Reserved buffer lost - V3 (64B) ──→ V1 (32B) ⚠️ Arousal + storage_strength lost - V2 (48B) ──→ V1 (32B) ⚠️ Arousal + storage_strength lost -``` - -### Atomic Migration Process - -```mermaid -flowchart LR - A["Original Store
store.dat"] --> B["Write to temp
store.dat.migrating"] - B --> C["Verify temp
record count match"] - C --> D["Backup original
store.dat.bak"] - D --> E["Atomic rename
temp → store.dat"] - - C -->|"Verify failed"| F["Delete temp
Abort migration"] - - style A fill:#3498db,color:white - style E fill:#27ae60,color:white - style F fill:#e74c3c,color:white -``` - -1. **Write** — Records are read from source, headers expanded/shrunk, written to `store.dat.migrating` -2. **Verify** — Record count in temp file must match source exactly -3. **Backup** — Original file renamed to `store.dat.bak` -4. **Rename** — Temp file atomically renamed to `store.dat` -5. **Cleanup** — On startup, orphaned `.migrating` files are detected and deleted - -### Usage - -```java -HeaderMigrator migrator = new HeaderMigrator(); - -// Upgrade V1 store to V3 -migrator.migrate( - Path.of("/data/episodic.dat"), - HeaderLayout.forVersion(1), // source layout - HeaderLayout.forVersion(3), // target layout - quantizedVecBytes // vector payload size -); -``` - ---- - -## Flags Bitfield - -The `flags` byte at offset 31 encodes per-record state: - -``` - Bit Name Description - ─── ──── ─────────── - 0 tombstone Record is logically deleted (pruned by Deep Sleep) - 1-2 memory_type 2-bit type: 0=WORKING, 1=EPISODIC, 2=SEMANTIC, 3=PROCEDURAL - 3 consolidated Has been reflected into Semantic tier - 4 pinned Exempt from decay and pruning (flashbulb memories) - 5 resolved Zeigarnik Effect — resolved tasks return to normal decay - 6-7 reserved Future use -``` - -### Zeigarnik Effect (Bit 5) - -Unresolved memories (bit 5 = 0) resist time-decay — their decay bucket is clamped to 0, keeping them perpetually "fresh." This models the psychological phenomenon where incomplete tasks remain more accessible than completed ones. - -```java -// In CognitiveScorer Phase 4: -if (!isResolved(flags) && !isPinned(flags)) { - adjustedBucket = 0; // acts like the memory was just formed -} - -// Agent marks task complete: -memory.markResolved("task-123"); // bit 5 → 1, normal decay resumes -``` - ---- - -## SynapticTagEncoder — The Inline Bloom Filter - -The `synaptic_tags` field is a **64-bit inline Bloom filter** rather than a discrete bitmap. This enables encoding thousands of unique tag strings across the system while each individual record holds 5-50 tags with negligible false positive rates. - -### How It Works - -```java -public static long encode(String... tags) { - long filter = 0L; - for (String tag : tags) { - filter |= encodeTag(tag); - } - return filter; -} - -private static long encodeTag(String tag) { - long h = murmurHash64(tag); - long h1 = h; - long h2 = h >>> 32 | h << 32; // Swap halves for second hash - - long filter = 0L; - for (int i = 0; i < K; i++) { // K = 3 hash functions - int bitIndex = Math.abs((int) ((h1 + (long) i * h2) % M)); // M = 64 - filter |= (1L << bitIndex); - } - return filter; -} -``` - -**Key properties**: - -| Property | Value | -|:---|:---| -| Filter size | 64 bits (fits in a single CPU register) | -| Hash functions | k = 3 (MurmurHash3-inspired double hashing) | -| Bits per tag | 3 | -| Match operation | `(record & query) == query` (containment check) | -| Cost | **1 CPU cycle** (single `long` read + bitwise AND) | - -### False Positive Rates - -| Tags per Record | FPR | Assessment | -|:---|:---|:---| -| 5 tags | 0.03% | Excellent — 1 false match per 3,000 records | -| 10 tags | 0.2% | Excellent — 1 false match per 500 records | -| 20 tags | 2.3% | Good — vector distance rejects false matches | -| 50 tags | 12% | Acceptable — still useful for coarse gating | - -!!! tip "System vs. Record Tags" - The system can have **thousands** of unique tag strings. But any single record should have at most **10-50 tags** for the Bloom filter to remain effective. This is a natural fit — a single memory rarely has more than 5-15 contextual associations. - -### Tag Overlap Scoring - -Beyond binary gating, the `SynapticTagEncoder` also computes a **fractional overlap ratio** for weighted tag relevance in Phase 6: - -```java -public static float overlapRatio(long recordTags, long queryMask) { - if (queryMask == 0) return 0f; - int overlapBits = Long.bitCount(recordTags & queryMask); - int queryBits = Long.bitCount(queryMask); - return (float) overlapBits / queryBits; -} -``` - -This ratio is used as a multiplier in the scoring formula: `finalScore = baseScore × (1 + tagOverlap × tagRelevanceBoost)`. A record matching 3 of 5 query tags gets a 60% tag boost vs 100% for a full match. - ---- - -## CognitiveRecordLayout — Binary Format - -The `CognitiveRecordLayout` class manages reading/writing headers and quantized vectors to/from off-heap `MemorySegment`. It delegates header operations to the active `HeaderLayout`: - -```java -public final class CognitiveRecordLayout { - private final HeaderLayout headerLayout; - private final int quantizedVecBytes; - - /** - * Record stride = header bytes + vector payload. - * V1: 32 + vecBytes, V2: 48 + vecBytes, V3: 64 + vecBytes. - */ - public int stride() { - return headerLayout.headerBytes() + quantizedVecBytes; - } - - /** - * Offset where the quantized vector begins within a record. - */ - public long vectorOffset(long recordOffset) { - return recordOffset + headerLayout.headerBytes(); - } - - public void writeHeader(MemorySegment segment, long offset, CognitiveHeader header) { - headerLayout.writeHeader(segment, offset, header); - } - - public CognitiveHeader readHeader(MemorySegment segment, long offset) { - return headerLayout.readHeader(segment, offset); - } -} -``` - -### CognitiveHeader Record - -The header data is represented as a Java `record` with all fields from all versions: - -```java -public record CognitiveHeader( - long timestampMs, // when the memory was formed - long synapticTags, // 64-bit Bloom filter - float exactNorm, // L2 norm of original vector - float importance, // cognitive importance (0.05 – 10.0) - int recallCount, // LTP reconsolidation counter - short centroidId, // IVF partition routing ID - byte valence, // emotional coloring (-128 to +127) - byte flags, // bit field (tombstone, type, consolidated, pinned, resolved) - byte arousal, // V2+: emotional intensity (unsigned 0-255) - float storageStrength // V2+: Two-Factor durability S(t) -) { - /** - * V1-compatible constructor — fills V2+ fields with safe defaults. - */ - public CognitiveHeader(long timestampMs, long synapticTags, float exactNorm, - float importance, int recallCount, short centroidId, - byte valence, byte flags) { - this(timestampMs, synapticTags, exactNorm, importance, - recallCount, centroidId, valence, flags, - (byte) 0, // arousal: neutral - 1.0f); // storageStrength: default durability - } -} -``` - ---- - -## DecayStrategy — SIMD-Friendly Temporal Decay - -!!! warning "The `exp()` Problem" - The naive decay formula `Math.exp(-λ·age)` costs 50-100ns per call and is a **scalar operation** — it cannot be SIMD-vectorized. At 1M memories, this adds 50-100ms of pure overhead, destroying the SIMD advantage. - -### The Solution: Precomputed Decay Buckets - -`DecayStrategy` quantizes time into discrete buckets and uses a precomputed lookup table: - -```java -// Precomputed — zero Math.exp() calls at query time -private static final float[] DECAY_TABLE = { - 1.00f, // Bucket 0: 0-1 hours - 0.95f, // Bucket 1: 1-6 hours - 0.85f, // Bucket 2: 6-24 hours - 0.70f, // Bucket 3: 1-3 days - 0.50f, // Bucket 4: 3-7 days - 0.30f, // Bucket 5: 1-2 weeks - 0.15f, // Bucket 6: 2-4 weeks - 0.05f, // Bucket 7: 1-3 months - 0.01f // Bucket 8+: 3+ months -}; - -public static float decay(int bucket) { - return DECAY_TABLE[Math.min(bucket, DECAY_TABLE.length - 1)]; -} -``` - -### Reconsolidation Adjustment - -Every 3 recalls shifts the bucket back by 1, simulating Long-Term Potentiation: - -```java -public static int adjustForReconsolidation(int rawBucket, int recallCount) { - return Math.max(0, rawBucket - (recallCount / 3)); -} -``` - -A memory recalled 12 times is 4 buckets "younger" than its actual age — it resists forgetting. - -### Arousal-Modulated Decay - -Emotionally intense memories resist forgetting. The `arousal` byte (V2+ headers) modulates the decay curve through a 4-bucket lookup table: - -```java -private static final int[] AROUSAL_THRESHOLDS = {64, 128, 192}; -private static final float[] AROUSAL_MODIFIERS = {1.0f, 1.15f, 1.35f, 1.65f}; -``` - -| Arousal Range | Bucket | Modifier | Biological Basis | -|:---|:---:|:---:|:---| -| 0-63 (neutral) | 0 | 1.00× | Normal forgetting — routine memories | -| 64-127 (mild) | 1 | 1.15× | Slightly persistent — mildly emotional | -| 128-191 (moderate) | 2 | 1.35× | Noticeably persistent — significant events | -| 192-255 (extreme) | 3 | 1.65× | Very hard to forget — flashbulb memories | - -The modifier **multiplies the base decay factor**, slowing the decay rate. A production outage at arousal=200 decays 1.65× slower than a routine log entry at arousal=0. - -```java -/** - * Computes decay with arousal modulation. - * Higher arousal → slower decay → memory persists longer. - */ -public static float computeDecayWithArousal(int bucket, byte arousal) { - float baseFactor = decay(bucket); - float modifier = arousalModifier(arousal); - return Math.min(1.0f, baseFactor * modifier); -} - -/** - * Returns the arousal modifier for a given arousal byte (unsigned 0-255). - */ -public static float arousalModifier(byte arousal) { - int unsigned = Byte.toUnsignedInt(arousal); - for (int i = AROUSAL_THRESHOLDS.length - 1; i >= 0; i--) { - if (unsigned >= AROUSAL_THRESHOLDS[i]) return AROUSAL_MODIFIERS[i + 1]; - } - return AROUSAL_MODIFIERS[0]; -} -``` - -**Automatic arousal derivation:** When arousal is not explicitly set by the LLM, it is auto-derived from valence at ingestion time: - -$$ -\text{arousal} = \min(255, |\text{valence}| \times 2) -$$ - -This means both extremely positive (valence=+100) and extremely negative (valence=-100) memories are equally arousing — matching the psychological finding that emotional intensity, not polarity, drives memory persistence. - -### Wiring in CognitiveScorer - -The scorer reads arousal from the header and applies the modifier to both standard and lateral scoring paths: - -```java -// In CognitiveScorer, after Phase 4 (temporal/importance pre-screen): - -// Read arousal — only available on V2+ layouts -byte arousal = hasArousal - ? segment.get(LAYOUT_AROUSAL, offset + OFFSET_AROUSAL) - : (byte) 0; // V1 fallback: no arousal effect - -// Phase 6: Standard scoring -float decay = DecayStrategy.decay(adjustedBucket) * DecayStrategy.arousalModifier(arousal); -decay = Math.min(1.0f, decay); -float baseScore = alpha * similarity + beta * importance * decay; -``` - ---- - -## Next Steps - -- :material-head-cog: [**Dopamine — Surprise Detection**](dopamine.md) — auto-importance scoring -- :material-brain: [**Cortex — Tier Stores**](cortex.md) — the 4-tier architecture -- :material-lightning-bolt: [**6-Phase Scoring Pipeline**](scoring-pipeline.md) — how scoring uses the header -- :material-flask: [**Labs — Research Roadmap**](../labs/roadmap.md) — Two-Factor Memory, Dynamic Quantization diff --git a/docs/docs/memory/sync.md b/docs/docs/memory/sync.md deleted file mode 100644 index 54faee9..0000000 --- a/docs/docs/memory/sync.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -title: "Sync — Persistence & Replication" -description: "Write-Ahead Log for durability and CRDT merge strategy for distributed memory synchronization." ---- - -# 🔄 Sync — Persistence & Replication - -> **Package**: `com.spectrayan.spector.memory.sync` -> -> **Biological Analog**: Memory consolidation doesn't happen in isolation. During sleep, the brain replays memories and transfers them between regions (hippocampus → neocortex). The sync package provides the infrastructure for **durable persistence** and **distributed memory merge**. - ---- - -## MemoryWal — Write-Ahead Log - -The `MemoryWal` provides crash-safe durability for cognitive memory operations: - -```java -public final class MemoryWal implements AutoCloseable { - - /** - * Appends a REMEMBER event to the WAL. - */ - public void appendRemember(String id, MemoryType type, byte[] quantizedVec, - CognitiveHeader header, String text, - MemorySource source, String[] tags) { ... } - - /** - * Appends a FORGET event to the WAL. - */ - public void appendForget(String id) { ... } - - /** - * Replays all WAL events to rebuild memory state after restart. - */ - public void replay(WalEventHandler handler) { ... } - - /** - * Returns the number of events in the WAL. - */ - public long eventCount() { ... } - - /** - * Returns the high-water mark (latest event offset). - */ - public long highWaterMark() { ... } -} -``` - -**Two modes**: - -| Mode | Storage | Use Case | -|---|---|---| -| **File-backed** | Append-only log file | Production — survives JVM restarts | -| **In-memory** | `ArrayList` | Testing — fast, no disk I/O | - ---- - -## CrdtMergeStrategy — Distributed Merge - -For multi-agent or distributed deployments, the `CrdtMergeStrategy` resolves conflicts between divergent memory replicas using **Conflict-free Replicated Data Types (CRDTs)**: - -```java -public final class CrdtMergeStrategy { - - /** - * Merges two versions of the same memory record. - * - * CRDT merge rules: - * - timestamp: max(local, remote) — Last-Write-Wins - * - synapticTags: local | remote — OR-merge (union) - * - importance: max(local, remote) — Highest signal wins - * - recallCount: max(local, remote) — Monotonic counter - * - flags: local | remote — OR-merge (tombstone propagates) - */ - public CognitiveHeader merge(CognitiveHeader local, CognitiveHeader remote) { ... } - - /** - * Determines if a remote update should be applied. - */ - public boolean shouldApply(CognitiveHeader local, CognitiveHeader remote) { ... } -} -``` - -**Key insight**: Synaptic tags use **bitwise OR** for merge — this is a natural CRDT (G-Set). Tags can only be added, never removed, which guarantees convergence without coordination. - ---- - -## Next Steps - -- :material-memory: [**Off-Heap Panama Design**](panama-design.md) — how persistence interacts with mmap -- :material-brain: [**Architecture**](architecture.md) — system overview diff --git a/docs/docs/memory/wal-design.md b/docs/docs/memory/wal-design.md deleted file mode 100644 index 0d78c15..0000000 --- a/docs/docs/memory/wal-design.md +++ /dev/null @@ -1,561 +0,0 @@ ---- -title: "WAL Design — Write-Ahead Log" -description: "Append-only binary WAL with chunked files, CRC-32 integrity, DEFLATE compression, crash recovery, CRDT merge, and cloud replication for cognitive memory durability." ---- - -# 📝 WAL Design — Write-Ahead Log - -> **Package**: `com.spectrayan.spector.memory.sync` -> -> **Biological Analog**: The hippocampus doesn't write memories directly to the neocortex. It first records a transient "replay buffer" — a sequential log of experiences — and consolidates them during sleep. The WAL is the digital equivalent: an ordered, append-only log of every memory mutation that can be replayed to reconstruct state. - ---- - -## Why a WAL? - -Cognitive memory stores mutable state (importance, valence, recall count, tags) in off-heap `MemorySegment` buffers. Without durability, a JVM crash loses everything. The WAL provides: - -| Concern | WAL Guarantee | -|---|---| -| **Crash recovery** | Replay the log → full state reconstruction | -| **Ordering** | Monotonic sequence numbers → total order | -| **Distributed sync** | Ship events after a high-water mark → pull-based replication | -| **Auditability** | Every mutation is recorded (who, what, when) | -| **Compaction** | Truncate chunks below a snapshot HWM | - ---- - -## Architecture Overview - -```mermaid -graph TD - subgraph "Write Path" - A["SpectorMemory.remember()"] --> B["MemoryWal.append()"] - B --> C["writeLock.lock()"] - C --> D["events.add(event)"] - C --> E["writeEventToChannel()"] - E --> F["CRC-32 header + payload"] - F --> G["FileChannel.write()"] - G --> H{"chunk ≥ 8MB?"} - H -->|yes| I["rollChunk()"] - H -->|no| J["fsync (optional)"] - end - - subgraph "Read Path (Recovery)" - K["JVM restart"] --> L["recoverFromDisk()"] - L --> M["findChunkFiles()"] - M --> N["readChunkFile() × N"] - N --> O["Validate magic + CRC"] - O --> P["Rebuild in-memory cache"] - P --> Q["Restore sequenceCounter"] - end - - subgraph "Replication" - R["CloudSync.exportEvents()"] --> S["replay(afterHwm)"] - S --> T["Ship to remote agent"] - T --> U["importEvents() + CRDT merge"] - end - - style A fill:#6c5ce7,color:white - style B fill:#00b894,color:white - style K fill:#e17055,color:white - style R fill:#0984e3,color:white -``` - ---- - -## Dual Mode Operation - -`MemoryWal` operates in two modes, selected at construction time: - -| Mode | Constructor | Storage | Durability | Use Case | -|---|---|---|---|---| -| **File-backed** | `new MemoryWal(walDir)` | Append-only chunk files | ✅ Survives crashes | Production | -| **In-memory** | `new MemoryWal()` | `ArrayList` | ❌ Volatile | Testing, ephemeral agents | - -```java -// Production: durable WAL with 8MB chunk rolling -MemoryWal wal = new MemoryWal(Path.of(".spector/memory/wal")); - -// Production: custom chunk size + compression + per-write fsync -MemoryWal wal = new MemoryWal(walDir, 16 * 1024 * 1024, true, 512, true); - -// Testing: in-memory, no disk I/O -MemoryWal wal = new MemoryWal(); -``` - ---- - -## Event Types - -Every memory mutation produces a `WalEvent` record: - -```java -public record WalEvent( - long sequence, // monotonically increasing - EventType type, // REMEMBER, FORGET, REINFORCE, REFLECT, TAG_MERGE, RECALL_HIT - String memoryId, // the affected memory ID - Instant timestamp, // when the event occurred - byte[] payload // serialized event data (format varies by type) -) { } -``` - -| Event Type | Trigger | Payload | -|---|---|---| -| `REMEMBER` | `memory.remember(text)` | Full cognitive record (header + quantized vector + text) | -| `FORGET` | `memory.forget(id)` | Empty (tombstone marker) | -| `REINFORCE` | `memory.reinforce(id, valence)` | 1 byte: valence value | -| `REFLECT` | Sleep consolidation cycle | Consolidation metadata | -| `TAG_MERGE` | Synaptic tag update | Updated tag bitfield | -| `RECALL_HIT` | `memory.recall(query)` | Recall count increment | - ---- - -## Binary Record Format (V2) - -### File Header - -Each WAL chunk file begins with an 8-byte header: - -``` -Offset Size Field Value -────── ──── ───── ───── - 0 4B magic 0x53504543 ("SPEC" in ASCII) - 4 4B version 2 -``` - -### Record Layout - -Each event is serialized as a **40-byte fixed header** followed by variable-length segments, aligned to 8-byte boundaries: - -``` - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| recMagic (2B) | version (1B) | flags (1B) | ← Offset 0 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| typeOrd (1B) | idLen (2B) | reserved (1B) | ← Offset 4 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -+ sequence (8B) + ← Offset 8 -| | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| | -+ timestamp — epoch millis (8B) + ← Offset 16 -| | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| payloadLen (4B) | ← Offset 24 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| payloadCRC (4B) | ← Offset 28 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| reserved (4B) | ← Offset 32 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| headerCRC (4B) | ← Offset 36 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| memoryId (idLen bytes, UTF-8) | ← Offset 40 -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| payload (payloadLen bytes, optionally compressed) | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -| padding (0–7 bytes to 8-byte align) | -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -``` - -### Field Reference - -| Offset | Size | Field | Description | -|--------|------|-------|-------------| -| 0 | 2B | `recMagic` | `0x5741` ("WA") — record start sentinel | -| 2 | 1B | `version` | Record format version (matches file version) | -| 3 | 1B | `flags` | Bit 0: compressed payload | -| 4 | 1B | `typeOrd` | `WalEvent.EventType` ordinal | -| 5 | 2B | `idLen` | Memory ID length in bytes (unsigned) | -| 7 | 1B | reserved | Future use | -| 8 | 8B | `sequence` | Monotonic sequence number | -| 16 | 8B | `timestamp` | Epoch milliseconds | -| 24 | 4B | `payloadLen` | Payload length in bytes | -| 28 | 4B | `payloadCRC` | CRC-32 of (possibly compressed) payload | -| 32 | 4B | reserved | Future use | -| 36 | 4B | `hdrCRC` | CRC-32 of bytes [0..35] | -| 40 | N | `memoryId` | UTF-8 encoded memory ID | -| 40+N | M | `payload` | Event-specific data | -| 40+N+M | P | padding | `(8 - ((N+M) % 8)) % 8` zero bytes | - -**Total record size**: `40 + idLen + payloadLen + padding` - -### Integrity: Dual CRC-32 - -Every record has **two** independent CRC-32 checksums: - -```mermaid -graph LR - H["Header bytes 0-35"] -->|CRC-32| HC["Header CRC (offset 36)"] - P["Payload bytes"] -->|CRC-32| PC["Payload CRC (offset 28)"] - - HC -->|verified on read| V1["✅ Header intact"] - PC -->|verified on read| V2["✅ Payload intact"] - - style HC fill:#00b894,color:white - style PC fill:#00b894,color:white -``` - -This split design detects: - -- **Torn headers**: header CRC fails → truncate at record start -- **Corrupt payloads**: payload CRC fails → quarantine chunk file -- **Partial writes**: record magic missing → truncate at boundary - ---- - -## Chunked File Layout - -WAL data is spread across multiple **chunk files** in a directory: - -``` -.spector/memory/wal/ -├── wal-000000.bin ← oldest chunk (may be truncated after snapshot) -├── wal-000001.bin -├── wal-000002.bin -├── wal-000003.bin ← active chunk (currently being written) -└── .quarantine/ ← corrupted chunks moved here - └── wal-000001.bin -``` - -### Chunk Rolling - -When the active chunk exceeds `maxChunkBytes` (default **8 MB**), the WAL: - -1. Calls `force(true)` on the active `FileChannel` (metadata + data flush) -2. Closes the channel -3. Increments `chunkIndex` -4. Opens a new chunk file with a fresh file header - -```java -// Configurable chunk size -new MemoryWal(walDir, 16 * 1024 * 1024); // 16 MB chunks -``` - -### Compaction & Garbage Collection - -As memories decay or undergo sleep-consolidation, older WAL chunks become redundant. The WAL enforces **snapshot-driven truncation** — chunks are only deleted after a snapshot proves their events have been fully materialized to disk. - -```mermaid -flowchart TD - A["Active Writing Chunk"] -->|"size ≥ maxChunkBytes (8MB)"| B["rollChunk()"] - B --> C["Immutable Closed Chunk"] - D["Background Consolidation Daemon"] -->|"runs memory consolidation"| E["Generate Disk Snapshot"] - E -->|"write metadata"| F["Persist Snapshot High-Water Mark"] - F -->|"trigger compaction"| G{"truncateBefore(snapshotHwm)"} - G -->|"chunk maxSeq ≤ snapshotHwm"| H["Safe to Delete"] - G -->|"chunk maxSeq > snapshotHwm"| I["Must Retain"] - G -->|"chunk == activeChunkPath"| J["Never Touched"] - H --> K["Files.delete(chunk)"] - H --> L["events.removeIf(seq ≤ hwm)"] - - style A fill:#6c5ce7,color:white - style H fill:#00b894,color:white - style J fill:#e17055,color:white -``` - -**How it works:** - -1. **Snapshot trigger**: The consolidation daemon (hippocampus) periodically snapshots the full in-memory state to disk (mmap partition files) -2. **HWM declaration**: The snapshot records the highest WAL sequence number that has been fully materialized -3. **Chunk disposal**: `truncateBefore(snapshotHwm)` sweeps all closed chunks — any chunk where the maximum sequence ≤ HWM is safely deleted -4. **Active chunk protection**: The currently active chunk is **never** deleted, even if all its events are below the HWM -5. **In-memory cache pruning**: Events with sequence ≤ HWM are also removed from the `ArrayList` cache to prevent memory bloating - -```java -// After a successful snapshot at sequence 5042: -wal.truncateBefore(5042); -// → deletes wal-000000.bin (maxSeq=3200), wal-000001.bin (maxSeq=4980) -// → retains wal-000002.bin (maxSeq=5100, has events after HWM) -// → retains wal-000003.bin (active chunk, never touched) -``` - -!!! tip "Zero Page-Cache Poisoning" - Chunk deletion uses `Files.delete()` at the file level — the compaction scanner does **not** read old WAL data back into memory. This avoids evicting the host's page cache, which would degrade active mmap partition performance during concurrent queries. - ---- - -## Crash Recovery - -On startup, `MemoryWal` automatically recovers from disk: - -```mermaid -sequenceDiagram - participant JVM as ☕ JVM Restart - participant WAL as 📝 MemoryWal - participant FS as 💾 Filesystem - - JVM->>WAL: new MemoryWal(walDir) - WAL->>FS: findChunkFiles() — sorted by name - loop Each chunk file - WAL->>FS: Open FileChannel (READ+WRITE) - WAL->>WAL: Validate file header (magic + version) - loop Each record - WAL->>WAL: Read 40B header - WAL->>WAL: Verify record magic (0x5741) - WAL->>WAL: Verify header CRC-32 - WAL->>WAL: Read variable segments - WAL->>WAL: Verify payload CRC-32 - alt Torn write detected - WAL->>FS: truncate(startPos) — repair in place - WAL->>WAL: Stop reading this chunk - else Mid-log corruption - WAL->>FS: Move to .quarantine/ - WAL->>WAL: Throw WalCorruptionException - end - end - end - WAL->>WAL: Restore sequenceCounter to max(seq) - WAL->>WAL: Open next chunk for writing -``` - -### Corruption Recovery Strategy - -Because distributed nodes can experience power cuts, OS crashes, or disk hardware decay, the recovery process must handle corruption gracefully and **never allow silent data loss**. - -#### Classification of Corruptions - -```mermaid -graph TD - A["WAL Boot Scan"] --> B{"Verify Record CRC?"} - B -->|"All Valid"| C["✅ Replay Completed"] - B -->|"CRC Mismatch / Truncated"| D{"Corruption at file tail?"} - D -->|"Yes — Torn Write"| E["Auto-Repair: truncate(startPos)"] - D -->|"No — Mid-Log Bit Rot"| F["Fatal: Quarantine Protocol"] - E --> G["Resume writing after last valid record"] - F --> H["Halt boot + move file to .quarantine/"] - H --> I["Throw WalCorruptionException"] - I --> J["Cold Bootstrap from healthy peer"] - - style C fill:#00b894,color:white - style E fill:#fdcb6e,color:black - style F fill:#d63031,color:white -``` - -#### A. Torn Writes (End-of-File Corruption) - -| Aspect | Detail | -|---|---| -| **Cause** | Crash occurred while writing a record, leaving an incomplete block at the active chunk's tail | -| **Diagnosis** | Record's expected boundary exceeds actual file size, or header/payload CRC fails with no subsequent valid records in the file | -| **Safety** | The write was never acknowledged to the caller — the event is uncommitted | -| **Resolution** | `handleTornWrite()` truncates the file to `startPos` (the last fully-written record boundary) and forces to disk. Writing resumes from the repaired position | - -```java -private void handleTornWrite(Path path, FileChannel fc, long startPos) throws IOException { - log.warn("Torn WAL record detected in {} at position {}. " - + "Truncating file to recovery boundary.", path, startPos); - fc.truncate(startPos); - fc.force(true); -} -``` - -#### B. Mid-Log Corruption (Bit Rot) - -| Aspect | Detail | -|---|---| -| **Cause** | Magnetic/SSD decay in historical, closed chunks — a valid record is followed by corrupted bytes, then more valid records | -| **Diagnosis** | CRC mismatch detected at a position that is NOT the file tail — valid records exist after the corruption point | -| **Safety** | Truncating would discard **committed** operations, causing silent partition state divergence | -| **Resolution** | **Never auto-repair.** The chunk is moved to `.quarantine/` to preserve forensic evidence, and a `WalCorruptionException` halts startup. In cluster mode, the node initiates a **Cold Bootstrap** from a healthy peer | - -```java -private void handleMiddleLogCorruption(Path path, FileChannel fc, - long startPos, String reason) throws IOException { - log.error("Fatal mid-log corruption in {} at position {}: {}. " - + "Triggering quarantine.", path, startPos, reason); - fc.close(); - - Path quarantineDir = path.getParent().resolve(".quarantine"); - Files.createDirectories(quarantineDir); - Path quarantinedPath = quarantineDir.resolve(path.getFileName()); - Files.move(path, quarantinedPath, StandardCopyOption.REPLACE_EXISTING); - - throw new WalCorruptionException( - "Fatal WAL corruption: " + reason + " at position " + startPos); -} -``` - -#### Summary Matrix - -| Scenario | Detection | Action | Data Loss? | -|---|---|---|---| -| **Torn write** (EOF) | Record too short or CRC fails at tail | `truncate(startPos)` — auto-repair | ❌ No — write was uncommitted | -| **Bit rot** (mid-log) | CRC fails with valid records after | Quarantine + `WalCorruptionException` | ❌ No — manual recovery required | -| **Invalid file magic** | File header ≠ `0x53504543` | Skip file, log warning | ❌ No — file is not a WAL | -| **Version mismatch** | File version ≠ `WAL_VERSION` | Skip file, log warning | ❌ No — incompatible format | - -!!! warning "Why Not Auto-Repair Bit Rot?" - Truncating in the middle of a historical chunk would discard committed operations that downstream consumers (replicas, snapshots) may depend on. The quarantine-and-halt approach ensures **zero silent data loss** — the operator or cluster protocol must explicitly resolve the corruption before the node can serve traffic. - ---- - -## Compression - -Payload compression is opt-in and uses **DEFLATE** (java.util.zip): - -```java -// Enable compression for payloads > 512 bytes -new MemoryWal(walDir, 8 * 1024 * 1024, true, 512, false); -``` - -| Setting | Default | Description | -|---|---|---| -| `compressionEnabled` | `false` | Master switch | -| `compressionThreshold` | `1024` bytes | Minimum payload size before compression kicks in | - -When compression is enabled: - -1. Payloads larger than the threshold are DEFLATE-compressed before writing -2. The `flags` byte (offset 3) has bit 0 set to `1` -3. On read, the flag is checked and the payload is decompressed with `Inflater` -4. CRC-32 is computed on the **compressed** bytes (what's on disk) - -!!! tip "When to Enable" - Compression is most useful for `REMEMBER` events, which carry full text + quantized vectors (hundreds to thousands of bytes). `FORGET` and `REINFORCE` events have tiny payloads and skip compression regardless of the threshold. - ---- - -## Distributed Sync — CloudSync - -`CloudSync` provides **pull-based replication** between agents using the WAL as the replication log: - -```mermaid -graph LR - subgraph "Agent A" - WA["MemoryWal A"] --> CSA["CloudSync A"] - end - - subgraph "Agent B" - WB["MemoryWal B"] --> CSB["CloudSync B"] - end - - CSA -->|"exportEvents(remoteHwm)"| EVENTS["WAL Events"] - EVENTS -->|"importEvents()"| CSB - - CSB -->|"CRDT merge"| WB - - style CSA fill:#0984e3,color:white - style CSB fill:#0984e3,color:white - style EVENTS fill:#fdcb6e,color:black -``` - -### Replication Protocol - -1. **Agent B** sends its `highWaterMark` to Agent A -2. **Agent A** calls `wal.replay(remoteHwm)` → returns only new events -3. Events are shipped to Agent B (in-process V2, HTTP/gRPC V3) -4. **Agent B** replays each event into its local memory store -5. Conflicts are resolved via **CRDT merge** (see below) - -### Cold Bootstrap - -When a new agent joins (or corruption triggers a full resync): - -```java -// Download snapshot from leader and restore local state -long leaderHwm = CloudSync.bootstrapFromLeader( - "http://leader:7070", - localPersistenceDir -); -``` - -The leader serves its entire off-heap state as a zip archive via `GET /api/v2/memory/snapshot`. The new agent unpacks it, restoring all mmap partition files and WAL chunks. - ---- - -## CRDT Merge Strategy - -When two agents modify the same memory concurrently, `CrdtMergeStrategy` resolves conflicts deterministically: - -| Field | CRDT Type | Merge Rule | Guarantee | -|---|---|---|---| -| `timestamp` | LWW Register | `max(local, remote)` | Most recent write wins | -| `synapticTags` | G-Set (OR) | `local \| remote` | Tags only accumulate, never removed | -| `importance` | Max Register | `max(local, remote)` | Highest signal preserved | -| `recallCount` | G-Counter | `max(local, remote)` | Monotonic counter | -| `valence` | LWW Register | Value from newer `timestamp` | Latest emotional signal wins | -| `tombstone` (flag) | OR | `local \| remote` | Once deleted, always deleted | -| `consolidated` (flag) | OR | `local \| remote` | Once consolidated, stays consolidated | -| `pinned` (flag) | OR | `local \| remote` | Once pinned, stays pinned | - -**Convergence guarantee**: All merge operations are commutative, associative, and idempotent — any order of merges from any agents produces the **same final state**. - -```java -CrdtMergeStrategy.MergedHeader result = CrdtMergeStrategy.merge(local, remote); - -// Check if merge would actually change local state -if (CrdtMergeStrategy.wouldChange(local, remote)) { - applyMerge(result); -} -``` - ---- - -## Thread Safety - -| Operation | Lock | Mechanism | -|---|---|---| -| `append()` | `writeLock` (ReentrantLock) | Serializes writes — safe with Virtual Threads | -| `replay()` | None | Reads from in-memory `ArrayList` snapshot | -| `truncateBefore()` | `writeLock` | Serializes with appends | -| `close()` | `writeLock` | Final `force(true)` + channel close | - -!!! tip "No `synchronized`" - `MemoryWal` uses `ReentrantLock` exclusively — never `synchronized` — to avoid Virtual Thread pinning. This is consistent with the zero-`synchronized` policy across the entire Spector codebase. - ---- - -## Configuration - -WAL behavior is controlled via `spector.yml`: - -```yaml -spector: - memory: - persistence-mode: DISK # DISK | IN_MEMORY - persistence-path: .spector/memory -``` - -| Parameter | Default | Description | -|---|---|---| -| `persistence-mode` | `DISK` | `DISK` = file-backed WAL, `IN_MEMORY` = volatile | -| `persistence-path` | `.spector/memory` | Root directory (WAL stored in `{path}/wal/`) | -| Chunk size | 8 MB | Hardcoded default, configurable via constructor | -| Compression | `false` | Configurable via constructor | -| fsync-per-write | `false` | Configurable via constructor | - ---- - -## Storage Adapter SPI - -For cloud-based WAL replication, the `StorageAdapter` SPI provides a pluggable backend: - -```java -public interface StorageAdapter extends AutoCloseable { - void upload(String namespace, String chunkName, ByteBuffer data); - ByteBuffer download(String namespace, String chunkName); - List listChunks(String namespace); - List listNamespaces(); - boolean isAvailable(); -} -``` - -Planned implementations: - -| Adapter | Backend | Status | -|---|---|---| -| `S3StorageAdapter` | AWS S3 | Planned (V3) | -| `GcsStorageAdapter` | Google Cloud Storage | Planned (V3) | -| `LocalStorageAdapter` | Local filesystem | Planned (V3) | - ---- - -## Next Steps - -- :material-memory: [**Off-Heap Panama Design**](panama-design.md) — how mmap partitions store cognitive records -- :material-sleep: [**Hippocampus — Sleep Consolidation**](hippocampus.md) — the consolidation daemon that triggers snapshot + truncation -- :material-brain: [**Architecture**](architecture.md) — system overview -- :material-lightning-bolt: [**Synapse — Tags & Scoring**](synapse.md) — the synaptic header that WAL events serialize diff --git a/docs/docs/modules/index.md b/docs/docs/modules/index.md deleted file mode 100644 index 165bcfd..0000000 --- a/docs/docs/modules/index.md +++ /dev/null @@ -1,213 +0,0 @@ -# Modules - -Spector is organized as a multi-module Maven project. Each module has a focused responsibility, clear API boundaries, and minimal cross-module coupling. - ---- - -## Architecture - -```mermaid -graph LR - subgraph "🔬 Foundation" - core["spector-core
SIMD kernels"] - commons["spector-commons
Chunkers, tokenizer"] - config["spector-config
SpectorConfig + YAML"] - storage["spector-storage
Panama MemorySegment"] - end - - subgraph "🧠 Intelligence" - embedApi["spector-embed-api
Embedding SPI"] - embedOllama["spector-embed-ollama
Ollama provider"] - index["spector-index
HNSW + IVF-PQ + BM25"] - query["spector-query
Hybrid + RRF + rerank"] - gpu["spector-gpu
CUDA via Panama FFM"] - end - - subgraph "⚡ Engine" - rag["spector-rag
RAG pipeline"] - engine["spector-engine
Search facade"] - ingestion["spector-ingestion
File ingest pipeline"] - memory["spector-memory
Cognitive memory 🧠"] - end - - subgraph "🌐 Runtime & Interfaces" - runtime["spector-runtime
Composition root"] - node["spector-node
Armeria: REST + gRPC + SSE"] - mcp["spector-mcp
MCP Server (stdio)"] - cli["spector-cli
spectorctl"] - client["spector-client
Java SDK"] - spring["spector-spring
Spring AI"] - end - - subgraph "📦 Distribution" - metrics["spector-metrics
Prometheus + JVM"] - bench["spector-bench
JMH benchmarks"] - dist["spector-dist
Fat JAR"] - end -``` - ---- - -## Module Dependency Graph - -```mermaid -graph TD - node["🌐 node"] --> runtime["⚡ runtime"] - node --> mcp["🤖 mcp"] - node --> metrics["📈 metrics"] - mcp --> runtime - mcp --> ingestion["📥 ingestion"] - cli["🖥️ cli"] --> runtime - cli --> client["📦 client"] - - runtime --> engine["⚡ engine"] - runtime --> memory["🧠 memory"] - runtime --> ingestion - - engine --> query["🔍 query"] - engine --> rag["🤖 rag"] - engine --> ingestion - engine --> index["📊 index"] - engine --> storage["💾 storage"] - engine --> embedapi["🧬 embed-api"] - engine -.-> gpu["🎮 gpu"] - - memory --> index - memory --> storage - memory --> ingestion - memory --> embedapi - memory --> core["🔬 core"] - - metrics --> engine - metrics --> memory - - ingestion --> config["⚙️ config"] - ingestion --> embedapi - - rag --> query - rag --> index - rag --> storage - rag --> embedapi - - query --> index - index --> storage - index --> config - storage --> config - storage --> core - config --> core - - embedapi --> commons["📄 commons"] - gpu --> core - gpu --> storage - - dist["📦 dist"] --> mcp - dist --> cli - dist --> runtime - - spring["🌱 spring"] --> engine - spring --> memory - spring --> metrics - bench["🧪 bench"] --> engine - bench --> memory -``` - -> **Legend:** Solid arrows = compile dependency. Dotted arrow (`gpu`) = optional dependency. - -!!! important "Architecture" - `spector-ingestion` defines the `IngestionPipeline` and `IngestionTarget` interface. Both `spector-engine` and `spector-memory` depend on it to implement their `IngestionTarget`. `spector-memory` is fully independent of `spector-engine` — they are peers, wired together only at the `SpectorRuntime` composition root. - ---- - -## Architecture: Entry Points → Runtime → Subsystems - -All entry points (MCP, CLI, Server) route through `SpectorRuntime`: - -```mermaid -graph TD - cli["🖥️ spector-cli
SpectorCtl"] - mcp["🤖 spector-mcp
SpectorMcpMain"] - node["🌐 spector-node
SpectorNode (Armeria)"] - - cli --> runtime - mcp --> runtime - node --> runtime - - runtime["⚡ SpectorRuntime
Composition Root"] - - runtime --> sh["SearchHandler
mode-aware search"] - runtime --> ih["IngestionHandler
delegates to IngestionPipeline"] - - sh --> engine["SpectorEngine"] - sh --> memory["SpectorMemory"] - ih --> pipeline["IngestionPipeline
chunk → embed → store"] - pipeline --> engineTarget["EngineIngestionTarget
SEARCH mode"] - pipeline --> memTarget["CognitiveIngestionTarget
MEMORY mode"] -``` - -**SpectorRuntime** is a thin composition root — it creates and wires subsystems but contains no business logic. Each handler owns its domain: - -| Handler | Responsibility | Routes to | -|---------|---------------|-----------| -| `SearchHandler` | Mode-aware search | Engine (SEARCH mode) or Memory (MEMORY mode) | -| `IngestionHandler` | Delegates to unified `IngestionPipeline` | Pipeline → `EngineIngestionTarget` or `CognitiveIngestionTarget` | - ---- - -## Module Overview - -### Foundation Layer - -| Module | Description | -|:---|:---| -| [spector-commons](spector-commons.md) | Shared utilities — concurrent primitives, I/O helpers | -| [spector-core](spector-core.md) | Core abstractions — quantization, SIMD, similarity functions | -| [spector-config](spector-config.md) | Configuration — `SpectorProperties`, `SpectorConfigFactory`, YAML loading | -| [spector-storage](spector-storage.md) | Persistent storage — memory-mapped files, arena management | - -### Embedding Layer - -| Module | Description | -|:---|:---| -| [spector-embed-api](spector-embed-api.md) | Embedding provider SPI — model-agnostic interface | -| [spector-embed-ollama](spector-embed-ollama.md) | Ollama embedding implementation | - -### Search Layer - -| Module | Description | -|:---|:---| -| [spector-index](spector-index.md) | Vector indexing — HNSW, IVF, brute-force | -| [spector-query](spector-query.md) | Query processing — parsing, planning, execution | -| [spector-gpu](spector-gpu.md) | GPU acceleration — Panama FFM bindings | - -### Intelligence Layer - -| Module | Description | -|:---|:---| -| [spector-rag](spector-rag.md) | RAG pipeline — retrieval-augmented generation | -| [spector-engine](spector-engine.md) | Search engine — orchestrates index + RAG + storage | -| [spector-ingestion](spector-ingestion.md) | Unified ingestion pipeline — `IngestionPipeline` (builder), `IngestionTarget` interface, `FileDiscoveryService` | -| [spector-memory](spector-memory.md) | Cognitive memory — biologically-inspired agent memory | - -### Runtime Layer - -| Module | Description | -|:---|:---| -| [spector-runtime](spector-runtime.md) | Composition root — wires engine + memory + ingestion pipeline, exposes `SearchHandler` and `IngestionHandler` | -| [spector-mcp](spector-mcp.md) | MCP server — Model Context Protocol integration via stdio | -| [spector-node](spector-node.md) | Unified node — Armeria HTTP REST + gRPC + SSE events + cluster coordination | - -### Client Layer - -| Module | Description | -|:---|:---| -| [spector-cli](spector-cli.md) | CLI tool — `spectorctl` with remote (HTTP) and local batch (runtime) modes | -| [spector-client](spector-client.md) | Java client — programmatic HTTP API access | -| [spector-spring](spector-spring.md) | Spring AI integration — auto-configuration | - -### Infrastructure - -| Module | Description | -|:---|:---| -| [spector-metrics](spector-metrics.md) | Metrics — Prometheus + JVM instrumentation | -| [spector-bench](spector-bench.md) | Benchmarks — JMH performance testing | -| [spector-dist](spector-dist.md) | Distribution — single fat JAR packaging | diff --git a/docs/docs/modules/spector-bench.md b/docs/docs/modules/spector-bench.md deleted file mode 100644 index 570a55a..0000000 --- a/docs/docs/modules/spector-bench.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-bench/README.md" diff --git a/docs/docs/modules/spector-cli.md b/docs/docs/modules/spector-cli.md deleted file mode 100644 index adb4d1e..0000000 --- a/docs/docs/modules/spector-cli.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-cli/README.md" diff --git a/docs/docs/modules/spector-client.md b/docs/docs/modules/spector-client.md deleted file mode 100644 index fcea103..0000000 --- a/docs/docs/modules/spector-client.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-client/README.md" diff --git a/docs/docs/modules/spector-commons.md b/docs/docs/modules/spector-commons.md deleted file mode 100644 index baf8970..0000000 --- a/docs/docs/modules/spector-commons.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-commons/README.md" diff --git a/docs/docs/modules/spector-config.md b/docs/docs/modules/spector-config.md deleted file mode 100644 index 4a03d02..0000000 --- a/docs/docs/modules/spector-config.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-config/README.md" diff --git a/docs/docs/modules/spector-core.md b/docs/docs/modules/spector-core.md deleted file mode 100644 index 671e050..0000000 --- a/docs/docs/modules/spector-core.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-core/README.md" diff --git a/docs/docs/modules/spector-cortex.md b/docs/docs/modules/spector-cortex.md deleted file mode 100644 index 6281162..0000000 --- a/docs/docs/modules/spector-cortex.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -title: spector-cortex -description: "Real-time neural dashboard for visualizing Spector's cognitive memory engine." ---- - -# spector-cortex - -!!! info "Module Type" - **Frontend Application** — Angular 21 standalone UI (not a Maven module) - -## Purpose - -`spector-cortex` is the real-time neural visualization dashboard for Spector's cognitive memory engine. It provides interactive 3D and 2D visualizations of the entire cognitive pipeline — from SIMD vector processing to Hebbian graph spreading activation to Ebbinghaus decay curves. - -Unlike the backend Java modules, this is a standalone **Angular 21 application** that runs independently and connects to a Spector Node via SSE. - -## Key Features - -| Feature | Description | -|:--------|:------------| -| **Neural Graph** | 200-node Three.js 3D graph with 3 edge types and particle trails | -| **Vector Space** | 300-point PCA-projected embedding cloud | -| **Scoring Pipeline** | Animated 6-phase cognitive funnel | -| **Live Metrics** | Real-time recall/remember/reinforce/forget time-series | -| **Cognitive Profiles** | 6-axis radar chart with smooth profile transitions | -| **SIMD Lanes** | 16-lane register heatmap | -| **Memory Heatmap** | Off-heap segment utilization | -| **Decay Curve** | Ebbinghaus + LTP reconsolidation overlay | -| **Query History** | Scrollable timeline with latency and profile chips | -| **Zeigarnik Effect** | Unresolved memory tension gauge | -| **Habituation** | IoR, satiation, and penalty gauges | -| **Mock Data** | Toggleable simulated events for demo/development | - -## Technology Stack - -| Layer | Technology | -|:------|:-----------| -| Framework | Angular 21 (standalone, zoneless) | -| UI Components | Angular Material 3 | -| 3D Rendering | Three.js | -| 2D Charts | Canvas 2D API | -| State | Angular Signals | -| Data Stream | SSE (`ng-sse-client`) | -| Styling | SCSS + M3 CSS tokens | - -## Quick Start - -```bash -cd spector-cortex -npm install -npx ng serve --port 4300 -``` - -## Dependencies - -`spector-cortex` has **no compile-time dependency** on any Java module. It communicates with the backend exclusively through SSE: - -```mermaid -graph LR - cortex["🧬 spector-cortex
Angular 21 UI"] -->|SSE| node["🌐 spector-node
Armeria Server"] - node --> runtime["⚡ spector-runtime"] - node --> memory["🧠 spector-memory"] - node --> metrics["📈 spector-metrics"] -``` - -## Related - -- [Cortex Dashboard — Full Documentation](../cortex/index.md) -- [Cognitive Memory Overview](../memory/index.md) -- [spector-node](spector-node.md) -- [spector-metrics](spector-metrics.md) diff --git a/docs/docs/modules/spector-dist.md b/docs/docs/modules/spector-dist.md deleted file mode 100644 index e3596b7..0000000 --- a/docs/docs/modules/spector-dist.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-dist/README.md" diff --git a/docs/docs/modules/spector-embed-api.md b/docs/docs/modules/spector-embed-api.md deleted file mode 100644 index defe85b..0000000 --- a/docs/docs/modules/spector-embed-api.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-embed-api/README.md" diff --git a/docs/docs/modules/spector-embed-ollama.md b/docs/docs/modules/spector-embed-ollama.md deleted file mode 100644 index 9f2e57f..0000000 --- a/docs/docs/modules/spector-embed-ollama.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-embed-ollama/README.md" diff --git a/docs/docs/modules/spector-engine.md b/docs/docs/modules/spector-engine.md deleted file mode 100644 index 9f602a4..0000000 --- a/docs/docs/modules/spector-engine.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-engine/README.md" diff --git a/docs/docs/modules/spector-gpu.md b/docs/docs/modules/spector-gpu.md deleted file mode 100644 index 90bbe4d..0000000 --- a/docs/docs/modules/spector-gpu.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-gpu/README.md" diff --git a/docs/docs/modules/spector-index.md b/docs/docs/modules/spector-index.md deleted file mode 100644 index f0a7ecb..0000000 --- a/docs/docs/modules/spector-index.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-index/README.md" diff --git a/docs/docs/modules/spector-ingestion.md b/docs/docs/modules/spector-ingestion.md deleted file mode 100644 index d1b7771..0000000 --- a/docs/docs/modules/spector-ingestion.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-ingestion/README.md" diff --git a/docs/docs/modules/spector-mcp.md b/docs/docs/modules/spector-mcp.md deleted file mode 100644 index 3197539..0000000 --- a/docs/docs/modules/spector-mcp.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-mcp/README.md" diff --git a/docs/docs/modules/spector-memory.md b/docs/docs/modules/spector-memory.md deleted file mode 100644 index 8cf3bad..0000000 --- a/docs/docs/modules/spector-memory.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-memory/README.md" diff --git a/docs/docs/modules/spector-metrics.md b/docs/docs/modules/spector-metrics.md deleted file mode 100644 index 393905f..0000000 --- a/docs/docs/modules/spector-metrics.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-metrics/README.md" diff --git a/docs/docs/modules/spector-node.md b/docs/docs/modules/spector-node.md deleted file mode 100644 index 799bef5..0000000 --- a/docs/docs/modules/spector-node.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-node/README.md" diff --git a/docs/docs/modules/spector-query.md b/docs/docs/modules/spector-query.md deleted file mode 100644 index 598ff97..0000000 --- a/docs/docs/modules/spector-query.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-query/README.md" diff --git a/docs/docs/modules/spector-rag.md b/docs/docs/modules/spector-rag.md deleted file mode 100644 index 7f79efb..0000000 --- a/docs/docs/modules/spector-rag.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-rag/README.md" diff --git a/docs/docs/modules/spector-runtime.md b/docs/docs/modules/spector-runtime.md deleted file mode 100644 index d11e054..0000000 --- a/docs/docs/modules/spector-runtime.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-runtime/README.md" diff --git a/docs/docs/modules/spector-spring.md b/docs/docs/modules/spector-spring.md deleted file mode 100644 index 7e619ea..0000000 --- a/docs/docs/modules/spector-spring.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-spring/README.md" diff --git a/docs/docs/modules/spector-storage.md b/docs/docs/modules/spector-storage.md deleted file mode 100644 index aa9a3df..0000000 --- a/docs/docs/modules/spector-storage.md +++ /dev/null @@ -1 +0,0 @@ ---8<-- "spector-storage/README.md" diff --git a/docs/docs/operations/contributing.md b/docs/docs/operations/contributing.md deleted file mode 100644 index 7a07af6..0000000 --- a/docs/docs/operations/contributing.md +++ /dev/null @@ -1,268 +0,0 @@ -# 🤝 Contributing - -> **We'd love your help making Spector even better!** Whether you're fixing a bug, adding a feature, improving docs, or optimizing performance — every contribution matters. This page covers everything you need to get started. - ---- - -## 🚀 Development Setup - -### 📋 Prerequisites - -| Tool | Version | Notes | -|------|---------|-------| -| ☕ JDK | 25+ | OpenJDK with Vector API incubator | -| 📦 Maven | 3.9+ | Multi-module reactor build | -| 🔧 Git | 2.40+ | Version control | - -### 🏗️ First-Time Setup - -```bash -# Fork and clone -git clone https://github.com//spector.git -cd spector - -# Verify JDK -java -version # Should show 25+ - -# Build the project -mvn clean compile - -# Run the full test suite (316+ tests) -mvn test - -# Verify SIMD support -java --add-modules jdk.incubator.vector -cp spector-core/target/classes \ - com.spectrayan.spector.core.SimdCapability -``` - -> [!TIP] -> The full build takes ~2 minutes. Use `mvn test -pl spector-core` to test a single module during development. - ---- - -## 📦 Module Structure - -```mermaid -graph LR - subgraph "🔬 Foundation" - core["spector-core
SIMD kernels"] - commons["spector-commons
Chunkers, readers"] - storage["spector-storage
Off-heap stores"] - end - - subgraph "📊 Search" - index["spector-index
HNSW, IVF-PQ, BM25"] - query["spector-query
Hybrid + RRF"] - end - - subgraph "🧠 Intelligence" - embedapi["spector-embed-api
Embedding SPI"] - embedollama["spector-embed-ollama
Ollama provider"] - gpu["spector-gpu
CUDA via Panama"] - end - - subgraph "⚡ Applications" - engine["spector-engine
Unified facade"] - server["spector-node
REST API"] - cluster["spector-node
Distributed gRPC"] - cli["spector-cli
CLI tool"] - client["spector-client
Java SDK"] - spring["spector-spring
Spring AI"] - end - - subgraph "📈 Quality" - bench["spector-bench
JMH benchmarks"] - end -``` - ---- - -## 🧪 Running Tests - -```bash -# Full suite -mvn test - -# Single module -mvn test -pl spector-core - -# Single test class -mvn test -pl spector-core -Dtest=DotProductTest - -# With JMH benchmarks -mvn -pl spector-bench exec:java -``` - ---- - -## 📝 Code Style - -### Java Conventions - -| Rule | Details | -|------|---------| -| **Java 25 features** | Records, sealed classes, pattern matching, switch expressions | -| **Vector API** | Always use `FloatVector.SPECIES_PREFERRED`, never hardcode lanes | -| **Panama FFM** | `Arena.ofShared()` for concurrent, `Arena.ofConfined()` for single-thread | -| **Virtual Threads** | `ReentrantLock` instead of `synchronized` (avoids pinning) | -| **Testing** | JUnit 5 + AssertJ for all new features | -| **Javadoc** | Required on all public classes and methods | - -### ⚡ Performance Rules - -- **No allocations in hot paths** — Reuse buffers, use slice-based APIs - -- **Branchless SIMD** — Use `VectorMask` for tail handling, no scalar fallback - -- **Benchmark before/after** — Performance PRs must include JMH results - -### 🏗️ Architecture Rules - -- **Respect module boundaries** — Follow the dependency graph, no circular dependencies - -- **Interface-first** — Add interfaces before implementations - -- **Zero-copy** — Prefer `MemorySegment` slices over array copies - ---- - -## 🌿 Branch Naming - -``` -feat/add-quantization-support -fix/hnsw-concurrent-insert-race -perf/simd-avx512-unroll-loop -refactor/storage-arena-lifecycle -docs/api-usage-examples -``` - ---- - -## 💬 Commit Messages - -Follow [Conventional Commits](https://www.conventionalcommits.org/): - -``` -feat(core): add AVX-512 double-pump dot product kernel -fix(index): prevent HNSW neighbor list corruption under concurrent insert -perf(storage): use bulk MemorySegment.copy for vector reads -refactor(query): extract RRF into standalone utility class -docs: add benchmark results to README -test(index): add property tests for HNSW persistence round-trip -``` - -| Type | Purpose | -|------|---------| -| `feat` | New feature | -| `fix` | Bug fix | -| `perf` | Performance improvement | -| `refactor` | Code restructuring (no behavior change) | -| `docs` | Documentation only | -| `test` | Adding or updating tests | -| `chore` | Build, CI, tooling changes | - ---- - -## ✅ Testing Requirements - -All new features require tests. The project uses: - -| Framework | Purpose | -|-----------|---------| -| **JUnit 5** | Unit tests | -| **AssertJ** | Fluent assertions | -| **jqwik** | Property-based tests | -| **JMH** | Performance benchmarks | - -### Test Categories - -| Type | When Required | Location | -|------|---------------|----------| -| Unit tests | All changes | `src/test/java/` in each module | -| Property tests | Algorithm changes | `src/test/java/` with `@Property` | -| Integration tests | Cross-module changes | `spector-engine/src/test/` | -| Benchmarks | Performance PRs | `spector-bench/src/main/` | - -### Property-Based Tests Example - -```java -@Property(tries = 100) -void hnswPersistenceRoundTrip(@ForAll @Size(min=10, max=1000) List vectors) { - // Build index, persist, reload, verify identical search results -} -``` - ---- - -## 🔄 Pull Request Process - -1. **Create a branch** from `main` with appropriate naming -2. **Make changes** with tests -3. **Ensure all tests pass** — `mvn test` -4. **Fill out the PR template** -5. **Link related issues** — `Closes #123` or `Fixes #456` -6. **One approval required** from a maintainer -7. **Squash merge** to keep history clean - -### ✅ PR Checklist - -- [ ] Code follows the project's coding standards - -- [ ] Tests added/updated for the change - -- [ ] Javadoc updated for public API changes - -- [ ] No hardcoded secrets or credentials - -- [ ] Commit messages follow Conventional Commits - -- [ ] JMH benchmarks included (if performance-related) - -- [ ] No circular module dependencies introduced - ---- - -## 🐛 Reporting Issues - -### Bug Reports - -Use the [Bug Report template](https://github.com/spectrayan/spector/issues/new?template=bug_report.md): - -- Steps to reproduce - -- Expected vs actual behavior - -- JDK version and SIMD capability output - -- Relevant logs or stack traces - -### 💡 Feature Requests - -Use the [Feature Request template](https://github.com/spectrayan/spector/issues/new?template=feature_request.md): - -- Problem you're solving - -- Proposed solution - -- Alternatives considered - ---- - -## 💬 Getting Help - -| Channel | Use For | -|---------|---------| -| [GitHub Discussions](https://github.com/spectrayan/spector/discussions) | General questions | -| [GitHub Issues](https://github.com/spectrayan/spector/issues) | Bug reports | -| [SECURITY.md](https://github.com/spectrayan/spector/blob/main/SECURITY.md) | Security vulnerabilities | -| developer@spectrayan.com | Direct contact | - ---- - -## 🔗 See Also - -- [Architecture Overview](../architecture/overview.md) — System design - -- [Core Concepts](../architecture/core-concepts.md) — Algorithms and data structures - -- [Performance Tuning](performance-tuning.md) — Benchmark methodology \ No newline at end of file diff --git a/docs/docs/operations/performance-tuning.md b/docs/docs/operations/performance-tuning.md deleted file mode 100644 index 44d05f2..0000000 --- a/docs/docs/operations/performance-tuning.md +++ /dev/null @@ -1,308 +0,0 @@ -# 🏎️ Performance Tuning - -> **Spector delivers sub-millisecond latency out of the box — but there's always room to optimize for your specific workload.** This page covers benchmarks, tuning strategies, and the science of finding the right recall/latency/memory trade-off. - ---- - -## 📊 Benchmark Summary - -> All benchmarks measured on a 24-core x86 machine (Windows 11, Intel Core Ultra 9 285K), AVX2 256-bit, Java 25, ZGC, using clustered vectors (realistic distribution). Numbers represent actual measured results — run `mvn -pl spector-bench exec:java` to reproduce on your hardware. - -> [!NOTE] -> **Methodology:** Benchmarks use 200 measurement iterations with 50 warmup iterations per scenario. Vectors are generated with realistic cluster structure (50 clusters with Gaussian noise). Documents contain 200–1500 words with paragraph structure. Recall is measured against brute-force ground truth. Your results may vary ±20% depending on CPU model, OS scheduling, background load, and thermal throttling. - -### ⚡ SIMD Kernel Latency - -| Dimension | Cosine P50 | Cosine P99 | Dot Product P50 | Dot Product P99 | -|-----------|-----------|-----------|----------------|----------------| -| 32 | 500 ns | 1,500 ns | 200 ns | 400 ns | -| 128 | <100 ns | 100 ns | 100 ns | 1,300 ns | -| 384 | ~100 ns | 100 ns | ~100 ns | 100 ns | -| 768 | ~100 ns | 100 ns | ~100 ns | 100 ns | - -> [!NOTE] -> Values at 384+ are at `System.nanoTime()` resolution floor. JMH confirms millions of ops/sec. - -### 🔍 Search Latency (128-dim, top-10, clustered vectors) - -| Scale | Keyword (BM25) | Vector (HNSW) | Hybrid (RRF) | -|-------|---------------|---------------|--------------| -| **10K docs** | 0.19 ms / 3.79 ms p99 | **0.05 ms** / 0.10 ms p99 | 0.17 ms / 0.37 ms p99 | -| **50K docs** | 0.42 ms / 0.68 ms p99 | **0.09 ms** / 0.19 ms p99 | 0.50 ms / 0.81 ms p99 | -| **100K docs** | 0.98 ms / 1.39 ms p99 | **0.13 ms** / 0.26 ms p99 | 1.01 ms / 1.22 ms p99 | - -### 🚀 Search Throughput (queries/sec) - -| Scale | Keyword | Vector | Hybrid | -|-------|---------|--------|--------| -| 10K | 5,194 | **18,824** | 5,828 | -| 50K | 2,406 | **10,980** | 1,988 | -| 100K | 1,019 | **7,556** | 994 | - -### 📥 Ingestion Throughput - -| Dataset Size | Time | Rate | Memory | -|-------------|------|------|--------| -| 10K | 2.5s | **3,931 docs/s** | +19 MB | -| 50K | 15.1s | **3,308 docs/s** | +93 MB | -| 100K | 38.2s | **2,618 docs/s** | +187 MB | - -### 🧵 Concurrency Scaling (50K docs, 384-dim, Hybrid Search) - -| Threads | Throughput | Avg Latency | Scaling Factor | -|---------|-----------|-------------|----------------| -| 1 | 3,739 ops/s | 0.26 ms | 1.0× | -| 4 | 10,317 ops/s | 0.37 ms | **2.8×** | -| 8 | 11,812 ops/s | 0.58 ms | **3.2×** | -| 16 | 14,022 ops/s | 1.00 ms | **3.7×** | - -> [!NOTE] -> Concurrency scaling is measured with 384-dim vectors (production-realistic). 128-dim shows higher absolute throughput but the scaling factor is similar. Individual HNSW queries are sequential — scaling comes from serving multiple queries concurrently. - ---- - -## 🧪 Running Benchmarks - -### Full Benchmark Suite - -```bash -mvn -pl spector-bench exec:java -``` - -> [!TIP] -> Generates an HTML report at `spector-bench/target/performance-report.html` - -### Specific Benchmarks - -```bash -# SIMD kernels only -mvn -pl spector-bench exec:java -Dexec.args="SimdKernelBenchmark" - -# HNSW index operations -mvn -pl spector-bench exec:java -Dexec.args="HnswBenchmark" - -# Concurrency scaling -mvn -pl spector-bench exec:java -Dexec.args="ConcurrencyBenchmark" -``` - -### JSON Output for CI - -```bash -mvn -pl spector-bench exec:java -Dexec.args="-rf json -rff results.json" -``` - -### 📏 Baseline Regression Detection - -```bash -# Generate baseline -mvn -pl spector-bench exec:java -Dexec.args="--baseline" - -# Compare against baseline -mvn -pl spector-bench exec:java -Dexec.args="--compare" -``` - ---- - -## 🎛️ Tuning Strategies - -### 🎯 Maximize Recall - -Goal: recall@10 ≥ 95% - -```java -var config = SpectorConfig.DEFAULT - .withM(32) // More connections - .withEfConstruction(400) // Better graph quality - .withEfSearch(200); // Wider search beam -``` - -Trade-offs: 2× memory, ~3× build time, ~2× query latency. - ---- - -### ⚡ Minimize Latency - -Goal: p99 < 0.5ms - -```java -var config = SpectorConfig.DEFAULT - .withM(12) - .withEfConstruction(100) - .withEfSearch(30); -``` - -Trade-offs: Lower recall (~80% recall@10), but sub-millisecond guaranteed. - ---- - -### 🚀 Maximize Throughput - -Goal: Maximum queries/sec under concurrent load - -```java -var config = SpectorConfig.DEFAULT - .withM(16) // Balanced - .withEfSearch(50) // Not too high - .withGpu(true); // Batch processing -``` - -Key factors: - -- Virtual threads handle concurrency automatically - -- Keep `efSearch` moderate to reduce per-query work - -- Enable GPU for batch workloads - -- Use IVF-PQ for large datasets (reduced memory = better cache behavior) - ---- - -### 💾 Minimize Memory - -Goal: Fit large datasets in limited RAM - -```java -var config = SpectorConfig.DEFAULT - .withM(8) // Fewer connections - .withEfConstruction(100); -// Use IVF-PQ for 32× vector compression -``` - -**Memory per document (384-dim):** - -| Mode | Per Vector | 1M vectors | -|------|-----------|------------| -| Float32 | ~1.8 KB | ~1.8 GB | -| INT8 | ~640 bytes | ~640 MB | -| IVF-PQ | ~288 bytes | ~288 MB | - ---- - -## 📈 Parameter Tuning Guide - -### HNSW: efSearch vs Recall vs Latency - -> [!NOTE] -> Recall values below are measured with uniform random vectors (best case). Real embedding distributions with cluster structure may show lower recall at the same efSearch — increase efSearch to 100–200 for production workloads with real embeddings. - -| efSearch | Recall@10 (random) | Recall@10 (clustered) | Avg Latency | Notes | -|----------|-----------|-----------|-------------|-------| -| 10 | ~70% | ~30-40% | 0.02 ms | Too low for most uses | -| 30 | ~85% | ~50-60% | 0.03 ms | Fast, moderate recall | -| **64** | **~90%** | **~50-65%** | **0.05 ms** | **Default** | -| 100 | ~95% | ~70-80% | 0.10 ms | Good for production | -| 200 | ~98% | ~85-90% | 0.20 ms | High recall | -| 500 | ~99.5% | ~95%+ | 0.50 ms | Near-perfect | - -### IVF-PQ: nprobe vs Recall - -| nprobe | Recall@10 | Relative Latency | -|--------|-----------|-----------------| -| 1 | ~40% | 1× | -| 4 | ~70% | 4× | -| 8 | ~85% | 8× | -| 16 | ~92% | 16× | -| 32 | ~97% | 32× | - -### SpectorIndex (IVF-HNSW-SVASQ): nCentroids vs nProbe - -SpectorIndex uses IVF partitioning with adaptive HNSW shards. The two key parameters are: - -- **`nCentroids`** — number of K-Means partitions (set at training time) -- **`nProbe`** — number of partitions searched at query time (adjustable) - -**Rule of thumb:** `nCentroids ≈ √N` (square root of dataset size). - -**Real embedding results (Qwen3-embedding, 4096-dim, 10K vectors):** - -| nCentroids | nProbe | % Data Searched | Avg Latency | QPS | Recall@10 | -|------------|--------|-----------------|-------------|-----|-----------| -| **128** | **4** | **3.1%** | **0.46ms** | **2,173** | **1.0000** | -| 128 | 8 | 6.3% | 0.73ms | 1,368 | 1.0000 | -| 128 | 16 | 12.5% | 1.26ms | 792 | 1.0000 | -| 64 | 4 | 6.3% | 0.62ms | 1,601 | 1.0000 | -| 64 | 8 | 12.5% | 1.17ms | 856 | 1.0000 | -| 32 | 4 | 12.5% | 1.17ms | 857 | 1.0000 | - -> [!TIP] -> With real embeddings (not random vectors), SpectorIndex achieves **perfect recall at nProbe=4** because real embeddings form natural semantic clusters that K-Means captures effectively. Start with `nProbe=4` and only increase if your recall target isn't met. - -> [!NOTE] -> For the complete, empirical sweeps across multiple partition configurations ($C \in \{32, 64, 128, 256\}$) and detailed HNSW shard promotion benchmarks, see the dedicated [Large-Scale Benchmarks deep dive](../deep-dives/real-embedding-benchmarks.md). - -**Ingestion throughput** (SpectorIndex vs standalone HNSW): - -| Dataset Size | SpectorIndex | Standalone HNSW | Speedup | -|-------------|-------------|-----------------|---------| -| 10K | 130K docs/s | 4,677 docs/s | **28×** | -| 50K | 140K docs/s | 2,483 docs/s | **56×** | -| 100K | 150K docs/s | 1,535 docs/s | **98×** | -| 500K | 246K docs/s | — | — | -| 1M | 128K docs/s | — | — | - ---- - -## 📐 Scaling Strategies - -### ⬆️ Vertical Scaling - -- **Add CPU cores** → Concurrent throughput scaling (up to ~3.7× at 16 threads measured) - -- **Add RAM** → Support larger capacity without IVF-PQ compression - -- **Add GPU** → 4× brute-force search speedup at 100K+ vectors (data resident in VRAM) - -### ➡️ Horizontal Scaling (Distributed Mode) - -- **Add nodes** → Linear throughput scaling per shard - -- Rule of thumb: 100K–500K docs per shard - -- See [Distributed Mode](../architecture/distributed-mode.md) for cluster setup - ---- - -## ☕ JVM Tuning - -Recommended JVM arguments for production: - -```bash -java \ - --add-modules jdk.incubator.vector \ - --enable-native-access=ALL-UNNAMED \ - -XX:+UseZGC \ - -XX:+ZGenerational \ - -Xmx4g \ - -Xms4g \ - -jar spector-node.jar -``` - -| Argument | Purpose | -|----------|---------| -| `--add-modules jdk.incubator.vector` | Required for SIMD acceleration | -| `--enable-native-access=ALL-UNNAMED` | Required for Panama FFM (GPU, mmap) | -| `-XX:+UseZGC` | Low-pause GC (vectors are off-heap) | -| `-XX:+ZGenerational` | Generational ZGC for better throughput | -| `-Xmx4g -Xms4g` | Fixed heap avoids resize pauses | - -> [!TIP] -> Since all vectors live off-heap, GC pressure is minimal. The heap primarily holds the HNSW graph structure and BM25 inverted index. - ---- - -## 🔗 See Also - -- [Configuration Guide](../configuration/parameters.md) — All parameters with ranges - -- [Core Concepts](../architecture/core-concepts.md) — How algorithms affect performance - -- [SpectorIndex Architecture](../deep-dives/spector-index-architecture.md) — IVF-HNSW-SVASQ design and tuning - -- [Large-Scale Benchmarks](../deep-dives/real-embedding-benchmarks.md) — Empirical sweeps for real embeddings and shard promotions - -- [SVASQ Quantization](../deep-dives/svasq-deep-dive.md) — How SVASQ compression works - -- [GPU Acceleration](../architecture/gpu-acceleration.md) — GPU-specific performance - -- [Distributed Mode](../architecture/distributed-mode.md) — Scaling across nodes \ No newline at end of file diff --git a/docs/docs/roadmap.md b/docs/docs/roadmap.md deleted file mode 100644 index 656c7d1..0000000 --- a/docs/docs/roadmap.md +++ /dev/null @@ -1,472 +0,0 @@ -# 🗺️ Roadmap - -Spector is under active development. This page details planned improvements, their projected impact, and implementation status. - ---- - -## Compression & Quantization - -### ✅ SVASQ-4 — Half-Precision SVASQ (INT4 Codes) {#svasq-4} - -!!! success "Completed" - Implemented and merged. Available via `SpectorEngine.builder().svasq4()` or `QuantizedHnswIndex.svasq4(...)`. - -Replace INT8 `[-127, 127]` codes with INT4 `[-7, 7]` codes in the SVASQ pipeline. The FWHT rotation still equalizes variance, so INT4 quantization error remains uniformly distributed — just at a coarser granularity (15 levels vs 255). - -**Memory layout:** -``` -[float32 normSq (4 bytes)] [INT4 × paddedDim nibble-packed (paddedDim/2 bytes)] -``` - -| Dims | Current SVASQ-8 | SVASQ-4 | Compression vs float32 | -|------|---------------|--------|----------------------| -| 384 → 512 | 516 B | 260 B | **5.9×** | -| 768 → 1024 | 1028 B | 516 B | **6.0×** | -| 4096 | 4100 B | 2052 B | **8.0×** | - -**Recall:** - -- Without rescore: ~95–97% recall@10 -- With 3× oversampling rescore: **~97–99% recall@10** - -**Key design decisions:** - -- Separate `Svasq4Encoder` / `Svasq4SimdKernel` classes (not parameterizing SVASQ-8) to avoid impacting existing code -- Offset encoding `[0, 14]` keeps byte values non-negative for correct `castShape` sign extension -- Deinterleaved hi/lo query arrays match nibble layout for natural SIMD ILP -- Tighter clipping (2.5σ vs 3.0σ) optimizes for 15 quantization levels - ---- - -### 🔜 Padding-Aware Storage — Skip Zero Dimensions {#padding-aware} - -!!! info "Status: Planned (next)" - Low effort, zero recall loss for L2 distance. Highest ROI pending improvement. - -SVASQ pads vectors to the next power-of-two dimensionality (e.g., 768 → 1024), adding wasted bytes. The padded dimensions are zero-filled before FWHT, so their rotated codes are predictable. We can **store only the first `originalDim` codes** and reconstruct padded codes at query time. - -| Dims | paddedDim | Current SVASQ-8 | Padding-Aware | Savings | -|------|-----------|---------------|---------------|---------| -| 384 | 512 | 516 B | 388 B | **25%** | -| 768 | 1024 | 1028 B | 772 B | **25%** | -| 1536 | 2048 | 2052 B | 1540 B | **25%** | -| 4096 | 4096 | 4100 B | 4100 B | 0% (already pow2) | - -**Recall impact:** **None** for L2 distance — padded dimensions contribute a constant offset that doesn't affect ranking. - -!!! warning "SIMD Tail Loop" - The current SIMD kernel exploits `paddedDim % VL == 0` to avoid tail loops. Storing only `originalDim` codes breaks this, requiring either a scalar tail loop or alignment padding to the next SIMD boundary (e.g., round up to multiple of 16 bytes). - -**Changes required:** - -- `SvasqEncoder` / `Svasq4Encoder`: Store only `originalDim` codes, update `bytesPerVector()` -- `SvasqSimdKernel` / `Svasq4SimdKernel`: Handle non-power-of-2 loop bound (SIMD-aligned padding recommended) - ---- - -### 🔜 Norm Header Compression — float32 → float16 {#norm-f16} - -!!! info "Status: Planned (next)" - Very low effort. Negligible recall impact. - -The 4-byte `float32 exactNormSq` header can be compressed to 2 bytes using `float16` (half-precision). Java 21+ provides `Float.floatToFloat16()` and `Float.float16ToFloat()` for lossless conversion. - -**Savings:** 2 bytes per vector. Small absolute savings but trivial to implement. - -| Combined with | Before | After | Savings | -|---------------|--------|-------|---------| -| SVASQ-8 (768-dim) | 1028 B | 1026 B | 0.2% | -| SVASQ-4 (768-dim) | 516 B | 514 B | 0.4% | -| Padding-aware SVASQ-8 (768-dim) | 772 B | 770 B | 0.3% | - -**Recall impact:** < 0.01% — `float16` has ~3 decimal digits of precision. For L2 ranking, the norm header is a per-vector constant that shifts all distances equally. - -**Changes required:** - -- `SvasqEncoder` / `Svasq4Encoder`: Use `Float.floatToFloat16()` for 2-byte header write -- `SvasqSimdKernel` / `Svasq4SimdKernel`: Read with `Float.float16ToFloat(segment.get(JAVA_SHORT, offset))` - ---- - -### 🔬 SVASQ-PQ Hybrid — Product Quantization of SVASQ Residuals {#svasq-pq} - -!!! note "Status: Future Research" - Very high implementation effort. Most aggressive compression option. - -After FWHT rotation, instead of scalar INT8/INT4 quantization, apply **Product Quantization** to the rotated coordinates. The FWHT rotation makes coordinates near-independent (isotropized), which is the ideal input distribution for PQ — similar to how Optimized PQ (OPQ) works with learned rotations, but using FWHT instead of an expensive SVD-based rotation matrix. - -**Memory layout:** -``` -[float32 normSq (4 bytes)] [PQ codes: M bytes (one centroid ID per subspace)] -``` - -With M=16 subspaces, K=256 centroids: - -| Dims | Float32 | SVASQ-8 | SVASQ-PQ (M=16) | Compression vs float32 | -|------|---------|--------|----------------|----------------------| -| 768 | 3,072 B | 1,028 B | 20 B | **154×** | -| 4096 | 16,384 B | 4,100 B | 68 B | **241×** | - -**Recall impact:** - -- PQ on FWHT-rotated residuals: ~85–93% recall@10 -- FWHT rotation gives ~3–5% recall advantage over naive PQ (pre-decorrelates dimensions) -- Rescore with exact float32 residuals pushes recall to 95%+ - -**Why it works:** The FWHT rotation is essentially a free, lossless "Optimized PQ" rotation — it decorrelates dimensions without requiring an expensive SVD or learned rotation matrix. This means PQ subspaces can be independent slices of the rotated vector, which is information-theoretically optimal. - -**Implementation scope:** - -- Train PQ codebooks per shard (or globally after FWHT rotation) -- Asymmetric Distance Computation (ADC) lookup tables during search -- New SIMD kernel for PQ distance computation -- Integration with existing `ProductQuantizer` in `spector-index` - -!!! danger "Complexity Warning" - This is essentially building a new quantization mode. The existing `ProductQuantizer` could be adapted, but integrating it with the FWHT rotation pipeline is non-trivial. Estimated effort: 2–4 weeks. - ---- - -### 🔬 Flat-Mode SVASQ — Compress Flat-Shard Storage {#flat-svasq} - -!!! note "Status: Future Research" - Medium effort, good payoff for large flat shards. - -In `SpectorShard`'s flat mode, residuals are stored as raw `float32[]`. Since all residuals in a shard share the same centroid, they have similar statistical distributions. **SVASQ quantization of flat residuals** could compress flat-mode storage by ~3× without changing the shard architecture. - -**Savings:** - -| Scenario | Current (float32) | With SVASQ | Savings | -|----------|-------------------|-----------|---------| -| 10K vectors × 768 dims | 30 MB/shard | 10 MB/shard | **3×** | -| 50K vectors × 4096 dims | 781 MB/shard | 195 MB/shard | **4×** | - -**Recall impact:** - -- If applied only to storage (decode for search): **None** — search uses decoded float32 -- If applied to search (scan quantized codes directly): Same as SVASQ-8 (~99.5%) - -**Implementation scope:** - -- Integrate SVASQ encoding into the flat-mode ingestion path -- Modify `SpectorShard.flatScan()` to use the SVASQ SIMD kernel directly -- Per-shard calibration using the shard's centroid residuals - ---- - -### 🔴 Adaptive Bit-Width SVASQ {#adaptive-bw} - -!!! warning "Status: Not Recommended" - Very high effort, marginal benefit due to FWHT already equalizing variance. - -Instead of uniform INT8 across all dimensions, assign more bits to high-variance dimensions and fewer to low-variance ones (after FWHT rotation): - -- Dimensions with σ > 2× median: 8 bits -- Dimensions with σ < 0.5× median: 4 bits -- Others: 6 bits - -**Projected savings:** ~10–15% additional compression. - -**Recall impact:** Minimal (< 0.5%) — allocating bits proportionally to variance is information-theoretically optimal. - -**Why it's not recommended:** FWHT already equalizes variance by design, so the marginal gain from adaptive bit-widths is small. The implementation requires variable-length encoding, non-aligned SIMD reads, and per-dimension bit-width bookkeeping — the worst effort-to-benefit ratio of all proposed improvements. - ---- - -## Agentic AI - -### ✅ Native MCP Server {#mcp-server} - -!!! success "Completed" - Implemented in `spector-mcp` module. 6 tools, stdio transport, agent-native search. - -Built-in [Model Context Protocol](https://modelcontextprotocol.io/) server that gives AI agents (Claude Desktop, Cursor, autonomous agents) direct, in-process access to Spector’s search engine. Zero network overhead — tool handlers call `SpectorEngine` directly via virtual threads. - -**Tools:** `semantic_search`, `hybrid_search`, `rag_query`, `ingest_document`, `delete_document`, `engine_status` - -**Architecture:** -- `McpToolHandler` abstract base class (common timing, error handling, arg parsing) -- `ToolSchemaBuilder` fluent JSON schema construction -- `SpectorToolRegistry` for extensible tool registration -- `SpectorResourceProvider` + `SpectorPromptProvider` for MCP resources/prompts -- `ResultFormatter` shared formatting utilities - ---- - -### 🔜 Streamable HTTP Transport {#mcp-http} - -!!! info "Status: Planned (next)" - Stdio covers Claude Desktop, Cursor, and all local agents. HTTP needed for cloud/remote deployments. - -Add HTTP-based MCP transport for scenarios where the agent and Spector run on different machines. The official MCP SDK supports Streamable HTTP transport — Spector would expose the same 6 tools over an HTTP endpoint. - -**Use cases:** Cloud deployments, remote agent connections, multi-agent architectures. - ---- - -### 🔬 LoRA Adapter Routing {#lora-routing} - -!!! note "Status: Future Research" - Requires LoRA weight format specification and SIMD matrix multiply implementation. - -Multi-tenant query projection via SIMD matrix multiply. Instead of creating separate indexes per tenant, store one base index and apply per-tenant LoRA weight matrices at query time using Panama FMA loops. - -**How it works:** -- Ingest base model embeddings once -- Each tenant uploads a small LoRA matrix ($W_A$, typically 768×32 or similar) -- At query time: $q_{tenant} = q_{base} \times W_A$ (microseconds via Panama SIMD) -- Search the same index with the projected query - -**Expected impact:** Zero-downtime multi-tenant customization without index duplication. - ---- - -### 🔬 ColBERT Late Interaction Reranking {#colbert} - -!!! note "Status: Future Research" - Requires token-level vector storage and MaxSim SIMD kernel. - -Native ColBERT reranking using Panama FMA loops. ColBERT stores a vector for every token in a document, then computes relevance via MaxSim (maximum similarity per query token). Python struggles with this due to GIL contention when routing massive matrices between C++ and Python memory. - -**Spector advantage:** Off-heap `MemorySegment` arrays and Fused-Multiply-Add Panama loops can natively execute ColBERT MaxSim reranking faster than almost any competitor. - ---- - -## Cognitive Graph Memory - -### ✅ 3-Layer Cognitive Graph {#cognitive-graph} - -!!! success "Completed" - All four phases implemented and merged. 357 tests pass, 0 failures. - -Full graph augmentation layer for `spector-memory` — three biologically-inspired graph structures that augment vector recall with associative, temporal, and relational signals. - -**Architecture:** -``` -RecallPipeline - Step 5a: Habituation + Inhibition of Return - Step 5b: STDP causal boost (CoActivationTracker) - Step 5c: Hebbian spreading activation (HebbianGraph, depth=2) - Step 5d: Temporal chain extension (TemporalChain, maxHops=3) - Step 5e: Entity graph traversal (EntityGraph, 2-hop BFS) -``` - -**Layer 1 — Hebbian Association Graph:** - -- Off-heap adjacency list (164B/node, MAX_DEGREE=20) via Panama `MemorySegment` -- Edge strengthening, decay (0.9 factor per consolidation), spreading activation -- Persistence via `HGPH` magic header, chunked 64KB FileChannel I/O -- CoActivationTracker migrated to off-heap: `OffHeapPairTable` (32B/slot) + `OffHeapEdgeTable` (40B/slot) -- Persistence via `COAX` magic header with hash→tag reverse map - -**Layer 2 — Entity-Relationship Graph:** - -- Off-heap entity store (48B/entity, 16B/edge), BFS traversal with typed edge filtering -- 22 entity types × 21 relation types -- `EntityExtractor` SPI with `LlmEntityExtractor` (externalized prompt template) and `NoOpEntityExtractor` -- Persistence via `ENTG` magic header with nameIndex reconstruction - -**Layer 3 — Temporal Causal Chain:** - -- Off-heap linked list (16B/node: prevIdx + nextIdx + sessionId + pad) -- Session-local memory linking at ingestion, forward/backward traversal at recall -- Persistence via `TPCH` magic header - -**Error framework:** 6 error codes (`SPE-310-006..011`), 7 granular exception classes extending `SpectorGraphException`. All catch sites use `catch(RuntimeException)` → create exception → `log(ex.getMessage())`. No string concatenation. - -**Each graph step is additive and gracefully degrading** — if the graph is null/empty or the operation throws, the step is a no-op. - ---- - -### 🔜 Temporal Chain Pruning {#temporal-pruning} - -!!! info "Status: Planned (next)" - Low effort. Prevents unbounded temporal chain growth. - -Temporal chain links are permanent — unlike Hebbian edges which decay via `decayEdges(0.9f)`, temporal links have no homeostasis mechanism. Old session-local links waste slots indefinitely. - -**Design:** - -- Add `pruneOlderThan(long cutoffEpochMs)` to `TemporalChain` -- Replace the `pad:4B` field in the 16B node layout with `epochSec:4B` (seconds since epoch, ~136 year range) -- Integrate into `DefaultSpectorMemory.reflect()` after Hebbian decay -- Configurable retention period via Builder: `temporalRetentionDays(int)` (default: 7) - -**Effort:** ~0.5 day - ---- - -### 🔜 Cross-Layer Promotion (Hebbian → Entity) {#cross-layer-promotion} - -!!! info "Status: Planned (next)" - Medium effort. Enables automatic knowledge graph construction from statistical patterns. - -Promote strong statistical Hebbian associations into explicit entity relations during sleep consolidation — analogous to hippocampal replay. - -**Design:** - -- During `reflect()`, scan HebbianGraph for edges with `weight ≥ 0.8` AND `activationCount ≥ 5` -- For each strong edge, look up shared entities via `EntityGraph.memoriesForEntity()` -- If shared entities exist, strengthen the entity relation edge; if none, create a `RELATED_TO` relation -- Add `promotionThreshold(float)` and `promotionMinActivations(int)` to Builder config -- Add `PromotionReport` record for observability: `promotedCount`, `strengthenedCount`, `skippedCount` - -**Effort:** ~1-2 days - ---- - -### 🔜 Entity Graph Decay + Node Merging {#entity-decay} - -!!! info "Status: Planned" - Medium effort. Prevents entity graph bloat. - -Entity graph edges accumulate without decay. Near-duplicate entities (e.g., "John Smith" and "J. Smith") should be merged during consolidation. - -**Design:** - -- Add `decayRelations(float factor)` to `EntityGraph` — multiplicative decay, prune below threshold -- Add `mergeEntities(int sourceId, int targetId)` — redirect all edges and memory links -- Fuzzy name matching via Levenshtein distance during consolidation -- Integrate into `reflect()` cycle - -**Effort:** ~1-2 days - ---- - -### 🔜 Graph-Aware Scoring Weights {#graph-scoring} - -!!! info "Status: Planned" - Low effort. Highest ROI among remaining graph improvements. - -Extract hardcoded graph score attenuation factors into a configurable `GraphScoringPolicy`. - -**Current hardcoded values:** - -| Factor | Current Value | Used In | -|---|---|---| -| Hebbian boost | 0.3f | RecallPipeline Step 5c | -| Temporal forward | 0.8f | RecallPipeline Step 5d | -| Temporal backward | 0.7f | RecallPipeline Step 5d | -| Entity hop attenuation | 0.25f | RecallPipeline Step 5e | - -**Design:** - -```java -public record GraphScoringPolicy( - float hebbianBoostFactor, // default 0.3 - float temporalForwardFactor, // default 0.8 - float temporalBackwardFactor, // default 0.7 - float entityHopAttenuation, // default 0.25 - int hebbianMaxDepth, // default 2 - int temporalMaxHops, // default 3 - int entityMaxHops // default 2 -) {} -``` - -- Configurable via Builder: `graphScoringPolicy(GraphScoringPolicy)` -- Future: online tuning based on user reinforcement/suppression feedback - -**Effort:** ~0.5 day - ---- - -## Compute & Hardware - -### 🔜 GPU Kernel Dispatch {#gpu-dispatch} - -!!! info "Status: Infrastructure Ready" - CUDA context management and Panama FFM bridge are implemented. The compute kernel dispatch is pending. - -Ship actual CUDA compute kernels for batch cosine similarity and HNSW neighbor selection. The existing `spector-gpu` module provides context management, memory allocation, and kernel loading via Panama FFM — the remaining work is the CUDA kernel code itself. - -**Prerequisites:** CUDA Toolkit 12+ on the host machine. - -**Expected impact:** 10–100× throughput improvement for batch similarity computation on large datasets (> 100K vectors). - ---- - -### 🔬 NPU Acceleration {#npu} - -!!! note "Status: Exploratory" - Depends on Intel/AMD NPU SDK maturity. - -Leverage Intel NPU (via OpenVINO) or AMD XDNA (via DirectML) for INT8 batch operations. NPUs are optimized for low-precision matrix operations, making them ideal for quantized SVASQ distance computation. - -**Target workloads:** INT8/INT4 batch similarity, SVASQ kernel offload. - ---- - -## Runtime & Deployment - -### 🔬 WASM Runtime for Edge Deployment {#wasm} - -!!! note "Status: Exploratory" - Depends on GraalWasm or Chicory maturity for JVM → WASM compilation. - -Compile the core SIMD kernels and HNSW index to WebAssembly for browser-based or edge deployment. This would enable client-side semantic search without a server round-trip. - ---- - -### 🔬 Project Valhalla Value Classes {#valhalla} - -!!! note "Status: Future Research" - Exploratory evaluation of JEP 401 (Value Classes and Objects). Requires Project Valhalla Early-Access builds. - -Migrate hot-path intermediate records (e.g., `CognitiveResult`, candidate pairs, search options) to `value class` (or `value record`). This will allow the JVM JIT compiler to perform aggressive scalar replacement and store value arrays contiguously in memory, eliminating garbage collection overhead and pointer-chasing latency during HNSW index traversals. - -**Benefits:** -- **Zero-GC Hot Path**: Short-lived search results and option records are stack-allocated, avoiding the JVM heap. -- **Cache Locality**: Contiguous storage of value structures inside arrays prevents pointer chasing. -- **Header Elimination**: Removes standard 12-to-16-byte JVM object headers for inline arrays. - ---- - -### ✅ Structured Concurrency (JEP 505) {#structured-concurrency} - -!!! success "Completed" - Implemented via `ConcurrentTasks` in `spector-commons`. Dual-mode: structured concurrency (default) with classic `ExecutorService` fallback via `-Dspector.concurrency.structured=false`. - -Migrated all 6 concurrency sites from unstructured `ExecutorService` + `Future` to the JEP 505 `StructuredTaskScope` API, centralized in `ConcurrentTasks`: - -| Site | Module | Pattern | Benefit | -|------|--------|---------|---------| -| `HybridSearchOrchestrator` | spector-query | 2-way fan-out (keyword ∥ vector) | Auto-cancel sibling on failure | -| `ClusterCoordinator` | spector-node | N-way shard fan-out | Auto-cancel all on shard failure | -| `DistributedQueryCoordinator` | spector-node | N-way with timeout + partial results | Clean timeout via `awaitAll()` + `withTimeout()` | -| `ParallelEmbeddingPipeline` | spector-embed-api | N-way batch embedding | Scope-per-call, no executor lifecycle | -| `ParallelPqTrainer` | spector-index | M-way K-Means subspace training | All-or-nothing structured scope | -| `BM25Index` | spector-index | Parallel term scoring | Auto-cancel with sequential fallback | - -**Key design decisions:** - -- Centralized in `ConcurrentTasks` (spector-commons) for single-point updates when JEP finalizes -- Feature flag: `-Dspector.concurrency.structured=false` for fallback to classic virtual threads -- `forkJoinAll()`: all-or-nothing with auto-cancel (uses `awaitAllSuccessfulOrThrow` Joiner) -- `forkJoinPartial()`: deadline-based with `LabeledTask`/`PartialResult` records (uses `awaitAll` Joiner + `Configuration.withTimeout()`) - ---- - -## Summary Table - -| # | Improvement | Category | Effort | Status | -|---|------------|----------|--------|--------| -| 1 | **SVASQ-4** | Compression | Medium | ✅ Done | -| 2 | **Native MCP Server** | Agentic AI | Medium | ✅ Done | -| 3 | **3-Layer Cognitive Graph** | Graph Memory | High | ✅ Done | -| 4 | **Structured Concurrency** | Runtime | Low | ✅ Done | -| 5 | **Padding-aware storage** | Compression | Low | 🔜 Next | -| 6 | **Norm header f16** | Compression | Very Low | 🔜 Next | -| 7 | **Temporal chain pruning** | Graph Memory | Low | 🔜 Next | -| 8 | **Cross-layer promotion** | Graph Memory | Medium | 🔜 Planned | -| 9 | **Entity graph decay + merging** | Graph Memory | Medium | 🔜 Planned | -| 10 | **Graph scoring weights** | Graph Memory | Low | 🔜 Planned | -| 11 | **Streamable HTTP transport** | Agentic AI | Medium | 🔜 Planned | -| 12 | **GPU kernel dispatch** | Compute | Medium | 🔜 Infra ready | -| 13 | **SVASQ-PQ hybrid** | Compression | Very High | 🔬 Research | -| 14 | **Flat-mode SVASQ** | Compression | Medium | 🔬 Research | -| 15 | **LoRA adapter routing** | Agentic AI | High | 🔬 Research | -| 16 | **ColBERT late interaction** | Agentic AI | High | 🔬 Research | -| 17 | **NPU acceleration** | Compute | High | 🔬 Exploratory | -| 18 | **WASM edge runtime** | Runtime | High | 🔬 Exploratory | -| 19 | **Project Valhalla** | Runtime | Medium | 🔬 Research | -| 20 | **Adaptive bit-width** | Compression | Very High | 🔴 Not planned | diff --git a/docs/docs/sdk-usage/java-client.md b/docs/docs/sdk-usage/java-client.md index c6e9298..44a84d8 100644 --- a/docs/docs/sdk-usage/java-client.md +++ b/docs/docs/sdk-usage/java-client.md @@ -1,12 +1,10 @@ -# ☕ Java SDK Guide +# Java Client SDK -> **Type-safe, thread-safe Java access to Spector — as a remote client or embedded engine.** Whether you're connecting to a server or embedding search directly in your application, this guide covers everything you need. +The `spector-client` module provides a type-safe Java client for interacting with a Spector Search server. ---- +## Installation -## 📦 Installation - -**Remote client** (connects to a running server): +Add the dependency to your `pom.xml`: ```xml @@ -16,271 +14,109 @@ ``` -**Embedded engine** (in-process, zero network overhead): - -```xml - - com.spectrayan - spector-engine - 1.0-SNAPSHOT - -``` - -> [!TIP] -> Choose **embedded** for maximum performance (zero latency overhead). Choose **client** when you want a shared server across multiple services. - ---- - -## 🌐 Client SDK (Remote Server) +## Creating a Client -### 🔧 Creating a Client +Use the builder pattern to configure the client: ```java -import com.spectrayan.spector.client.SpectorClient; - SpectorClient client = SpectorClient.builder() .host("localhost") .port(7070) - .apiKey("my-secret-key") // optional - .connectTimeout(Duration.ofSeconds(10)) - .requestTimeout(Duration.ofSeconds(30)) - .maxConnections(10) + .apiKey("my-secret-key") // optional .build(); ``` -**Configuration Options:** - -| Option | Default | Description | -|--------|---------|-------------| -| `host` | localhost | Server hostname | -| `port` | 7070 | Server port | -| `apiKey` | — | API key for authentication | -| `connectTimeout` | 10s | Connection timeout | -| `requestTimeout` | 30s | Per-request timeout | -| `maxConnections` | 10 | HTTP connection pool size | - -> [!NOTE] -> `SpectorClient` is fully **thread-safe**. It uses Java's `HttpClient` with internal connection pooling. Share a single instance across all threads. - ---- +## Runnable SDK Example -### 📥 Ingesting Documents +This complete example demonstrates the full lifecycle — ingest, search, and delete: ```java -// Single document -IngestResponse response = client.ingest(IngestRequest.builder() - .id("doc-1") - .title("Java Vector API") - .content("SIMD-accelerated search engine built on modern JVM") - .vector(new float[]{0.1f, 0.2f, 0.3f, 0.4f, 0.5f}) - .build()); - -System.out.println("Indexed: " + response.id()); -``` - -```java -// Bulk ingest -List documents = List.of( - IngestRequest.builder().id("d1").content("first doc").vector(vec1).build(), - IngestRequest.builder().id("d2").content("second doc").vector(vec2).build(), - IngestRequest.builder().id("d3").content("third doc").vector(vec3).build() -); - -IngestResponse bulkResponse = client.bulkIngest(documents); -``` +import com.spectrayan.spector.client.SpectorClient; +import com.spectrayan.spector.client.model.*; ---- +public class SpectorClientExample { + public static void main(String[] args) throws Exception { + // 1. Create client + try (SpectorClient client = SpectorClient.builder() + .host("localhost") + .port(7070) + .build()) { -### 🔍 Searching + // 2. Ingest a document + IngestResponse ingestResp = client.ingest(IngestRequest.builder() + .id("sdk-doc-1") + .title("Vector Search") + .content("Spector uses HNSW for approximate nearest neighbor search") + .vector(new float[]{0.1f, 0.2f, 0.3f, 0.4f, 0.5f}) + .build()); + System.out.println("Ingested: " + ingestResp.id()); -```java -// Keyword search -SearchResponse results = client.search(SearchRequest.builder() - .text("vector search engine") - .topK(10) - .build()); + // 3. Search + SearchResponse searchResp = client.search(SearchRequest.builder() + .text("nearest neighbor") + .topK(5) + .build()); + for (SearchResponse.Result result : searchResp.results()) { + System.out.printf(" %s → %.4f%n", result.id(), result.score()); + } -// Vector search -SearchResponse results = client.search(SearchRequest.builder() - .vector(queryEmbedding) - .topK(10) - .build()); + // 4. Check status + StatusResponse status = client.status(); + System.out.println("Engine status: " + status.status()); -// Hybrid search (both text and vector) -SearchResponse results = client.search(SearchRequest.builder() - .text("search engine") - .vector(queryEmbedding) - .topK(10) - .build()); + // 5. Get metrics + MetricsResponse metrics = client.metrics(); + System.out.println("Total queries: " + metrics.totalQueries()); -// Process results -for (SearchResponse.Result result : results.results()) { - System.out.printf("%s (%.4f): %s%n", - result.id(), result.score(), result.content()); + // 6. Delete + client.delete("sdk-doc-1"); + System.out.println("Deleted sdk-doc-1"); + } + } } ``` ---- - -### 🗑️ Deleting Documents +## Bulk Ingestion ```java -client.delete("doc-1"); +List docs = List.of( + IngestRequest.builder().id("d1").content("first").vector(vec1).build(), + IngestRequest.builder().id("d2").content("second").vector(vec2).build() +); +IngestResponse resp = client.bulkIngest(docs); ``` -### 📊 Status and Metrics +## Error Handling -```java -StatusResponse status = client.status(); -System.out.println("Documents: " + status.documentCount()); -System.out.println("SIMD: " + status.simd()); +The SDK throws typed exceptions: -MetricsResponse metrics = client.metrics(); -System.out.println("QPS: " + metrics.queriesPerSecond()); -``` - ---- - -### ⚠️ Error Handling +| Exception | Cause | +|-----------|-------| +| `SpectorConnectionException` | Server unreachable | +| `SpectorApiException` | HTTP 4xx/5xx response | +| `SpectorTimeoutException` | Request timeout exceeded | ```java try { client.search(request); } catch (SpectorApiException e) { - // HTTP 4xx/5xx from server System.err.println("HTTP " + e.statusCode() + ": " + e.message()); } catch (SpectorConnectionException e) { - // Server unreachable System.err.println("Cannot connect to " + e.endpoint()); -} catch (SpectorTimeoutException e) { - // Request timed out - System.err.println("Timeout after " + e.timeout()); -} -``` - -### ♻️ Resource Management - -The client implements `AutoCloseable`: - -```java -try (SpectorClient client = SpectorClient.builder().build()) { - // Use client... -} // Connections released automatically -``` - ---- - -## ⚡ SpectorEngine (Embedded Usage) - -For applications that want in-process search without network overhead: - -### 🔧 Creating an Engine - -```java -import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.engine.SpectorConfig; - -var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(100_000) - .withSimilarityFunction(SimilarityFunction.COSINE) - .withGpu(true) // optional GPU - .withReranker("http://localhost:11434", "llama3.2", 20); // optional LLM - -try (var engine = new SpectorEngine(config)) { - // Engine is ready — sub-millisecond search, zero network overhead -} -``` - -### 📥 Ingesting - -```java -// With pre-computed vector -engine.ingest("doc-1", "Document content here", embedding); -// The engine handles BM25 indexing, HNSW insertion, and storage automatically -``` - -### 🔍 Searching - -```java -// Hybrid search (keyword + vector) -SearchResponse response = engine.hybridSearch("search query", queryVector, 10); - -// Keyword-only -SearchResponse response = engine.keywordSearch("exact phrase", 10); - -// Vector-only -SearchResponse response = engine.vectorSearch(queryVector, 10); - -// Process results -for (ScoredResult result : response.results()) { - System.out.printf("%s → %.4f%n", result.id(), result.score()); -} -``` - -### 🗑️ Deleting - -```java -engine.delete("doc-1"); -``` - ---- - -## 🎯 Complete Example - -```java -import com.spectrayan.spector.client.SpectorClient; -import com.spectrayan.spector.client.model.*; - -public class SpectorExample { - public static void main(String[] args) throws Exception { - try (SpectorClient client = SpectorClient.builder() - .host("localhost") - .port(7070) - .build()) { - - // Ingest documents - client.ingest(IngestRequest.builder() - .id("java-1") - .title("Virtual Threads") - .content("Java virtual threads enable millions of concurrent tasks") - .vector(new float[]{0.9f, 0.1f, 0.3f, 0.7f, 0.5f}) - .build()); - - client.ingest(IngestRequest.builder() - .id("java-2") - .title("Vector API") - .content("The Vector API provides SIMD acceleration for math operations") - .vector(new float[]{0.2f, 0.8f, 0.4f, 0.1f, 0.6f}) - .build()); - - // Search - SearchResponse results = client.search(SearchRequest.builder() - .text("SIMD acceleration") - .topK(5) - .build()); - - System.out.println("Results:"); - for (var r : results.results()) { - System.out.printf(" %s (%.4f): %s%n", r.id(), r.score(), r.title()); - } - - // Cleanup - client.delete("java-1"); - client.delete("java-2"); - } - } } ``` ---- +## Thread Safety -## 🔗 See Also +`SpectorClient` is thread-safe. It uses Java's `HttpClient` with a connection pool (default 10 connections). You can safely share a single instance across multiple threads. -- [REST API Reference](../api-reference/rest-endpoints.md) — Underlying API endpoints +## Configuration -- [Spring AI Integration](spring-ai.md) — Spring AI VectorStore adapter - -- [Configuration Guide](../configuration/parameters.md) — All engine parameters - -- [Getting Started](../getting-started/quickstart.md) — Quick start guide \ No newline at end of file +| Option | Default | Description | +|--------|---------|-------------| +| `host` | localhost | Server hostname | +| `port` | 7070 | Server port | +| `apiKey` | — | Authentication key | +| `connectTimeout` | 10s | Connection timeout | +| `requestTimeout` | 30s | Request timeout | +| `maxConnections` | 10 | Connection pool size | diff --git a/docs/docs/sdk-usage/mcp-server.md b/docs/docs/sdk-usage/mcp-server.md deleted file mode 100644 index 5779bdd..0000000 --- a/docs/docs/sdk-usage/mcp-server.md +++ /dev/null @@ -1,301 +0,0 @@ -# 🤖 MCP Server Usage Guide - -> **Connect any AI agent to Spector's search engine in minutes.** - -This guide covers practical setup for Claude Desktop, Cursor IDE, and custom MCP clients. - ---- - -## Quick Start (3 Steps) - -### 1. Build the Distribution JAR - -```bash -cd spector -mvn package -pl spector-dist -am -DskipTests -``` - -The fat JAR is produced at `spector-dist/target/spector.jar`. - -### 2. Configure Your AI Agent - -Add the following to your agent's MCP configuration (see per-agent sections below): - -```json -{ - "mcpServers": { - "spector": { - "command": "java", - "args": [ - "--add-modules", "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "--enable-preview", - "-jar", "/path/to/spector-dist/target/spector.jar", - "--config", "/path/to/spector.yml" - ] - } - } -} -``` - -### 3. Start Using - -Your AI agent now has access to up to 13 tools. With cognitive memory enabled (`spector.memory.enabled: true`), all 13 tools are registered. Otherwise, the 6 search tools are available: - -- *"Search for documents about SIMD acceleration"* → `semantic_search` -- *"Find articles mentioning 'Panama' and related to memory management"* → `hybrid_search` -- *"What does the codebase say about quantization?"* → `rag_query` -- *"Add this document to the index: ..."* → `ingest_document` -- *"Remember that the user prefers dark mode"* → `core_memory_append` -- *"What do you remember about the user's preferences?"* → `recall_context` - ---- - -## CLI Options - -| Flag | Default | Description | -|:---|:---|:---| -| `--config ` | *(none)* | Explicit config file (YAML or .properties) | -| `--profile ` | *(none)* | Configuration profile (loads `spector-{profile}.yml`) | -| `--dims ` | 384 | Vector dimensionality (must match your embedding model) | -| `--capacity ` | 100,000 | Maximum document capacity | -| `--data-dir ` | *(none)* | Persistence directory (auto-enables DISK mode) | -| `--ollama-url ` | *(none)* | Ollama embedding server URL (e.g., `http://localhost:11434`) | -| `--ollama-model ` | *(none)* | Ollama embedding model name (e.g., `nomic-embed-text`) | -| `--help`, `-h` | — | Show help message | - -> [!TIP] -> **Recommended approach:** Use a `spector.yml` config file rather than CLI flags. CLI flags override values from the config file. - -### Configuration File - -All settings can be specified in a `spector.yml` file: - -```yaml -spector: - engine: - dimensions: 768 - capacity: 100000 - persistence-mode: DISK - data-directory: .spector/index - embedding: - model: nomic-embed-text - base-url: http://localhost:11434 - memory: - enabled: true # Enable cognitive memory tools - persistence-path: .spector/memory -``` - -See the [Configuration Guide](../configuration/parameters.md) for the complete list of settings. - -### Choosing Dimensions - -The `--dims` flag must match your embedding model's output dimensionality: - -| Model | Dimensions | Flag | -|:---|:---|:---| -| `nomic-embed-text` | 768 | `--dims 768` | -| `all-minilm` | 384 | `--dims 384` | -| `mxbai-embed-large` | 1024 | `--dims 1024` | -| `qwen3-embedding` | 4096 | `--dims 4096` | - ---- - -## Agent Configuration - -### Claude Desktop - -Edit your `claude_desktop_config.json`: - -=== "macOS" - - ``` - ~/Library/Application Support/Claude/claude_desktop_config.json - ``` - -=== "Windows" - - ``` - %APPDATA%\Claude\claude_desktop_config.json - ``` - -=== "Linux" - - ``` - ~/.config/Claude/claude_desktop_config.json - ``` - -**Configuration:** - -```json -{ - "mcpServers": { - "spector": { - "command": "java", - "args": [ - "--add-modules", "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "--enable-preview", - "-jar", "/absolute/path/to/spector.jar", - "--config", "/absolute/path/to/spector.yml" - ] - } - } -} -``` - -> [!TIP] -> Use absolute paths for the JAR file. Relative paths may not resolve correctly from Claude Desktop's working directory. - -### Cursor IDE - -Add to your Cursor MCP settings (`.cursor/mcp.json` in your project, or global settings): - -```json -{ - "mcpServers": { - "spector": { - "command": "java", - "args": [ - "--add-modules", "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "--enable-preview", - "-jar", "/absolute/path/to/spector.jar", - "--config", "/absolute/path/to/spector.yml" - ] - } - } -} -``` - -### Custom MCP Clients - -Any application implementing the [MCP client specification](https://modelcontextprotocol.io/docs/concepts/clients) can connect to Spector. The server communicates via **JSON-RPC 2.0 over stdio** (stdin/stdout). - -**Key requirements:** - -1. Spawn the Java process with the correct JVM flags -2. Write JSON-RPC messages to the process's stdin -3. Read JSON-RPC responses from the process's stdout -4. All logging goes to stderr (stdout is reserved for protocol messages) - -**Example initialization sequence:** - -```json -// Client → Server -{"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {"protocolVersion": "2025-03-26", "capabilities": {}, "clientInfo": {"name": "my-app", "version": "1.0"}}} - -// Server → Client -{"jsonrpc": "2.0", "id": 1, "result": {"protocolVersion": "2025-03-26", "capabilities": {"tools": {}}, "serverInfo": {"name": "spector-mcp", "version": "0.1.0"}}} - -// Client → Server -{"jsonrpc": "2.0", "method": "notifications/initialized"} -``` - ---- - -## MCP Tools Overview - -Once connected, your agent has access to these tools: - -### Search Tools (always available) - -| Tool | Description | Requires Embedding | -|:---|:---|:---| -| `semantic_search` | Vector similarity search | ✅ | -| `hybrid_search` | Keyword + vector with RRF fusion | Partial (keyword mode works without) | -| `rag_query` | Retrieval-Augmented Generation context | ✅ | -| `ingest_document` | Add documents to the index | ✅ (for auto-embedding) | -| `delete_document` | Remove documents by ID | ❌ | -| `engine_status` | Engine capabilities and stats | ❌ | - -### Cognitive Memory Tools (enabled via `spector.memory.enabled: true`) - -| Tool | Description | -|:---|:---| -| `core_memory_append` | Store a semantic memory with tags and source | -| `recall_context` | Cognitive recall with fused scoring across tiers | -| `memory_status` | Memory tier counts and persistence info | -| `memory_reinforce` | Report positive/negative outcome for a memory | -| `memory_forget` | Tombstone a memory by ID | -| `memory_introspect` | Metamemory self-analysis on a topic | -| `working_memory_scratchpad` | Quick-write to working memory | - -> [!NOTE] -> For full tool schemas and parameter details, see the [MCP Integration Architecture](../architecture/mcp-integration.md#tool-reference) page. - ---- - -## Troubleshooting - -### Agent can't find or start the server - -- **Check the JAR path** — Use absolute paths, not relative -- **Check Java version** — Spector requires JDK 25+. Run `java -version` to verify -- **Check JVM flags** — `--add-modules jdk.incubator.vector` is required - -### "Embedding provider not configured" errors - -The `semantic_search` and `rag_query` tools require an embedding provider. Ensure: - -1. Ollama is running: `ollama serve` -2. The model is pulled: `ollama pull nomic-embed-text` -3. Both `--ollama-url` and `--ollama-model` are specified in the args - -### Stdout corruption / garbled output - -Spector redirects all logging to **stderr**. If you see garbled output: - -- Check that nothing else is writing to stdout -- Verify the logback configuration routes to stderr -- Check for print statements in any custom code - -### Performance issues - -- **High latency on first query** — The HNSW index is built lazily. First query triggers graph construction. Subsequent queries are fast. -- **Memory usage** — Vectors are stored off-heap. Monitor with `-XX:NativeMemoryTracking=summary` and `jcmd VM.native_memory summary` - ---- - -## Adding a New Tool - -To extend the MCP server with a custom tool: - -1. **Create a new class** extending `McpToolHandler`: - -```java -public final class MyCustomTool extends McpToolHandler { - @Override public String name() { return "my_custom_tool"; } - @Override public String description() { return "Does something useful."; } - @Override public Map inputSchema() { - return ToolSchemaBuilder.object() - .requiredString("input", "The input parameter.") - .build(); - } - @Override public CallToolResult execute(SpectorEngine engine, Map args) { - String input = requireString(args, "input"); - // Your logic here - return textResult("Result: " + input); - } -} -``` - -2. **Register it** in `SpectorToolRegistry.handlers()`: - -```java -List.of( - new SemanticSearchTool(), - // ... existing tools ... - new MyCustomTool() // ← add here -); -``` - -That's it — the tool is automatically available to all connected agents. - ---- - -## See Also - -- [MCP Integration Architecture](../architecture/mcp-integration.md) — Module structure, data flow, and performance analysis -- [Architecture Overview](../architecture/overview.md) — Full system architecture -- [REST API Reference](../api-reference/rest-endpoints.md) — Alternative HTTP interface diff --git a/docs/docs/sdk-usage/spring-ai.md b/docs/docs/sdk-usage/spring-ai.md deleted file mode 100644 index 32e2aea..0000000 --- a/docs/docs/sdk-usage/spring-ai.md +++ /dev/null @@ -1,336 +0,0 @@ -# 🌱 Spring AI Integration - -> **Seamlessly integrate Spector into your Spring AI applications.** The `spector-spring` module implements Spring AI's `VectorStore` interface, giving you access to filter expressions, RAG patterns, and the full Spring AI ecosystem backed by sub-millisecond search. - ---- - -## 📦 Maven Dependency - -```xml - - com.spectrayan - spector-spring - 1.0-SNAPSHOT - -``` - -Spring AI dependencies (BOM recommended): - -```xml - - - - org.springframework.ai - spring-ai-bom - 1.0.0 - pom - import - - - -``` - ---- - -## ⚡ Configuration Modes - -```mermaid -graph LR - subgraph "🏠 Embedded Mode" - A[Your App] --> B[SpectorVectorStore] - B --> C[SpectorEngine
In-process, zero latency] - end - - subgraph "🌐 Remote Mode" - D[Your App] --> E[SpectorVectorStore] - E --> F[SpectorClient
REST to server] - F --> G[Spector Server] - end -``` - -### 🏠 Embedded Mode (In-Process) - -Use the SpectorEngine directly — no network, lowest latency: - -```java -import org.springframework.ai.vectorstore.spector.SpectorVectorStore; -import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.engine.SpectorConfig; - -@Configuration -public class VectorStoreConfig { - - @Bean - public SpectorEngine spectorEngine() { - var config = SpectorConfig.DEFAULT - .withDimensions(384) - .withCapacity(100_000); - return new SpectorEngine(config); - } - - @Bean - public VectorStore vectorStore(SpectorEngine engine) { - return new SpectorVectorStore(engine); - } -} -``` - -### 🌐 Remote Mode (Client SDK) - -Connect to a running Spector server: - -```java -import com.spectrayan.spector.client.SpectorClient; - -@Configuration -public class VectorStoreConfig { - - @Bean - public SpectorClient spectorClient() { - return SpectorClient.builder() - .host("spector-node.internal") - .port(7070) - .apiKey("my-api-key") - .build(); - } - - @Bean - public VectorStore vectorStore(SpectorClient client) { - return new SpectorVectorStore(client); - } -} -``` - ---- - -## 📄 Adding Documents - -```java -import org.springframework.ai.document.Document; -import org.springframework.ai.vectorstore.VectorStore; - -@Service -public class DocumentService { - - private final VectorStore vectorStore; - - public DocumentService(VectorStore vectorStore) { - this.vectorStore = vectorStore; - } - - public void addDocuments() { - List documents = List.of( - new Document("HNSW enables fast approximate nearest neighbor search", - Map.of("source", "architecture.md", "category", "indexing")), - new Document("BM25 provides keyword scoring with term frequency saturation", - Map.of("source", "algorithms.md", "category", "search")), - new Document("Virtual threads allow millions of concurrent operations", - Map.of("source", "concurrency.md", "category", "runtime")) - ); - - vectorStore.add(documents); - } -} -``` - ---- - -## 🔍 Similarity Search - -### Basic Search - -```java -List results = vectorStore.similaritySearch("nearest neighbor search"); -``` - -### Search with Parameters - -```java -import org.springframework.ai.vectorstore.SearchRequest; - -List results = vectorStore.similaritySearch( - SearchRequest.query("vector search algorithms") - .withTopK(10) - .withSimilarityThreshold(0.7) -); -``` - -### 🎯 Filter Expressions - -SpectorVectorStore supports Spring AI's metadata filter expressions: - -```java -// Filter by category -List results = vectorStore.similaritySearch( - SearchRequest.query("search algorithms") - .withTopK(5) - .withFilterExpression("category == 'indexing'") -); - -// Complex filters -List results = vectorStore.similaritySearch( - SearchRequest.query("performance") - .withTopK(10) - .withFilterExpression("category == 'search' && source == 'algorithms.md'") -); -``` - -**Supported filter operators:** - -| Operator | Example | -|----------|---------| -| `==` | `category == 'search'` | -| `!=` | `category != 'draft'` | -| `>`, `>=`, `<`, `<=` | `version > 2` | -| `&&` | `a == 'x' && b == 'y'` | -| `\|\|` | `a == 'x' \|\| a == 'y'` | -| `in` | `category in ['search', 'index']` | -| `not in` | `status not in ['archived']` | - ---- - -## 🗑️ Deleting Documents - -```java -vectorStore.delete(List.of("doc-id-1", "doc-id-2")); -``` - ---- - -## 🤖 RAG Service - -The `SpectorRagService` provides end-to-end retrieval-augmented generation: - -```java -import org.springframework.ai.vectorstore.spector.rag.SpectorRagService; - -@Service -public class AiAssistant { - - private final SpectorRagService ragService; - - public AiAssistant(SpectorRagService ragService) { - this.ragService = ragService; - } - - public String getContext(String userQuery) { - RagConfig config = new RagConfig( - 10, // topK - 0.7f, // similarity threshold - 4096 // token limit - ); - - RetrievalResult result = ragService.retrieve(userQuery, config); - return result.contextText(); - } -} -``` - -### 💬 RAG with Spring AI ChatClient - -```java -@Service -public class RagChatService { - - private final ChatClient chatClient; - private final VectorStore vectorStore; - - public String ask(String question) { - return chatClient.prompt() - .system("Answer based on the provided context.") - .user(question) - .advisors(new QuestionAnswerAdvisor(vectorStore)) - .call() - .content(); - } -} -``` - -> [!TIP] -> Spring AI's `QuestionAnswerAdvisor` automatically retrieves relevant context from the VectorStore and includes it in the prompt — no manual context assembly needed. - ---- - -## ⚙️ Spring Boot Auto-Configuration - -Configure via `application.yml`: - -```yaml -spector: - search: - mode: embedded # or "remote" - dimensions: 384 - capacity: 100000 - # Remote mode settings - host: localhost - port: 7070 - api-key: ${SPECTOR_API_KEY:} -``` - ---- - -## ⚠️ Error Handling - -| Exception | Cause | -|-----------|-------| -| `SpectorVectorStoreException` | Connection failure, server error | -| `SpectorRagServiceException` | RAG pipeline errors | - -```java -try { - vectorStore.add(documents); -} catch (SpectorVectorStoreException e) { - log.error("Failed to add documents: {}", e.getMessage()); -} -``` - ---- - -## 🎯 Complete Example - -```java -@SpringBootApplication -public class SearchApp { - - @Bean - public VectorStore vectorStore() { - var engine = new SpectorEngine( - SpectorConfig.DEFAULT.withDimensions(384)); - return new SpectorVectorStore(engine); - } - - @Bean - CommandLineRunner demo(VectorStore store) { - return args -> { - // Add documents - store.add(List.of( - new Document("HNSW uses multi-layer graphs for fast ANN search", - Map.of("topic", "indexing")), - new Document("Product quantization compresses vectors 32x", - Map.of("topic", "compression")) - )); - - // Search with filter - var results = store.similaritySearch( - SearchRequest.query("compression techniques") - .withTopK(5) - .withFilterExpression("topic == 'compression'")); - - results.forEach(doc -> - System.out.println(doc.getContent())); - }; - } -} -``` - ---- - -## 🔗 See Also - -- [Java SDK Guide](java-client.md) — Direct SDK usage - -- [RAG Pipeline](../architecture/rag-pipeline.md) — How the RAG pipeline works internally - -- [REST API Reference](../api-reference/rest-endpoints.md) — Underlying REST endpoints - -- [Configuration Guide](../configuration/parameters.md) — All configurable parameters \ No newline at end of file diff --git a/docs/docs/stylesheets/extra.css b/docs/docs/stylesheets/extra.css deleted file mode 100644 index ec50d3c..0000000 --- a/docs/docs/stylesheets/extra.css +++ /dev/null @@ -1,79 +0,0 @@ -/* Center mermaid diagrams */ -.mermaid { - text-align: center; -} - -.mermaid svg { - margin: 0 auto; - display: block; -} - -/* Fix mermaid diagrams on dark theme */ -[data-md-color-scheme="slate"] .mermaid { - --md-mermaid-font-family: var(--md-text-font-family, _); -} - -/* Ensure mermaid nodes are readable on dark backgrounds */ -[data-md-color-scheme="slate"] .mermaid .node rect, -[data-md-color-scheme="slate"] .mermaid .node polygon, -[data-md-color-scheme="slate"] .mermaid .node circle { - fill: #1e1e2e !important; - stroke: #6c6c8a !important; -} - -[data-md-color-scheme="slate"] .mermaid .node .label, -[data-md-color-scheme="slate"] .mermaid span { - color: #cdd6f4 !important; - fill: #cdd6f4 !important; -} - -[data-md-color-scheme="slate"] .mermaid .edgePath .path { - stroke: #6c6c8a !important; -} - -[data-md-color-scheme="slate"] .mermaid .edgeLabel { - background-color: #1e1e2e !important; - color: #cdd6f4 !important; -} - -[data-md-color-scheme="slate"] .mermaid .cluster rect { - fill: #181825 !important; - stroke: #45475a !important; -} - -[data-md-color-scheme="slate"] .mermaid .cluster span { - color: #a6adc8 !important; -} - -/* Sequence diagram dark theme fixes */ -[data-md-color-scheme="slate"] .mermaid .actor { - fill: #1e1e2e !important; - stroke: #6c6c8a !important; -} - -[data-md-color-scheme="slate"] .mermaid text.actor { - fill: #cdd6f4 !important; -} - -[data-md-color-scheme="slate"] .mermaid .messageLine0, -[data-md-color-scheme="slate"] .mermaid .messageLine1 { - stroke: #6c6c8a !important; -} - -[data-md-color-scheme="slate"] .mermaid .messageText { - fill: #cdd6f4 !important; -} - -[data-md-color-scheme="slate"] .mermaid .note { - fill: #313244 !important; - stroke: #45475a !important; -} - -[data-md-color-scheme="slate"] .mermaid .noteText { - fill: #cdd6f4 !important; -} - -/* Flowchart dark theme fixes */ -[data-md-color-scheme="slate"] .mermaid .flowchart-link { - stroke: #6c6c8a !important; -} diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 509697a..f879e6b 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,13 +1,11 @@ -site_name: Spector Documentation -site_description: The Zero-Overhead, Agent-Ready AI Memory Backbone -site_url: https://spectrayan.github.io/spector/ -repo_url: https://github.com/spectrayan/spector -repo_name: spectrayan/spector +site_name: Spector Search Documentation +site_description: Ultra-fast, SIMD-accelerated semantic search engine built on Java Vector API +site_url: https://spectrayan.github.io/spector-search/ +repo_url: https://github.com/spectrayan/spector-search +repo_name: spectrayan/spector-search theme: name: material - icon: - logo: material/lightning-bolt palette: - scheme: default primary: indigo @@ -22,182 +20,44 @@ theme: icon: material/brightness-4 name: Switch to light mode features: - # Navigation - - navigation.tabs # Top-level tabs - - navigation.tabs.sticky # Tabs stay visible on scroll - - navigation.sections # Bold section headers in sidebar - - navigation.expand # Auto-expand sidebar sections - - navigation.top # "Back to top" button on scroll - - navigation.instant # Single-page app feel (no full reload) - - navigation.instant.progress # Loading progress bar - - navigation.tracking # URL updates as you scroll sections - - navigation.indexes # Section index pages - - navigation.footer # Previous/Next page links at bottom - - navigation.path # Breadcrumbs above page title - # Search - - search.suggest # Autocomplete suggestions - - search.highlight # Highlight search terms on page - - search.share # Shareable search links - # Content - - content.code.copy # Copy button on code blocks - - content.code.annotate # Inline code annotations - - content.tabs.link # Linked content tabs across page - - content.tooltips # Rich tooltips on hover - # TOC - - toc.follow # TOC follows scroll position + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.top + - search.suggest + - search.highlight + - content.code.copy + - content.tabs.link plugins: - search - - callouts markdown_extensions: - - pymdownx.arithmatex: - generic: true - pymdownx.highlight: anchor_linenums: true - line_spans: __span - - pymdownx.inlinehilite - - pymdownx.snippets: - base_path: - - docs/docs - - .. # Repo root — enables --8<-- "spector-core/README.md" - check_paths: true - - pymdownx.superfences: - custom_fences: - - name: mermaid - class: mermaid - format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.superfences - pymdownx.tabbed: alternate_style: true - - pymdownx.emoji: - emoji_index: !!python/name:material.extensions.emoji.twemoji - emoji_generator: !!python/name:material.extensions.emoji.to_svg - - pymdownx.tasklist: - custom_checkbox: true - - pymdownx.keys # Render keyboard shortcuts like ++ctrl+c++ - - pymdownx.mark # ==highlighted text== - - pymdownx.critic # Track changes markup - - pymdownx.caret # ^^superscript^^ - - pymdownx.tilde # ~~strikethrough~~ and ~subscript~ - - pymdownx.smartsymbols # (c) → ©, (tm) → ™, etc. - admonition - pymdownx.details - attr_list - md_in_html - - def_list # Definition lists - - footnotes # Footnote references - - abbr # Abbreviation tooltips - - tables - toc: permalink: true - toc_depth: 3 - - nav: - Home: index.md - - About: about.md - Getting Started: - Quick Start: getting-started/quickstart.md - Installation: getting-started/installation.md - - JDK API Status: getting-started/jdk-api-status.md + - API Reference: + - Overview: api-reference/overview.md + - REST Endpoints: api-reference/rest-endpoints.md + - Configuration: + - Parameters: configuration/parameters.md - Architecture: - System Overview: architecture/overview.md - - Core Concepts: architecture/core-concepts.md - - MCP Integration: architecture/mcp-integration.md - - Ingestion Pipeline: architecture/ingestion-pipeline.md - - RAG Pipeline: architecture/rag-pipeline.md - - Distributed Mode: architecture/distributed-mode.md - - GPU Acceleration: architecture/gpu-acceleration.md - - Modules: - - Overview: modules/index.md - - spector-core: modules/spector-core.md - - spector-commons: modules/spector-commons.md - - spector-config: modules/spector-config.md - - spector-storage: modules/spector-storage.md - - spector-embed-api: modules/spector-embed-api.md - - spector-embed-ollama: modules/spector-embed-ollama.md - - spector-index: modules/spector-index.md - - spector-query: modules/spector-query.md - - spector-gpu: modules/spector-gpu.md - - spector-rag: modules/spector-rag.md - - spector-engine: modules/spector-engine.md - - spector-ingestion: modules/spector-ingestion.md - - spector-memory: modules/spector-memory.md - - spector-runtime: modules/spector-runtime.md - - spector-node: modules/spector-node.md - - spector-mcp: modules/spector-mcp.md - - spector-cli: modules/spector-cli.md - - spector-client: modules/spector-client.md - - spector-spring: modules/spector-spring.md - - spector-metrics: modules/spector-metrics.md - - spector-bench: modules/spector-bench.md - - spector-dist: modules/spector-dist.md - - spector-cortex: modules/spector-cortex.md - - Deep Dives: - - ANN Search Primer: deep-dives/ann-search-primer.md - - HNSW Explained: deep-dives/hnsw-explained.md - - SpectorIndex Architecture: deep-dives/spector-index-architecture.md - - SVASQ Quantization: deep-dives/svasq-deep-dive.md - - Understanding Quantization: deep-dives/understanding-quantization.md - - Quantization Comparison: deep-dives/quantization-comparison.md - - TurboQuant: deep-dives/turbo-quant.md - - Real-Embedding Benchmarks: deep-dives/real-embedding-benchmarks.md - - "Whitepaper: SVASQ + SpectorIndex": deep-dives/svasq-spectorindex-whitepaper.md - - "🧠 Cognitive Memory": - - Overview: memory/index.md - - Getting Started: memory/getting-started.md - - Architecture: - - System Architecture: memory/architecture.md - - The 6-Phase Scoring Pipeline: memory/scoring-pipeline.md - - Biological Systems: - - Overview: memory/biological-systems.md - - "Cortex — Tier Stores": memory/cortex.md - - "Hippocampus — Sleep Consolidation": memory/hippocampus.md - - "Synapse — Tags & Scoring": memory/synapse.md - - "Dopamine — Surprise Detection": memory/dopamine.md - - "Amygdala — Emotional Valence": memory/amygdala.md - - "3-Layer Cognitive Graph": memory/hebbian.md - - "Habituation — Anti-Filter Bubble": memory/habituation.md - - "Inhibition — Suppression": memory/inhibition.md - - "Interference — Deduplication": memory/interference.md - - "Prospective — Future Intents": memory/prospective.md - - "Metamemory — Self-Reflection": memory/metamemory.md - - "Sync — Persistence & Replication": memory/sync.md - - Advanced Profiles: - - Cognitive Profiles Overview: memory/cognitive-profiles.md - - "Focus Mode": memory/focus-mode.md - - "Explorer — Lateral Retrieval": memory/lateral-retrieval.md - - "Importance Fusion (ICNU)": memory/importance-fusion.md - - Deep Dives: - - "Performance & SIMD": memory/performance.md - - "Off-Heap Panama Design": memory/panama-design.md - - "WAL Design": memory/wal-design.md - - API Reference: memory/api-reference.md - - - "🧬 Cortex Dashboard": - - Overview: cortex/index.md - - - Reference: - - REST API: api-reference/rest-endpoints.md - - MCP Server: sdk-usage/mcp-server.md - - Java SDK: sdk-usage/java-client.md - - Spring AI Integration: sdk-usage/spring-ai.md - - CLI (spectorctl): cli-reference/spectorctl.md - - Configuration: configuration/parameters.md - - Operations: - - Performance Tuning: operations/performance-tuning.md - - Contributing: operations/contributing.md - - FAQ: faq.md - - Roadmap: roadmap.md - - "🔬 Labs": - - labs/index.md - - Research Roadmap: labs/roadmap.md - -extra_css: - - stylesheets/extra.css - -extra_javascript: - - javascripts/mermaid-init.js - - javascripts/mathjax.js - - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js + - SDK Usage: + - Java Client SDK: sdk-usage/java-client.md + - CLI Reference: + - spectorctl: cli-reference/spectorctl.md diff --git a/docs/screenshots/spector-cortex-dashboard.png b/docs/screenshots/spector-cortex-dashboard.png deleted file mode 100644 index dc8dcfd..0000000 Binary files a/docs/screenshots/spector-cortex-dashboard.png and /dev/null differ diff --git a/goal.md b/goal.md new file mode 100644 index 0000000..97d9357 --- /dev/null +++ b/goal.md @@ -0,0 +1,68 @@ +# **Spector‑Search** +**Ultra‑fast, SIMD‑accelerated semantic search engine built on Java Vector API + modern JVM technologies.** + +Spector‑Search is a high‑performance search engine designed for the next generation of intelligent applications. It combines **Java's Vector API**, **virtual threads**, and **zero‑copy memory** to deliver blazing‑fast indexing and retrieval across large text corpora and vector embeddings. + +Built for developers who want **NumPy‑level performance** with the reliability, safety, and scalability of the JVM. + +--- + +## 🚀 **Key Features** + +### **⚡ SIMD‑Accelerated Query Execution** +Powered by the Java Vector API (AVX2/AVX‑512/NEON/SVE), Spector‑Search performs vector math, scoring, and similarity computations at hardware speed. + +### **🧠 Semantic Search Ready** +Supports embedding‑based retrieval (cosine similarity, dot‑product ranking) and integrates cleanly with any embedding generator or LLM. + +### **🧵 Massive Concurrency with Virtual Threads** +Java Loom enables millions of lightweight concurrent search tasks without the overhead of traditional thread pools. + +### **🧩 Zero‑Copy Memory Architecture** +Uses Panama Memory Segments for high‑throughput indexing, caching, and vector storage. + +### **📦 Pluggable Indexing Pipeline** +Custom analyzers, tokenizers, and embedding pipelines allow you to tailor search behavior to your domain. + +### **🔍 Hybrid Search** +Combine keyword search + vector search for best‑of‑both‑worlds retrieval. + +### **🛠 JVM‑Native Performance** +No Python, no JNI overhead — pure Java, optimized by the JIT and Graal. + +--- + +## 🧪 **Use Cases** + +- High‑performance document search +- Embedding/vector similarity search +- LLM‑augmented retrieval (RAG) +- Real‑time log or event search +- On‑device or edge semantic search +- Custom search engines for enterprise data + +--- + +## 🏗 **Tech Stack** + +- **Java 25** +- **Java Vector API (SIMD)** +- **Virtual Threads (Project Loom)** +- **Foreign Function & Memory API (Panama)** +- **Custom SIMD‑optimized math kernels** +- **CUDA GPU acceleration (optional)** +- **gRPC distributed search** + +--- + +## 📈 **Roadmap** + +- [x] GPU acceleration via CUDA bindings +- [x] HNSW / IVF / PQ vector index +- [x] Distributed search nodes +- [x] LLM‑powered ranking +- [x] REST API with CORS, auth, metrics +- [x] Embedding provider SPI (Ollama) +- [x] Document deletion + bulk ingest +- [x] gRPC TLS support +- [ ] WASM runtime for edge deployment diff --git a/pom.xml b/pom.xml index 4f0f760..0b4237d 100644 --- a/pom.xml +++ b/pom.xml @@ -5,13 +5,13 @@ 4.0.0 com.spectrayan - spector + spector-search 0.1.0-SNAPSHOT pom - Spector + Spector Search Ultra-fast, SIMD-accelerated semantic search engine built on Java Vector API + modern JVM technologies. - https://github.com/spectrayan/spector + https://github.com/spectrayan/spector-search @@ -24,26 +24,19 @@ spector-commons spector-core - spector-config spector-storage spector-index spector-query spector-embed-api spector-embed-ollama spector-gpu - spector-rag spector-engine - spector-ingestion - spector-memory - spector-metrics - spector-runtime - spector-node + spector-server + spector-cluster spector-bench spector-cli spector-client spector-spring - spector-mcp - spector-dist @@ -60,17 +53,10 @@ 6.6.0 - 3.1.3 - 2.21.3 + 2.18.3 2.0.17 1.5.18 1.37 - 2.0.0-M3 - 2.11.0 - 1.9.4 - 2.3 - 1.14.5 - 1.31.3 5.11.4 @@ -82,7 +68,6 @@ 3.4.2 3.6.0 0.8.12 - 5.0.0 2024-01-01T00:00:00Z @@ -92,11 +77,6 @@ - - com.spectrayan - spector-config - ${project.version} - com.spectrayan spector-core @@ -122,16 +102,6 @@ spector-engine ${project.version} - - com.spectrayan - spector-ingestion - ${project.version} - - - com.spectrayan - spector-rag - ${project.version} - com.spectrayan spector-commons @@ -164,83 +134,15 @@ com.spectrayan - spring-ai-starter-vector-store-spector - ${project.version} - - - com.spectrayan - spector-mcp - ${project.version} - - - com.spectrayan - spector-memory - ${project.version} - - - com.spectrayan - spector-metrics - ${project.version} - - - com.spectrayan - spector-node - ${project.version} - - - com.spectrayan - spector-runtime + spring-ai-starter-vector-store-spector-search ${project.version} - - - io.micrometer - micrometer-core - ${micrometer.version} - - - io.micrometer - micrometer-registry-prometheus - ${micrometer.version} - - - - - io.modelcontextprotocol.sdk - mcp - ${mcp-sdk.version} - - - - - tools.jackson.core - jackson-databind - ${jackson.version} - - - + com.fasterxml.jackson.core jackson-databind - ${jackson2.version} - - - - - org.apache.commons - commons-configuration2 - ${commons-configuration2.version} - - - commons-beanutils - commons-beanutils - ${commons-beanutils.version} - - - org.yaml - snakeyaml - ${snakeyaml.version} + ${jackson.version} @@ -255,31 +157,12 @@ ${logback.version} - + io.javalin javalin ${javalin.version} - - - - com.linecorp.armeria - armeria-bom - ${armeria.version} - pom - import - - - com.linecorp.armeria - armeria - ${armeria.version} - - - com.linecorp.armeria - armeria-grpc - ${armeria.version} - io.javalin javalin-testtools @@ -401,28 +284,6 @@ - - - - com.mycila - license-maven-plugin - ${license-maven-plugin.version} - - - 2026 - - - -
src/license/apache2-header.txt
- - src/main/java/**/*.java - src/test/java/**/*.java - -
-
- false -
-
@@ -439,10 +300,6 @@ org.jacoco jacoco-maven-plugin - - com.mycila - license-maven-plugin - diff --git a/scripts/collect-labs.sh b/scripts/collect-labs.sh deleted file mode 100755 index 1671d18..0000000 --- a/scripts/collect-labs.sh +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env bash -# ═══════════════════════════════════════════════════════════════════════ -# collect-labs.sh — Auto-discover labs/* branches and generate docs -# ═══════════════════════════════════════════════════════════════════════ -# -# This script is run by CI before `mkdocs build`. It: -# 1. Discovers all remote branches matching `origin/labs/*` -# 2. Extracts LABS.md from each branch via `git show` -# 3. Copies each LABS.md into docs/docs/labs/.md -# 4. Auto-generates docs/docs/labs/index.md with overview cards -# -# Convention: Each labs/* branch must have a LABS.md at the repo root. -# - Line 1: `# ` (becomes the nav entry and card title) -# - Lines 3-5: First paragraph (becomes the overview blurb) -# -# Usage: -# ./scripts/collect-labs.sh # Run from repo root -# ./scripts/collect-labs.sh --dry-run # Preview without writing files -# -set -eu - -DOCS_LABS_DIR="docs/docs/labs" -DRY_RUN=false - -if [[ "${1:-}" == "--dry-run" ]]; then - DRY_RUN=true - echo "🔍 Dry run mode — no files will be written" -fi - -# ─── Ensure we have remote branch info ─────────────────────────────── -git fetch --prune origin 'refs/heads/labs/*:refs/remotes/origin/labs/*' 2>/dev/null || true - -# ─── Discover all labs branches ────────────────────────────────────── -LABS_BRANCHES=$(git branch -r --list 'origin/labs/*' 2>/dev/null | sed 's/^ *//' | sort) - -if [[ -z "$LABS_BRANCHES" ]]; then - echo "ℹ️ No labs/* branches found. Skipping labs docs generation." - # Create minimal index if the nav references it - mkdir -p "$DOCS_LABS_DIR" - cat > "$DOCS_LABS_DIR/index.md" << 'EOF' -# 🔬 Labs - -> **Experimental branches exploring cutting-edge JVM features and research ideas.** - -No active lab branches found. When a `labs/*` branch is pushed with a `LABS.md` file, -it will automatically appear here. - -Check the [Roadmap](../roadmap.md) for planned experiments. -EOF - exit 0 -fi - -echo "🔬 Discovered lab branches:" -echo "$LABS_BRANCHES" | sed 's/^/ /' - -# ─── Create output directory ──────────────────────────────────────── -if [[ "$DRY_RUN" == "false" ]]; then - mkdir -p "$DOCS_LABS_DIR" -fi - -# ─── Collect LABS.md from each branch ──────────────────────────────── -declare -a LAB_ENTRIES=() - -for BRANCH in $LABS_BRANCHES; do - # Extract branch short name: origin/labs/valhalla → valhalla - SHORT_NAME="${BRANCH#origin/labs/}" - SAFE_NAME=$(echo "$SHORT_NAME" | tr '/' '-') - - echo " 📄 Processing: $BRANCH → labs/$SAFE_NAME.md" - - # Try to extract LABS.md from this branch - CONTENT=$(git show "$BRANCH:LABS.md" 2>/dev/null) || { - echo " ⚠️ No LABS.md found in $BRANCH — skipping" - continue - } - - # Extract title (first H1 line) - TITLE=$(echo "$CONTENT" | grep -m1 '^# ' | sed 's/^# //') - if [[ -z "$TITLE" ]]; then - TITLE="Labs: $SHORT_NAME" - fi - - # Extract overview: first non-empty, non-heading paragraph after the title - # Skip lines starting with #, >, ---, or empty lines, then grab until next blank line - OVERVIEW=$(echo "$CONTENT" | awk ' - BEGIN { found_title=0; in_para=0 } - /^# / { found_title=1; next } - found_title && /^$/ && !in_para { next } - found_title && /^[>#\-\[]/ && !in_para { next } - found_title && /^.+$/ && !in_para { in_para=1; print; next } - in_para && /^.+$/ { print; next } - in_para && /^$/ { exit } - ') - - if [[ -z "$OVERVIEW" ]]; then - OVERVIEW="Experimental branch: \`labs/$SHORT_NAME\`" - else - # Collapse multi-line to single line (read <<< splits on newlines) - OVERVIEW=$(echo "$OVERVIEW" | tr '\n' ' ' | sed 's/ */ /g') - fi - - # Extract metadata (disable pipefail for grep pipelines) - STATUS=$(set +o pipefail; echo "$CONTENT" | grep -m1 'Status:' | sed 's/.*Status:[[:space:]]*//' | sed 's/[*]//g' | sed 's/^[[:space:]]*//') - [[ -z "$STATUS" ]] && STATUS="Experimental" - - LAST_UPDATED=$(set +o pipefail; git log -1 --format='%cd' --date=short "$BRANCH" 2>/dev/null) - [[ -z "$LAST_UPDATED" ]] && LAST_UPDATED="unknown" - - COMMIT_COUNT=$(set +o pipefail; git rev-list --count "origin/main..$BRANCH" 2>/dev/null) - [[ -z "$COMMIT_COUNT" ]] && COMMIT_COUNT="?" - - if [[ "$DRY_RUN" == "false" ]]; then - # Write the full LABS.md as a doc page, with metadata header - { - echo "---" - echo "title: \"$TITLE\"" - echo "---" - echo "" - echo "!!! warning \"Experimental Branch\"" - echo " This page is auto-generated from the \`labs/$SHORT_NAME\` branch." - echo " It requires a specialized JDK or environment. See build instructions below." - echo "" - echo "**Branch:** [\`labs/$SHORT_NAME\`](https://github.com/spectrayan/spector/tree/labs/$SHORT_NAME)" - echo "| **Last updated:** $LAST_UPDATED" - echo "| **Commits ahead of main:** $COMMIT_COUNT" - echo "" - echo "---" - echo "" - echo "$CONTENT" - } > "$DOCS_LABS_DIR/$SAFE_NAME.md" - fi - - # Collect entry for index page (use SOH as delimiter — pipe conflicts with markdown tables) - SEP=$'\x01' - LAB_ENTRIES+=("${SAFE_NAME}${SEP}${TITLE}${SEP}${OVERVIEW}${SEP}${STATUS}${SEP}${LAST_UPDATED}${SEP}${COMMIT_COUNT}") - - echo " ✅ Done: $TITLE" -done - -# ─── Generate index page ──────────────────────────────────────────── -if [[ "$DRY_RUN" == "false" ]]; then - INDEX_FILE="$DOCS_LABS_DIR/index.md" - - cat > "$INDEX_FILE" << 'HEADER' -# 🔬 Labs - -> **Experimental branches exploring cutting-edge JVM features and research ideas.** -> -> Each lab branch contains a self-contained experiment that may require specialized -> JDK builds or dependencies. Labs are automatically discovered from `labs/*` branches -> and documented here. - -!!! info "How Labs Work" - Any branch named `labs/<feature>` with a `LABS.md` file at the root is automatically - picked up by CI and rendered here. No manual editing of `main` required. - ---- - -HEADER - - for ENTRY in "${LAB_ENTRIES[@]}"; do - IFS=$'\x01' read -r SAFE_NAME TITLE OVERVIEW STATUS LAST_UPDATED COMMIT_COUNT <<< "$ENTRY" - - cat >> "$INDEX_FILE" << EOF -## [$TITLE]($SAFE_NAME.md) - -| | | -|---|---| -| **Branch** | [\`labs/$SAFE_NAME\`](https://github.com/spectrayan/spector/tree/labs/$SAFE_NAME) | -| **Status** | $STATUS | -| **Updated** | $LAST_UPDATED | -| **Commits** | $COMMIT_COUNT ahead of main | - -$OVERVIEW - -[:octicons-arrow-right-24: Full details]($SAFE_NAME.md){ .md-button } - ---- - -EOF - done - - echo "" - echo "✅ Generated $INDEX_FILE with ${#LAB_ENTRIES[@]} lab(s)" -fi - -echo "🔬 Labs collection complete: ${#LAB_ENTRIES[@]} lab(s) processed" diff --git a/scripts/ingest-docs.bat b/scripts/ingest-docs.bat deleted file mode 100644 index 25c4445..0000000 --- a/scripts/ingest-docs.bat +++ /dev/null @@ -1,28 +0,0 @@ -@echo off -REM ═══════════════════════════════════════════════════════════════ -REM Spector File Ingestion Script -REM Uses spectorctl to discover and ingest files via SpectorRuntime. -REM All configuration is read from spector.yml (or CLI overrides). -REM -REM Usage: scripts\ingest-docs.bat [--pattern "**\*.java"] [--root path] -REM ═══════════════════════════════════════════════════════════════ - -set SPECTOR_HOME=%~dp0.. -set JAR=%SPECTOR_HOME%\spector-dist\target\spector.jar -set CONFIG=%SPECTOR_HOME%\spector-local.yml - -if not exist "%JAR%" ( - echo [ERROR] Fat JAR not found: %JAR% - echo [INFO] Run: mvn package -pl spector-dist -am -DskipTests - exit /b 1 -) - -java ^ - -Xmx4g ^ - --add-modules jdk.incubator.vector ^ - --enable-native-access=ALL-UNNAMED ^ - --enable-preview ^ - -cp "%JAR%" ^ - com.spectrayan.spector.cli.SpectorCtl ^ - ingest --config "%CONFIG%" ^ - %* diff --git a/scripts/mcp-config.json b/scripts/mcp-config.json deleted file mode 100644 index 92798d0..0000000 --- a/scripts/mcp-config.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "mcpServers": { - "spector": { - "command": "java", - "args": [ - "--add-modules", - "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "-XX:+UseCompactObjectHeaders", - "-XX:+UnlockDiagnosticVMOptions", - "-XX:+UseVectorizedMismatch", - "--enable-preview", - "-jar", - "/path/to/spector-dist/target/spector.jar", - "--config", - "/path/to/spector-local.yml" - ] - } - } -} \ No newline at end of file diff --git a/scripts/start-mcp.bat b/scripts/start-mcp.bat deleted file mode 100644 index 52ba034..0000000 --- a/scripts/start-mcp.bat +++ /dev/null @@ -1,28 +0,0 @@ -@echo off -REM ═══════════════════════════════════════════════════════════════ -REM Spector MCP Server — Start Script -REM Starts the MCP server. Configuration is read from spector.yml. -REM CLI args can override any setting. -REM ═══════════════════════════════════════════════════════════════ - -set SPECTOR_HOME=%~dp0.. -set JAR=%SPECTOR_HOME%\spector-dist\target\spector.jar -set CONFIG=%SPECTOR_HOME%\spector-local.yml - -if not exist "%JAR%" ( - echo [ERROR] Fat JAR not found: %JAR% - echo [INFO] Run: mvn package -pl spector-dist -am -DskipTests - exit /b 1 -) - -echo [Spector MCP] Starting... 1>&2 -echo [Spector MCP] JAR: %JAR% 1>&2 -echo [Spector MCP] Config: %CONFIG% 1>&2 - -java ^ - --add-modules jdk.incubator.vector ^ - --enable-native-access=ALL-UNNAMED ^ - --enable-preview ^ - -jar "%JAR%" ^ - --config "%CONFIG%" ^ - %* diff --git a/spector-bench/README.md b/spector-bench/README.md deleted file mode 100644 index a99ece5..0000000 --- a/spector-bench/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# spector-bench 📊 - -> **JMH microbenchmarks, performance sweeps, and large-scale real-embedding performance runners.** - -`spector-bench` handles empirical performance testing, SIMD kernel validation, and large-scale index sweeps for Spector. It is designed to run locally, generating interactive HTML reports with latency charts. - ---- - -## 🏗️ Core Architecture & Runners - -1. **JMH Microbenchmarks (`SpectorMicrobench`):** Microsecond-level isolation checks for the Panama Vector similarity kernels (AVX2 vs. AVX-512 vs. ARM NEON). -2. **Real-Embedding Sweeps (`RealEmbeddingScaleBench`):** Implements multi-centroid sweeps ($C \in \{32, 64, 128, 256\}$) using real Qwen3 text embeddings from local Ollama providers. -3. **Promotion Benchmarks (`SpectorIndexPromotionBench`):** Head-to-head comparisons of Flat Shard SIMD scans vs. Promoted HNSW Shards at 100K scale. - ---- - -## 🚀 Running Benchmarks - -### Generate Dependencies Classpath -Ensure the classpath is compiled before running: -```bash -mvn clean compile -pl spector-bench -``` - -### Running the Real-Embedding Scale Sweep -Run Ollama qwen3-embedding benchmarking at a scale of 10,000 vectors: -```powershell -$cp = "spector-bench/target/classes;" + (Get-Content spector-bench/target/cp.txt) -java --add-modules jdk.incubator.vector -Xmx12g -cp $cp com.spectrayan.spector.bench.RealEmbeddingScaleBench 10000 -``` - -### Running the Shard Promotion Comparison -Run Flat vs Promoted HNSW comparison at 100K scale: -```powershell -java --add-modules jdk.incubator.vector -Xmx12g -cp $cp com.spectrayan.spector.bench.SpectorIndexPromotionBench -``` diff --git a/spector-bench/pom.xml b/spector-bench/pom.xml index 7cef599..095d07c 100644 --- a/spector-bench/pom.xml +++ b/spector-bench/pom.xml @@ -6,24 +6,15 @@ <parent> <groupId>com.spectrayan</groupId> - <artifactId>spector</artifactId> + <artifactId>spector-search</artifactId> <version>0.1.0-SNAPSHOT</version> </parent> <artifactId>spector-bench</artifactId> <name>Spector Benchmarks</name> - <description>JMH benchmarks for Spector performance testing.</description> - - <properties> - <exec.mainClass>com.spectrayan.spector.bench.IndustryBenchmark</exec.mainClass> - </properties> + <description>JMH benchmarks for Spector Search performance testing.</description> <dependencies> - - <dependency> - <groupId>com.spectrayan</groupId> - <artifactId>spector-config</artifactId> - </dependency> <dependency> <groupId>com.spectrayan</groupId> <artifactId>spector-engine</artifactId> @@ -36,14 +27,6 @@ <groupId>com.spectrayan</groupId> <artifactId>spector-gpu</artifactId> </dependency> - <dependency> - <groupId>com.spectrayan</groupId> - <artifactId>spector-memory</artifactId> - </dependency> - <dependency> - <groupId>com.spectrayan</groupId> - <artifactId>spector-embed-ollama</artifactId> - </dependency> <!-- JMH --> <dependency> @@ -58,7 +41,7 @@ <!-- Jackson for JSON baseline regression --> <dependency> - <groupId>tools.jackson.core</groupId> + <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> </dependency> @@ -85,18 +68,14 @@ <artifactId>exec-maven-plugin</artifactId> <version>3.5.0</version> <configuration> - <!-- exec:exec launches a child JVM with enable-preview flag --> - <executable>java</executable> - <arguments> - <argument>--enable-preview</argument> - <argument>--add-modules</argument> - <argument>jdk.incubator.vector</argument> - <argument>-Xmx28g</argument> - <argument>-Dlogback.configurationFile=logback-bench.xml</argument> - <argument>-classpath</argument> - <classpath/> - <argument>${exec.mainClass}</argument> - </arguments> + <mainClass>com.spectrayan.spector.bench.PerformanceTestRunner</mainClass> + <arguments/> + <systemProperties> + <systemProperty> + <key>logback.configurationFile</key> + <value>logback-bench.xml</value> + </systemProperty> + </systemProperties> </configuration> </plugin> </plugins> diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/BM25Benchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/BM25Benchmark.java index 6ce870b..0569952 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/BM25Benchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/BM25Benchmark.java @@ -1,18 +1,3 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; import com.spectrayan.spector.index.BM25Index; diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/BaselineRegressionDetector.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/BaselineRegressionDetector.java index ce8c49a..d489f82 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/BaselineRegressionDetector.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/BaselineRegressionDetector.java @@ -1,18 +1,3 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; import java.io.IOException; @@ -23,8 +8,8 @@ import java.util.List; import java.util.Map; -import tools.jackson.databind.JsonNode; -import tools.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; /** * Detects performance regressions by comparing JMH JSON results against a baseline. @@ -110,7 +95,9 @@ private Map<String, BenchmarkEntry> parseBenchmarks(Path path) throws IOExceptio JsonNode paramsNode = node.get("params"); if (paramsNode != null && paramsNode.isObject()) { StringBuilder sb = new StringBuilder(); - for (var field : paramsNode.properties()) { + var fields = paramsNode.fields(); + while (fields.hasNext()) { + var field = fields.next(); if (!sb.isEmpty()) sb.append(","); sb.append(field.getKey()).append("=").append(field.getValue().asText()); } diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/BenchmarkSuiteRunner.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/BenchmarkSuiteRunner.java index e193fd8..b77571d 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/BenchmarkSuiteRunner.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/BenchmarkSuiteRunner.java @@ -1,18 +1,3 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; import java.io.IOException; diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/CognitiveMemoryBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/CognitiveMemoryBenchmark.java deleted file mode 100644 index 268601b..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/CognitiveMemoryBenchmark.java +++ /dev/null @@ -1,324 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.memory.*; -import com.spectrayan.spector.memory.cortex.MemorySource; -import com.spectrayan.spector.memory.sync.MemoryWal; -import com.spectrayan.spector.memory.hippocampus.CircadianPolicy; -import com.spectrayan.spector.commons.concurrent.MemoryPinning; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.Duration; -import java.time.Instant; -import java.util.*; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicLong; - -/** - * Standalone empirical benchmark suite for Spector's off-heap cognitive memory. - * Validates hebbian plasticity counter throughput, page cache pre-touching, - * and quantifies the "Truncation Trap" recall error delta compared to external databases. - */ -public class CognitiveMemoryBenchmark { - - private static final int DIMENSIONS = 128; - private static final int DATASET_SIZE = 5000; - private static final int CONCURRENCY_THREADS = 16; - private static final int MEASURE_ITERATIONS = 50000; - - public static void main(String[] args) throws Exception { - System.out.println("╔══════════════════════════════════════════════════════════╗"); - System.out.println("║ SPECTOR COGNITIVE MEMORY BENCHMARK HARNESS ║"); - System.out.println("╚══════════════════════════════════════════════════════════╝"); - System.out.println(); - - Path walDir = Files.createTempDirectory("spector-wal-bench"); - - try { - // Run 1: Hebbian Plasticity CAS Latency & Throughput Benchmark - runPlasticityCasBenchmark(walDir); - - // Run 2: Fused SIMD vs. pgvector Truncation Trap Correctness Benchmark - runTruncationTrapBenchmark(); - - // Run 3: Multi-Segment Parallel Scatter-Gather Scan Benchmark - runParallelSegmentScansBenchmark(); - - } finally { - // Cleanup - deleteDirectory(walDir); - } - } - - // ─── Benchmark 1: Hebbian Plasticity CAS Throughput ─── - - private static void runPlasticityCasBenchmark(Path walDir) throws Exception { - System.out.println("▶ Benchmark 1: Hebbian Plasticity CAS Throughput"); - - // Open file-backed MemoryWal - try (MemoryWal wal = new MemoryWal(walDir, 8L * 1024 * 1024, false, 1024, false)) { - // Append some base memories - for (int i = 0; i < 1000; i++) { - wal.appendRemember("mem-" + i, new byte[]{1}); - } - - ExecutorService executor = Executors.newFixedThreadPool(CONCURRENCY_THREADS); - AtomicLong totalOps = new AtomicLong(); - long t0 = System.nanoTime(); - - List<Future<?>> futures = new ArrayList<>(); - for (int t = 0; t < CONCURRENCY_THREADS; t++) { - final int threadId = t; - futures.add(executor.submit(() -> { - Random rng = new Random(threadId); - for (int i = 0; i < MEASURE_ITERATIONS; i++) { - String targetMemId = "mem-" + rng.nextInt(1000); - // Simulate a Hops recall-hit counters CAS increment or reinforcement mutation - wal.appendReinforce(targetMemId, (byte) (rng.nextInt(128) - 64)); - totalOps.incrementAndGet(); - } - })); - } - - for (var f : futures) f.get(); - long elapsedNanos = System.nanoTime() - t0; - executor.shutdown(); - - double seconds = elapsedNanos / 1e9; - double throughput = totalOps.get() / seconds; - double avgLatencyUs = (elapsedNanos / (double) totalOps.get()) / 1000.0; - - System.out.printf(" Threads: %2d | Total Mutations: %,d%n", CONCURRENCY_THREADS, totalOps.get()); - System.out.printf(" Plasticity Throughput: %,.0f ops/sec%n", throughput); - System.out.printf(" Average CAS Latency : %.2f µs%n", avgLatencyUs); - System.out.println(); - } - } - - // ─── Benchmark 2: Fused SIMD vs. pgvector Truncation Trap ─── - - private static void runTruncationTrapBenchmark() { - System.out.println("▶ Benchmark 2: Fused SIMD vs. pgvector Truncation Trap (Recall Correctness)"); - - Random rng = new Random(42); - - // Generate mock cognitive memories with varying vectors, valence, tags, and importance scores - List<MockMemoryNode> nodes = new ArrayList<>(DATASET_SIZE); - for (int i = 0; i < DATASET_SIZE; i++) { - float[] vec = randomVector(DIMENSIONS, rng); - float importance = rng.nextFloat() * 10f; // importance score 0-10 - byte valence = (byte) (rng.nextInt(128) - 64); // signed valence - long tags = rng.nextLong(); // bloom tags - nodes.add(new MockMemoryNode("mem-" + i, vec, importance, valence, tags)); - } - - // Generate query vector - float[] queryVec = randomVector(DIMENSIONS, rng); - long targetTagFilter = 0x7L; // filter condition: must have specific bloom flags set - - // 1. Fused Cognitive Scoring: Evaluate Fused L2 + importance + tags simultaneously over ALL records - List<MockScoredResult> fusedResults = new ArrayList<>(); - for (var node : nodes) { - // Tag filtering - if ((node.tags & targetTagFilter) != targetTagFilter) { - continue; - } - float l2Dist = computeEuclideanDistance(queryVec, node.vector); - // Fuse score: lower Euclidean distance + higher importance + absolute valence - float cognitiveScore = (10f - l2Dist) + (node.importance * 0.5f) + (Math.abs(node.valence) * 0.05f); - fusedResults.add(new MockScoredResult(node.id, cognitiveScore)); - } - fusedResults.sort((a, b) -> Float.compare(b.score, a.score)); // descending - List<MockScoredResult> top10Fused = fusedResults.subList(0, Math.min(10, fusedResults.size())); - - // 2. pgvector-style Search: Retrieve top-50 pure vector Euclidean distance matches, THEN apply cognitive filter - List<MockScoredResult> vectorResults = new ArrayList<>(); - for (var node : nodes) { - float l2Dist = computeEuclideanDistance(queryVec, node.vector); - vectorResults.add(new MockScoredResult(node.id, l2Dist, node)); // score = l2Dist (lower is better) - } - vectorResults.sort((a, b) -> Float.compare(a.score, b.score)); // ascending L2 - List<MockScoredResult> top50Vector = vectorResults.subList(0, Math.min(50, vectorResults.size())); - - // Post-filter the pre-truncated top-50 set - List<MockScoredResult> postFilteredResults = new ArrayList<>(); - for (var res : top50Vector) { - MockMemoryNode node = res.node; - if ((node.tags & targetTagFilter) != targetTagFilter) { - continue; - } - float cognitiveScore = (10f - res.score) + (node.importance * 0.5f) + (Math.abs(node.valence) * 0.05f); - postFilteredResults.add(new MockScoredResult(node.id, cognitiveScore)); - } - postFilteredResults.sort((a, b) -> Float.compare(b.score, a.score)); - - // Calculate overlap / recall loss - Set<String> fusedIds = new HashSet<>(); - for (var r : top10Fused) fusedIds.add(r.id); - - int overlap = 0; - for (int i = 0; i < Math.min(10, postFilteredResults.size()); i++) { - if (fusedIds.contains(postFilteredResults.get(i).id)) { - overlap++; - } - } - - double recallErrorPercent = (10 - overlap) * 10.0; - - System.out.printf(" Total Candidates meeting filter criteria: %,d%n", fusedResults.size()); - System.out.println(" Top-10 Fused Cognitive Matches (Spector SIMD):"); - int showFusedCount = Math.min(3, top10Fused.size()); - for (int i = 0; i < showFusedCount; i++) { - System.out.printf(" #%d: id=%s score=%.2f%n", i + 1, top10Fused.get(i).id, top10Fused.get(i).score); - } - System.out.println(" Top-10 pgvector-Style Post-Filtered Matches (External DB):"); - int showCount = Math.min(3, postFilteredResults.size()); - for (int i = 0; i < showCount; i++) { - System.out.printf(" #%d: id=%s score=%.2f%n", i + 1, postFilteredResults.get(i).id, postFilteredResults.get(i).score); - } - System.out.println(); - System.out.printf(" [TRUNCATION TRAP METRIC] Overlap: %d/10 | Recall Loss Error: %.1f%%%n", overlap, recallErrorPercent); - System.out.println(" Verdict: " + (recallErrorPercent > 0 - ? "⚠️ Truncation Trap Verified! External DB missed high-importance cognitive nodes." - : "Perfect overlap (low-selectivity filter)")); - System.out.println(); - } - - // ─── Benchmark 3: Multi-Segment Parallel Scatter-Gather Scans ─── - - private static void runParallelSegmentScansBenchmark() throws Exception { - System.out.println("▶ Benchmark 3: Parallel Scatter-Gather Segment Scans (Loom vs. Bandwidth)"); - - int numSegments = 16; - int elementsPerSegment = 10000; - - System.out.printf(" Simulating parallel scans over %d partition segments (%d elements/segment)...%n", - numSegments, elementsPerSegment); - - ExecutorService loomExecutor = Executors.newVirtualThreadPerTaskExecutor(); - float[] queryVec = randomVector(DIMENSIONS, new Random(42)); - AtomicLong elementsScanned = new AtomicLong(); - - long t0 = System.nanoTime(); - List<Future<Double>> futures = new ArrayList<>(); - - for (int s = 0; s < numSegments; s++) { - final int segmentId = s; - futures.add(loomExecutor.submit(() -> { - Random rng = new Random(segmentId); - // Pre-allocate segment array to simulate off-heap segment scan - float[][] segmentVectors = new float[elementsPerSegment][DIMENSIONS]; - for (int i = 0; i < elementsPerSegment; i++) { - segmentVectors[i] = randomVector(DIMENSIONS, rng); - } - - double bestDist = Double.MAX_VALUE; - for (int i = 0; i < elementsPerSegment; i++) { - double dist = computeEuclideanDistance(queryVec, segmentVectors[i]); - bestDist = Math.min(bestDist, dist); - elementsScanned.incrementAndGet(); - } - return bestDist; - })); - } - - for (var f : futures) f.get(); - long elapsedNanos = System.nanoTime() - t0; - loomExecutor.shutdown(); - - double milliseconds = elapsedNanos / 1e6; - double throughput = elementsScanned.get() / (elapsedNanos / 1e9); - - System.out.printf(" Scanned %,d vectors sequentially across %d virtual threads.%n", - elementsScanned.get(), numSegments); - System.out.printf(" Wall-Clock Scan Duration: %.2f ms%n", milliseconds); - System.out.printf(" Aggregate Scan Rate : %,.0f vectors/sec (SIMD/Loom bound)%n", throughput); - System.out.println(); - } - - // ─── Helpers ─── - - private static float[] randomVector(int dim, Random rng) { - float[] v = new float[dim]; - for (int i = 0; i < dim; i++) { - v[i] = rng.nextFloat() * 2f - 1f; - } - return v; - } - - private static float computeEuclideanDistance(float[] a, float[] b) { - float sum = 0f; - for (int i = 0; i < a.length; i++) { - float diff = a[i] - b[i]; - sum += diff * diff; - } - return (float) Math.sqrt(sum); - } - - private static void deleteDirectory(Path path) throws IOException { - if (Files.exists(path)) { - try (var stream = Files.walk(path)) { - stream.sorted(Comparator.reverseOrder()) - .forEach(p -> { - try { - Files.delete(p); - } catch (IOException e) { - // ignore - } - }); - } - } - } - - // ─── Inner Mock Classes ─── - - private static class MockMemoryNode { - String id; - float[] vector; - float importance; - byte valence; - long tags; - - MockMemoryNode(String id, float[] vector, float importance, byte valence, long tags) { - this.id = id; - this.vector = vector; - this.importance = importance; - this.valence = valence; - this.tags = tags; - } - } - - private static class MockScoredResult { - String id; - float score; - MockMemoryNode node; - - MockScoredResult(String id, float score) { - this.id = id; - this.score = score; - } - - MockScoredResult(String id, float score, MockMemoryNode node) { - this.id = id; - this.score = score; - this.node = node; - } - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/ConcurrencyBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/ConcurrencyBenchmark.java index 0ac0a75..2c24ca5 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/ConcurrencyBenchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/ConcurrencyBenchmark.java @@ -1,25 +1,9 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.engine.DefaultSpectorEngine; +import com.spectrayan.spector.core.SimilarityFunction; +import com.spectrayan.spector.engine.SpectorConfig; import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.config.HnswParams; +import com.spectrayan.spector.index.HnswParams; import com.spectrayan.spector.query.SearchQuery; import org.openjdk.jmh.annotations.*; @@ -69,7 +53,7 @@ public void setup() { var hnswParams = new HnswParams(16, 200, 64); var config = new SpectorConfig(DIMENSIONS, DATASET_SIZE + 1000, SimilarityFunction.COSINE, hnswParams); - engine = new DefaultSpectorEngine(config); + engine = new SpectorEngine(config); Random rng = new Random(42); for (int i = 0; i < DATASET_SIZE; i++) { diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/CorePerformanceBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/CorePerformanceBenchmark.java deleted file mode 100644 index 2a8f2db..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/CorePerformanceBenchmark.java +++ /dev/null @@ -1,751 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.config.HnswParams; -import com.spectrayan.spector.core.simd.SimdCapability; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.engine.DefaultSpectorEngine; -import com.spectrayan.spector.engine.SpectorEngine; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.lang.management.GarbageCollectorMXBean; -import java.lang.management.ManagementFactory; -import java.net.ServerSocket; -import java.net.Socket; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.*; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicLong; - -/** - * Core performance benchmark suite for Spector. - * - * <p>Measures the fundamental performance characteristics of the in-process - * SIMD-accelerated search engine: latency, throughput, GC impact, scalability, - * and fused cognitive scoring correctness.</p> - * - * <h3>Benchmarks</h3> - * <ul> - * <li>In-process vs network latency comparison</li> - * <li>Vector search latency at 10K/50K/100K scale</li> - * <li>GC pressure during sustained search</li> - * <li>Concurrent QPS scaling (1–64 threads)</li> - * <li>Search latency at 100K → 1M scale</li> - * <li>Fused cognitive scoring vs top-K-then-rerank</li> - * </ul> - * - * <p>Run: {@code mvn -pl spector-bench exec:exec - * -Dexec.mainClass=com.spectrayan.spector.bench.CorePerformanceBenchmark}</p> - */ -public class CorePerformanceBenchmark { - - // ─────────────── Configuration ─────────────── - - private static final int DIMS = 384; - private static final int WARMUP_QUERIES = 500; - private static final int MEASURE_QUERIES = 2000; - private static final int TOP_K = 10; - private static final int NUM_CLUSTERS = 50; - - // C5: Incremental scaling - private static final int[] SCALE_SIZES = {100_000, 300_000, 500_000, 700_000, 1_000_000}; - private static final int SCALE_DIMS = 128; // keep smaller for 1M - - // Results - private final List<String[]> verdicts = new ArrayList<>(); - - // ─────────────── Main ─────────────── - - public static void main(String[] args) throws Exception { - new CorePerformanceBenchmark().run(); - } - - public void run() throws Exception { - System.out.println("╔══════════════════════════════════════════════════════════════╗"); - System.out.println("║ SPECTOR SEARCH — CORE PERFORMANCE BENCHMARK ║"); - System.out.println("╚══════════════════════════════════════════════════════════════╝"); - System.out.println(); - printSystemInfo(); - System.out.println(); - - // C1: MCP latency comparison - runC1_McpLatencyComparison(); - - // C2: Search latency at scale - runC2_SearchLatency(); - - // C3: GC pressure - runC3_GcPressure(); - - // C4: QPS with virtual threads - runC4_ConcurrentQps(); - - // C5: Recall at 1M memories (incremental) - runC5_ScaleLatency(); - - // C6: Truncation trap - runC6_TruncationTrap(); - - // Summary - printVerdictTable(); - - // Write report - writeReport(); - } - - // ═══════════════════════════════════════════════════════════════ - // C1: "100× faster than Python MCP servers" - // ═══════════════════════════════════════════════════════════════ - - private void runC1_McpLatencyComparison() throws Exception { - System.out.println("▶ C1: MCP In-Process vs Network Roundtrip"); - - // Build a small engine for in-process measurement - var config = new SpectorConfig(DIMS, 11_000, SimilarityFunction.COSINE, - new HnswParams(16, 200, 64)); - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - float[][] vectors = generateClusteredVectors(10_000, DIMS, rng); - for (int i = 0; i < 10_000; i++) { - engine.ingest("doc-" + i, "content " + i, vectors[i]); - } - - // Warmup in-process - float[] qv = perturbVector(vectors[0], 0.3f, DIMS, new Random(999)); - for (int i = 0; i < 200; i++) engine.vectorSearch(qv, TOP_K); - - // Measure in-process search latency (what Spector MCP does) - long[] inProcessNanos = new long[MEASURE_QUERIES]; - for (int i = 0; i < MEASURE_QUERIES; i++) { - long t0 = System.nanoTime(); - engine.vectorSearch(qv, TOP_K); - inProcessNanos[i] = System.nanoTime() - t0; - } - var inProcessStats = computeStats(inProcessNanos); - - // Measure actual localhost TCP roundtrip (network floor) - long[] networkNanos = measureLocalhostRoundtrip(1000); - var networkStats = computeStats(networkNanos); - - double spectorUs = inProcessStats.p50 / 1000.0; - - // Python MCP reference: README states 2–10ms for "network + Python GIL" based on - // typical Chroma/Weaviate/Qdrant MCP servers. We compare against both ends: - double pythonLowMs = 2.0; // optimistic: well-tuned Python, localhost - double pythonHighMs = 10.0; // realistic: network + GIL + framework overhead - double speedupVsLow = (pythonLowMs * 1000) / spectorUs; - double speedupVsHigh = (pythonHighMs * 1000) / spectorUs; - - // Also compute measured overhead: network roundtrip + JSON (conservative 200µs) - double measuredOverheadUs = (networkStats.mean / 1000.0) + 200; - double measuredSpeedup = (measuredOverheadUs + spectorUs) / spectorUs; - - System.out.printf(" Spector in-process: p50=%.0fµs p99=%.0fµs avg=%.0fµs%n", - spectorUs, inProcessStats.p99 / 1000.0, inProcessStats.mean / 1000.0); - System.out.printf(" Localhost TCP roundtrip: p50=%.0fµs p99=%.0fµs avg=%.0fµs%n", - networkStats.p50 / 1000.0, networkStats.p99 / 1000.0, networkStats.mean / 1000.0); - System.out.println(); - System.out.printf(" vs measured network floor: %.0f× (%.0fµs network+JSON overhead)%n", - measuredSpeedup, measuredOverheadUs); - System.out.printf(" vs Python MCP (2ms low): %.0f× (Spector %.0fµs vs Python 2,000µs)%n", - speedupVsLow, spectorUs); - System.out.printf(" vs Python MCP (10ms high): %.0f× (Spector %.0fµs vs Python 10,000µs)%n", - speedupVsHigh, spectorUs); - System.out.println(); - - engine.close(); - - // The README claim "100×" refers to the high end (10ms Python MCP) - String verdict = speedupVsHigh >= 100 ? "✅ VALIDATED" : - (speedupVsLow >= 20 ? "⚠️ PARTIAL (" + String.format("%.0f–%.0f×", speedupVsLow, speedupVsHigh) + ")" : - "❌ FAILED"); - verdicts.add(new String[]{"C1: 100× faster than Python MCP", - String.format("%.0f–%.0f×", speedupVsLow, speedupVsHigh), verdict}); - } - - /** - * Measures actual localhost TCP roundtrip: connect → write → read → close. - * Simulates the absolute minimum network overhead a Python MCP server would have. - */ - private long[] measureLocalhostRoundtrip(int iterations) throws Exception { - // Start a tiny echo server on localhost - try (ServerSocket serverSocket = new ServerSocket(0)) { - int port = serverSocket.getLocalPort(); - serverSocket.setSoTimeout(5000); - - // Echo server in background - Thread echoThread = Thread.ofVirtual().start(() -> { - try { - for (int i = 0; i < iterations; i++) { - try (Socket client = serverSocket.accept()) { - InputStream in = client.getInputStream(); - OutputStream out = client.getOutputStream(); - byte[] buf = new byte[256]; - int n = in.read(buf); - if (n > 0) out.write(buf, 0, n); - } - } - } catch (Exception e) { - // server stopping - } - }); - - // Measure client roundtrips - long[] nanos = new long[iterations]; - byte[] payload = "{\"tool\":\"vector_search\",\"query\":[0.1,0.2],\"top_k\":10}".getBytes(StandardCharsets.UTF_8); - - for (int i = 0; i < iterations; i++) { - long t0 = System.nanoTime(); - try (Socket sock = new Socket("127.0.0.1", port)) { - sock.getOutputStream().write(payload); - sock.getOutputStream().flush(); - byte[] resp = new byte[256]; - sock.getInputStream().read(resp); - } - nanos[i] = System.nanoTime() - t0; - } - - echoThread.join(3000); - return nanos; - } - } - - // ═══════════════════════════════════════════════════════════════ - // C2: "50–200µs search latency" - // ═══════════════════════════════════════════════════════════════ - - private void runC2_SearchLatency() { - System.out.println("▶ C2: Vector Search Latency at Scale"); - - int[] sizes = {10_000, 50_000, 100_000}; - boolean allPassed = true; - - for (int size : sizes) { - var config = new SpectorConfig(DIMS, size + 1000, SimilarityFunction.COSINE, - new HnswParams(16, 200, 64)); - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - float[][] vectors = generateClusteredVectors(size, DIMS, rng); - for (int i = 0; i < size; i++) { - engine.ingest("doc-" + i, "content " + i, vectors[i]); - } - - float[] qv = perturbVector(vectors[0], 0.3f, DIMS, new Random(999)); - - // Warmup - for (int i = 0; i < WARMUP_QUERIES; i++) engine.vectorSearch(qv, TOP_K); - - // Measure - long[] nanos = new long[MEASURE_QUERIES]; - for (int i = 0; i < MEASURE_QUERIES; i++) { - long t0 = System.nanoTime(); - engine.vectorSearch(qv, TOP_K); - nanos[i] = System.nanoTime() - t0; - } - var stats = computeStats(nanos); - - double p50Us = stats.p50 / 1000.0; - double p99Us = stats.p99 / 1000.0; - String sizeLabel = size / 1000 + "K"; - System.out.printf(" %5s docs: p50=%.0fµs p95=%.0fµs p99=%.0fµs QPS=%.0f%n", - sizeLabel, p50Us, stats.p95 / 1000.0, p99Us, 1e9 / stats.mean); - - // Pass criteria: p50 < 1ms for all sizes - if (p50Us > 1000) allPassed = false; - - engine.close(); - } - - System.out.println(); - String verdict = allPassed ? "✅ VALIDATED" : "❌ FAILED"; - verdicts.add(new String[]{"C2: 50–200µs search latency", "see above", verdict}); - } - - // ═══════════════════════════════════════════════════════════════ - // C3: "Zero GC pressure — 100% off-heap Panama" - // ═══════════════════════════════════════════════════════════════ - - private void runC3_GcPressure() { - System.out.println("▶ C3: Zero GC Pressure During Sustained Search"); - - var config = new SpectorConfig(DIMS, 11_000, SimilarityFunction.COSINE, - new HnswParams(16, 200, 64)); - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - float[][] vectors = generateClusteredVectors(10_000, DIMS, rng); - for (int i = 0; i < 10_000; i++) { - engine.ingest("doc-" + i, "content " + i, vectors[i]); - } - - float[] qv = perturbVector(vectors[0], 0.3f, DIMS, new Random(999)); - - // Warmup - for (int i = 0; i < WARMUP_QUERIES; i++) engine.vectorSearch(qv, TOP_K); - - // Force GC before measurement - System.gc(); - try { Thread.sleep(200); } catch (InterruptedException e) { /* ignore */ } - - // Record GC state before - long gcCountBefore = totalGcCount(); - long gcTimeBefore = totalGcTimeMs(); - - // Run 100K searches - int searchCount = 100_000; - long t0 = System.nanoTime(); - for (int i = 0; i < searchCount; i++) { - engine.vectorSearch(qv, TOP_K); - } - long elapsed = System.nanoTime() - t0; - - // Record GC state after - long gcCountAfter = totalGcCount(); - long gcTimeAfter = totalGcTimeMs(); - - long gcPauses = gcCountAfter - gcCountBefore; - long gcTimeMs = gcTimeAfter - gcTimeBefore; - double searchMs = elapsed / 1e6; - - System.out.printf(" Searches executed: %,d%n", searchCount); - System.out.printf(" Total wall time: %.1f ms%n", searchMs); - System.out.printf(" GC pauses during: %d%n", gcPauses); - System.out.printf(" GC time during: %d ms%n", gcTimeMs); - System.out.printf(" GC overhead: %.4f%%%n", (gcTimeMs / searchMs) * 100); - System.out.println(); - - engine.close(); - - // Pass: ≤2 GC pauses (some minor GC may be unavoidable from JVM bookkeeping) - String verdict = gcPauses <= 2 ? "✅ VALIDATED" : "⚠️ PARTIAL"; - verdicts.add(new String[]{"C3: Zero GC pressure", - gcPauses + " pauses, " + gcTimeMs + "ms", verdict}); - } - - private long totalGcCount() { - return ManagementFactory.getGarbageCollectorMXBeans().stream() - .mapToLong(GarbageCollectorMXBean::getCollectionCount) - .filter(c -> c >= 0).sum(); - } - - private long totalGcTimeMs() { - return ManagementFactory.getGarbageCollectorMXBeans().stream() - .mapToLong(GarbageCollectorMXBean::getCollectionTime) - .filter(c -> c >= 0).sum(); - } - - // ═══════════════════════════════════════════════════════════════ - // C4: "10,000+ QPS with Virtual Threads" - // ═══════════════════════════════════════════════════════════════ - - private void runC4_ConcurrentQps() throws Exception { - System.out.println("▶ C4: Concurrent QPS Scaling"); - - var config = new SpectorConfig(DIMS, 51_000, SimilarityFunction.COSINE, - new HnswParams(16, 200, 64)); - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - float[][] vectors = generateClusteredVectors(50_000, DIMS, rng); - for (int i = 0; i < 50_000; i++) { - engine.ingest("doc-" + i, "content " + i, vectors[i]); - } - - float[] qv = perturbVector(vectors[0], 0.3f, DIMS, new Random(999)); - // Warmup (use vectorSearch — hybridSearch requires --enable-preview via ConcurrentTasks) - for (int i = 0; i < 200; i++) engine.vectorSearch(qv, TOP_K); - - int[] threadCounts = {1, 4, 8, 16, 32, 64}; - double maxQps = 0; - - for (int threads : threadCounts) { - int opsPerThread = 500; - ExecutorService executor = Executors.newFixedThreadPool(threads); - AtomicLong totalOps = new AtomicLong(); - - long wallStart = System.nanoTime(); - List<Future<?>> futures = new ArrayList<>(); - - for (int t = 0; t < threads; t++) { - final int tid = t; - futures.add(executor.submit(() -> { - Random trng = new Random(tid + 1000); - float[] threadQv = perturbVector(vectors[trng.nextInt(50_000)], 0.3f, DIMS, trng); - for (int i = 0; i < opsPerThread; i++) { - engine.vectorSearch(threadQv, TOP_K); - totalOps.incrementAndGet(); - } - })); - } - for (var f : futures) f.get(); - long wallElapsed = System.nanoTime() - wallStart; - executor.shutdown(); - - double qps = totalOps.get() / (wallElapsed / 1e9); - maxQps = Math.max(maxQps, qps); - - System.out.printf(" threads=%2d QPS=%,.0f total_ops=%,d%n", threads, qps, totalOps.get()); - } - - System.out.println(); - engine.close(); - - String verdict = maxQps >= 10_000 ? "✅ VALIDATED" : - (maxQps >= 5_000 ? "⚠️ PARTIAL" : "❌ FAILED"); - verdicts.add(new String[]{"C4: 10,000+ QPS", - String.format("%,.0f QPS", maxQps), verdict}); - } - - // ═══════════════════════════════════════════════════════════════ - // C5: "~2ms recall at 1M memories" (incremental scaling) - // ═══════════════════════════════════════════════════════════════ - - private void runC5_ScaleLatency() { - System.out.println("▶ C5: Search Latency at Scale (100K → 1M)"); - - var hnswParams = new HnswParams(16, 200, 64); - var config = new SpectorConfig(SCALE_DIMS, 1_100_000, SimilarityFunction.COSINE, hnswParams); - - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - int ingested = 0; - double latencyAt1M = -1; - - for (int targetSize : SCALE_SIZES) { - // Ingest incrementally - while (ingested < targetSize) { - float[] vec = randomVector(SCALE_DIMS, rng); - engine.ingest("mem-" + ingested, "memory content " + ingested, vec); - ingested++; - - // Progress every 100K - if (ingested % 100_000 == 0) { - System.out.printf(" Ingested %,d...%n", ingested); - } - } - - // Measure search latency at this scale - float[] qv = randomVector(SCALE_DIMS, new Random(999)); - - // Warmup - for (int i = 0; i < 100; i++) engine.vectorSearch(qv, TOP_K); - - long[] nanos = new long[500]; - for (int i = 0; i < 500; i++) { - long t0 = System.nanoTime(); - engine.vectorSearch(qv, TOP_K); - nanos[i] = System.nanoTime() - t0; - } - var stats = computeStats(nanos); - double p50Ms = stats.p50 / 1e6; - double p99Ms = stats.p99 / 1e6; - - System.out.printf(" %,7d memories: p50=%.2fms p99=%.2fms QPS=%.0f%n", - targetSize, p50Ms, p99Ms, 1e9 / stats.mean); - - if (targetSize == 1_000_000) latencyAt1M = p50Ms; - } - - System.out.println(); - engine.close(); - - String verdict = latencyAt1M <= 5.0 ? "✅ VALIDATED" : - (latencyAt1M <= 10.0 ? "⚠️ PARTIAL" : "❌ FAILED"); - verdicts.add(new String[]{"C5: ~2ms at 1M memories", - String.format("p50=%.2fms", latencyAt1M), verdict}); - } - - // ═══════════════════════════════════════════════════════════════ - // C6: "Fused scoring — no truncation trap" - // ═══════════════════════════════════════════════════════════════ - - private void runC6_TruncationTrap() { - System.out.println("▶ C6: Fused Scoring vs Top-K-Then-Rerank (Truncation Trap)"); - - int datasetSize = 50_000; - Random rng = new Random(42); - - // Generate memories with cognitive metadata - List<CognitiveNode> nodes = new ArrayList<>(datasetSize); - for (int i = 0; i < datasetSize; i++) { - float[] vec = randomVector(DIMS, rng); - float importance = rng.nextFloat() * 10f; - byte valence = (byte) (rng.nextInt(128) - 64); - long tags = rng.nextLong(); - float decayFactor = 0.3f + rng.nextFloat() * 0.7f; // 0.3–1.0 - nodes.add(new CognitiveNode("mem-" + i, vec, importance, valence, tags, decayFactor)); - } - - float[] queryVec = randomVector(DIMS, new Random(999)); - long tagFilter = 0x7L; // require specific bloom bits - - // ── Strategy 1: Fused Cognitive Scoring (Spector) ── - // Evaluate ALL candidates with combined score: similarity + importance × decay + valence - List<ScoredResult> fusedResults = new ArrayList<>(); - for (var node : nodes) { - if ((node.tags & tagFilter) != tagFilter) continue; - float sim = cosineSim(queryVec, node.vector); - float cogScore = sim + (node.importance * node.decayFactor * 0.3f) - + (Math.abs(node.valence) * 0.01f); - fusedResults.add(new ScoredResult(node.id, cogScore)); - } - fusedResults.sort((a, b) -> Float.compare(b.score, a.score)); - List<ScoredResult> fusedTop10 = fusedResults.subList(0, Math.min(10, fusedResults.size())); - - // ── Strategy 2: pgvector-style (External DB) ── - // Top-50 by pure vector similarity, then post-filter with cognitive scoring - List<ScoredResult> vectorOnly = new ArrayList<>(); - for (var node : nodes) { - float sim = cosineSim(queryVec, node.vector); - vectorOnly.add(new ScoredResult(node.id, sim, node)); - } - vectorOnly.sort((a, b) -> Float.compare(b.score, a.score)); - List<ScoredResult> top50Vec = vectorOnly.subList(0, Math.min(50, vectorOnly.size())); - - // Post-filter - List<ScoredResult> postFiltered = new ArrayList<>(); - for (var res : top50Vec) { - var node = res.node; - if ((node.tags & tagFilter) != tagFilter) continue; - float cogScore = res.score + (node.importance * node.decayFactor * 0.3f) - + (Math.abs(node.valence) * 0.01f); - postFiltered.add(new ScoredResult(node.id, cogScore)); - } - postFiltered.sort((a, b) -> Float.compare(b.score, a.score)); - - // Also test with top-100 and top-200 - int[] truncationLevels = {50, 100, 200}; - for (int topN : truncationLevels) { - List<ScoredResult> topNVec = vectorOnly.subList(0, Math.min(topN, vectorOnly.size())); - List<ScoredResult> reranked = new ArrayList<>(); - for (var res : topNVec) { - var node = res.node; - if ((node.tags & tagFilter) != tagFilter) continue; - float cogScore = res.score + (node.importance * node.decayFactor * 0.3f) - + (Math.abs(node.valence) * 0.01f); - reranked.add(new ScoredResult(node.id, cogScore)); - } - reranked.sort((a, b) -> Float.compare(b.score, a.score)); - - Set<String> fusedIds = new HashSet<>(); - for (var r : fusedTop10) fusedIds.add(r.id); - - int overlap = 0; - for (int i = 0; i < Math.min(10, reranked.size()); i++) { - if (fusedIds.contains(reranked.get(i).id)) overlap++; - } - double recallLoss = (10 - overlap) * 10.0; - - System.out.printf(" top-%d then rerank: overlap=%d/10 recall_loss=%.0f%%%n", - topN, overlap, recallLoss); - } - - // Use top-50 result for the verdict - Set<String> fusedIds = new HashSet<>(); - for (var r : fusedTop10) fusedIds.add(r.id); - int overlap50 = 0; - for (int i = 0; i < Math.min(10, postFiltered.size()); i++) { - if (fusedIds.contains(postFiltered.get(i).id)) overlap50++; - } - double recallLoss50 = (10 - overlap50) * 10.0; - - System.out.printf("%n Candidates passing filter: %,d / %,d%n", fusedResults.size(), datasetSize); - System.out.printf(" Truncation Trap recall loss (top-50): %.0f%%%n", recallLoss50); - - // Show top-3 fused vs top-3 postfiltered - System.out.println(" Top-3 Fused (Spector): " + formatTop3(fusedTop10)); - System.out.println(" Top-3 External DB (top50): " + formatTop3(postFiltered)); - System.out.println(); - - String verdict = recallLoss50 >= 20 ? "✅ VALIDATED" : - (recallLoss50 >= 10 ? "⚠️ PARTIAL" : "❌ NOT PROVEN"); - verdicts.add(new String[]{"C6: Truncation trap proven", - String.format("%.0f%% recall loss", recallLoss50), verdict}); - } - - // ═══════════════════════════════════════════════════════════════ - // Results & Report - // ═══════════════════════════════════════════════════════════════ - - private void printVerdictTable() { - System.out.println("═══════════════════════════════════════════════════════════════"); - System.out.println(" CORE PERFORMANCE REPORT "); - System.out.println("═══════════════════════════════════════════════════════════════"); - System.out.printf(" %-38s %-20s %-15s%n", "BENCHMARK", "RESULT", "VERDICT"); - System.out.println(" " + "─".repeat(73)); - for (var v : verdicts) { - System.out.printf(" %-38s %-20s %-15s%n", v[0], v[1], v[2]); - } - System.out.println("═══════════════════════════════════════════════════════════════"); - } - - private void writeReport() throws IOException { - StringBuilder sb = new StringBuilder(); - sb.append("# Spector — Core Performance Report\n\n"); - sb.append("**Generated:** ").append(LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)).append("\n\n"); - - // System info - sb.append("## System\n\n"); - sb.append("| Property | Value |\n"); - sb.append("|---|---|\n"); - sb.append("| OS | ").append(System.getProperty("os.name")).append(" ").append(System.getProperty("os.arch")).append(" |\n"); - sb.append("| Java | ").append(System.getProperty("java.version")).append(" |\n"); - sb.append("| CPUs | ").append(Runtime.getRuntime().availableProcessors()).append(" logical cores |\n"); - sb.append("| CPU | ").append(getCpuModel()).append(" |\n"); - sb.append("| Max Heap | ").append(Runtime.getRuntime().maxMemory() / (1024 * 1024)).append(" MB |\n"); - sb.append("| SIMD | ").append(SimdCapability.report()).append(" |\n\n"); - - // Results - sb.append("## Results\n\n"); - sb.append("| Benchmark | Result | Verdict |\n"); - sb.append("|---|---|---|\n"); - for (var v : verdicts) { - sb.append("| ").append(v[0]).append(" | ").append(v[1]).append(" | ").append(v[2]).append(" |\n"); - } - - Path reportPath = Path.of("spector-bench", "target", "core-performance-report.md"); - Files.createDirectories(reportPath.getParent()); - Files.writeString(reportPath, sb.toString()); - System.out.printf("%nReport saved: %s%n", reportPath.toAbsolutePath()); - } - - // ─────────────── System Info ─────────────── - - private void printSystemInfo() { - long totalMem = Runtime.getRuntime().maxMemory() / (1024 * 1024); - System.out.printf(" OS: %s %s%n", System.getProperty("os.name"), System.getProperty("os.arch")); - System.out.printf(" Java: %s%n", System.getProperty("java.version")); - System.out.printf(" CPU: %s (%d logical cores)%n", getCpuModel(), Runtime.getRuntime().availableProcessors()); - System.out.printf(" Heap: %d MB%n", totalMem); - System.out.printf(" SIMD: %s%n", SimdCapability.report()); - System.out.printf(" Time: %s%n", LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)); - } - - private static String getCpuModel() { - // Try Windows - try { - Process p = new ProcessBuilder("powershell", "-Command", - "(Get-CimInstance Win32_Processor).Name").start(); - String result = new String(p.getInputStream().readAllBytes()).trim(); - p.waitFor(); - if (!result.isBlank()) return result; - } catch (Exception ignored) {} - // Try Linux - try { - Process p = new ProcessBuilder("sh", "-c", - "grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2").start(); - String result = new String(p.getInputStream().readAllBytes()).trim(); - p.waitFor(); - if (!result.isBlank()) return result; - } catch (Exception ignored) {} - return System.getProperty("os.arch"); - } - - // ─────────────── Helpers ─────────────── - - private static float[] randomVector(int dim, Random rng) { - float[] v = new float[dim]; - for (int i = 0; i < dim; i++) v[i] = rng.nextFloat() * 2f - 1f; - normalize(v); - return v; - } - - private static float[][] generateClusteredVectors(int count, int dims, Random rng) { - float[][] centers = new float[NUM_CLUSTERS][dims]; - for (int c = 0; c < NUM_CLUSTERS; c++) { - for (int d = 0; d < dims; d++) centers[c][d] = (float) rng.nextGaussian() * 0.5f; - normalize(centers[c]); - } - float[][] vectors = new float[count][dims]; - for (int i = 0; i < count; i++) { - int cluster = rng.nextInt(NUM_CLUSTERS); - for (int d = 0; d < dims; d++) vectors[i][d] = centers[cluster][d] + (float) rng.nextGaussian() * 0.15f; - normalize(vectors[i]); - } - return vectors; - } - - private static float[] perturbVector(float[] base, float noise, int dims, Random rng) { - float[] result = new float[dims]; - for (int d = 0; d < dims; d++) result[d] = base[d] + (float) rng.nextGaussian() * noise; - normalize(result); - return result; - } - - private static void normalize(float[] v) { - float norm = 0; - for (float f : v) norm += f * f; - norm = (float) Math.sqrt(norm); - if (norm > 1e-10f) for (int i = 0; i < v.length; i++) v[i] /= norm; - } - - private static float cosineSim(float[] a, float[] b) { - float dot = 0, na = 0, nb = 0; - for (int i = 0; i < a.length; i++) { - dot += a[i] * b[i]; - na += a[i] * a[i]; - nb += b[i] * b[i]; - } - return (float) (dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-10)); - } - - private String formatTop3(List<ScoredResult> results) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < Math.min(3, results.size()); i++) { - if (i > 0) sb.append(", "); - sb.append(results.get(i).id).append("(").append(String.format("%.3f", results.get(i).score)).append(")"); - } - return sb.toString(); - } - - // ─────────────── Statistics ─────────────── - - record Stats(double min, double max, double mean, double p50, double p95, double p99) {} - - private Stats computeStats(long[] nanos) { - Arrays.sort(nanos); - int n = nanos.length; - double sum = 0; - for (long v : nanos) sum += v; - return new Stats(nanos[0], nanos[n - 1], sum / n, - nanos[(int) (n * 0.50)], nanos[(int) (n * 0.95)], nanos[(int) (n * 0.99)]); - } - - // ─────────────── Inner Types ─────────────── - - private record CognitiveNode(String id, float[] vector, float importance, - byte valence, long tags, float decayFactor) {} - - private static class ScoredResult { - final String id; - final float score; - final CognitiveNode node; - - ScoredResult(String id, float score) { this.id = id; this.score = score; this.node = null; } - ScoredResult(String id, float score, CognitiveNode node) { this.id = id; this.score = score; this.node = node; } - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/DiskPersistenceBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/DiskPersistenceBenchmark.java deleted file mode 100644 index bd19200..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/DiskPersistenceBenchmark.java +++ /dev/null @@ -1,610 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import com.spectrayan.spector.config.HnswParams; -import com.spectrayan.spector.config.PersistenceMode; -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.core.simd.SimdCapability; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.embed.ollama.OllamaEmbeddingProvider; -import com.spectrayan.spector.engine.DefaultSpectorEngine; -import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.memory.*; -import com.spectrayan.spector.memory.cortex.MemorySource; -import com.spectrayan.spector.memory.sync.MemoryWal; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.*; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicLong; - -/** - * Benchmarks Spector in DISK persistence mode — engine index, cognitive memory, - * and Write-Ahead Log (WAL) with real Ollama embeddings. - * - * <h3>Tests</h3> - * <ul> - * <li>D1: Engine DISK mode — mmap'd sharded vector store search latency</li> - * <li>D2: Engine DISK mode — cold-start (first search after open) vs warm</li> - * <li>D3: Cognitive Memory — remember + recall with real Ollama embeddings</li> - * <li>D4: WAL — append throughput and replay speed (file-backed, fsync'd)</li> - * <li>D5: Memory DISK mode — full pipeline: ingest → recall → reinforce → reflect</li> - * </ul> - * - * <p>Requires Ollama running at localhost:11434 with an embedding model.</p> - * - * <p>Run: {@code mvn exec:java -pl spector-bench - * -Dexec.mainClass=com.spectrayan.spector.bench.DiskPersistenceBenchmark}</p> - */ -public class DiskPersistenceBenchmark { - - // ─── Configuration ─── - private static final int TOP_K = 10; - private static final String EMBEDDING_MODEL = "qwen3-embedding:latest"; - private int DIMS; // auto-detected from Ollama - - private final List<String[]> verdicts = new ArrayList<>(); - - // ─── Main ─── - - public static void main(String[] args) throws Exception { - new DiskPersistenceBenchmark().run(); - } - - public void run() throws Exception { - System.out.println("╔══════════════════════════════════════════════════════════════╗"); - System.out.println("║ SPECTOR — DISK PERSISTENCE + MEMORY BENCHMARK ║"); - System.out.println("╚══════════════════════════════════════════════════════════════╝"); - System.out.println(); - printSystemInfo(); - System.out.println(); - - // Verify Ollama connectivity - OllamaEmbeddingProvider embedder = OllamaEmbeddingProvider.create(EMBEDDING_MODEL); - DIMS = embedder.dimensions(); - System.out.printf(" Ollama model: %s (%d-dim)%n%n", EMBEDDING_MODEL, DIMS); - - // D1: Engine DISK mode search latency - runD1_DiskEngineLatency(embedder); - - // D2: Cold start vs warm - runD2_ColdVsWarm(); - - // D3: Cognitive memory with real embeddings - runD3_CognitiveMemoryRecall(embedder); - - // D4: WAL throughput - runD4_WalThroughput(); - - // D5: Full memory pipeline - runD5_FullMemoryPipeline(embedder); - - // Summary - printVerdictTable(); - writeReport(); - } - - // ═══════════════════════════════════════════════════════════════ - // D1: Engine DISK mode — search latency with mmap'd vectors - // ═══════════════════════════════════════════════════════════════ - - private void runD1_DiskEngineLatency(OllamaEmbeddingProvider embedder) throws Exception { - System.out.println("▶ D1: Engine DISK Mode — Search Latency (mmap sharded store)"); - - Path dataDir = Files.createTempDirectory("spector-disk-bench"); - int datasetSize = 5_000; - - var config = new SpectorConfig(DIMS, datasetSize + 1000, - SimilarityFunction.COSINE, new HnswParams(16, 200, 64)) - .withPersistence(PersistenceMode.DISK, dataDir); - - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - // Ingest with synthetic vectors (skip Ollama for scale — embeddings are slow) - float[][] vectors = generateClusteredVectors(datasetSize, DIMS, rng); - Instant ingestStart = Instant.now(); - for (int i = 0; i < datasetSize; i++) { - engine.ingest("doc-" + i, "document content " + i, vectors[i]); - } - Duration ingestTime = Duration.between(ingestStart, Instant.now()); - System.out.printf(" Ingested %,d docs to disk in %.1fs (%.0f docs/s)%n", - datasetSize, ingestTime.toMillis() / 1000.0, - datasetSize / (ingestTime.toMillis() / 1000.0)); - - // Warmup - float[] qv = perturbVector(vectors[0], 0.3f, DIMS, new Random(999)); - for (int i = 0; i < 200; i++) engine.vectorSearch(qv, TOP_K); - - // Measure search - long[] nanos = new long[1000]; - for (int i = 0; i < 1000; i++) { - long t0 = System.nanoTime(); - engine.vectorSearch(qv, TOP_K); - nanos[i] = System.nanoTime() - t0; - } - var stats = computeStats(nanos); - - System.out.printf(" DISK search: p50=%.0fµs p95=%.0fµs p99=%.0fµs QPS=%.0f%n", - stats.p50 / 1000.0, stats.p95 / 1000.0, stats.p99 / 1000.0, 1e9 / stats.mean); - - // Compare with IN_MEMORY baseline on same data - var memConfig = new SpectorConfig(DIMS, datasetSize + 1000, - SimilarityFunction.COSINE, new HnswParams(16, 200, 64)); - SpectorEngine memEngine = new DefaultSpectorEngine(memConfig); - for (int i = 0; i < datasetSize; i++) { - memEngine.ingest("doc-" + i, "content " + i, vectors[i]); - } - for (int i = 0; i < 200; i++) memEngine.vectorSearch(qv, TOP_K); - - long[] memNanos = new long[1000]; - for (int i = 0; i < 1000; i++) { - long t0 = System.nanoTime(); - memEngine.vectorSearch(qv, TOP_K); - memNanos[i] = System.nanoTime() - t0; - } - var memStats = computeStats(memNanos); - - double overhead = (stats.p50 / memStats.p50 - 1.0) * 100; - System.out.printf(" IN_MEMORY: p50=%.0fµs p95=%.0fµs p99=%.0fµs QPS=%.0f%n", - memStats.p50 / 1000.0, memStats.p95 / 1000.0, memStats.p99 / 1000.0, 1e9 / memStats.mean); - System.out.printf(" DISK overhead: %.1f%% (vs IN_MEMORY p50)%n%n", overhead); - - engine.close(); - memEngine.close(); - deleteDirectory(dataDir); - - verdicts.add(new String[]{"D1: DISK search latency", - String.format("p50=%.0fµs (%.1f%% overhead)", stats.p50 / 1000.0, overhead), - overhead < 50 ? "✅ VALIDATED" : "⚠️ OVERHEAD"}); - } - - // ═══════════════════════════════════════════════════════════════ - // D2: Cold-start vs warm (page cache populated) - // ═══════════════════════════════════════════════════════════════ - - private void runD2_ColdVsWarm() throws Exception { - System.out.println("▶ D2: Cold-Start vs Warm Search (page cache effects)"); - - Path dataDir = Files.createTempDirectory("spector-cold-bench"); - int datasetSize = 10_000; - - var config = new SpectorConfig(DIMS, datasetSize + 1000, - SimilarityFunction.COSINE, new HnswParams(16, 200, 64)) - .withPersistence(PersistenceMode.DISK, dataDir); - - // Build and close (writes to disk) - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - float[][] vectors = generateClusteredVectors(datasetSize, DIMS, rng); - for (int i = 0; i < datasetSize; i++) { - engine.ingest("doc-" + i, "content " + i, vectors[i]); - } - engine.close(); - - // Reopen — first search is "cold" (mmap page faults) - float[] qv = perturbVector(vectors[0], 0.3f, DIMS, new Random(999)); - - SpectorEngine engine2 = new DefaultSpectorEngine(config); - long coldStart = System.nanoTime(); - engine2.vectorSearch(qv, TOP_K); - long coldNanos = System.nanoTime() - coldStart; - - // Warm up — pages are now in OS cache - for (int i = 0; i < 200; i++) engine2.vectorSearch(qv, TOP_K); - long[] warmNanos = new long[500]; - for (int i = 0; i < 500; i++) { - long t0 = System.nanoTime(); - engine2.vectorSearch(qv, TOP_K); - warmNanos[i] = System.nanoTime() - t0; - } - var warmStats = computeStats(warmNanos); - - System.out.printf(" Cold-start (first search): %.2fms%n", coldNanos / 1e6); - System.out.printf(" Warm (page-cached): p50=%.0fµs p99=%.0fµs%n", - warmStats.p50 / 1000.0, warmStats.p99 / 1000.0); - System.out.printf(" Cold/warm ratio: %.0f×%n%n", (coldNanos / warmStats.p50)); - - engine2.close(); - deleteDirectory(dataDir); - - verdicts.add(new String[]{"D2: Cold-start vs warm", - String.format("cold=%.1fms, warm=%.0fµs", coldNanos / 1e6, warmStats.p50 / 1000.0), - "✅ MEASURED"}); - } - - // ═══════════════════════════════════════════════════════════════ - // D3: Cognitive Memory — real Ollama embeddings recall - // ═══════════════════════════════════════════════════════════════ - - private void runD3_CognitiveMemoryRecall(OllamaEmbeddingProvider embedder) throws Exception { - System.out.println("▶ D3: Cognitive Memory — Remember + Recall with Ollama Embeddings"); - - Path memDir = Files.createTempDirectory("spector-mem-bench"); - - SpectorMemory memory = DefaultSpectorMemory.builder() - .dimensions(DIMS) - .embeddingProvider(embedder) - .persistence(memDir) - .persistenceMode(MemoryPersistenceMode.DISK) - .semanticCapacity(10_000) - .build(); - - // Ingest real memories - String[] memories = { - "User prefers dark mode with high contrast colors for accessibility.", - "The project uses Java 25 with Panama FFI for zero-copy vector operations.", - "Meeting scheduled for Friday at 3 PM with the engineering team about SIMD optimizations.", - "The HNSW index uses M=16, efConstruction=200 for production workloads.", - "User's favorite programming language is Java, followed by Rust and Go.", - "Database migration from PostgreSQL to Spector completed on March 15th.", - "API rate limits set to 1000 requests per minute for free tier users.", - "The neural network training uses cosine similarity as the loss function.", - "Deployment uses Kubernetes with 3 replicas and auto-scaling enabled.", - "Bug fix: resolved memory leak in the vector quantization pipeline last sprint." - }; - - System.out.printf(" Ingesting %d memories via Ollama...%n", memories.length); - long ingestStart = System.nanoTime(); - for (int i = 0; i < memories.length; i++) { - memory.remember("mem-" + i, memories[i], MemoryType.SEMANTIC, - MemorySource.USER_STATED, "benchmark").join(); - } - long ingestElapsed = System.nanoTime() - ingestStart; - System.out.printf(" Ingestion: %d memories in %.1fs (%.0fms/memory, Ollama embedding included)%n", - memories.length, ingestElapsed / 1e9, ingestElapsed / 1e6 / memories.length); - - // Recall queries - String[] queries = { - "What color theme does the user prefer?", - "What programming language is used?", - "When is the next meeting?", - "How is the deployment configured?", - "What database was migrated?" - }; - - System.out.println(" Recall latencies (includes Ollama embedding):"); - long[] recallNanos = new long[queries.length]; - for (int i = 0; i < queries.length; i++) { - long t0 = System.nanoTime(); - List<CognitiveResult> results = memory.recall(queries[i]); - recallNanos[i] = System.nanoTime() - t0; - String topMatch = results.isEmpty() ? "none" : results.getFirst().id(); - System.out.printf(" Q: \"%s\"%n → %s (%.1fms, %d results)%n", - queries[i], topMatch, recallNanos[i] / 1e6, results.size()); - } - - // Measure repeated recall (shows consistency) - System.out.println(" Recall consistency (5 rounds):"); - for (int r = 0; r < 5; r++) { - long t0 = System.nanoTime(); - memory.recall("What color theme?"); - long ms = (System.nanoTime() - t0) / 1_000_000; - System.out.printf(" Round %d: %dms%n", r + 1, ms); - } - System.out.println(); - - memory.close(); - deleteDirectory(memDir); - - double avgRecallMs = 0; - for (long n : recallNanos) avgRecallMs += n / 1e6; - avgRecallMs /= recallNanos.length; - - verdicts.add(new String[]{"D3: Cognitive recall (Ollama)", - String.format("avg=%.0fms (embed+score)", avgRecallMs), - "✅ MEASURED"}); - } - - // ═══════════════════════════════════════════════════════════════ - // D4: WAL — append throughput and replay speed - // ═══════════════════════════════════════════════════════════════ - - private void runD4_WalThroughput() throws Exception { - System.out.println("▶ D4: WAL Append + Replay Throughput (file-backed, fsync'd)"); - - Path walDir = Files.createTempDirectory("spector-wal-bench"); - int eventCount = 50_000; - - // Test 1: WAL with fsync per write (durability guarantee) - try (MemoryWal walSync = new MemoryWal(walDir.resolve("fsync"), - 8L * 1024 * 1024, false, 1024, true)) { - - long t0 = System.nanoTime(); - for (int i = 0; i < eventCount; i++) { - walSync.appendRemember("mem-" + i, ("content-" + i).getBytes()); - } - long appendSyncNanos = System.nanoTime() - t0; - - double syncOpsPerSec = eventCount / (appendSyncNanos / 1e9); - double syncLatencyUs = appendSyncNanos / (double) eventCount / 1000.0; - - System.out.printf(" fsync WAL: %,d appends in %.1fs (%.0f ops/s, %.0fµs/op)%n", - eventCount, appendSyncNanos / 1e9, syncOpsPerSec, syncLatencyUs); - - // Replay from disk - long replayStart = System.nanoTime(); - var replayed = walSync.replayFromDisk(); - long replayNanos = System.nanoTime() - replayStart; - System.out.printf(" fsync replay: %,d events in %.1fms (%.0f events/s)%n", - replayed.size(), replayNanos / 1e6, replayed.size() / (replayNanos / 1e9)); - - verdicts.add(new String[]{"D4a: WAL fsync append", - String.format("%.0f ops/s, %.0fµs/op", syncOpsPerSec, syncLatencyUs), - "✅ MEASURED"}); - } - - // Test 2: WAL without fsync (buffered — much faster) - try (MemoryWal walBuf = new MemoryWal(walDir.resolve("buffered"), - 8L * 1024 * 1024, false, 1024, false)) { - - long t0 = System.nanoTime(); - for (int i = 0; i < eventCount; i++) { - walBuf.appendRemember("mem-" + i, ("content-" + i).getBytes()); - } - long appendBufNanos = System.nanoTime() - t0; - - double bufOpsPerSec = eventCount / (appendBufNanos / 1e9); - double bufLatencyUs = appendBufNanos / (double) eventCount / 1000.0; - - System.out.printf(" buffered WAL: %,d appends in %.1fms (%.0f ops/s, %.1fµs/op)%n", - eventCount, appendBufNanos / 1e6, bufOpsPerSec, bufLatencyUs); - - verdicts.add(new String[]{"D4b: WAL buffered append", - String.format("%.0f ops/s, %.1fµs/op", bufOpsPerSec, bufLatencyUs), - "✅ MEASURED"}); - } - - // Test 3: Concurrent WAL writes (simulating multi-agent scenario) - try (MemoryWal walConc = new MemoryWal(walDir.resolve("concurrent"), - 8L * 1024 * 1024, false, 1024, false)) { - - int threads = 8; - int opsPerThread = 10_000; - ExecutorService executor = Executors.newFixedThreadPool(threads); - AtomicLong totalOps = new AtomicLong(); - - long wallStart = System.nanoTime(); - List<Future<?>> futures = new ArrayList<>(); - for (int t = 0; t < threads; t++) { - final int tid = t; - futures.add(executor.submit(() -> { - for (int i = 0; i < opsPerThread; i++) { - walConc.appendRemember("t" + tid + "-mem-" + i, - ("concurrent-" + tid + "-" + i).getBytes()); - totalOps.incrementAndGet(); - } - })); - } - for (var f : futures) f.get(); - long wallElapsed = System.nanoTime() - wallStart; - executor.shutdown(); - - double concOpsPerSec = totalOps.get() / (wallElapsed / 1e9); - System.out.printf(" concurrent: %d threads × %,d ops = %,.0f ops/s%n", - threads, opsPerThread, concOpsPerSec); - - verdicts.add(new String[]{"D4c: WAL concurrent writes", - String.format("%,.0f ops/s (%d threads)", concOpsPerSec, threads), - "✅ MEASURED"}); - } - - System.out.println(); - deleteDirectory(walDir); - } - - // ═══════════════════════════════════════════════════════════════ - // D5: Full Memory Pipeline — ingest → recall → reinforce → reflect - // ═══════════════════════════════════════════════════════════════ - - private void runD5_FullMemoryPipeline(OllamaEmbeddingProvider embedder) throws Exception { - System.out.println("▶ D5: Full Cognitive Pipeline (remember → recall → reinforce → reflect)"); - - Path memDir = Files.createTempDirectory("spector-pipeline-bench"); - - SpectorMemory memory = DefaultSpectorMemory.builder() - .dimensions(DIMS) - .embeddingProvider(embedder) - .persistence(memDir) - .persistenceMode(MemoryPersistenceMode.DISK) - .semanticCapacity(10_000) - .build(); - - // Phase 1: Remember - String[] texts = { - "Implemented SIMD-accelerated cosine similarity using Java Vector API with AVX-512.", - "The Panama FFI provides zero-copy access to native memory segments without JNI overhead.", - "HNSW graph construction uses M=16, efConstruction=200 for 95%+ recall at 10K scale.", - "Write-Ahead Log uses append-only binary format with CRC32 checksums for crash recovery.", - "Cognitive memory scoring fuses similarity, importance, decay, and valence in one SIMD pass." - }; - - System.out.printf(" Phase 1: Remember (%d memories)...%n", texts.length); - long rememberStart = System.nanoTime(); - for (int i = 0; i < texts.length; i++) { - memory.remember("pipeline-" + i, texts[i], MemoryType.SEMANTIC, - MemorySource.OBSERVED, "pipeline", "benchmark").join(); - } - long rememberMs = (System.nanoTime() - rememberStart) / 1_000_000; - System.out.printf(" Done: %dms total (%.0fms/memory)%n", rememberMs, (double) rememberMs / texts.length); - - // Phase 2: Recall - System.out.println(" Phase 2: Recall..."); - long recallStart = System.nanoTime(); - List<CognitiveResult> results = memory.recall("What is the HNSW configuration?"); - long recallMs = (System.nanoTime() - recallStart) / 1_000_000; - System.out.printf(" Recall: %dms, %d results%n", recallMs, results.size()); - if (!results.isEmpty()) { - System.out.printf(" Top: %s (score=%.3f)%n", results.getFirst().id(), - results.getFirst().score()); - } - - // Phase 3: Reinforce - System.out.println(" Phase 3: Reinforce..."); - if (!results.isEmpty()) { - long reinforceStart = System.nanoTime(); - memory.reinforce(results.getFirst().id(), (byte) 64); - long reinforceUs = (System.nanoTime() - reinforceStart) / 1000; - System.out.printf(" Reinforced '%s' in %dµs%n", results.getFirst().id(), reinforceUs); - } - - // Phase 4: Reflect (sleep consolidation) - System.out.println(" Phase 4: Reflect (sleep consolidation)..."); - long reflectStart = System.nanoTime(); - ReflectReport report = memory.reflect(); - long reflectMs = (System.nanoTime() - reflectStart) / 1_000_000; - System.out.printf(" Reflect: %dms (promoted=%d, pruned=%d)%n", - reflectMs, report.consolidatedCount(), report.tombstonedCount()); - - // Phase 5: Stats - System.out.println(" Phase 5: Final stats..."); - System.out.printf(" Total memories: %d%n", memory.totalMemories()); - System.out.printf(" Working: %d%n", memory.memoryCount(MemoryType.WORKING)); - System.out.printf(" Episodic: %d%n", memory.memoryCount(MemoryType.EPISODIC)); - System.out.printf(" Semantic: %d%n", memory.memoryCount(MemoryType.SEMANTIC)); - System.out.printf(" Procedural: %d%n", memory.memoryCount(MemoryType.PROCEDURAL)); - System.out.println(); - - memory.close(); - deleteDirectory(memDir); - - verdicts.add(new String[]{"D5: Full pipeline cycle", - String.format("remember=%dms, recall=%dms, reflect=%dms", rememberMs, recallMs, reflectMs), - "✅ MEASURED"}); - } - - // ═══════════════════════════════════════════════════════════════ - // Results & Report - // ═══════════════════════════════════════════════════════════════ - - private void printVerdictTable() { - System.out.println("═══════════════════════════════════════════════════════════════"); - System.out.println(" DISK PERSISTENCE BENCHMARK REPORT "); - System.out.println("═══════════════════════════════════════════════════════════════"); - System.out.printf(" %-38s %-35s %-15s%n", "TEST", "RESULT", "VERDICT"); - System.out.println(" " + "─".repeat(88)); - for (var v : verdicts) { - System.out.printf(" %-38s %-35s %-15s%n", v[0], v[1], v[2]); - } - System.out.println("═══════════════════════════════════════════════════════════════"); - } - - private void writeReport() throws IOException { - StringBuilder sb = new StringBuilder(); - sb.append("# Spector — Disk Persistence Benchmark Report\n\n"); - sb.append("**Generated:** ").append(LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)).append("\n\n"); - - sb.append("## System\n\n"); - sb.append("| Property | Value |\n|---|---|\n"); - sb.append("| CPU | ").append(getCpuModel()).append(" |\n"); - sb.append("| Java | ").append(System.getProperty("java.version")).append(" |\n"); - sb.append("| SIMD | ").append(SimdCapability.report()).append(" |\n"); - sb.append("| Embedding | ").append(EMBEDDING_MODEL).append(" (Ollama, localhost) |\n\n"); - - sb.append("## Results\n\n"); - sb.append("| Test | Result | Verdict |\n|---|---|---|\n"); - for (var v : verdicts) { - sb.append("| ").append(v[0]).append(" | ").append(v[1]).append(" | ").append(v[2]).append(" |\n"); - } - - Path reportPath = Path.of("spector-bench", "target", "disk-persistence-report.md"); - Files.createDirectories(reportPath.getParent()); - Files.writeString(reportPath, sb.toString()); - System.out.printf("%nReport saved: %s%n", reportPath.toAbsolutePath()); - } - - // ─── System Info ─── - - private void printSystemInfo() { - System.out.printf(" OS: %s %s%n", System.getProperty("os.name"), System.getProperty("os.arch")); - System.out.printf(" Java: %s%n", System.getProperty("java.version")); - System.out.printf(" CPU: %s (%d cores)%n", getCpuModel(), Runtime.getRuntime().availableProcessors()); - System.out.printf(" Heap: %d MB%n", Runtime.getRuntime().maxMemory() / (1024 * 1024)); - System.out.printf(" SIMD: %s%n", SimdCapability.report()); - System.out.printf(" Time: %s%n", LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)); - } - - private static String getCpuModel() { - try { - Process p = new ProcessBuilder("powershell", "-Command", - "(Get-CimInstance Win32_Processor).Name").start(); - String result = new String(p.getInputStream().readAllBytes()).trim(); - p.waitFor(); - if (!result.isBlank()) return result; - } catch (Exception ignored) {} - return System.getProperty("os.arch"); - } - - // ─── Helpers ─── - - private static float[][] generateClusteredVectors(int count, int dims, Random rng) { - int clusters = 50; - float[][] centers = new float[clusters][dims]; - for (int c = 0; c < clusters; c++) { - for (int d = 0; d < dims; d++) centers[c][d] = (float) rng.nextGaussian() * 0.5f; - normalize(centers[c]); - } - float[][] vectors = new float[count][dims]; - for (int i = 0; i < count; i++) { - int cluster = rng.nextInt(clusters); - for (int d = 0; d < dims; d++) vectors[i][d] = centers[cluster][d] + (float) rng.nextGaussian() * 0.15f; - normalize(vectors[i]); - } - return vectors; - } - - private static float[] perturbVector(float[] base, float noise, int dims, Random rng) { - float[] result = new float[dims]; - for (int d = 0; d < dims; d++) result[d] = base[d] + (float) rng.nextGaussian() * noise; - normalize(result); - return result; - } - - private static void normalize(float[] v) { - float norm = 0; - for (float f : v) norm += f * f; - norm = (float) Math.sqrt(norm); - if (norm > 1e-10f) for (int i = 0; i < v.length; i++) v[i] /= norm; - } - - private static void deleteDirectory(Path path) throws IOException { - if (Files.exists(path)) { - try (var stream = Files.walk(path)) { - stream.sorted(Comparator.reverseOrder()).forEach(p -> { - try { Files.delete(p); } catch (IOException ignored) {} - }); - } - } - } - - record Stats(double min, double max, double mean, double p50, double p95, double p99) {} - - private Stats computeStats(long[] nanos) { - Arrays.sort(nanos); - int n = nanos.length; - double sum = 0; - for (long v : nanos) sum += v; - return new Stats(nanos[0], nanos[n - 1], sum / n, - nanos[(int) (n * 0.50)], nanos[(int) (n * 0.95)], nanos[(int) (n * 0.99)]); - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/FlatScanBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/FlatScanBenchmark.java deleted file mode 100644 index 465bf4b..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/FlatScanBenchmark.java +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import com.spectrayan.spector.core.quantization.strategy.DistanceContext; -import com.spectrayan.spector.core.quantization.strategy.SvasqStrategy; -import com.spectrayan.spector.core.quantization.svasq.SvasqCalibrator; -import com.spectrayan.spector.core.quantization.svasq.SvasqEncoder; -import com.spectrayan.spector.core.quantization.svasq.SvasqParams; -import com.spectrayan.spector.core.similarity.SimilarityFunction; - -import org.openjdk.jmh.annotations.*; -import org.openjdk.jmh.infra.Blackhole; - -import java.lang.foreign.Arena; -import java.lang.foreign.MemorySegment; -import java.util.ArrayList; -import java.util.List; -import java.util.PriorityQueue; -import java.util.Random; -import java.util.concurrent.TimeUnit; - -/** - * JMH benchmarks for the SpectorShard flat-scan path. - * - * <p>The flat scan is the critical mode for small shards (< shardThreshold). It performs - * exhaustive exact L2 over float32 residuals and is expected to outperform HNSW for - * sizes below ~20K due to contiguous memory access patterns and SIMD-friendly layout.</p> - * - * <p>Benchmarks:</p> - * <ul> - * <li><b>float32 flat scan</b> — exhaustive exact similarity over raw float residuals</li> - * <li><b>SVASQ flat scan</b> — exhaustive scan using the SVASQ distance kernel over encoded - * off-heap residuals. Simulates what the shard would do post-calibration in a fully - * quantized shard (not yet promoted to HNSW).</li> - * </ul> - * - * <p>Run via:</p> - * <pre> - * java -jar spector-bench/target/benchmarks.jar FlatScanBenchmark - * </pre> - */ -@BenchmarkMode({Mode.Throughput, Mode.AverageTime}) -@OutputTimeUnit(TimeUnit.MICROSECONDS) -@State(Scope.Benchmark) -@Warmup(iterations = 3, time = 2) -@Measurement(iterations = 5, time = 3) -@Fork(value = 1, jvmArgsAppend = { - "--add-modules", "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "-Xmx2g" -}) -public class FlatScanBenchmark { - - @Param({"128", "384"}) - int dims; - - /** Shard size — spans the flat-mode range and one post-threshold point. */ - @Param({"1000", "5000", "20000"}) - int shardSize; - - @Param({"10"}) - int topK; - - private float[] queryResidual; - private float[][] floatResiduals; // float32 exact residuals (flat mode) - private MemorySegment encodedSegment; - private Arena arena; - private SvasqStrategy svasqStrategy; - private int bpv; - private SimilarityFunction fn = SimilarityFunction.COSINE; - - @Setup(Level.Trial) - public void setup() { - Random rng = new Random(42L); - - // Build calibrated SVASQ strategy - List<float[]> sample = new ArrayList<>(Math.min(shardSize, 2000)); - for (int i = 0; i < sample.size(); i++) sample.add(gaussianUnit(rng, dims)); - // Ensure we have enough for calibration - while (sample.size() < 200) sample.add(gaussianUnit(rng, dims)); - SvasqParams params = SvasqCalibrator.calibrate(sample, dims); - SvasqEncoder encoder = new SvasqEncoder(params); - svasqStrategy = new SvasqStrategy(params, fn); - bpv = svasqStrategy.bytesPerVector(); - - // Query residual - queryResidual = gaussianUnit(rng, dims); - - // Float32 residuals (heap) - floatResiduals = new float[shardSize][dims]; - for (int i = 0; i < shardSize; i++) floatResiduals[i] = gaussianUnit(rng, dims); - - // SVASQ-encoded residuals (off-heap) - arena = Arena.ofShared(); - encodedSegment = arena.allocate((long) shardSize * bpv, 8L); - for (int i = 0; i < shardSize; i++) { - encoder.encode(floatResiduals[i], encodedSegment, (long) i * bpv); - } - } - - @TearDown(Level.Trial) - public void tearDown() { - arena.close(); - } - - // ── Float32 exact flat scan (current SpectorShard flat mode) ───────────── - - /** - * Exhaustive exact similarity scan over float32 residuals. - * Uses a min-heap of size k to track the best candidates. - * This is what {@link com.spectrayan.spector.index.spectrum.SpectorShard#flatScan} does. - */ - @Benchmark - public void flatScan_exact_float32(Blackhole bh) { - PriorityQueue<float[]> heap = new PriorityQueue<>(topK, - (a, b) -> Float.compare(a[0], b[0])); // min-heap by score - - for (int i = 0; i < shardSize; i++) { - float score = fn.compute(queryResidual, floatResiduals[i]); - if (heap.size() < topK) { - heap.offer(new float[]{score, i}); - } else if (score > heap.peek()[0]) { - heap.poll(); - heap.offer(new float[]{score, i}); - } - } - bh.consume(heap); - } - - // ── SVASQ quantized flat scan (hypothetical fully-quantized shard mode) ─── - - /** - * Exhaustive SVASQ distance scan over off-heap encoded residuals. - * Demonstrates the throughput possible if the flat-scan path also used SVASQ - * instead of float32 (useful for very large pre-promotion shards). - */ - @Benchmark - public void flatScan_svasq_encoded(Blackhole bh) { - DistanceContext ctx = svasqStrategy.prepareQueryContext(queryResidual); - PriorityQueue<float[]> heap = new PriorityQueue<>(topK, - (a, b) -> Float.compare(a[0], b[0])); - - for (int i = 0; i < shardSize; i++) { - float score = svasqStrategy.distance(encodedSegment, (long) i * bpv, ctx); - if (heap.size() < topK) { - heap.offer(new float[]{score, i}); - } else if (score > heap.peek()[0]) { - heap.poll(); - heap.offer(new float[]{score, i}); - } - } - bh.consume(heap); - } - - // ── Helpers ────────────────────────────────────────────────────────────── - - private static float[] gaussianUnit(Random rng, int dims) { - float[] v = new float[dims]; - double norm = 0; - for (int i = 0; i < dims; i++) { - v[i] = (float) rng.nextGaussian(); - norm += (double) v[i] * v[i]; - } - float scale = (float) (1.0 / Math.sqrt(norm)); - for (int i = 0; i < dims; i++) v[i] *= scale; - return v; - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/FwhtBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/FwhtBenchmark.java deleted file mode 100644 index 0cc1abf..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/FwhtBenchmark.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import com.spectrayan.spector.core.quantization.svasq.SvasqFwht; - -import org.openjdk.jmh.annotations.*; -import org.openjdk.jmh.infra.Blackhole; - -import java.util.Random; -import java.util.concurrent.TimeUnit; - -/** - * JMH benchmarks for {@link SvasqFwht} — the FWHT rotation step in the SVASQ pipeline. - * - * <p>FWHT is applied once per query preparation ({@code O(N log N)} additions, zero multiplications) - * and once per indexed vector during encode. This benchmark isolates the rotation cost so it - * can be tracked separately from the SVASQ quantization overhead.</p> - * - * <p>Run via:</p> - * <pre> - * java -jar spector-bench/target/benchmarks.jar FwhtBenchmark - * </pre> - */ -@BenchmarkMode({Mode.Throughput, Mode.AverageTime}) -@OutputTimeUnit(TimeUnit.MICROSECONDS) -@State(Scope.Benchmark) -@Warmup(iterations = 3, time = 2) -@Measurement(iterations = 5, time = 3) -@Fork(value = 1, jvmArgsAppend = { - "--add-modules", "jdk.incubator.vector", - "--enable-native-access=ALL-UNNAMED", - "-Xmx2g" -}) -public class FwhtBenchmark { - - /** Vector dimensionality — 128 (small), 768 (BERT), 1024 (padded BERT). */ - @Param({"128", "768", "1024"}) - int dims; - - private SvasqFwht fwht; - private float[] inputVector; - private float[] outputBuffer; - - @Setup(Level.Trial) - public void setup() { - fwht = new SvasqFwht(dims, 42L); - int paddedDim = fwht.paddedDim(); - Random rng = new Random(1L); - inputVector = new float[dims]; - outputBuffer = new float[paddedDim]; - for (int i = 0; i < dims; i++) { - inputVector[i] = (float) rng.nextGaussian(); - } - } - - /** - * Allocating variant — creates a new output buffer each call. - * Represents the encode path at index time. - */ - @Benchmark - public float[] rotate_allocating(Blackhole bh) { - return fwht.rotate(inputVector); - } - - /** - * Zero-copy variant — writes into a pre-allocated buffer. - * Represents the query preparation path (called once per search). - */ - @Benchmark - public void rotate_intoBuffer(Blackhole bh) { - fwht.rotate(inputVector, outputBuffer); - bh.consume(outputBuffer); - } - - /** - * Raw FWHT butterfly on an already-prepared array. - * Isolates the O(N log N) butterfly cost without sign-flip or normalization overhead. - */ - @Benchmark - public void rawFwht_butterfly(Blackhole bh) { - System.arraycopy(inputVector, 0, outputBuffer, 0, dims); - SvasqFwht.applyFwht(outputBuffer); - bh.consume(outputBuffer); - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuDetectTest.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuDetectTest.java deleted file mode 100644 index b1f4e4d..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuDetectTest.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import com.spectrayan.spector.gpu.GpuCapability; - -public class GpuDetectTest { - public static void main(String[] args) { - System.out.println(GpuCapability.detect().report()); - System.out.println("Available: " + GpuCapability.isAvailable()); - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuKernelBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuKernelBenchmark.java index 04d4d07..23b9a01 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuKernelBenchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuKernelBenchmark.java @@ -1,18 +1,3 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; import java.util.Random; diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuPerfTest.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuPerfTest.java deleted file mode 100644 index 26d86d5..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuPerfTest.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import java.util.Random; - -import com.spectrayan.spector.core.similarity.CosineSimilarity; -import com.spectrayan.spector.gpu.CudaKernelLauncher; -import com.spectrayan.spector.gpu.GpuBatchSimilarity; -import com.spectrayan.spector.gpu.GpuCapability; - -/** - * Quick GPU vs CPU SIMD performance comparison. - * Tests batch cosine similarity at various batch sizes. - */ -public class GpuPerfTest { - - private static final int DIMENSIONS = 384; - private static final int WARMUP = 20; - private static final int MEASURE = 100; - private static final int[] BATCH_SIZES = {1, 8, 32, 128, 512, 1024, 4096, 10000, 50000, 100000}; - - public static void main(String[] args) { - System.out.println("GPU: " + GpuCapability.detect().report()); - System.out.println("Dimensions: " + DIMENSIONS); - System.out.println(); - - if (!GpuCapability.isAvailable()) { - System.out.println("ERROR: No GPU available!"); - return; - } - - Random rng = new Random(42); - GpuBatchSimilarity gpu = new GpuBatchSimilarity(); - - System.out.printf("%-10s %12s %12s %12s%n", "Batch", "CPU SIMD", "GPU", "Speedup"); - System.out.println("-".repeat(52)); - - for (int batchSize : BATCH_SIZES) { - float[] query = randomVec(DIMENSIONS, rng); - float[] database = new float[batchSize * DIMENSIONS]; - for (int i = 0; i < database.length; i++) { - database[i] = rng.nextFloat() * 2f - 1f; - } - - // Warmup both - for (int i = 0; i < WARMUP; i++) { - cpuBatchCosine(query, database, batchSize, DIMENSIONS); - gpu.batchCosineSimilarity(query, database, batchSize, DIMENSIONS); - } - - // Measure CPU - long cpuTotal = 0; - for (int i = 0; i < MEASURE; i++) { - long t0 = System.nanoTime(); - cpuBatchCosine(query, database, batchSize, DIMENSIONS); - cpuTotal += System.nanoTime() - t0; - } - double cpuAvgMs = (cpuTotal / (double) MEASURE) / 1e6; - - // Measure GPU (direct kernel launch, bypassing threshold) - long gpuTotal = 0; - CudaKernelLauncher directLauncher = null; - try { directLauncher = new CudaKernelLauncher(); } catch (Exception ignored) {} - if (directLauncher != null) { - for (int i = 0; i < WARMUP; i++) { - directLauncher.batchCosine(query, database, batchSize, DIMENSIONS); - } - for (int i = 0; i < MEASURE; i++) { - long t0 = System.nanoTime(); - directLauncher.batchCosine(query, database, batchSize, DIMENSIONS); - gpuTotal += System.nanoTime() - t0; - } - directLauncher.close(); - } - double gpuAvgMs = directLauncher != null ? (gpuTotal / (double) MEASURE) / 1e6 : -1; - - double speedup = cpuAvgMs / gpuAvgMs; - System.out.printf("%-10d %10.3f ms %10.3f ms %10.1f×%n", - batchSize, cpuAvgMs, gpuAvgMs, speedup); - } - - gpu.close(); - } - - private static float[] cpuBatchCosine(float[] query, float[] database, - int n, int dims) { - float[] results = new float[n]; - for (int i = 0; i < n; i++) { - results[i] = CosineSimilarity.compute(query, 0, database, i * dims, dims); - } - return results; - } - - private static float[] randomVec(int dims, Random rng) { - float[] v = new float[dims]; - for (int i = 0; i < dims; i++) v[i] = rng.nextFloat() * 2f - 1f; - return v; - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuResidentBench.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuResidentBench.java deleted file mode 100644 index 4f326f9..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/GpuResidentBench.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import java.util.Random; - -import com.spectrayan.spector.gpu.GpuCapability; -import com.spectrayan.spector.gpu.GpuVectorIndex; - -/** - * Benchmark for GPU-resident vector search (persistent device memory model). - * Database is uploaded to VRAM once, then queries only transfer the query vector. - */ -public class GpuResidentBench { - - private static final int DIMS = 384; - private static final int WARMUP = 10; - private static final int MEASURE = 50; - - public static void main(String[] args) { - System.out.println("GPU: " + GpuCapability.detect().report()); - System.out.println("Dimensions: " + DIMS); - System.out.println(); - - int[] sizes = {10_000, 100_000, 500_000, 1_000_000}; - - for (int n : sizes) { - long memMB = (long) n * DIMS * 4 / (1024 * 1024); - System.out.printf("▶ %,d vectors (%d MB)%n", n, memMB); - - Random rng = new Random(42); - float[] database = new float[n * DIMS]; - for (int i = 0; i < database.length; i++) { - database[i] = rng.nextFloat() * 2f - 1f; - } - float[] query = new float[DIMS]; - for (int i = 0; i < DIMS; i++) query[i] = rng.nextFloat() * 2f - 1f; - - // Create GPU index (uploads to VRAM) - long uploadStart = System.nanoTime(); - GpuVectorIndex gpuIndex = GpuVectorIndex.create(database, n, DIMS, true); - long uploadMs = (System.nanoTime() - uploadStart) / 1_000_000; - System.out.printf(" Upload: %dms | GPU active: %s%n", uploadMs, gpuIndex.isGpuActive()); - - // Create CPU-only index for comparison - GpuVectorIndex cpuIndex = GpuVectorIndex.create(database, n, DIMS, false); - - // Warmup - for (int i = 0; i < WARMUP; i++) { - gpuIndex.search(query); - cpuIndex.search(query); - } - - // Measure GPU - long gpuTotal = 0; - for (int i = 0; i < MEASURE; i++) { - long t0 = System.nanoTime(); - gpuIndex.search(query); - gpuTotal += System.nanoTime() - t0; - } - double gpuMs = (gpuTotal / (double) MEASURE) / 1e6; - - // Measure CPU - long cpuTotal = 0; - for (int i = 0; i < MEASURE; i++) { - long t0 = System.nanoTime(); - cpuIndex.search(query); - cpuTotal += System.nanoTime() - t0; - } - double cpuMs = (cpuTotal / (double) MEASURE) / 1e6; - - double speedup = cpuMs / gpuMs; - System.out.printf(" CPU SIMD: %.2f ms | GPU: %.2f ms | Speedup: %.1f×%n%n", - cpuMs, gpuMs, speedup); - - gpuIndex.close(); - cpuIndex.close(); - } - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/HeavyPerformanceBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/HeavyPerformanceBenchmark.java index 37e4f82..4ef80a4 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/HeavyPerformanceBenchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/HeavyPerformanceBenchmark.java @@ -1,25 +1,9 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.engine.DefaultSpectorEngine; +import com.spectrayan.spector.core.SimilarityFunction; +import com.spectrayan.spector.engine.SpectorConfig; import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.config.HnswParams; +import com.spectrayan.spector.index.HnswParams; import com.spectrayan.spector.query.SearchQuery; import com.spectrayan.spector.query.SearchResponse; @@ -81,7 +65,7 @@ public void setup() { var hnswParams = new HnswParams(16, 200, 64); var config = new SpectorConfig(dimensions, datasetSize + 1000, SimilarityFunction.COSINE, hnswParams); - engine = new DefaultSpectorEngine(config); + engine = new SpectorEngine(config); Random rng = new Random(42); diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/HnswBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/HnswBenchmark.java index af8f6b5..c6f736d 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/HnswBenchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/HnswBenchmark.java @@ -1,23 +1,8 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; -import com.spectrayan.spector.core.similarity.SimilarityFunction; +import com.spectrayan.spector.core.SimilarityFunction; import com.spectrayan.spector.index.HnswIndex; -import com.spectrayan.spector.config.HnswParams; +import com.spectrayan.spector.index.HnswParams; import com.spectrayan.spector.index.ScoredResult; import org.openjdk.jmh.annotations.*; diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/IndexOperationBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/IndexOperationBenchmark.java index c860863..037b13e 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/IndexOperationBenchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/IndexOperationBenchmark.java @@ -1,25 +1,9 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.engine.DefaultSpectorEngine; +import com.spectrayan.spector.core.SimilarityFunction; +import com.spectrayan.spector.engine.SpectorConfig; import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.config.HnswParams; +import com.spectrayan.spector.index.HnswParams; import org.openjdk.jmh.annotations.*; import org.openjdk.jmh.infra.Blackhole; @@ -69,7 +53,7 @@ public void setup() { var hnswParams = new HnswParams(16, 200, 64); var config = new SpectorConfig(dimensions, datasetSize + 10_000, SimilarityFunction.COSINE, hnswParams); - engine = new DefaultSpectorEngine(config); + engine = new SpectorEngine(config); Random rng = new Random(42); for (int i = 0; i < datasetSize; i++) { diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/IndustryBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/IndustryBenchmark.java deleted file mode 100644 index 35b9463..0000000 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/IndustryBenchmark.java +++ /dev/null @@ -1,523 +0,0 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.spectrayan.spector.bench; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.time.Duration; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Random; -import java.util.Set; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicLong; - -import com.spectrayan.spector.core.simd.SimdCapability; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.engine.DefaultSpectorEngine; -import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.config.HnswParams; - -/** - * Industry-standard benchmark following ann-benchmarks methodology. - * - * <p>Key differences from the previous PerformanceTestRunner:</p> - * <ul> - * <li>Uses clustered (realistic) vectors, not uniform random</li> - * <li>Measures recall@K against brute-force ground truth</li> - * <li>Tests multiple dimensions: 128, 384, 768</li> - * <li>Uses realistic document sizes: 200-2000 words (like real paragraphs/pages)</li> - * <li>Reports QPS at specific recall thresholds</li> - * <li>Records system state (CPU%, RAM) during test</li> - * </ul> - * - * <p>Run: {@code mvn -pl spector-bench exec:java -Dexec.mainClass=com.spectrayan.spector.bench.IndustryBenchmark}</p> - */ -public class IndustryBenchmark { - - // ─── Configuration ─── - private static final int[] DATASET_SIZES = {10_000, 50_000, 100_000}; - private static final int[] DIMENSIONS = {128, 384, 768}; - private static final int WARMUP_QUERIES = 100; - private static final int MEASURE_QUERIES = 500; - private static final int[] CONCURRENCY_LEVELS = {1, 4, 8, 16}; - private static final int TOP_K = 10; - private static final int NUM_CLUSTERS = 50; // for realistic vector generation - - // Realistic document corpus words (varied topics, longer vocabulary) - private static final String[] CORPUS = { - "machine", "learning", "algorithm", "neural", "network", "deep", - "transformer", "attention", "embedding", "vector", "semantic", - "retrieval", "augmented", "generation", "language", "model", - "inference", "training", "gradient", "optimization", "batch", - "epoch", "loss", "function", "activation", "layer", "weight", - "bias", "dropout", "regularization", "normalization", "encoder", - "decoder", "tokenizer", "vocabulary", "context", "window", - "position", "encoding", "multi-head", "self-attention", "cross", - "architecture", "parameter", "fine-tuning", "pre-training", - "benchmark", "evaluation", "metric", "accuracy", "precision", - "recall", "f1-score", "latency", "throughput", "scalability", - "distributed", "parallel", "concurrent", "asynchronous", "pipeline", - "streaming", "real-time", "indexing", "search", "query", - "document", "passage", "chunk", "sentence", "paragraph", - "knowledge", "base", "graph", "ontology", "taxonomy", - "classification", "clustering", "similarity", "distance", - "nearest", "neighbor", "approximate", "exact", "brute-force", - "quantization", "compression", "pruning", "distillation", - "deployment", "production", "monitoring", "observability", - "infrastructure", "cloud", "server", "client", "api", - "endpoint", "request", "response", "authentication", "authorization", - "database", "storage", "memory", "cache", "buffer", - "performance", "optimization", "profiling", "bottleneck" - }; - - private final List<BenchResult> results = new ArrayList<>(); - private final Runtime runtime = Runtime.getRuntime(); - - public static void main(String[] args) throws Exception { - new IndustryBenchmark().run(); - } - - public void run() throws Exception { - System.out.println("╔══════════════════════════════════════════════════════════════╗"); - System.out.println("║ SPECTOR SEARCH — INDUSTRY-STANDARD BENCHMARK SUITE ║"); - System.out.println("╚══════════════════════════════════════════════════════════════╝"); - System.out.println(); - printSystemInfo(); - System.out.println(); - - // Phase 1: Recall + Latency at different scales and dimensions - for (int dims : DIMENSIONS) { - for (int size : DATASET_SIZES) { - if (dims == 768 && size == 100_000) continue; // skip largest combo to keep runtime reasonable - runRecallLatencyBenchmark(dims, size); - } - } - - // Phase 2: Document size impact (does content byte size affect search?) - runDocumentSizeImpact(); - - // Phase 3: Concurrency at 50K/384-dim (realistic production scenario) - runConcurrencyBenchmark(384, 50_000); - - // Generate report - printSummary(); - Path reportPath = Path.of("spector-bench", "target", "industry-benchmark.txt"); - Files.createDirectories(reportPath.getParent()); - writeReport(reportPath); - System.out.printf("%n Report saved: %s%n", reportPath.toAbsolutePath()); - } - - private void printSystemInfo() { - long totalMem = runtime.maxMemory() / (1024 * 1024); - System.out.printf(" OS: %s %s%n", System.getProperty("os.name"), System.getProperty("os.arch")); - System.out.printf(" Java: %s%n", System.getProperty("java.version")); - System.out.printf(" CPUs: %d logical cores%n", runtime.availableProcessors()); - System.out.printf(" Max Heap: %d MB%n", totalMem); - System.out.printf(" SIMD: %s%n", SimdCapability.report()); - System.out.printf(" Timestamp: %s%n", LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)); - } - - // ─────────────── Recall + Latency Benchmark ─────────────── - - private void runRecallLatencyBenchmark(int dims, int datasetSize) { - System.out.printf("▶ Recall+Latency: %,d docs × %d-dim%n", datasetSize, dims); - - var hnswParams = new HnswParams(16, 200, 64); - var config = new SpectorConfig(dims, datasetSize + 1000, - SimilarityFunction.COSINE, hnswParams); - - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - // Generate clustered vectors (realistic: embeddings form clusters in practice) - float[][] allVectors = generateClusteredVectors(datasetSize, dims, rng); - - // Ingest with realistic document content - Instant ingestStart = Instant.now(); - for (int i = 0; i < datasetSize; i++) { - String content = generateRealisticDocument(rng); - engine.ingest("doc-" + i, content, allVectors[i]); - } - Duration ingestTime = Duration.between(ingestStart, Instant.now()); - double ingestRate = datasetSize / (ingestTime.toMillis() / 1000.0); - System.out.printf(" Ingested in %.1fs (%.0f docs/s)%n", - ingestTime.toMillis() / 1000.0, ingestRate); - - // Generate query vectors from same distribution (realistic: queries are similar to corpus) - int numQueries = MEASURE_QUERIES; - float[][] queryVectors = new float[numQueries][]; - Random qrng = new Random(999); - for (int i = 0; i < numQueries; i++) { - // Pick a random cluster center and add noise (simulates real queries) - int cluster = qrng.nextInt(NUM_CLUSTERS); - queryVectors[i] = perturbVector(allVectors[cluster * (datasetSize / NUM_CLUSTERS)], 0.3f, dims, qrng); - } - - // Compute brute-force ground truth for recall measurement - int[][] groundTruth = computeGroundTruth(queryVectors, allVectors, TOP_K); - - // Warmup - for (int i = 0; i < WARMUP_QUERIES; i++) { - engine.vectorSearch(queryVectors[i % numQueries], TOP_K); - } - - // Measure vector search - long[] vectorNanos = new long[numQueries]; - int totalRecallHits = 0; - for (int i = 0; i < numQueries; i++) { - long t0 = System.nanoTime(); - var response = engine.vectorSearch(queryVectors[i], TOP_K); - vectorNanos[i] = System.nanoTime() - t0; - - // Compute recall - Set<String> retrieved = new HashSet<>(); - for (var r : response.results()) retrieved.add(r.id()); - for (int gt : groundTruth[i]) { - if (retrieved.contains("doc-" + gt)) totalRecallHits++; - } - } - double recall = (double) totalRecallHits / (numQueries * TOP_K); - var vecStats = computeStats(vectorNanos); - - System.out.printf(" Vector: avg=%.3fms p99=%.3fms recall@%d=%.1f%% QPS=%.0f%n", - vecStats.mean / 1e6, vecStats.p99 / 1e6, TOP_K, recall * 100, 1e9 / vecStats.mean); - - results.add(new BenchResult("Vector Search", dims, datasetSize, - vecStats.mean / 1e6, vecStats.p99 / 1e6, 1e9 / vecStats.mean, recall)); - - // Measure keyword search - String[] queryTexts = {"machine learning neural network architecture", - "retrieval augmented generation language model", - "distributed parallel concurrent optimization", - "quantization compression approximate nearest neighbor", - "performance latency throughput scalability benchmark"}; - long[] kwNanos = new long[numQueries]; - for (int i = 0; i < numQueries; i++) { - String q = queryTexts[i % queryTexts.length]; - long t0 = System.nanoTime(); - engine.keywordSearch(q, TOP_K); - kwNanos[i] = System.nanoTime() - t0; - } - var kwStats = computeStats(kwNanos); - System.out.printf(" Keyword: avg=%.3fms p99=%.3fms QPS=%.0f%n", - kwStats.mean / 1e6, kwStats.p99 / 1e6, 1e9 / kwStats.mean); - - results.add(new BenchResult("Keyword Search", dims, datasetSize, - kwStats.mean / 1e6, kwStats.p99 / 1e6, 1e9 / kwStats.mean, -1)); - - // Measure hybrid search - long[] hybNanos = new long[numQueries]; - for (int i = 0; i < numQueries; i++) { - String q = queryTexts[i % queryTexts.length]; - long t0 = System.nanoTime(); - engine.hybridSearch(q, queryVectors[i], TOP_K); - hybNanos[i] = System.nanoTime() - t0; - } - var hybStats = computeStats(hybNanos); - System.out.printf(" Hybrid: avg=%.3fms p99=%.3fms QPS=%.0f%n", - hybStats.mean / 1e6, hybStats.p99 / 1e6, 1e9 / hybStats.mean); - - results.add(new BenchResult("Hybrid Search", dims, datasetSize, - hybStats.mean / 1e6, hybStats.p99 / 1e6, 1e9 / hybStats.mean, -1)); - - // Record ingestion - results.add(new BenchResult("Ingestion", dims, datasetSize, - ingestTime.toMillis(), 0, ingestRate, -1)); - - engine.close(); - System.out.println(); - } - - // ─────────────── Document Size Impact ─────────────── - - private void runDocumentSizeImpact() { - System.out.println("▶ Document Size Impact Test (10K docs, 384-dim)"); - int dims = 384; - int size = 10_000; - Random rng = new Random(42); - float[][] vectors = generateClusteredVectors(size, dims, rng); - float[] queryVec = perturbVector(vectors[0], 0.3f, dims, new Random(999)); - - int[][] docWordCounts = {{50, 100}, {200, 500}, {500, 1500}, {1000, 3000}}; - String[] labels = {"Short (50-100w)", "Medium (200-500w)", "Long (500-1500w)", "Very Long (1-3Kw)"}; - - for (int t = 0; t < docWordCounts.length; t++) { - var hnswParams = new HnswParams(16, 200, 64); - var config = new SpectorConfig(dims, size + 1000, SimilarityFunction.COSINE, hnswParams); - SpectorEngine engine = new DefaultSpectorEngine(config); - - int minWords = docWordCounts[t][0]; - int maxWords = docWordCounts[t][1]; - long totalBytes = 0; - - for (int i = 0; i < size; i++) { - int wordCount = minWords + rng.nextInt(maxWords - minWords); - String content = generateDocument(wordCount, rng); - totalBytes += content.length(); - engine.ingest("doc-" + i, content, vectors[i]); - } - - // Warmup - for (int i = 0; i < 50; i++) engine.vectorSearch(queryVec, TOP_K); - - // Measure - long[] nanos = new long[200]; - for (int i = 0; i < 200; i++) { - long t0 = System.nanoTime(); - engine.vectorSearch(queryVec, TOP_K); - nanos[i] = System.nanoTime() - t0; - } - var stats = computeStats(nanos); - long avgDocBytes = totalBytes / size; - - System.out.printf(" %-20s avgDoc=%,dB vecSearch=%.3fms QPS=%.0f%n", - labels[t], avgDocBytes, stats.mean / 1e6, 1e9 / stats.mean); - - results.add(new BenchResult("DocSize:" + labels[t], dims, size, - stats.mean / 1e6, stats.p99 / 1e6, 1e9 / stats.mean, -1)); - engine.close(); - } - System.out.println(); - } - - // ─────────────── Concurrency Benchmark ─────────────── - - private void runConcurrencyBenchmark(int dims, int datasetSize) throws Exception { - System.out.printf("▶ Concurrency Scaling: %,d docs × %d-dim%n", datasetSize, dims); - - var hnswParams = new HnswParams(16, 200, 64); - var config = new SpectorConfig(dims, datasetSize + 1000, - SimilarityFunction.COSINE, hnswParams); - SpectorEngine engine = new DefaultSpectorEngine(config); - Random rng = new Random(42); - - float[][] vectors = generateClusteredVectors(datasetSize, dims, rng); - for (int i = 0; i < datasetSize; i++) { - engine.ingest("doc-" + i, generateRealisticDocument(rng), vectors[i]); - } - - for (int threads : CONCURRENCY_LEVELS) { - int opsPerThread = 300; - ExecutorService executor = Executors.newFixedThreadPool(threads); - AtomicLong totalOps = new AtomicLong(); - AtomicLong totalNanos = new AtomicLong(); - - // Warmup - float[] wv = perturbVector(vectors[0], 0.3f, dims, new Random(999)); - for (int i = 0; i < 50; i++) engine.hybridSearch("neural network", wv, TOP_K); - - long wallStart = System.nanoTime(); - List<Future<?>> futures = new ArrayList<>(); - - for (int t = 0; t < threads; t++) { - final int tid = t; - futures.add(executor.submit(() -> { - Random trng = new Random(tid + 1000); - float[] qv = perturbVector(vectors[trng.nextInt(datasetSize)], 0.3f, dims, trng); - for (int i = 0; i < opsPerThread; i++) { - long t0 = System.nanoTime(); - engine.hybridSearch("machine learning optimization", qv, TOP_K); - totalNanos.addAndGet(System.nanoTime() - t0); - totalOps.incrementAndGet(); - } - })); - } - for (var f : futures) f.get(); - long wallElapsed = System.nanoTime() - wallStart; - executor.shutdown(); - - double wallSec = wallElapsed / 1e9; - double throughput = totalOps.get() / wallSec; - double avgLatencyMs = (totalNanos.get() / (double) totalOps.get()) / 1e6; - - System.out.printf(" threads=%2d throughput=%.0f ops/s avgLatency=%.2fms%n", - threads, throughput, avgLatencyMs); - - results.add(new BenchResult("Concurrent(t=" + threads + ")", dims, datasetSize, - avgLatencyMs, 0, throughput, -1)); - } - engine.close(); - System.out.println(); - } - - // ─────────────── Vector Generation (Clustered, Realistic) ─────────────── - - /** - * Generates vectors that form clusters (like real embeddings). - * Real embeddings from transformer models form clusters around topics/concepts. - */ - private float[][] generateClusteredVectors(int count, int dims, Random rng) { - // Generate cluster centers - float[][] centers = new float[NUM_CLUSTERS][dims]; - for (int c = 0; c < NUM_CLUSTERS; c++) { - for (int d = 0; d < dims; d++) { - centers[c][d] = (float) rng.nextGaussian() * 0.5f; - } - normalize(centers[c]); - } - - // Generate vectors around cluster centers - float[][] vectors = new float[count][dims]; - for (int i = 0; i < count; i++) { - int cluster = rng.nextInt(NUM_CLUSTERS); - for (int d = 0; d < dims; d++) { - vectors[i][d] = centers[cluster][d] + (float) rng.nextGaussian() * 0.15f; - } - normalize(vectors[i]); - } - return vectors; - } - - private float[] perturbVector(float[] base, float noise, int dims, Random rng) { - float[] result = new float[dims]; - for (int d = 0; d < dims; d++) { - result[d] = base[d] + (float) rng.nextGaussian() * noise; - } - normalize(result); - return result; - } - - private void normalize(float[] v) { - float norm = 0; - for (float f : v) norm += f * f; - norm = (float) Math.sqrt(norm); - if (norm > 1e-10f) { - for (int i = 0; i < v.length; i++) v[i] /= norm; - } - } - - // ─────────────── Ground Truth (Brute-Force KNN) ─────────────── - - private int[][] computeGroundTruth(float[][] queries, float[][] database, int k) { - int[][] truth = new int[queries.length][k]; - for (int q = 0; q < queries.length; q++) { - // Compute all distances - float[] dists = new float[database.length]; - for (int i = 0; i < database.length; i++) { - dists[i] = cosineSim(queries[q], database[i]); - } - // Find top-K by sorting indices - Integer[] indices = new Integer[database.length]; - for (int i = 0; i < database.length; i++) indices[i] = i; - Arrays.sort(indices, (a, b) -> Float.compare(dists[b], dists[a])); - for (int i = 0; i < k; i++) truth[q][i] = indices[i]; - } - return truth; - } - - private float cosineSim(float[] a, float[] b) { - float dot = 0, na = 0, nb = 0; - for (int i = 0; i < a.length; i++) { - dot += a[i] * b[i]; - na += a[i] * a[i]; - nb += b[i] * b[i]; - } - return (float) (dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-10)); - } - - // ─────────────── Document Generation ─────────────── - - /** Generates a realistic document (200-1500 words, paragraph structure). */ - private String generateRealisticDocument(Random rng) { - return generateDocument(200 + rng.nextInt(1300), rng); - } - - /** Generates a document of specified word count with paragraph breaks. */ - private String generateDocument(int wordCount, Random rng) { - StringBuilder sb = new StringBuilder(wordCount * 8); - int sentenceLen = 8 + rng.nextInt(15); - int paraLen = 3 + rng.nextInt(5); - int sentenceCount = 0; - - for (int w = 0; w < wordCount; w++) { - sb.append(CORPUS[rng.nextInt(CORPUS.length)]); - if ((w + 1) % sentenceLen == 0) { - sb.append(". "); - sentenceCount++; - sentenceLen = 8 + rng.nextInt(15); - if (sentenceCount % paraLen == 0) { - sb.append("\n\n"); - paraLen = 3 + rng.nextInt(5); - } - } else { - sb.append(' '); - } - } - return sb.toString(); - } - - // ─────────────── Statistics ─────────────── - - record Stats(double min, double max, double mean, double p50, double p95, double p99) {} - - private Stats computeStats(long[] nanos) { - Arrays.sort(nanos); - int n = nanos.length; - double sum = 0; - for (long v : nanos) sum += v; - double mean = sum / n; - return new Stats(nanos[0], nanos[n - 1], mean, - nanos[(int) (n * 0.50)], nanos[(int) (n * 0.95)], nanos[(int) (n * 0.99)]); - } - - // ─────────────── Results ─────────────── - - record BenchResult(String name, int dims, int datasetSize, - double avgMs, double p99Ms, double qps, double recall) {} - - private void printSummary() { - System.out.println("═══════════════════════════════════════════════════════════════"); - System.out.println(" SUMMARY"); - System.out.println("═══════════════════════════════════════════════════════════════"); - System.out.printf(" %-35s %8s %8s %10s %8s%n", "Benchmark", "Avg(ms)", "P99(ms)", "QPS", "Recall"); - System.out.println(" " + "-".repeat(75)); - for (var r : results) { - String recallStr = r.recall >= 0 ? String.format("%.1f%%", r.recall * 100) : "—"; - System.out.printf(" %-35s %8.3f %8.3f %10.0f %8s%n", - r.name + " " + r.dims + "d/" + r.datasetSize / 1000 + "K", - r.avgMs, r.p99Ms, r.qps, recallStr); - } - } - - private void writeReport(Path path) throws IOException { - StringBuilder sb = new StringBuilder(); - sb.append("Spector Industry Benchmark\n"); - sb.append("Generated: ").append(LocalDateTime.now()).append("\n"); - sb.append("Java: ").append(System.getProperty("java.version")).append("\n"); - sb.append("CPUs: ").append(runtime.availableProcessors()).append("\n"); - sb.append("SIMD: ").append(SimdCapability.report()).append("\n\n"); - - sb.append(String.format("%-35s %8s %8s %10s %8s%n", "Benchmark", "Avg(ms)", "P99(ms)", "QPS", "Recall")); - sb.append("-".repeat(80)).append("\n"); - for (var r : results) { - String recallStr = r.recall >= 0 ? String.format("%.1f%%", r.recall * 100) : "—"; - sb.append(String.format("%-35s %8.3f %8.3f %10.0f %8s%n", - r.name + " " + r.dims + "d/" + r.datasetSize / 1000 + "K", - r.avgMs, r.p99Ms, r.qps, recallStr)); - } - Files.writeString(path, sb.toString()); - } -} diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/IngestionBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/IngestionBenchmark.java index e499cb3..7e88aa0 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/IngestionBenchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/IngestionBenchmark.java @@ -1,18 +1,3 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; import java.util.Random; @@ -34,11 +19,10 @@ import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.engine.DefaultSpectorEngine; +import com.spectrayan.spector.core.SimilarityFunction; +import com.spectrayan.spector.engine.SpectorConfig; import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.config.HnswParams; +import com.spectrayan.spector.index.HnswParams; /** * Benchmarks measuring ingestion throughput for SpectorEngine. @@ -89,7 +73,7 @@ public void setup() { var hnswParams = new HnswParams(16, 200, 64); var config = new SpectorConfig(dimensions, MAX_CAPACITY, SimilarityFunction.COSINE, hnswParams); - engine = new DefaultSpectorEngine(config); + engine = new SpectorEngine(config); docCounter = 0; rng = new Random(42); } diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/IvfPqBenchmark.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/IvfPqBenchmark.java index 4ebc118..5293bd7 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/IvfPqBenchmark.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/IvfPqBenchmark.java @@ -1,21 +1,6 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; -import com.spectrayan.spector.core.similarity.SimilarityFunction; +import com.spectrayan.spector.core.SimilarityFunction; import com.spectrayan.spector.index.ScoredResult; import com.spectrayan.spector.index.ivf.IvfPqIndex; import com.spectrayan.spector.index.pq.ProductQuantizer; diff --git a/spector-bench/src/main/java/com/spectrayan/spector/bench/PerformanceTestRunner.java b/spector-bench/src/main/java/com/spectrayan/spector/bench/PerformanceTestRunner.java index a8d7328..b0ae675 100644 --- a/spector-bench/src/main/java/com/spectrayan/spector/bench/PerformanceTestRunner.java +++ b/spector-bench/src/main/java/com/spectrayan/spector/bench/PerformanceTestRunner.java @@ -1,28 +1,12 @@ -/* - * Copyright 2026 Spectrayan - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ package com.spectrayan.spector.bench; -import com.spectrayan.spector.core.similarity.CosineSimilarity; -import com.spectrayan.spector.core.similarity.DotProduct; -import com.spectrayan.spector.core.simd.SimdCapability; -import com.spectrayan.spector.core.similarity.SimilarityFunction; -import com.spectrayan.spector.config.SpectorConfig; -import com.spectrayan.spector.engine.DefaultSpectorEngine; +import com.spectrayan.spector.core.CosineSimilarity; +import com.spectrayan.spector.core.DotProduct; +import com.spectrayan.spector.core.SimdCapability; +import com.spectrayan.spector.core.SimilarityFunction; +import com.spectrayan.spector.engine.SpectorConfig; import com.spectrayan.spector.engine.SpectorEngine; -import com.spectrayan.spector.config.HnswParams; +import com.spectrayan.spector.index.HnswParams; import java.io.IOException; import java.io.PrintWriter; @@ -155,7 +139,7 @@ private void runScaleBenchmark(int datasetSize) { long memBefore = usedMemoryMB(); Instant ingestStart = Instant.now(); - SpectorEngine engine = new DefaultSpectorEngine(config); + SpectorEngine engine = new SpectorEngine(config); Random rng = new Random(42); // Ingestion @@ -230,7 +214,7 @@ private void runConcurrencyTest() throws Exception { var config = new SpectorConfig(DIMENSIONS, 51_000, SimilarityFunction.COSINE, hnswParams); - SpectorEngine engine = new DefaultSpectorEngine(config); + SpectorEngine engine = new SpectorEngine(config); Random rng = new Random(42); for (int i = 0; i < 50_000; i++) { engine.ingest("doc-" + i, generateText(30, rng), randomVector(DIMENSIONS, rng)); @@ -417,7 +401,7 @@ private void generateHtmlReport(Path path) throws IOException { <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> - <title>Spector — Performance Report + Spector Search — Performance Report