diff --git a/alembic/versions/7791001eb8a2_add_fts_gin_index.py b/alembic/versions/7791001eb8a2_add_fts_gin_index.py new file mode 100644 index 0000000..aa206a3 --- /dev/null +++ b/alembic/versions/7791001eb8a2_add_fts_gin_index.py @@ -0,0 +1,36 @@ +"""add_fts_gin_index + +Revision ID: 7791001eb8a2 +Revises: 8dceb919041e +Create Date: 2026-02-09 17:40:52.049941 + +""" +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = '7791001eb8a2' +down_revision: Union[str, Sequence[str], None] = '8dceb919041e' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Create GIN index for full-text search on document_chunks.content. + + Note: For production deployments with large existing datasets, consider + using CREATE INDEX CONCURRENTLY to avoid locking the table. Alembic + doesn't support CONCURRENTLY natively — use raw SQL with autocommit. + """ + op.execute(""" + CREATE INDEX ix_document_chunks_content_fts + ON document_chunks + USING GIN (to_tsvector('english', content)) + """) + + +def downgrade() -> None: + """Drop the full-text search GIN index.""" + op.execute("DROP INDEX IF EXISTS ix_document_chunks_content_fts") diff --git a/ragitect/agents/rag/tools.py b/ragitect/agents/rag/tools.py index d9845f1..184e05a 100644 --- a/ragitect/agents/rag/tools.py +++ b/ragitect/agents/rag/tools.py @@ -7,8 +7,9 @@ from typing import Callable, Awaitable from uuid import UUID -from langchain_core.tools import tool, BaseTool +from langchain_core.tools import tool +from ragitect.services.config import RETRIEVAL_RRF_K from ragitect.services.database.repositories.vector_repo import VectorRepository from ragitect.agents.rag.state import ContextChunk @@ -42,29 +43,29 @@ async def _retrieve_documents_impl( if query_embedding is None: query_embedding = await embed_fn(query) - # Search for similar chunks - chunks_with_distances = await vector_repo.search_similar_chunks( + # Search using hybrid RRF fusion (vector + full-text search) + chunks_with_scores = await vector_repo.hybrid_search( workspace_id=UUID(workspace_id), query_vector=query_embedding, + query_text=query, k=top_k, + rrf_k=RETRIEVAL_RRF_K, ) # Convert to ContextChunk format - # Note: search_similar_chunks returns (chunk, distance) tuples - # Distance is cosine distance: 0 = identical, 2 = opposite - # Convert to similarity: similarity = 1.0 - distance - # Note: document.file_name requires relationship loading, deferred to graph node + # Note: hybrid_search returns (chunk, rrf_score) tuples + # RRF score is already "higher = better", no conversion needed # Embedding preserved from DB to avoid redundant API calls during MMR selection return [ ContextChunk( chunk_id=str(chunk.id), content=chunk.content, - score=1.0 - distance, + score=rrf_score, document_id=str(chunk.document_id), title="", # Populated by graph node after document lookup embedding=list(chunk.embedding), ) - for chunk, distance in chunks_with_distances + for chunk, rrf_score in chunks_with_scores ] diff --git a/ragitect/services/config.py b/ragitect/services/config.py index 70c68c2..d3ea605 100644 --- a/ragitect/services/config.py +++ b/ragitect/services/config.py @@ -26,6 +26,9 @@ ) DEFAULT_RETRIEVAL_K: int = int(os.getenv("DEFAULT_RETRIEVAL_K", "10")) +# Embedding dimension (must match the model's output dimension) +EMBEDDING_DIMENSION: int = int(os.getenv("EMBEDDING_DIMENSION", "768")) + # Default LLM model for LangGraph nodes and other services # Format: "{provider}/{model}" for LiteLLM compatibility DEFAULT_LLM_MODEL: str = os.getenv("DEFAULT_LLM_MODEL", "ollama/llama3.1:8b") @@ -48,6 +51,7 @@ os.getenv("RETRIEVAL_ADAPTIVE_K_GAP_THRESHOLD", "0.15") ) RETRIEVAL_TOKEN_BUDGET: int = int(os.getenv("RETRIEVAL_TOKEN_BUDGET", "4000")) +RETRIEVAL_RRF_K: int = int(os.getenv("RETRIEVAL_RRF_K", "60")) # Encryption key for API key storage (required for cloud LLM providers) ENCRYPTION_KEY: str | None = os.getenv("ENCRYPTION_KEY") diff --git a/ragitect/services/database/repositories/vector_repo.py b/ragitect/services/database/repositories/vector_repo.py index bfc6f3a..6c95d02 100644 --- a/ragitect/services/database/repositories/vector_repo.py +++ b/ragitect/services/database/repositories/vector_repo.py @@ -1,10 +1,11 @@ """Vector repository for similarity search operations""" -from sqlalchemy import func, select +from sqlalchemy import func, select, type_coerce, Float from ragitect.services.database.exceptions import NotFoundError from ragitect.services.database.models import Document, Workspace from ragitect.services.database.exceptions import ValidationError from ragitect.services.database.models import DocumentChunk +from ragitect.services.config import EMBEDDING_DIMENSION from uuid import UUID from sqlalchemy.ext.asyncio.session import AsyncSession import logging @@ -72,9 +73,10 @@ async def search_similar_chunks( ValidationError: If query_vector dimension is invalid NotFoundError: If workspace does not exist """ - if len(query_vector) != 768: + if len(query_vector) != EMBEDDING_DIMENSION: raise ValidationError( - "query_vector", f"Expected 768 dimensions, got {len(query_vector)}" + "query_vector", + f"Expected {EMBEDDING_DIMENSION} dimensions, got {len(query_vector)}", ) # verify if workspace exists @@ -114,6 +116,159 @@ async def search_similar_chunks( ) return chunks_with_score + async def hybrid_search( + self, + workspace_id: UUID, + query_vector: list[float], + query_text: str, + k: int = 10, + rrf_k: int = 60, + vector_weight: float = 1.0, + fts_weight: float = 1.0, + ) -> list[tuple[DocumentChunk, float]]: + """Search using hybrid RRF fusion of vector similarity and full-text search. + + Combines cosine similarity (pgvector) with PostgreSQL full-text search + via Reciprocal Rank Fusion (RRF) in a single CTE-based SQL query. + + RRF Formula: score = Σ(weight / (k + rank_i)) for each retrieval system. + + When full-text search returns no matches, gracefully degrades to + vector-only ranking. + + Args: + workspace_id: Workspace to search within + query_vector: Query embedding vector (768 dims) + query_text: Original query text for full-text search + k: Number of results to return + rrf_k: RRF constant (default: 60). Higher values reduce rank impact. + vector_weight: Weight for vector search scores (default: 1.0) + fts_weight: Weight for full-text search scores (default: 1.0) + + Returns: + List of (DocumentChunk, rrf_score) tuples ordered by RRF score + (descending, higher = better). + + Raises: + ValidationError: If query_vector dimension is invalid + NotFoundError: If workspace does not exist + """ + if len(query_vector) != EMBEDDING_DIMENSION: + raise ValidationError( + "query_vector", + f"Expected {EMBEDDING_DIMENSION} dimensions, got {len(query_vector)}", + ) + + # Verify workspace exists + workspace = await self.session.get(Workspace, workspace_id) + if workspace is None: + raise NotFoundError("Workspace", workspace_id) + + oversample = k * 3 + + # CTE 1: Semantic search ranked by cosine distance (ascending = better) + distance_col = DocumentChunk.embedding.cosine_distance(query_vector).label( + "distance" + ) + semantic_cte = ( + select( + DocumentChunk.id.label("chunk_id"), + func.row_number() + .over(order_by=distance_col) + .label("semantic_rank"), + ) + .where(DocumentChunk.workspace_id == workspace_id) + .order_by(distance_col) + .limit(oversample) + .cte("semantic_search") + ) + + # CTE 2: Full-text search ranked by ts_rank_cd (descending = better) + fts_config = "english" + ts_vector = func.to_tsvector(fts_config, DocumentChunk.content) + ts_query = func.plainto_tsquery(fts_config, query_text) + + keyword_cte = ( + select( + DocumentChunk.id.label("chunk_id"), + func.row_number() + .over( + order_by=func.ts_rank_cd(ts_vector, ts_query).desc() + ) + .label("keyword_rank"), + ) + .where( + DocumentChunk.workspace_id == workspace_id, + ts_vector.op("@@")(ts_query), + ) + .order_by(func.ts_rank_cd(ts_vector, ts_query).desc()) + .limit(oversample) + .cte("keyword_search") + ) + + # Full outer join + RRF score computation + # Use coalesce: if a chunk only appears in one list, the other rank is absent + rrf_score = ( + func.coalesce( + type_coerce(vector_weight, Float) + / (rrf_k + semantic_cte.c.semantic_rank), + 0.0, + ) + + func.coalesce( + type_coerce(fts_weight, Float) + / (rrf_k + keyword_cte.c.keyword_rank), + 0.0, + ) + ).label("rrf_score") + + # Coalesce chunk_id from both CTEs for the join back to DocumentChunk + chunk_id_col = func.coalesce( + semantic_cte.c.chunk_id, keyword_cte.c.chunk_id + ).label("chunk_id") + + # Full outer join + fusion_query = ( + select(chunk_id_col, rrf_score) + .select_from( + semantic_cte.outerjoin( + keyword_cte, + semantic_cte.c.chunk_id == keyword_cte.c.chunk_id, + full=True, + ) + ) + .order_by(rrf_score.desc()) + .limit(k) + .subquery("fusion") + ) + + # Join back to DocumentChunk to get full model + final_stmt = ( + select(DocumentChunk, fusion_query.c.rrf_score) + .join(fusion_query, DocumentChunk.id == fusion_query.c.chunk_id) + .order_by(fusion_query.c.rrf_score.desc()) + ) + + result = await self.session.execute(final_stmt) + results = result.all() + + chunks_with_scores = [ + (chunk, float(rrf_score)) for chunk, rrf_score in results + ] + + logger.info( + f"Hybrid search: found {len(chunks_with_scores)} chunks " + + f"(workspace={workspace_id}, k={k}, rrf_k={rrf_k})" + ) + + if chunks_with_scores: + scores = [score for _, score in chunks_with_scores] + logger.debug( + f"RRF score range: [{min(scores):.6f}, {max(scores):.6f}], " + + f"mean: {sum(scores) / len(scores):.6f}" + ) + + return chunks_with_scores + async def search_similar_documents( self, workspace_id: UUID, @@ -141,9 +296,10 @@ async def search_similar_documents( ValidationError: If query_vector dimension is invalid NotFoundError: If workspace does not exist """ - if len(query_vector) != 768: + if len(query_vector) != EMBEDDING_DIMENSION: raise ValidationError( - "query_vector", f"Expected 768 dimensions, got {len(query_vector)}" + "query_vector", + f"Expected {EMBEDDING_DIMENSION} dimensions, got {len(query_vector)}", ) workspace = await self.session.get(Workspace, workspace_id) @@ -220,9 +376,10 @@ async def get_chunk_by_document( Raises: ValidationError: if query_vector dimension is invalid """ - if len(query_vector) != 768: + if len(query_vector) != EMBEDDING_DIMENSION: raise ValidationError( - "query_vector", f"Expected 768 dimensions, got {len(query_vector)}" + "query_vector", + f"Expected {EMBEDDING_DIMENSION} dimensions, got {len(query_vector)}", ) distance_col = DocumentChunk.embedding.cosine_distance(query_vector).label( diff --git a/tests/agents/rag/test_tools.py b/tests/agents/rag/test_tools.py index b07ace7..a927125 100644 --- a/tests/agents/rag/test_tools.py +++ b/tests/agents/rag/test_tools.py @@ -6,8 +6,8 @@ import pytest from langchain_core.tools import BaseTool -from ragitect.agents.rag.state import ContextChunk from ragitect.agents.rag.tools import retrieve_documents, _retrieve_documents_impl +from ragitect.services.config import RETRIEVAL_RRF_K class TestRetrieveDocumentsToolDecorator: @@ -36,7 +36,6 @@ async def test_retrieve_documents_returns_list_of_context_chunks(self): # Setup mocks workspace_id = str(uuid4()) mock_vector_repo = AsyncMock() - mock_embeddings_model = MagicMock() # Mock chunk returned from vector repo mock_chunk = MagicMock() @@ -45,9 +44,9 @@ async def test_retrieve_documents_returns_list_of_context_chunks(self): mock_chunk.document_id = str(uuid4()) mock_chunk.embedding = [0.1] * 768 # Add embedding - # search_similar_chunks returns (chunk, distance) tuples - mock_vector_repo.search_similar_chunks.return_value = [ - (mock_chunk, 0.1), # distance 0.1 = similarity 0.9 + # hybrid_search returns (chunk, rrf_score) tuples (higher = better) + mock_vector_repo.hybrid_search.return_value = [ + (mock_chunk, 0.032), # RRF score ] # Mock embedding function @@ -72,7 +71,7 @@ async def test_retrieve_documents_returns_empty_list_when_no_results(self): """Tool should return empty list when no chunks found.""" workspace_id = str(uuid4()) mock_vector_repo = AsyncMock() - mock_vector_repo.search_similar_chunks.return_value = [] + mock_vector_repo.hybrid_search.return_value = [] mock_embed_fn = AsyncMock(return_value=[0.1] * 768) result = await _retrieve_documents_impl( @@ -85,8 +84,8 @@ async def test_retrieve_documents_returns_empty_list_when_no_results(self): assert result == [] - async def test_retrieve_documents_converts_distance_to_similarity(self): - """Tool should convert cosine distance to similarity score.""" + async def test_retrieve_documents_uses_rrf_score_directly(self): + """Tool should use RRF score directly (higher = better).""" workspace_id = str(uuid4()) mock_vector_repo = AsyncMock() mock_embed_fn = AsyncMock(return_value=[0.1] * 768) @@ -97,9 +96,10 @@ async def test_retrieve_documents_converts_distance_to_similarity(self): mock_chunk.document_id = str(uuid4()) mock_chunk.embedding = [0.2] * 768 # Add embedding - # Distance 0.15 should become similarity 0.85 - mock_vector_repo.search_similar_chunks.return_value = [ - (mock_chunk, 0.15), + # RRF score is used directly (no distance-to-similarity conversion) + rrf_score = 0.032 + mock_vector_repo.hybrid_search.return_value = [ + (mock_chunk, rrf_score), ] result = await _retrieve_documents_impl( @@ -110,14 +110,14 @@ async def test_retrieve_documents_converts_distance_to_similarity(self): top_k=10, ) - # Score should be 1.0 - distance = 0.85 - assert result[0]["score"] == pytest.approx(0.85, abs=0.001) + # Score should be the RRF score directly + assert result[0]["score"] == pytest.approx(rrf_score, abs=0.001) async def test_retrieve_documents_respects_top_k_parameter(self): - """Tool should pass top_k to vector repo.""" + """Tool should pass top_k to hybrid_search.""" workspace_id = str(uuid4()) mock_vector_repo = AsyncMock() - mock_vector_repo.search_similar_chunks.return_value = [] + mock_vector_repo.hybrid_search.return_value = [] mock_embed_fn = AsyncMock(return_value=[0.1] * 768) await _retrieve_documents_impl( @@ -128,16 +128,18 @@ async def test_retrieve_documents_respects_top_k_parameter(self): top_k=50, ) - # Verify search was called with k=50 - mock_vector_repo.search_similar_chunks.assert_awaited_once() - call_kwargs = mock_vector_repo.search_similar_chunks.call_args.kwargs + # Verify hybrid_search was called with k=50 and rrf_k from config + mock_vector_repo.hybrid_search.assert_awaited_once() + call_kwargs = mock_vector_repo.hybrid_search.call_args.kwargs assert call_kwargs.get("k") == 50 + assert call_kwargs.get("rrf_k") == RETRIEVAL_RRF_K + assert call_kwargs.get("query_text") == "test" async def test_retrieve_documents_calls_embed_fn(self): """Tool should call embed_fn with the query.""" workspace_id = str(uuid4()) mock_vector_repo = AsyncMock() - mock_vector_repo.search_similar_chunks.return_value = [] + mock_vector_repo.hybrid_search.return_value = [] mock_embed_fn = AsyncMock(return_value=[0.1] * 768) await _retrieve_documents_impl( @@ -163,8 +165,9 @@ async def test_retrieve_documents_structure_matches_context_chunk(self): mock_chunk.document_id = doc_id mock_chunk.embedding = [0.3] * 768 # Add embedding - mock_vector_repo.search_similar_chunks.return_value = [ - (mock_chunk, 0.05), + rrf_score = 0.032 + mock_vector_repo.hybrid_search.return_value = [ + (mock_chunk, rrf_score), ] result = await _retrieve_documents_impl( @@ -189,7 +192,7 @@ async def test_retrieve_documents_structure_matches_context_chunk(self): assert chunk["content"] == "The content of the chunk" assert chunk["document_id"] == doc_id assert chunk["title"] == "" # Title populated later by graph node - assert chunk["score"] == pytest.approx(0.95, abs=0.001) + assert chunk["score"] == pytest.approx(rrf_score, abs=0.001) assert len(chunk["embedding"]) == 768 # Verify embedding preserved async def test_context_chunk_preserves_embeddings(self): @@ -219,9 +222,9 @@ async def test_context_chunk_preserves_embeddings(self): mock_chunk_2.document_id = str(uuid4()) mock_chunk_2.embedding = expected_embedding_2 - mock_vector_repo.search_similar_chunks.return_value = [ - (mock_chunk_1, 0.1), - (mock_chunk_2, 0.2), + mock_vector_repo.hybrid_search.return_value = [ + (mock_chunk_1, 0.032), + (mock_chunk_2, 0.016), ] result = await _retrieve_documents_impl( diff --git a/tests/services/database/repositories/test_vector_repo.py b/tests/services/database/repositories/test_vector_repo.py index c28f599..862af84 100644 --- a/tests/services/database/repositories/test_vector_repo.py +++ b/tests/services/database/repositories/test_vector_repo.py @@ -99,3 +99,123 @@ async def test_batch_search_chunks(self, repo, mock_session): assert len(results) == 2 assert len(results[0]) == 1 assert len(results[1]) == 1 + + +class TestHybridSearch: + @pytest.fixture + def repo(self, mock_session): + return VectorRepository(mock_session) + + async def test_hybrid_search_basic(self, repo, mock_session): + """Test hybrid search returns correctly ordered results.""" + workspace_id = uuid4() + mock_session.get.return_value = Workspace(id=workspace_id) + + chunk1 = DocumentChunk(id=uuid4(), content="asyncio best practices") + chunk2 = DocumentChunk(id=uuid4(), content="python basics") + rrf_score1 = 0.032 + rrf_score2 = 0.016 + + mock_result = MagicMock() + mock_result.all.return_value = [ + (chunk1, rrf_score1), + (chunk2, rrf_score2), + ] + mock_session.execute.return_value = mock_result + + results = await repo.hybrid_search( + workspace_id=workspace_id, + query_vector=[0.1] * 768, + query_text="asyncio", + k=10, + ) + + assert len(results) == 2 + assert results[0][0] == chunk1 + assert results[0][1] == rrf_score1 + assert results[1][0] == chunk2 + assert results[1][1] == rrf_score2 + # Higher score = better + assert results[0][1] > results[1][1] + + async def test_hybrid_search_invalid_vector(self, repo, mock_session): + """Test that wrong vector dimension raises ValidationError.""" + with pytest.raises(ValidationError): + await repo.hybrid_search( + workspace_id=uuid4(), + query_vector=[0.1] * 10, # Wrong dimension + query_text="test", + ) + + async def test_hybrid_search_workspace_not_found(self, repo, mock_session): + """Test that missing workspace raises NotFoundError.""" + mock_session.get.return_value = None + + with pytest.raises(NotFoundError): + await repo.hybrid_search( + workspace_id=uuid4(), + query_vector=[0.1] * 768, + query_text="test", + ) + + async def test_hybrid_search_empty_results(self, repo, mock_session): + """Test that no matches returns empty list.""" + workspace_id = uuid4() + mock_session.get.return_value = Workspace(id=workspace_id) + + mock_result = MagicMock() + mock_result.all.return_value = [] + mock_session.execute.return_value = mock_result + + results = await repo.hybrid_search( + workspace_id=workspace_id, + query_vector=[0.1] * 768, + query_text="nonexistent query", + ) + + assert results == [] + + async def test_hybrid_search_vector_only_results(self, repo, mock_session): + """Test graceful degradation when FTS returns nothing.""" + workspace_id = uuid4() + mock_session.get.return_value = Workspace(id=workspace_id) + + # When FTS has no matches, only vector results come through + chunk = DocumentChunk(id=uuid4(), content="semantic content only") + # Score from vector-only RRF: 1/(60+1) ≈ 0.0164 + rrf_score = 1.0 / (60 + 1) + + mock_result = MagicMock() + mock_result.all.return_value = [(chunk, rrf_score)] + mock_session.execute.return_value = mock_result + + results = await repo.hybrid_search( + workspace_id=workspace_id, + query_vector=[0.1] * 768, + query_text="completely unrelated terms xyz123", + ) + + assert len(results) == 1 + assert results[0][0] == chunk + assert results[0][1] > 0 # Still has vector score + + async def test_hybrid_search_default_rrf_k(self, repo, mock_session): + """Test that default rrf_k parameter is 60.""" + workspace_id = uuid4() + mock_session.get.return_value = Workspace(id=workspace_id) + + mock_result = MagicMock() + mock_result.all.return_value = [] + mock_session.execute.return_value = mock_result + + # Call without explicit rrf_k — should use default 60 + import inspect + + sig = inspect.signature(repo.hybrid_search) + assert sig.parameters["rrf_k"].default == 60 + + await repo.hybrid_search( + workspace_id=workspace_id, + query_vector=[0.1] * 768, + query_text="test", + ) diff --git a/tests/services/database/repositories/test_vector_repo_integration.py b/tests/services/database/repositories/test_vector_repo_integration.py index 1f48063..00d8424 100644 --- a/tests/services/database/repositories/test_vector_repo_integration.py +++ b/tests/services/database/repositories/test_vector_repo_integration.py @@ -6,6 +6,7 @@ """ import pytest +from sqlalchemy import text from ragitect.services.database import get_session from ragitect.services.database.repositories.vector_repo import VectorRepository from ragitect.services.database.repositories.workspace_repo import WorkspaceRepository @@ -89,3 +90,167 @@ async def test_batch_search_chunks_integration(self, clean_database): assert len(results) == 2 assert results[0][0][0].content == "c1" assert results[1][0][0].content == "c1" + + +class TestHybridSearchIntegration: + """Integration tests for hybrid_search with real PostgreSQL.""" + + @staticmethod + async def _ensure_fts_index(session): + """Create the GIN FTS index if it doesn't exist (not covered by metadata.create_all).""" + await session.execute( + text(""" + CREATE INDEX IF NOT EXISTS ix_document_chunks_content_fts + ON document_chunks + USING GIN (to_tsvector('english', content)) + """) + ) + await session.commit() + + async def test_hybrid_search_combines_vector_and_keyword(self, clean_database): + """Test that hybrid search combines both vector and keyword matches.""" + async with get_session() as session: + await self._ensure_fts_index(session) + + ws_repo = WorkspaceRepository(session) + doc_repo = DocumentRepository(session) + vec_repo = VectorRepository(session) + + workspace = await ws_repo.create("Hybrid WS") + doc = await doc_repo.create(workspace.id, "hybrid.pdf", "content") + + # Chunk 1: keyword "asyncio" + close vector + vec1 = [0.0] * 768 + vec1[0] = 1.0 + # Chunk 2: keyword "pgvector" + different vector + vec2 = [0.0] * 768 + vec2[1] = 1.0 + # Chunk 3: no keyword match, orthogonal vector + vec3 = [0.0] * 768 + vec3[2] = 1.0 + + await doc_repo.add_chunks( + doc.id, + [ + ("asyncio event loop handling in python", vec1, {}), + ("pgvector extension for postgresql", vec2, {}), + ("general documentation without special terms", vec3, {}), + ], + ) + + # Query: "asyncio" — should match chunk1 by keyword AND vector + query_vec = [0.0] * 768 + query_vec[0] = 1.0 # Close to vec1 + + results = await vec_repo.hybrid_search( + workspace_id=workspace.id, + query_vector=query_vec, + query_text="asyncio", + k=10, + ) + + assert len(results) >= 1 + # Chunk 1 should be top result (keyword + vector match) + assert results[0][0].content == "asyncio event loop handling in python" + assert results[0][1] > 0 + + async def test_hybrid_search_keyword_boost(self, clean_database): + """Test that keyword match boosts a chunk's RRF score over vector-only.""" + async with get_session() as session: + await self._ensure_fts_index(session) + + ws_repo = WorkspaceRepository(session) + doc_repo = DocumentRepository(session) + vec_repo = VectorRepository(session) + + workspace = await ws_repo.create("Keyword Boost WS") + doc = await doc_repo.create(workspace.id, "boost.pdf", "content") + + # Chunk 1: semantically similar (close vector) but NO keyword match + vec1 = [0.0] * 768 + vec1[0] = 0.9 + vec1[1] = 0.1 + + # Chunk 2: less similar vector BUT has exact keyword match + vec2 = [0.0] * 768 + vec2[0] = 0.7 + vec2[2] = 0.3 + + await doc_repo.add_chunks( + doc.id, + [ + ("general programming concepts overview", vec1, {}), + ("asyncio coroutine patterns for concurrent programming", vec2, {}), + ], + ) + + # Query vector close to vec1, but query text matches chunk2 + query_vec = [0.0] * 768 + query_vec[0] = 1.0 + + results = await vec_repo.hybrid_search( + workspace_id=workspace.id, + query_vector=query_vec, + query_text="asyncio coroutine", + k=10, + ) + + assert len(results) == 2 + # Chunk 2 should be boosted above chunk 1 due to keyword match + contents = [r[0].content for r in results] + assert "asyncio coroutine patterns for concurrent programming" in contents + + async def test_hybrid_search_no_keyword_matches(self, clean_database): + """Test graceful degradation to vector-only when no FTS matches.""" + async with get_session() as session: + await self._ensure_fts_index(session) + + ws_repo = WorkspaceRepository(session) + doc_repo = DocumentRepository(session) + vec_repo = VectorRepository(session) + + workspace = await ws_repo.create("No Keyword WS") + doc = await doc_repo.create(workspace.id, "nofts.pdf", "content") + + vec1 = [0.0] * 768 + vec1[0] = 1.0 + + await doc_repo.add_chunks( + doc.id, + [("some content about programming", vec1, {})], + ) + + query_vec = [0.0] * 768 + query_vec[0] = 1.0 + + # Query text that won't match any FTS + results = await vec_repo.hybrid_search( + workspace_id=workspace.id, + query_vector=query_vec, + query_text="xyznonexistent123", + k=10, + ) + + # Should still return vector results (graceful degradation) + assert len(results) == 1 + assert results[0][0].content == "some content about programming" + assert results[0][1] > 0 # Has a valid RRF score from vector ranking + + async def test_hybrid_search_empty_workspace(self, clean_database): + """Test hybrid search on empty workspace returns empty list.""" + async with get_session() as session: + await self._ensure_fts_index(session) + + ws_repo = WorkspaceRepository(session) + vec_repo = VectorRepository(session) + + workspace = await ws_repo.create("Empty Hybrid WS") + + results = await vec_repo.hybrid_search( + workspace_id=workspace.id, + query_vector=[0.1] * 768, + query_text="anything", + k=10, + ) + + assert results == []