Skip to content

Commit

Permalink
fix: Conditional parameters in pgvector.
Browse files Browse the repository at this point in the history
  • Loading branch information
undo76 committed Dec 11, 2024
1 parent 605307d commit 93e0495
Showing 1 changed file with 45 additions and 16 deletions.
61 changes: 45 additions & 16 deletions src/raglite/_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import numpy as np
from markdown_it import MarkdownIt
from packaging import version
from pydantic import ConfigDict
from sqlalchemy.engine import Engine, make_url
from sqlmodel import JSON, Column, Field, Relationship, Session, SQLModel, create_engine, text
Expand Down Expand Up @@ -287,6 +288,18 @@ def from_chunks(
)


@lru_cache(maxsize=1)
def _get_pgvector_version(session: Session) -> str | None:
"""Get pgvector version.
Returns
-------
str | None: Version string if pgvector is installed, None otherwise
"""
result = session.execute(text("SELECT extversion FROM pg_extension WHERE extname = 'vector'"))
return result.scalar()


@lru_cache(maxsize=1)
def create_database_engine(config: RAGLiteConfig | None = None) -> Engine:
"""Create a database engine and initialize it."""
Expand Down Expand Up @@ -331,53 +344,69 @@ def create_database_engine(config: RAGLiteConfig | None = None) -> Engine:
with Session(engine) as session:
metrics = {"cosine": "cosine", "dot": "ip", "euclidean": "l2", "l1": "l1", "l2": "l2"}
session.execute(
text("""
CREATE INDEX IF NOT EXISTS keyword_search_chunk_index ON chunk USING GIN (to_tsvector('simple', body));
""")
text(
"""
CREATE INDEX IF NOT EXISTS keyword_search_chunk_index ON chunk USING GIN (to_tsvector('simple', body));
"""
)
)
session.execute(
text(f"""
base_sql = f"""
CREATE INDEX IF NOT EXISTS vector_search_chunk_index ON chunk_embedding
USING hnsw (
(embedding::halfvec({embedding_dim}))
halfvec_{metrics[config.vector_search_index_metric]}_ops
);
SET hnsw.ef_search = {20 * 4 * 8};
SET hnsw.iterative_scan = {'relaxed_order' if config.reranker else 'strict_order'};
""")
)
"""
# Add iterative scan if version >= 0.8.0
pgvector_version = _get_pgvector_version(session)
if pgvector_version and version.parse(pgvector_version) >= version.parse("0.8.0"):
sql = f"""{base_sql};
SET hnsw.iterative_scan = {'relaxed_order' if config.reranker else 'strict_order'};
"""
else:
sql = f"{base_sql};"
session.execute(text(sql))
session.commit()
elif db_backend == "sqlite":
# Create a virtual table for keyword search on the chunk table.
# We use the chunk table as an external content table [1] to avoid duplicating the data.
# [1] https://www.sqlite.org/fts5.html#external_content_tables
with Session(engine) as session:
session.execute(
text("""
text(
"""
CREATE VIRTUAL TABLE IF NOT EXISTS keyword_search_chunk_index USING fts5(body, content='chunk', content_rowid='rowid');
""")
"""
)
)
session.execute(
text("""
text(
"""
CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_insert AFTER INSERT ON chunk BEGIN
INSERT INTO keyword_search_chunk_index(rowid, body) VALUES (new.rowid, new.body);
END;
""")
"""
)
)
session.execute(
text("""
text(
"""
CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_delete AFTER DELETE ON chunk BEGIN
INSERT INTO keyword_search_chunk_index(keyword_search_chunk_index, rowid, body) VALUES('delete', old.rowid, old.body);
END;
""")
"""
)
)
session.execute(
text("""
text(
"""
CREATE TRIGGER IF NOT EXISTS keyword_search_chunk_index_auto_update AFTER UPDATE ON chunk BEGIN
INSERT INTO keyword_search_chunk_index(keyword_search_chunk_index, rowid, body) VALUES('delete', old.rowid, old.body);
INSERT INTO keyword_search_chunk_index(rowid, body) VALUES (new.rowid, new.body);
END;
""")
"""
)
)
session.commit()
return engine

0 comments on commit 93e0495

Please sign in to comment.