Skip to content
Merged
111 changes: 111 additions & 0 deletions scripts/migrate_vector_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Script to migrate/re-ingest chunks into the configured vector database.
It reads the `chunk_index` records from the SQL database and pushes
them to the Vector Store, using the current embedding model.
This is useful when changing embedding models or vector databases.
"""

import os
import sys
from datetime import datetime
from typing import cast, Dict, Any
from uuid import UUID

# Add project root to sys.path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from sqlalchemy.orm import Session
from src.infrastructure.repositories.sql.connector import Session as DBSessionFactory
from src.infrastructure.repositories.sql.models.chunk_index import ChunkIndexModel
from src.infrastructure.services.model_loader_service import ModelLoaderService
from src.infrastructure.repositories.vector.models.chunk_model import ChunkModel
from src.config.settings import Settings
from src.config.logger import Logger
from src.presentation.api.dependencies import get_vector_repository

logger = Logger()


def migrate_vector_db(batch_size: int = 100) -> None:
settings = Settings()

embedding_model_name = settings.model_embedding.name
logger.info(f"Initializing Model Loader Service with model: {embedding_model_name}")
model_loader = ModelLoaderService(model_name=embedding_model_name)
model_loader.load_model()

logger.info("Initializing Vector Repository...")
# This automatically instantiates the vector repo for the correct type defined in .env
vector_repo = get_vector_repository(settings=settings, model_loader=model_loader)

if not vector_repo.is_ready():
logger.error("Vector Repository is not ready. Aborting.")
sys.exit(1)

db: Session = DBSessionFactory()
try:
total_chunks = db.query(ChunkIndexModel).count()
logger.info(f"Total chunks to migrate: {total_chunks}")

offset = 0
while offset < total_chunks:
chunk_models_sql = (
db.query(ChunkIndexModel)
.order_by(ChunkIndexModel.created_at)
.offset(offset)
.limit(batch_size)
.all()
)

Copilot AI Apr 7, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The count() + OFFSET/LIMIT pagination pattern becomes very slow on large tables, and SQLAlchemy will also keep each loaded ORM object in the session identity map (risking unbounded memory growth during long migrations). Consider keyset pagination (e.g., by (created_at, id)) or iterating with yield_per/streaming results, and expunging/clearing the session per batch.

Copilot uses AI. Check for mistakes.

if not chunk_models_sql:
break

documents = []
for chunk_sql in chunk_models_sql:
extra_data = (
cast(Dict[str, Any], chunk_sql.extra)
if isinstance(chunk_sql.extra, dict)
else {}
)
if chunk_sql.vector_store_type:
extra_data["original_vector_store_type"] = (
chunk_sql.vector_store_type
)

Copilot AI Apr 7, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extra_data reuses the dict object from chunk_sql.extra and then mutates it (adds original_vector_store_type). This can mark the ORM row as dirty and can also cause surprising side effects within the session. Make a shallow copy (e.g., dict(chunk_sql.extra) / {**chunk_sql.extra}) before mutating.

Copilot uses AI. Check for mistakes.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot apply changes based on this feedback


doc = ChunkModel(
id=cast(UUID, chunk_sql.id),
job_id=cast(UUID, chunk_sql.job_id),
content_source_id=cast(UUID, chunk_sql.content_source_id),
source_type=str(chunk_sql.source_type or "UNKNOWN"),
external_source=cast(str, chunk_sql.external_source),
subject_id=cast(UUID, chunk_sql.subject_id),
index=cast(int, chunk_sql.index),
content=cast(str, chunk_sql.content),
tokens_count=cast(int, chunk_sql.tokens_count),
language=cast(str, chunk_sql.language),
embedding_model=embedding_model_name,
created_at=cast(datetime, chunk_sql.created_at or datetime.now()),

Copilot AI Apr 7, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ChunkIndexModel.created_at is timezone-aware and non-nullable, but the fallback datetime.now() is naive and can produce inconsistent timestamps in vector metadata. Prefer using chunk_sql.created_at directly (or a UTC-aware fallback such as datetime.now(timezone.utc)), and avoid emitting naive datetimes.

Suggested change
created_at=cast(datetime, chunk_sql.created_at or datetime.now()),
created_at=cast(datetime, chunk_sql.created_at),

Copilot uses AI. Check for mistakes.
version_number=cast(int, chunk_sql.version_number),
extra=extra_data,
)
documents.append(doc)

logger.info(
f"Uploading batch of {len(documents)} chunks to vector db... (Progress: {offset + len(documents)} / {total_chunks})"
)

# create_documents will internally call the EmbeddingService for the texts and save them
vector_repo.create_documents(documents)

offset += batch_size

Copilot AI Apr 7, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This script reuses the existing chunk_sql.id as the vector document ID. For backends that don't upsert on insert (e.g., Chroma uses add_texts in create_documents), rerunning this migration against a non-empty collection can fail with duplicate-ID errors or create duplicates. Consider either deleting/clearing the target collection up front (or documenting that scripts/clear_vector_db.py must be run first), or adding an explicit upsert/replace behavior for migration mode.

Copilot uses AI. Check for mistakes.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot apply changes based on this feedback

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Applied in commit 81f10f5. Added a --clear CLI flag that calls clear_vector_db() before the migration begins, preventing duplicate-ID errors on backends that don't upsert on insert (e.g. ChromaDB). Run with python scripts/migrate_vector_db.py --clear for a clean migration, or without the flag to preserve existing data.


logger.info("Vector DB migration finished successfully!")

except Exception as e:
logger.error(f"Migration failed: {e}")
finally:
Comment thread
ericksonlopes marked this conversation as resolved.
db.close()


if __name__ == "__main__":
migrate_vector_db()
Loading