Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ repos:
- id: check-yaml
- id: detect-private-key
- repo: https://github.com/tox-dev/pyproject-fmt
rev: "v2.6.0"
rev: "v2.11.1"
hooks:
- id: pyproject-fmt
- repo: https://github.com/citation-file-format/cffconvert
Expand Down Expand Up @@ -39,12 +39,12 @@ repos:
- id: yamllint
exclude: pre-commit-config.yaml
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.13.0"
rev: "v0.14.8"
hooks:
- id: ruff-format
- id: ruff-check
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
rev: v1.7.9
hooks:
- id: actionlint
- repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
Expand Down
4 changes: 2 additions & 2 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ COPY --exclude=.venv ./packages/manugen-ai/ /packages/manugen-ai/
ENV VIRTUAL_ENV=/app/.venv
ENV UV_LINK_MODE=copy
ENV UV_NO_SYNC=1
COPY ./backend/pyproject.toml ./
COPY ./backend/pyproject.toml ./backend/uv.lock ./
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --cache-dir /root/.cache/uv
uv sync --frozen --cache-dir /root/.cache/uv

# copy backend contents into working dir
COPY --exclude=.venv ./backend/ .
Expand Down
4 changes: 4 additions & 0 deletions packages/manugen-ai/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
addopts = -m "not slow"
markers =
slow: marks tests as slow (deselected by default)
9 changes: 3 additions & 6 deletions packages/manugen-ai/src/manugen_ai/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import duckdb
import numpy as np
import pyarrow as pa
from duckdb.typing import VARCHAR

from manugen_ai.utils import download_file_if_not_available

Expand Down Expand Up @@ -275,9 +276,7 @@ def create_withdrarxiv_embeddings(
"""
)

conn.create_function(
"embed", embed, [duckdb.typing.VARCHAR], f"FLOAT[{get_embedding_size()}]"
)
conn.create_function("embed", embed, [VARCHAR], f"FLOAT[{get_embedding_size()}]")

# Batch-compute embeddings for every abstract
batch_size = 100
Expand Down Expand Up @@ -373,9 +372,7 @@ def search_withdrarxiv_embeddings(query: str, top_k: int = 2):
)

conn = duckdb.connect(target_db)
conn.create_function(
"embed", embed, [duckdb.typing.VARCHAR], f"FLOAT[{get_embedding_size()}]"
)
conn.create_function("embed", embed, [VARCHAR], f"FLOAT[{get_embedding_size()}]")

q = {"q": query, "k": top_k}
df = (
Expand Down
80 changes: 80 additions & 0 deletions packages/manugen-ai/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Tests for the 'data' module, mostly consisting of embedding models.
"""

import json
from pathlib import Path

import pytest
from manugen_ai.data import create_withdrarxiv_embeddings, search_withdrarxiv_embeddings


@pytest.mark.slow
def test_create_withdrarxiv_embedding():
""" "
Test creating the withdrarxiv embeddings database using a table of retracted
papers from arXiv and an embedding model, either via Gemini's API or via
a local Flag embedding model.

The test is marked 'slow' and thus excluded from the default test suite
for a few reasons:
- it takes ~2 minutes to run using the Gemini embeddings API, which incurs
API usage costs
- it takes a lot longer with the flag embeddings, which require
downloading a large model file, BAAI/bge-m3, from Hugging Face.
- the withdrarxiv dataset on which it relies can't be included in the repo
and must be manually downloaded; see test_create_withdrarxiv_embedding()'s
docstring for details.

Depending on the value of the env var USE_GEMINI_EMBEDDINGS, this test will
use either Google's Gemini embeddings via Google's API (if set to "1") or
the "flag" embedding model from Hugging Face. The Gemini embeddings require
that the GOOGLE_API_KEY env var be set to a valid API key.

The Flag embedding model uses the FLAGEMBEDDING_MODEL_OR_PATH env var
to determine where the model is already located, or if given a model
what model to download. By default it'll download "BAAI/bge-m3" from
Hugging Face.
"""
datafiles_dir = (
Path(__file__).resolve().parent.parent / "src" / "manugen_ai" / "data"
)
target_db = "withdrarxiv_test_embeddings.duckdb"

try:
# construct the db; we're mostly seeing if this runs without throwing anything
resulting_db_path = create_withdrarxiv_embeddings(target_db)

# check that the resulting db path is what we passed in
assert resulting_db_path == target_db

# ensure the db file was actually created
full_db_path = datafiles_dir / target_db
assert full_db_path.exists()

finally:
full_db_path = datafiles_dir / target_db
full_db_path.unlink(missing_ok=True)


def test_search_withdrarxiv_embeddings():
"""
Test searching the withdrarxiv embeddings.

As of 2025-12-03, the retraction db produced these results for the query
"What is the role of quantum entanglement in quantum computing?":
[
{"related_retraction_reasons":"Just because interleaving bisimilarity based ACP cannot be reversed, some conclusions of this paper are wrong and cannot be remedied, I beg to withdraw this paper"},
{"related_retraction_reasons":"a wrong formula"},
{"related_retraction_reasons":"The paper is withdrawn because of many flaws in the manuscript"}
]
"""
results = json.loads(
search_withdrarxiv_embeddings(
"What is the role of quantum entanglement in quantum computing?", top_k=3
)
)

assert len(results) == 3
for result in results:
assert "related_retraction_reasons" in result
Loading