Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# .dockerignore
venv/
assets/
qdrant_data/
redpanda_data/
__pycache__/
*.pyc
.git/
.env
qdrant_data/
redpanda_data/
*.db
98 changes: 87 additions & 11 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,28 +1,104 @@
# Python
__pycache__/
*.pyc
*.pyo
*.pyd
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Virtual Environments
venv/
.venv/
env/
.env
ENV/
env.bak/
venv.bak/

# Environment variables
.env
.env.local
.env.*.local

# IDE
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
.AppleDouble
.LSOverride

# OS
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db

.vscode/
.idea/


# Project-specific data directories
assets/videos/
*.mp4


qdrant_data/
redpanda_data/


# Logs
*.log
logs/
*.log.*

# Testing
.pytest_cache/
.coverage
htmlcov/
.tox/
.hypothesis/
.mypy_cache/
.dmypy.json
dmypy.json

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# pip
pip-log.txt
pip-delete-this-directory.txt

# Model cache (HuggingFace, etc.)
.cache/
models/
*.pt
*.pth
*.ckpt

# Temporary files
*.tmp
*.temp
*.bak
*.swp
*~

# Docker
.dockerignore
37 changes: 22 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
# Use a slim Python image (lightweight and fast)
# If you need GPU support later, change this to an nvidia/cuda image or pytorch/pytorch
FROM python:3.10-slim
# Use Python 3.12 to match your project requirements
FROM python:3.12-slim

# Set the working directory to match the volume mount in your compose file
WORKDIR /app

# Install system dependencies (often needed for AI/Inference libraries like OpenCV or numpy)
# 1. Install System Dependencies
RUN apt-get update && apt-get install -y \
build-essential \
libgl1 \
libglib2.0-0 \
ffmpeg \
curl \
&& rm -rf /var/lib/apt/lists/*

# Copy requirements first to leverage Docker caching
COPY requirements.txt .
# 2. Install uv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
Copy link

@cubic-dev-ai cubic-dev-ai bot Dec 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Pin the uv image to a specific version instead of using :latest for reproducible and secure builds.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At Dockerfile, line 13:

<comment>Pin the `uv` image to a specific version instead of using `:latest` for reproducible and secure builds.</comment>

<file context>
@@ -1,25 +1,32 @@
-# Copy requirements first to leverage Docker caching
-COPY requirements.txt .
+# 2. Install uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+
+# 3. Set up the application
</file context>
Suggested change
COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /bin/uv
Fix with Cubic


# 3. Set up the application
WORKDIR /app

# 4. Install Dependencies
# Only copy pyproject.toml (since we deleted uv.lock)
COPY pyproject.toml ./

# Run sync WITHOUT --frozen (this resolves dependencies fresh)
RUN uv sync
Copy link

@cubic-dev-ai cubic-dev-ai bot Dec 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: Builds are not reproducible without a lock file. Consider keeping uv.lock committed and using uv sync --frozen to ensure consistent dependency versions across builds.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At Dockerfile, line 23:

<comment>Builds are not reproducible without a lock file. Consider keeping `uv.lock` committed and using `uv sync --frozen` to ensure consistent dependency versions across builds.</comment>

<file context>
@@ -1,25 +1,32 @@
+COPY pyproject.toml ./
+
+# Run sync WITHOUT --frozen (this resolves dependencies fresh)
+RUN uv sync
 
-# Install Python dependencies
</file context>
Fix with Cubic


# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# CRITICAL: Add the virtual environment to the PATH
ENV PATH="/app/.venv/bin:$PATH"

# Copy the rest of the code
# 5. Copy the Code
COPY . .

# This CMD is a fallback; your docker-compose 'command' overrides this.
CMD ["python3", "app/server.py"]
# 6. Default Command
CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000"]
80 changes: 80 additions & 0 deletions app/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import logging
import os
import uuid

import ffmpeg
from fastembed import TextEmbedding
from faster_whisper import WhisperModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load Models
try:
# Use 'tiny' or 'base' for speed during debugging
WHISPER = WhisperModel("base", device="cpu", compute_type="int8")
EMBEDDER = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
except Exception as e:
Copy link

@cubic-dev-ai cubic-dev-ai bot Dec 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: If model loading fails, WHISPER and EMBEDDER will be undefined, causing NameError when transcribe_chunk() is called. Consider either re-raising the exception to fail fast, or setting the variables to None and checking before use.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At app/audio.py, line 17:

<comment>If model loading fails, `WHISPER` and `EMBEDDER` will be undefined, causing `NameError` when `transcribe_chunk()` is called. Consider either re-raising the exception to fail fast, or setting the variables to `None` and checking before use.</comment>

<file context>
@@ -0,0 +1,80 @@
+    # Use &#39;tiny&#39; or &#39;base&#39; for speed during debugging
+    WHISPER = WhisperModel(&quot;base&quot;, device=&quot;cpu&quot;, compute_type=&quot;int8&quot;)
+    EMBEDDER = TextEmbedding(model_name=&quot;BAAI/bge-small-en-v1.5&quot;)
+except Exception as e:
+    logger.critical(f&quot;Failed to load Audio models: {e}&quot;)
+
</file context>
Fix with Cubic

logger.critical(f"Failed to load Audio models: {e}")


def transcribe_chunk(
video_path: str, video_id: str, start_time: float, end_time: float
):
duration = end_time - start_time
temp_audio = f"/tmp/temp_{video_id}_{int(start_time)}.wav"
points = []

# 1. DEBUG: Check if file exists before asking FFmpeg to touch it
if not os.path.exists(video_path):
logger.error(f"AUDIO ERROR: File not found at {video_path}")
return []

try:
# 2. Extract Audio
(
ffmpeg.input(video_path, ss=start_time, t=duration)
.output(temp_audio, ac=1, ar=16000, loglevel="error")
.overwrite_output()
.run(capture_stdout=True, capture_stderr=True)
)

# 3. Transcribe
segments, _ = WHISPER.transcribe(temp_audio)
valid_segments = [s for s in segments if len(s.text.strip()) > 5]

# 4. Embed & Package
if valid_segments:
texts = [s.text.strip() for s in valid_segments]
embeddings = list(EMBEDDER.embed(texts))

for i, seg in enumerate(valid_segments):
points.append(
{
"id": str(uuid.uuid4()),
"vector": embeddings[i].tolist(),
"payload": {
"video_id": video_id,
"text": seg.text.strip(),
"timestamp": start_time + seg.start,
"end_timestamp": start_time + seg.end,
"type": "audio_transcript",
},
}
)

return points

except ffmpeg.Error as e:
# CRITICAL: Print the actual FFmpeg error
error_message = e.stderr.decode("utf8")
logger.error(f"FFMPEG FAILED: {error_message}")
return []

except Exception as e:
logger.error(f"General Audio Error: {e}")
return []

finally:
if os.path.exists(temp_audio):
os.remove(temp_audio)
14 changes: 10 additions & 4 deletions app/database.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
import datetime
import os
import sqlite3

DB_NAME = "vidsearch.db"
# CRITICAL: Point to the shared volume directory
# If we don't do this, API and Manager will have different databases!
DB_FOLDER = "/app/assets"
DB_NAME = os.path.join(DB_FOLDER, "vidsearch.db")


def init_db():
"""Creates the table if it doesn't exist"""
# Ensure the directory exists first
os.makedirs(DB_FOLDER, exist_ok=True)

conn = sqlite3.connect(DB_NAME)
c = conn.cursor()
c.execute("""
Expand All @@ -27,7 +34,7 @@ def add_video(job_id, url, title="Unknown Video"):
c = conn.cursor()
c.execute(
"INSERT INTO videos (id, title, url, status, created_at) VALUES (?, ?, ?, ?, ?)",
(job_id, title, url, "processing", datetime.datetime.now()),
(job_id, title, url, "queued", datetime.datetime.now()),
)
conn.commit()
conn.close()
Expand All @@ -43,12 +50,11 @@ def update_status(job_id, status):

def get_all_videos():
conn = sqlite3.connect(DB_NAME)
# Return results as a dictionary-like object
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute("SELECT * FROM videos ORDER BY created_at DESC")
return c.fetchall()


# Initialize immediately when this module is imported
# Initialize immediately
init_db()
Loading