Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# .dockerignore
venv/
assets/
qdrant_data/
redpanda_data/
__pycache__/
*.pyc
.git/
.env
qdrant_data/
redpanda_data/
*.db
98 changes: 87 additions & 11 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,28 +1,104 @@
# Python
__pycache__/
*.pyc
*.pyo
*.pyd
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Virtual Environments
venv/
.venv/
env/
.env
ENV/
env.bak/
venv.bak/

# Environment variables
.env
.env.local
.env.*.local

# IDE
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
.AppleDouble
.LSOverride

# OS
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db

.vscode/
.idea/


# Project-specific data directories
assets/videos/
*.mp4


qdrant_data/
redpanda_data/


# Logs
*.log
logs/
*.log.*

# Testing
.pytest_cache/
.coverage
htmlcov/
.tox/
.hypothesis/
.mypy_cache/
.dmypy.json
dmypy.json

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# pip
pip-log.txt
pip-delete-this-directory.txt

# Model cache (HuggingFace, etc.)
.cache/
models/
*.pt
*.pth
*.ckpt

# Temporary files
*.tmp
*.temp
*.bak
*.swp
*~

# Docker
.dockerignore
37 changes: 22 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
# Use a slim Python image (lightweight and fast)
# If you need GPU support later, change this to an nvidia/cuda image or pytorch/pytorch
FROM python:3.10-slim
# Use Python 3.12 to match your project requirements
FROM python:3.12-slim

# Set the working directory to match the volume mount in your compose file
WORKDIR /app

# Install system dependencies (often needed for AI/Inference libraries like OpenCV or numpy)
# 1. Install System Dependencies
RUN apt-get update && apt-get install -y \
build-essential \
libgl1 \
libglib2.0-0 \
ffmpeg \
curl \
&& rm -rf /var/lib/apt/lists/*

# Copy requirements first to leverage Docker caching
COPY requirements.txt .
# 2. Install uv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
Copy link

@cubic-dev-ai cubic-dev-ai bot Dec 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Pin the uv image to a specific version instead of using :latest for reproducible and secure builds.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At Dockerfile, line 13:

<comment>Pin the `uv` image to a specific version instead of using `:latest` for reproducible and secure builds.</comment>

<file context>
@@ -1,25 +1,32 @@
-# Copy requirements first to leverage Docker caching
-COPY requirements.txt .
+# 2. Install uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+
+# 3. Set up the application
</file context>
Suggested change
COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /bin/uv
Fix with Cubic


# 3. Set up the application
WORKDIR /app

# 4. Install Dependencies
# Only copy pyproject.toml (since we deleted uv.lock)
COPY pyproject.toml ./

# Run sync WITHOUT --frozen (this resolves dependencies fresh)
RUN uv sync
Copy link

@cubic-dev-ai cubic-dev-ai bot Dec 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: Builds are not reproducible without a lock file. Consider keeping uv.lock committed and using uv sync --frozen to ensure consistent dependency versions across builds.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At Dockerfile, line 23:

<comment>Builds are not reproducible without a lock file. Consider keeping `uv.lock` committed and using `uv sync --frozen` to ensure consistent dependency versions across builds.</comment>

<file context>
@@ -1,25 +1,32 @@
+COPY pyproject.toml ./
+
+# Run sync WITHOUT --frozen (this resolves dependencies fresh)
+RUN uv sync
 
-# Install Python dependencies
</file context>
Fix with Cubic


# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# CRITICAL: Add the virtual environment to the PATH
ENV PATH="/app/.venv/bin:$PATH"

# Copy the rest of the code
# 5. Copy the Code
COPY . .

# This CMD is a fallback; your docker-compose 'command' overrides this.
CMD ["python3", "app/server.py"]
# 6. Default Command
CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000"]
Empty file added app/algo/__init__.py
Empty file.
55 changes: 55 additions & 0 deletions app/algo/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from typing import Dict, List


def reciprocal_rank_fusion(
list_a: List[Dict], list_b: List[Dict], k: int = 60
) -> List[Dict]:
"""
Merges two lists by RANK rather than raw score.
Essential because CLIP scores (0.25) are lower than BGE scores (0.75).
"""
fused_scores = {}

# Helper to process a list
def process_list(results_list):
for rank, item in enumerate(results_list):
# Unique ID for the specific moment in the specific video
doc_id = f"{item['video_id']}_{int(item['timestamp'])}"

if doc_id not in fused_scores:
fused_scores[doc_id] = {"item": item, "score": 0}

# The RRF Formula
fused_scores[doc_id]["score"] += 1 / (k + rank + 1)

process_list(list_a)
process_list(list_b)

# Sort by new fused score
results = [val["item"] for val in fused_scores.values()]
for val in fused_scores.values():
val["item"]["score"] = val["score"]

results.sort(key=lambda x: x["score"], reverse=True)
return results


def deduplicate_results(results: List[Dict], time_threshold: float = 5.0) -> List[Dict]:
"""
Prevents clutter. If we have hits at 10s, 11s, and 12s, just keep 10s.
"""
clean_results = []
seen_timestamps = [] # simple list of (video_id, timestamp)

for res in results:
is_duplicate = False
for vid, ts in seen_timestamps:
if res["video_id"] == vid and abs(res["timestamp"] - ts) < time_threshold:
is_duplicate = True
break

if not is_duplicate:
clean_results.append(res)
seen_timestamps.append((res["video_id"], res["timestamp"]))

return clean_results
110 changes: 110 additions & 0 deletions app/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import logging
import os
import uuid

import ffmpeg
import numpy as np
from fastembed import TextEmbedding
from faster_whisper import WhisperModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- LOAD MODELS ---
try:
WHISPER = WhisperModel("base", device="cpu", compute_type="int8")
EMBEDDER = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
except Exception as e:
Copy link

@cubic-dev-ai cubic-dev-ai bot Dec 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: If model loading fails, WHISPER and EMBEDDER will be undefined, causing NameError when transcribe_chunk() is called. Consider either re-raising the exception to fail fast, or setting the variables to None and checking before use.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At app/audio.py, line 17:

<comment>If model loading fails, `WHISPER` and `EMBEDDER` will be undefined, causing `NameError` when `transcribe_chunk()` is called. Consider either re-raising the exception to fail fast, or setting the variables to `None` and checking before use.</comment>

<file context>
@@ -0,0 +1,80 @@
+    # Use &#39;tiny&#39; or &#39;base&#39; for speed during debugging
+    WHISPER = WhisperModel(&quot;base&quot;, device=&quot;cpu&quot;, compute_type=&quot;int8&quot;)
+    EMBEDDER = TextEmbedding(model_name=&quot;BAAI/bge-small-en-v1.5&quot;)
+except Exception as e:
+    logger.critical(f&quot;Failed to load Audio models: {e}&quot;)
+
</file context>
Fix with Cubic

logger.critical(f"Failed to load Audio models: {e}")


def transcribe_chunk(
video_path: str, video_id: str, start_time: float, end_time: float
):
"""
1. Pipes audio from FFmpeg directly to memory (No Disk I/O).
2. Uses VAD to skip silence (Saves CPU).
3. Merges short segments into semantic blocks (Increases Accuracy).
"""
duration = end_time - start_time

# 1. GHOST CHECK
if not os.path.exists(video_path):
logger.error(f"AUDIO ERROR: File not found at {video_path}")
return []

try:
# 2. EXTRACT AUDIO TO MEMORY (Zero-Copy)
# We request raw PCM data (s16le) at 16khz mono
out, _ = (
ffmpeg.input(video_path, ss=start_time, t=duration)
.output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=16000)
.run(capture_stdout=True, capture_stderr=True)
)

# Convert raw bytes to float32 numpy array (Required by Whisper)
# normalized between -1 and 1
audio_array = (
np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
)

# 3. TRANSCRIBE WITH VAD
# vad_filter=True stops the model from hallucinating on silence
segments, _ = WHISPER.transcribe(audio_array, vad_filter=True)

# 4. SEMANTIC MERGING
# Group segments until they form a coherent thought (~200 chars)
points = []
buffer_text = []
buffer_start = None
buffer_end = 0.0

# Generator to list so we can iterate
segments = list(segments)

for i, seg in enumerate(segments):
if not buffer_text:
buffer_start = seg.start

buffer_text.append(seg.text.strip())
buffer_end = seg.end

# Current semantic block
full_text = " ".join(buffer_text)

# Heuristic: If block is long enough OR it's the last segment
if len(full_text) >= 256 or i == len(segments) - 1:
# Embed the GROUPS, not the fragments
vector_gen = EMBEDDER.embed([full_text])
vector = list(vector_gen)[0].tolist() # Unpack generator

points.append(
{
"id": str(uuid.uuid4()),
"vector": vector,
"payload": {
"video_id": video_id,
"text": full_text,
# Map relative whisper time back to absolute video time
"timestamp": start_time + buffer_start,
"end_timestamp": start_time + buffer_end,
"type": "audio_transcript",
"strategy": "semantic_merge_256",
},
}
)

# Reset buffer
buffer_text = []
buffer_start = None

return points

except ffmpeg.Error as e:
error_msg = e.stderr.decode("utf8") if e.stderr else "Unknown FFmpeg error"
logger.error(f"FFMPEG MEMORY FAIL: {error_msg}")
return []

except Exception as e:
logger.error(f"General Audio Error: {e}")
return []
Loading