Kashyab19 · Kashyab19 · Dec 24, 2025 · Dec 27, 2025 · Dec 30, 2025 · cubic-dev-ai
diff --git a/.dockerignore b/.dockerignore
@@ -1,8 +1,8 @@
 # .dockerignore
-venv/
+assets/
+qdrant_data/
+redpanda_data/
 __pycache__/
+*.pyc
 .git/
 .env
-qdrant_data/
-redpanda_data/
-*.db
diff --git a/.gitignore b/.gitignore
@@ -1,28 +1,104 @@
+# Python
 __pycache__/
-*.pyc
-*.pyo
-*.pyd
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual Environments
 venv/
 .venv/
 env/
-.env
+ENV/
+env.bak/
+venv.bak/
 
+# Environment variables
+.env
+.env.local
+.env.*.local
 
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
 .DS_Store
 .AppleDouble
 .LSOverride
 
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
 
-.vscode/
-.idea/
-
-
+# Project-specific data directories
 assets/videos/
 *.mp4
-
-
 qdrant_data/
 redpanda_data/
 
-
+# Logs
 *.log
+logs/
+*.log.*
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.hypothesis/
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# pip
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Model cache (HuggingFace, etc.)
+.cache/
+models/
+*.pt
+*.pth
+*.ckpt
+
+# Temporary files
+*.tmp
+*.temp
+*.bak
+*.swp
+*~
+
+# Docker
+.dockerignore
diff --git a/Dockerfile b/Dockerfile
@@ -1,25 +1,32 @@
-# Use a slim Python image (lightweight and fast)
-# If you need GPU support later, change this to an nvidia/cuda image or pytorch/pytorch
-FROM python:3.10-slim
+# Use Python 3.12 to match your project requirements
+FROM python:3.12-slim
 
-# Set the working directory to match the volume mount in your compose file
-WORKDIR /app
-
-# Install system dependencies (often needed for AI/Inference libraries like OpenCV or numpy)
+# 1. Install System Dependencies
 RUN apt-get update && apt-get install -y \
-    build-essential \
     libgl1 \
     libglib2.0-0 \
+    ffmpeg \
+    curl \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy requirements first to leverage Docker caching
-COPY requirements.txt .
+# 2. Install uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /bin/uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /bin/uv
+
+# 3. Set up the application
+WORKDIR /app
+
+# 4. Install Dependencies
+# Only copy pyproject.toml (since we deleted uv.lock)
+COPY pyproject.toml ./
+
+# Run sync WITHOUT --frozen (this resolves dependencies fresh)
+RUN uv sync
 
-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
+# CRITICAL: Add the virtual environment to the PATH
+ENV PATH="/app/.venv/bin:$PATH"
 
-# Copy the rest of the code
+# 5. Copy the Code
 COPY . .
 
-# This CMD is a fallback; your docker-compose 'command' overrides this.
-CMD ["python3", "app/server.py"]
+# 6. Default Command
+CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/app/algo/__init__.py b/app/algo/__init__.py
diff --git a/app/algo/index.py b/app/algo/index.py
@@ -0,0 +1,55 @@
+from typing import Dict, List
+
+
+def reciprocal_rank_fusion(
+    list_a: List[Dict], list_b: List[Dict], k: int = 60
+) -> List[Dict]:
+    """
+    Merges two lists by RANK rather than raw score.
+    Essential because CLIP scores (0.25) are lower than BGE scores (0.75).
+    """
+    fused_scores = {}
+
+    # Helper to process a list
+    def process_list(results_list):
+        for rank, item in enumerate(results_list):
+            # Unique ID for the specific moment in the specific video
+            doc_id = f"{item['video_id']}_{int(item['timestamp'])}"
+
+            if doc_id not in fused_scores:
+                fused_scores[doc_id] = {"item": item, "score": 0}
+
+            # The RRF Formula
+            fused_scores[doc_id]["score"] += 1 / (k + rank + 1)
+
+    process_list(list_a)
+    process_list(list_b)
+
+    # Sort by new fused score
+    results = [val["item"] for val in fused_scores.values()]
+    for val in fused_scores.values():
+        val["item"]["score"] = val["score"]
+
+    results.sort(key=lambda x: x["score"], reverse=True)
+    return results
+
+
+def deduplicate_results(results: List[Dict], time_threshold: float = 5.0) -> List[Dict]:
+    """
+    Prevents clutter. If we have hits at 10s, 11s, and 12s, just keep 10s.
+    """
+    clean_results = []
+    seen_timestamps = []  # simple list of (video_id, timestamp)
+
+    for res in results:
+        is_duplicate = False
+        for vid, ts in seen_timestamps:
+            if res["video_id"] == vid and abs(res["timestamp"] - ts) < time_threshold:
+                is_duplicate = True
+                break
+
+        if not is_duplicate:
+            clean_results.append(res)
+            seen_timestamps.append((res["video_id"], res["timestamp"]))
+
+    return clean_results
diff --git a/app/audio.py b/app/audio.py
@@ -0,0 +1,110 @@
+import logging
+import os
+import uuid
+
+import ffmpeg
+import numpy as np
+from fastembed import TextEmbedding
+from faster_whisper import WhisperModel
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# --- LOAD MODELS ---
+try:
+    WHISPER = WhisperModel("base", device="cpu", compute_type="int8")
+    EMBEDDER = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
+except Exception as e:
+    logger.critical(f"Failed to load Audio models: {e}")
+
+
+def transcribe_chunk(
+    video_path: str, video_id: str, start_time: float, end_time: float
+):
+    """
+    1. Pipes audio from FFmpeg directly to memory (No Disk I/O).
+    2. Uses VAD to skip silence (Saves CPU).
+    3. Merges short segments into semantic blocks (Increases Accuracy).
+    """
+    duration = end_time - start_time
+
+    # 1. GHOST CHECK
+    if not os.path.exists(video_path):
+        logger.error(f"AUDIO ERROR: File not found at {video_path}")
+        return []
+
+    try:
+        # 2. EXTRACT AUDIO TO MEMORY (Zero-Copy)
+        # We request raw PCM data (s16le) at 16khz mono
+        out, _ = (
+            ffmpeg.input(video_path, ss=start_time, t=duration)
+            .output("pipe:", format="s16le", acodec="pcm_s16le", ac=1, ar=16000)
+            .run(capture_stdout=True, capture_stderr=True)
+        )
+
+        # Convert raw bytes to float32 numpy array (Required by Whisper)
+        # normalized between -1 and 1
+        audio_array = (
+            np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+        )
+
+        # 3. TRANSCRIBE WITH VAD
+        # vad_filter=True stops the model from hallucinating on silence
+        segments, _ = WHISPER.transcribe(audio_array, vad_filter=True)
+
+        # 4. SEMANTIC MERGING
+        # Group segments until they form a coherent thought (~200 chars)
+        points = []
+        buffer_text = []
+        buffer_start = None
+        buffer_end = 0.0
+
+        # Generator to list so we can iterate
+        segments = list(segments)
+
+        for i, seg in enumerate(segments):
+            if not buffer_text:
+                buffer_start = seg.start
+
+            buffer_text.append(seg.text.strip())
+            buffer_end = seg.end
+
+            # Current semantic block
+            full_text = " ".join(buffer_text)
+
+            # Heuristic: If block is long enough OR it's the last segment
+            if len(full_text) >= 256 or i == len(segments) - 1:
+                # Embed the GROUPS, not the fragments
+                vector_gen = EMBEDDER.embed([full_text])
+                vector = list(vector_gen)[0].tolist()  # Unpack generator
+
+                points.append(
+                    {
+                        "id": str(uuid.uuid4()),
+                        "vector": vector,
+                        "payload": {
+                            "video_id": video_id,
+                            "text": full_text,
+                            # Map relative whisper time back to absolute video time
+                            "timestamp": start_time + buffer_start,
+                            "end_timestamp": start_time + buffer_end,
+                            "type": "audio_transcript",
+                            "strategy": "semantic_merge_256",
+                        },
+                    }
+                )
+
+                # Reset buffer
+                buffer_text = []
+                buffer_start = None
+
+        return points
+
+    except ffmpeg.Error as e:
+        error_msg = e.stderr.decode("utf8") if e.stderr else "Unknown FFmpeg error"
+        logger.error(f"FFMPEG MEMORY FAIL: {error_msg}")
+        return []
+
+    except Exception as e:
+        logger.error(f"General Audio Error: {e}")
+        return []