Kashyab19 · Kashyab19 · Dec 24, 2025 · Dec 27, 2025 · Dec 30, 2025 · cubic-dev-ai
diff --git a/.dockerignore b/.dockerignore
@@ -1,8 +1,8 @@
 # .dockerignore
-venv/
+assets/
+qdrant_data/
+redpanda_data/
 __pycache__/
+*.pyc
 .git/
 .env
-qdrant_data/
-redpanda_data/
-*.db
diff --git a/.gitignore b/.gitignore
@@ -1,28 +1,104 @@
+# Python
 __pycache__/
-*.pyc
-*.pyo
-*.pyd
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual Environments
 venv/
 .venv/
 env/
-.env
+ENV/
+env.bak/
+venv.bak/
 
+# Environment variables
+.env
+.env.local
+.env.*.local
 
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
 .DS_Store
 .AppleDouble
 .LSOverride
 
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
 
-.vscode/
-.idea/
-
-
+# Project-specific data directories
 assets/videos/
 *.mp4
-
-
 qdrant_data/
 redpanda_data/
 
-
+# Logs
 *.log
+logs/
+*.log.*
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.hypothesis/
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# pip
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Model cache (HuggingFace, etc.)
+.cache/
+models/
+*.pt
+*.pth
+*.ckpt
+
+# Temporary files
+*.tmp
+*.temp
+*.bak
+*.swp
+*~
+
+# Docker
+.dockerignore
diff --git a/Dockerfile b/Dockerfile
@@ -1,25 +1,32 @@
-# Use a slim Python image (lightweight and fast)
-# If you need GPU support later, change this to an nvidia/cuda image or pytorch/pytorch
-FROM python:3.10-slim
+# Use Python 3.12 to match your project requirements
+FROM python:3.12-slim
 
-# Set the working directory to match the volume mount in your compose file
-WORKDIR /app
-
-# Install system dependencies (often needed for AI/Inference libraries like OpenCV or numpy)
+# 1. Install System Dependencies
 RUN apt-get update && apt-get install -y \
-    build-essential \
     libgl1 \
     libglib2.0-0 \
+    ffmpeg \
+    curl \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy requirements first to leverage Docker caching
-COPY requirements.txt .
+# 2. Install uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /bin/uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+COPY --from=ghcr.io/astral-sh/uv:0.5.11 /uv /bin/uv
+
+# 3. Set up the application
+WORKDIR /app
+
+# 4. Install Dependencies
+# Only copy pyproject.toml (since we deleted uv.lock)
+COPY pyproject.toml ./
+
+# Run sync WITHOUT --frozen (this resolves dependencies fresh)
+RUN uv sync
 
-# Install Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
+# CRITICAL: Add the virtual environment to the PATH
+ENV PATH="/app/.venv/bin:$PATH"
 
-# Copy the rest of the code
+# 5. Copy the Code
 COPY . .
 
-# This CMD is a fallback; your docker-compose 'command' overrides this.
-CMD ["python3", "app/server.py"]
+# 6. Default Command
+CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/app/audio.py b/app/audio.py
@@ -0,0 +1,80 @@
+import logging
+import os
+import uuid
+
+import ffmpeg
+from fastembed import TextEmbedding
+from faster_whisper import WhisperModel
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Load Models
+try:
+    # Use 'tiny' or 'base' for speed during debugging
+    WHISPER = WhisperModel("base", device="cpu", compute_type="int8")
+    EMBEDDER = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
+except Exception as e:
+    logger.critical(f"Failed to load Audio models: {e}")
+
+
+def transcribe_chunk(
+    video_path: str, video_id: str, start_time: float, end_time: float
+):
+    duration = end_time - start_time
+    temp_audio = f"/tmp/temp_{video_id}_{int(start_time)}.wav"
+    points = []
+
+    # 1. DEBUG: Check if file exists before asking FFmpeg to touch it
+    if not os.path.exists(video_path):
+        logger.error(f"AUDIO ERROR: File not found at {video_path}")
+        return []
+
+    try:
+        # 2. Extract Audio
+        (
+            ffmpeg.input(video_path, ss=start_time, t=duration)
+            .output(temp_audio, ac=1, ar=16000, loglevel="error")
+            .overwrite_output()
+            .run(capture_stdout=True, capture_stderr=True)
+        )
+
+        # 3. Transcribe
+        segments, _ = WHISPER.transcribe(temp_audio)
+        valid_segments = [s for s in segments if len(s.text.strip()) > 5]
+
+        # 4. Embed & Package
+        if valid_segments:
+            texts = [s.text.strip() for s in valid_segments]
+            embeddings = list(EMBEDDER.embed(texts))
+
+            for i, seg in enumerate(valid_segments):
+                points.append(
+                    {
+                        "id": str(uuid.uuid4()),
+                        "vector": embeddings[i].tolist(),
+                        "payload": {
+                            "video_id": video_id,
+                            "text": seg.text.strip(),
+                            "timestamp": start_time + seg.start,
+                            "end_timestamp": start_time + seg.end,
+                            "type": "audio_transcript",
+                        },
+                    }
+                )
+
+        return points
+
+    except ffmpeg.Error as e:
+        # CRITICAL: Print the actual FFmpeg error
+        error_message = e.stderr.decode("utf8")
+        logger.error(f"FFMPEG FAILED: {error_message}")
+        return []
+
+    except Exception as e:
+        logger.error(f"General Audio Error: {e}")
+        return []
+
+    finally:
+        if os.path.exists(temp_audio):
+            os.remove(temp_audio)
diff --git a/app/database.py b/app/database.py
@@ -1,11 +1,18 @@
 import datetime
+import os
 import sqlite3
 
-DB_NAME = "vidsearch.db"
+# CRITICAL: Point to the shared volume directory
+# If we don't do this, API and Manager will have different databases!
+DB_FOLDER = "/app/assets"
+DB_NAME = os.path.join(DB_FOLDER, "vidsearch.db")
 
 
 def init_db():
     """Creates the table if it doesn't exist"""
+    # Ensure the directory exists first
+    os.makedirs(DB_FOLDER, exist_ok=True)
+
     conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
     c.execute("""
@@ -27,7 +34,7 @@ def add_video(job_id, url, title="Unknown Video"):
     c = conn.cursor()
     c.execute(
         "INSERT INTO videos (id, title, url, status, created_at) VALUES (?, ?, ?, ?, ?)",
-        (job_id, title, url, "processing", datetime.datetime.now()),
+        (job_id, title, url, "queued", datetime.datetime.now()),
     )
     conn.commit()
     conn.close()
@@ -43,12 +50,11 @@ def update_status(job_id, status):
 
 def get_all_videos():
     conn = sqlite3.connect(DB_NAME)
-    # Return results as a dictionary-like object
     conn.row_factory = sqlite3.Row
     c = conn.cursor()
     c.execute("SELECT * FROM videos ORDER BY created_at DESC")
     return c.fetchall()
 
 
-# Initialize immediately when this module is imported
+# Initialize immediately
 init_db()