-
Notifications
You must be signed in to change notification settings - Fork 0
Features: #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Features: #1
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,8 @@ | ||
| # .dockerignore | ||
| venv/ | ||
| assets/ | ||
| qdrant_data/ | ||
| redpanda_data/ | ||
| __pycache__/ | ||
| *.pyc | ||
| .git/ | ||
| .env | ||
| qdrant_data/ | ||
| redpanda_data/ | ||
| *.db |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,28 +1,104 @@ | ||
| # Python | ||
| __pycache__/ | ||
| *.pyc | ||
| *.pyo | ||
| *.pyd | ||
| *.py[cod] | ||
| *$py.class | ||
| *.so | ||
| .Python | ||
| build/ | ||
| develop-eggs/ | ||
| dist/ | ||
| downloads/ | ||
| eggs/ | ||
| .eggs/ | ||
| lib/ | ||
| lib64/ | ||
| parts/ | ||
| sdist/ | ||
| var/ | ||
| wheels/ | ||
| pip-wheel-metadata/ | ||
| share/python-wheels/ | ||
| *.egg-info/ | ||
| .installed.cfg | ||
| *.egg | ||
| MANIFEST | ||
|
|
||
| # Virtual Environments | ||
| venv/ | ||
| .venv/ | ||
| env/ | ||
| .env | ||
| ENV/ | ||
| env.bak/ | ||
| venv.bak/ | ||
|
|
||
| # Environment variables | ||
| .env | ||
| .env.local | ||
| .env.*.local | ||
|
|
||
| # IDE | ||
| .vscode/ | ||
| .idea/ | ||
| *.swp | ||
| *.swo | ||
| *~ | ||
| .DS_Store | ||
| .AppleDouble | ||
| .LSOverride | ||
|
|
||
| # OS | ||
| .DS_Store | ||
| .DS_Store? | ||
| ._* | ||
| .Spotlight-V100 | ||
| .Trashes | ||
| ehthumbs.db | ||
| Thumbs.db | ||
|
|
||
| .vscode/ | ||
| .idea/ | ||
|
|
||
|
|
||
| # Project-specific data directories | ||
| assets/videos/ | ||
| *.mp4 | ||
|
|
||
|
|
||
| qdrant_data/ | ||
| redpanda_data/ | ||
|
|
||
|
|
||
| # Logs | ||
| *.log | ||
| logs/ | ||
| *.log.* | ||
|
|
||
| # Testing | ||
| .pytest_cache/ | ||
| .coverage | ||
| htmlcov/ | ||
| .tox/ | ||
| .hypothesis/ | ||
| .mypy_cache/ | ||
| .dmypy.json | ||
| dmypy.json | ||
|
|
||
| # Jupyter Notebook | ||
| .ipynb_checkpoints | ||
|
|
||
| # pyenv | ||
| .python-version | ||
|
|
||
| # pip | ||
| pip-log.txt | ||
| pip-delete-this-directory.txt | ||
|
|
||
| # Model cache (HuggingFace, etc.) | ||
| .cache/ | ||
| models/ | ||
| *.pt | ||
| *.pth | ||
| *.ckpt | ||
|
|
||
| # Temporary files | ||
| *.tmp | ||
| *.temp | ||
| *.bak | ||
| *.swp | ||
| *~ | ||
|
|
||
| # Docker | ||
| .dockerignore |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,25 +1,32 @@ | ||
| # Use a slim Python image (lightweight and fast) | ||
| # If you need GPU support later, change this to an nvidia/cuda image or pytorch/pytorch | ||
| FROM python:3.10-slim | ||
| # Use Python 3.12 to match your project requirements | ||
| FROM python:3.12-slim | ||
|
|
||
| # Set the working directory to match the volume mount in your compose file | ||
| WORKDIR /app | ||
|
|
||
| # Install system dependencies (often needed for AI/Inference libraries like OpenCV or numpy) | ||
| # 1. Install System Dependencies | ||
| RUN apt-get update && apt-get install -y \ | ||
| build-essential \ | ||
| libgl1 \ | ||
| libglib2.0-0 \ | ||
| ffmpeg \ | ||
| curl \ | ||
| && rm -rf /var/lib/apt/lists/* | ||
|
|
||
| # Copy requirements first to leverage Docker caching | ||
| COPY requirements.txt . | ||
| # 2. Install uv | ||
| COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv | ||
|
|
||
| # 3. Set up the application | ||
| WORKDIR /app | ||
|
|
||
| # 4. Install Dependencies | ||
| # Only copy pyproject.toml (since we deleted uv.lock) | ||
| COPY pyproject.toml ./ | ||
|
|
||
| # Run sync WITHOUT --frozen (this resolves dependencies fresh) | ||
| RUN uv sync | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P1: Builds are not reproducible without a lock file. Consider keeping Prompt for AI agents |
||
|
|
||
| # Install Python dependencies | ||
| RUN pip install --no-cache-dir -r requirements.txt | ||
| # CRITICAL: Add the virtual environment to the PATH | ||
| ENV PATH="/app/.venv/bin:$PATH" | ||
|
|
||
| # Copy the rest of the code | ||
| # 5. Copy the Code | ||
| COPY . . | ||
|
|
||
| # This CMD is a fallback; your docker-compose 'command' overrides this. | ||
| CMD ["python3", "app/server.py"] | ||
| # 6. Default Command | ||
| CMD ["uvicorn", "app.server:app", "--host", "0.0.0.0", "--port", "8000"] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| import logging | ||
| import os | ||
| import uuid | ||
|
|
||
| import ffmpeg | ||
| from fastembed import TextEmbedding | ||
| from faster_whisper import WhisperModel | ||
|
|
||
| logging.basicConfig(level=logging.INFO) | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
| # Load Models | ||
| try: | ||
| # Use 'tiny' or 'base' for speed during debugging | ||
| WHISPER = WhisperModel("base", device="cpu", compute_type="int8") | ||
| EMBEDDER = TextEmbedding(model_name="BAAI/bge-small-en-v1.5") | ||
| except Exception as e: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. P1: If model loading fails, Prompt for AI agents |
||
| logger.critical(f"Failed to load Audio models: {e}") | ||
|
|
||
|
|
||
| def transcribe_chunk( | ||
| video_path: str, video_id: str, start_time: float, end_time: float | ||
| ): | ||
| duration = end_time - start_time | ||
| temp_audio = f"/tmp/temp_{video_id}_{int(start_time)}.wav" | ||
| points = [] | ||
|
|
||
| # 1. DEBUG: Check if file exists before asking FFmpeg to touch it | ||
| if not os.path.exists(video_path): | ||
| logger.error(f"AUDIO ERROR: File not found at {video_path}") | ||
| return [] | ||
|
|
||
| try: | ||
| # 2. Extract Audio | ||
| ( | ||
| ffmpeg.input(video_path, ss=start_time, t=duration) | ||
| .output(temp_audio, ac=1, ar=16000, loglevel="error") | ||
| .overwrite_output() | ||
| .run(capture_stdout=True, capture_stderr=True) | ||
| ) | ||
|
|
||
| # 3. Transcribe | ||
| segments, _ = WHISPER.transcribe(temp_audio) | ||
| valid_segments = [s for s in segments if len(s.text.strip()) > 5] | ||
|
|
||
| # 4. Embed & Package | ||
| if valid_segments: | ||
| texts = [s.text.strip() for s in valid_segments] | ||
| embeddings = list(EMBEDDER.embed(texts)) | ||
|
|
||
| for i, seg in enumerate(valid_segments): | ||
| points.append( | ||
| { | ||
| "id": str(uuid.uuid4()), | ||
| "vector": embeddings[i].tolist(), | ||
| "payload": { | ||
| "video_id": video_id, | ||
| "text": seg.text.strip(), | ||
| "timestamp": start_time + seg.start, | ||
| "end_timestamp": start_time + seg.end, | ||
| "type": "audio_transcript", | ||
| }, | ||
| } | ||
| ) | ||
|
|
||
| return points | ||
|
|
||
| except ffmpeg.Error as e: | ||
| # CRITICAL: Print the actual FFmpeg error | ||
| error_message = e.stderr.decode("utf8") | ||
| logger.error(f"FFMPEG FAILED: {error_message}") | ||
| return [] | ||
|
|
||
| except Exception as e: | ||
| logger.error(f"General Audio Error: {e}") | ||
| return [] | ||
|
|
||
| finally: | ||
| if os.path.exists(temp_audio): | ||
| os.remove(temp_audio) | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P2: Pin the
uvimage to a specific version instead of using:latestfor reproducible and secure builds.Prompt for AI agents