From cfe4bf8fff4901bb91f4c9f01fba145f1a418ad3 Mon Sep 17 00:00:00 2001 From: ericksonlopes Date: Tue, 7 Apr 2026 17:19:01 -0300 Subject: [PATCH 1/7] feat: harden diarization ingestion and refactor readme --- README.md | 141 ++++++----- .../diarization_ingestion_use_case.py | 233 +++++++++--------- .../services/test_voice_profile_service.py | 18 +- 3 files changed, 209 insertions(+), 183 deletions(-) diff --git a/README.md b/README.md index 8f53eacf..10196680 100644 --- a/README.md +++ b/README.md @@ -2,115 +2,129 @@ # WhatYouSaid -[![codecov](https://codecov.io/github/ericksonlopes/WhatYouSaid/branch/main/graph/badge.svg?token=8CZJARVJUE)](https://codecov.io/github/ericksonlopes/WhatYouSaid) +## The Vectorized Intelligence & Diarization Hub + +[![codecov](https://codecov.io/github/ericksonlopes/WhatYouSaid/branch/main/graph/badge.svg?token=8CZJARVJUE)](https://codecov.io/github/ericksonlopes/WhatYouSaid) [![Tests](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/tests.yml) [![Code Quality](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/code-quality.yml/badge.svg?branch=main)](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/code-quality.yml) [![Security](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/security.yml/badge.svg?branch=main)](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/security.yml) ![Python](https://img.shields.io/badge/-Python-3776AB?&logo=Python&logoColor=FFFFFF) ![React](https://img.shields.io/badge/-React-61DAFB?&logo=React&logoColor=000000) -![Pytest](https://img.shields.io/badge/-Pytest-0A9EDC?&logo=Pytest&logoColor=FFFFFF) -![GitHub Actions](https://img.shields.io/badge/-GitHub%20Actions-2088FF?&logo=GitHub%20Actions&logoColor=FFFFFF) +![FastAPI](https://img.shields.io/badge/-FastAPI-05998B?&logo=FastAPI&logoColor=FFFFFF) +![Redis](https://img.shields.io/badge/-Redis-DC382D?&logo=Redis&logoColor=FFFFFF) +![Postgres](https://img.shields.io/badge/-PostgreSQL-4169E1?&logo=PostgreSQL&logoColor=FFFFFF) -WhatYouSaid is a vectorized data hub designed to explore any topic or knowledge domain. It extracts, processes, and indexes content from YouTube videos, local files, and remote URLs to enable advanced semantic search and Retrieval-Augmented Generation (RAG) workflows. - -This repository provides modular extractors, robust splitting utilities, and a scalable background processing pipeline to build searchable knowledge bases efficiently. +**WhatYouSaid** is a state-of-the-art vectorized data hub designed to explore any knowledge domain. It transforms unstructured audio, video, files, and web content into structured, searchable intelligence using advanced AI techniques, including **Speaker Diarization**, **Voice Recognition**, and **RAG** (Retrieval-Augmented Generation). --- -## 📚 Documentation +## ✨ Features -Detailed guides for specific topics: +### 🎧 Diarization & Voice Intelligence -- 🐳 **[Docker Deployment Guide](docs/docker-deployment.md)**: Learn how to use Docker Profiles to run different combinations of databases (MySQL, Postgres, SQLite) and vector stores (FAISS, Weaviate). +- **Speaker Segmentation**: Automatically split audio/video files by speaker using WhisperX/Whisper for unmatched accuracy. +- **Voice Recognition**: Identify and label speakers across your entire knowledge base using trained voice profiles. +- **Diarization Pipeline**: Interactive dashboard to review, edit, and finalize transcripts and speaker assignments before indexing. ---- -## 🚀 Features +### 📥 Multi-Source Ingestion + +- **YouTube Ecosystem**: Full support for individual videos, entire playlists, or entire channels. +- **Document Extractors**: High-fidelity extraction from PDF, DOCX, and TXT files. +- **Web Intelligence**: Powerful scraping via **Crawl4AI** and **Docling** for websites and remote URLs. +- **Robust Pipeline**: Step-by-step progress tracking with real-time SSE notifications and full rollback support on failure. + +### 🔍 Advanced Semantic Search -- **Multi-source Extraction**: Ingest data from YouTube (transcripts), local files (PDF, DOCX, TXT), **remote URLs** via Docling, and **Websites** via Crawl4AI. -- **Robust Fallbacks**: Integrated `PlainTextExtractor` ensuring successful ingestion even for formats not supported by specialized extractors. -- **Async Task Queue**: High-performance background processing powered by **Redis**, ensuring responsive workflows. -- **Structured Logging & Tracing**: Centralized logging equipped with contextvars and request tracing (Correlation IDs) for end-to-end observability. -- **Real-time Updates**: Live ingestion status and progress monitoring via a **Redis Event Bus** (SSE-ready). -- **Advanced Search**: Semantic, keyword (BM25), and **Hybrid Search** with cross-encoder re-ranking for maximum precision. -- **Pluggable Vector Stores**: Support for **FAISS** (local), **ChromaDB**, **Weaviate** (scalable), and **Qdrant**. -- **Pluggable SQL Databases**: Support for **SQLite**, **PostgreSQL**, **MySQL**, **MariaDB**, and **MSSQL**. -- **Modern Dashboard**: A clean React + Tailwind CSS frontend for managing knowledge subjects, content sources, and monitoring background tasks. +- **Hybrid Search**: Combining Vector (FAISS/Weaviate/Chroma) and Keyword (BM25) search for maximum precision. +- **Re-Ranking**: Specialized Cross-Encoders ensure the most relevant context is always at the top. +- **Pluggable Architecture**: Seamlessly switch between SQL databases (SQLite/Postgres/MySQL) and Vector stores. --- -## 🛠️ Infrastructure & Deployment +## 🚀 Quick Start -WhatYouSaid is designed to be flexible, from a lightweight local setup to a scalable production-ready environment. +WhatYouSaid is powered by **Python 3.12** and uses **uv** for high-speed dependency management. -### 1. Storage & Messaging Options +### 1. Prerequisites -| Component | Lightweight (Local) | Scalable / Production | -| :--- | :--- | :--- | -| **Relational Database** | **SQLite** (Default, file-based) | **PostgreSQL**, **MySQL**, **MariaDB**, **MSSQL** | -| **Vector Store** | **FAISS** (Local, file-based) | **Weaviate** (Container or Cloud), **ChromaDB** | -| **Task Queue & Bus** | **In-memory** (Limited) | **Redis** (Default in Docker) | +- [uv](https://github.com/astral-sh/uv) (Recommended) or `pip` +- [Docker](https://www.docker.com/) + +### 2. Environment Setup -### 2. Docker Compose Profiles & Dependencies +```bash +# Clone the repository +git clone https://github.com/ericksonlopes/WhatYouSaid.git +cd WhatYouSaid -We use **Docker Profiles** to keep the environment lean. Only the services you need are started. The project also natively supports both **CPU** and **GPU** environments via optional Python dependencies. +# Install dependencies (including dev groups) +uv sync --group dev +``` -> 📘 **Detailed Guide**: For a step-by-step tutorial on different deployment scenarios, see our [Docker Deployment Guide](docs/docker-deployment.md). +### 3. Spin Up Infrastructure -#### **Scenario A: Lite (Default)** -Uses **SQLite**, **FAISS**, and **Redis**. ```bash +# Lite mode: SQLite + FAISS + Redis docker-compose up -d + +# Scalable mode: PostgreSQL + Weaviate + Redis +docker-compose --profile base up -d ``` -#### **Scenario B: Scalable (Base)** -Starts **PostgreSQL**, **Weaviate**, and **Redis**. +### 4. Run Application + ```bash -docker-compose --profile base up -d -# Note: Set SQL__TYPE=postgres and VECTOR__STORE_TYPE=weaviate in .env +# Start Backend (FastAPI) +python main.py + +# Start Frontend (React) +cd frontend +npm install +npm run dev ``` --- -## 🏗️ Architecture +## 🐳 Deployment Profiles -The system follows a clean architecture approach, ensuring separation of concerns: +We use **Docker Profiles** to keep your environment lean. Only the services you need are started. -- **Application Layer**: Contains use cases (e.g., `FileIngestionUseCase`, `SearchUseCase`) and a `ServiceRegistry` for background worker dependency resolution. -- **Infrastructure Layer**: - - `extractors/`: Fetch raw content (Docling, YouTube, PlainText). - - `repositories/`: Data persistence (SQLAlchemy for relational, specialized clients for Vector Stores). - - `services/`: Core logic (text splitting, embedding, re-ranking, Redis task queue). -- **Presentation Layer**: FastAPI-based REST API with real-time SSE notifications. +| Component | Lite Profile (Default) | Scalable Profile (`base`) | +| :--- | :--- | :--- | +| **Relational DB** | SQLite (File-based) | PostgreSQL / MySQL / MariaDB | +| **Vector Store** | FAISS (Local) | Weaviate / ChromaDB / Qdrant | +| **Task Queue** | Redis | Redis (Production-ready) | ---- +> [!TIP] +> Use the **Scalable** profile if you require high-concurrency access or plan to manage multi-gigabyte vector indexes. -## 🧪 Quality & Testing +--- -We maintain a high standard of code quality and test coverage: +## 🏗️ Clean Architecture -- **417+ Automated Tests**: Covering unit, integration, and complex edge cases. -- **93% Code Coverage**: Verified via `pytest-cov`. -- **Strict Linting**: Powered by `ruff` for code style and `mypy` for static type checking. -- **Security Scanning**: Integrated `bandit` scans for vulnerability detection. +The system follows a modular approach ensuring maximum testability and maintainability: -**Run tests locally:** -```bash -uv run pytest -``` +- **Application Layer**: Orchestrates logic via use cases and resolves background worker dependencies through a `ServiceRegistry`. +- **Infrastructure Layer**: + - `extractors/`: Fetch raw content from specialized sources (Docling, YouTube, Crawl4AI). + - `repositories/`: Persistence via SQL (SQLAlchemy) and specialized Vector clients. + - `services/`: Core providers for embeddings, text splitting, and re-ranking. +- **Presentation Layer**: FastAPI-based REST API with real-time event broadcasting and a modern React dashboard. --- -## 🤝 Contributing +## 🤝 Contributing & Quality + +Contributions are what make the open-source community such an amazing place! Please: -Contributions are welcome. Please: -- Open an issue to discuss major changes. -- Add tests for any new feature or bug fix. -- Ensure `ruff check .` and `mypy .` pass before submitting. +1. Open an **Issue** to discuss proposed changes. +2. Ensure `uv run ruff check . --fix` and `uv run mypy .` pass. +3. Run all tests: `uv run pytest`. --- @@ -119,8 +133,9 @@ Contributions are welcome. Please: This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
-

Made with ❤️ by Erickson Lopes

-[![LinkedIn](https://img.shields.io/badge/LinkedIn-Erickson_Lopes-blue)](https://www.linkedin.com/in/ericksonlopes/) +Hand-crafted with ❤️ by **Erickson Lopes** + +[![LinkedIn](https://img.shields.io/badge/LinkedIn-Erickson_Lopes-blue?style=for-the-badge&logo=linkedin)](https://www.linkedin.com/in/ericksonlopes/)
diff --git a/src/application/use_cases/diarization_ingestion_use_case.py b/src/application/use_cases/diarization_ingestion_use_case.py index 7fe26127..ad48effd 100644 --- a/src/application/use_cases/diarization_ingestion_use_case.py +++ b/src/application/use_cases/diarization_ingestion_use_case.py @@ -59,26 +59,18 @@ def __init__( self.event_bus = event_bus def execute(self, cmd: IngestDiarizationCommand) -> Dict[str, Any]: - self.event_bus.publish( - "ingestion_status", - { - "job_id": str(cmd.ingestion_job_id) if cmd.ingestion_job_id else "new", - "status": "started", - "diarization_id": str(cmd.diarization_id), - }, - ) + """Orchestrates the ingestion of diarization results with status tracking and rollback.""" + self._publish_initial_status(cmd) logger.info( - "Starting Diarization ingestion", - context={ - "diarization_id": str(cmd.diarization_id), - "subject_id": str(cmd.subject_id), - }, + "Starting Diarization ingestion pipeline", + context={"diarization_id": str(cmd.diarization_id), "subject_id": str(cmd.subject_id)}, ) - ingestion = None + job = None source = None try: + # 1. Resolve domain objects and source info record = self.diarization_repo.get_by_id(str(cmd.diarization_id)) if not record: raise ValueError(f"Diarization record not found: {cmd.diarization_id}") @@ -89,98 +81,124 @@ def execute(self, cmd: IngestDiarizationCommand) -> Dict[str, Any]: source_type, external_source = self._resolve_source_info(record) - if cmd.ingestion_job_id: - ingestion = self.ingestion_service.get_by_id(cmd.ingestion_job_id) - - if ingestion is None: - ingestion = self._create_ingestion_job(external_source, source_type, subject.id) - - self.ingestion_service.update_job( - job_id=ingestion.id, - status=IngestionJobStatus.PROCESSING, - status_message="Formatting transcript from diarization...", - current_step=1, - total_steps=4, - ) - - full_text = self._format_transcript(cast(list, record.segments), cast(dict, record.recognition_results)) - if not full_text: - raise ValueError("No segments found in diarization record") + # 2. Setup ingestion job tracking + job = self._ensure_ingestion_job(cmd, external_source, source_type, subject.id) + # 3. Format transcript and prepare source + full_text = self._prepare_transcript(job, record) display_name = cmd.name or cast(str, record.name) or "Transcrição" source = self._get_or_create_source(source_type, external_source, subject.id, display_name, cmd, record) - # Generate chunks and Embeddings - self.ingestion_service.update_job( - job_id=ingestion.id, - status=IngestionJobStatus.PROCESSING, - status_message="Generating embeddings...", - current_step=2, - total_steps=4, - content_source_id=source.id, - ) - + # 4. Generate and index chunks + self._report_step(job, 2, "Generating embeddings...", content_id=source.id) split_docs = self._generate_split_docs(full_text, display_name, external_source, source_type, cmd, record) - # Persist Chunks - chunks = self._build_chunk_entities(split_docs, source, subject, cmd, ingestion.id) + chunks = self._build_chunk_entities(split_docs, source, subject, cmd, job.id) self.chunk_service.create_chunks(chunks) - # Index - self.ingestion_service.update_job( - job_id=ingestion.id, - status=IngestionJobStatus.PROCESSING, - status_message="Indexing in vector store...", - current_step=3, - total_steps=4, - ) + self._report_step(job, 3, "Indexing in vector store...") self.vector_service.index_documents(chunks) - # Finalize - self._finalize_ingestion(ingestion, source, chunks, cmd) - - # Update Diarization record status to COMPLETED - self.diarization_repo.update_status( - diarization_id=str(cmd.diarization_id), - status=DiarizationStatus.COMPLETED.value, - status_message="Ingestão concluída com sucesso", - error_message="", # Clear any previous error - ) - - # Notify frontend that diarization is fully done - self.event_bus.publish( - "ingestion_status", - { - "type": "diarization", - "id": str(cmd.diarization_id), - "status": "done", - "message": "Diarização indexada com sucesso", - }, - ) + # 5. Finalize + self._finalize_ingestion(job, source, chunks, cmd) + self._complete_diarization_record(cmd) + self._publish_final_notification(cmd) return { "diarization_id": str(cmd.diarization_id), "created_chunks": len(chunks), "source_id": source.id, - "job_id": ingestion.id, + "job_id": job.id, } except Exception as e: logger.error(e, context={"action": "diarization_ingestion_execute"}) - if ingestion: - self.ingestion_service.update_job( - job_id=ingestion.id, - status=IngestionJobStatus.FAILED, - error_message=str(e), - ) - if source: - self.cs_service.update_processing_status( - content_source_id=source.id, - status=ContentSourceStatus.FAILED, - error_message=str(e), - ) + self._rollback_on_failure(job, source, e) raise + def _publish_initial_status(self, cmd: IngestDiarizationCommand) -> None: + self.event_bus.publish( + "ingestion_status", + { + "job_id": str(cmd.ingestion_job_id) if cmd.ingestion_job_id else "new", + "status": "started", + "diarization_id": str(cmd.diarization_id), + }, + ) + + def _ensure_ingestion_job( + self, cmd: IngestDiarizationCommand, external_source: str, source_type: SourceType, subject_id: UUID + ) -> Any: + job = None + if cmd.ingestion_job_id: + job = self.ingestion_service.get_by_id(cmd.ingestion_job_id) + + if not job: + job = self._create_ingestion_job(external_source, source_type, subject_id) + return job + + def _prepare_transcript(self, job: Any, record: Any) -> str: + self.ingestion_service.update_job( + job_id=job.id, + status=IngestionJobStatus.PROCESSING, + status_message="Formatting transcript from diarization...", + current_step=1, + total_steps=4, + ) + full_text = self._format_transcript(cast(list, record.segments), cast(dict, record.recognition_results)) + if not full_text: + raise ValueError("No segments found in diarization record") + return full_text + + def _report_step(self, job: Any, step: int, message: str, content_id: Optional[UUID] = None) -> None: + self.ingestion_service.update_job( + job_id=job.id, + status=IngestionJobStatus.PROCESSING, + status_message=message, + current_step=step, + total_steps=4, + content_source_id=content_id, + ) + + def _complete_diarization_record(self, cmd: IngestDiarizationCommand) -> None: + self.diarization_repo.update_status( + diarization_id=str(cmd.diarization_id), + status=DiarizationStatus.COMPLETED.value, + status_message="Ingestão concluída com sucesso", + error_message="", + ) + + def _publish_final_notification(self, cmd: IngestDiarizationCommand) -> None: + self.event_bus.publish( + "ingestion_status", + { + "type": "diarization", + "id": str(cmd.diarization_id), + "status": "done", + "message": "Diarização indexada com sucesso", + }, + ) + + def _rollback_on_failure(self, job: Optional[Any], source: Optional[Any], error: Exception) -> None: + """Rolls back changes if ingestion fails.""" + if job: + self.ingestion_service.update_job( + job_id=job.id, + status=IngestionJobStatus.FAILED, + error_message=str(error), + ) + # Cleanup SQL Chunks + self.chunk_service.delete_by_job_id(job.id) + + if source: + self.cs_service.update_processing_status( + content_source_id=source.id, + status=ContentSourceStatus.FAILED, + error_message=str(error), + ) + # Vector cleanup is handled by job_id filters if possible, or by source_id + self.vector_service.delete(filters={"content_source_id": str(source.id)}) + def _resolve_source_info(self, record: Any) -> tuple[SourceType, str]: source_type_val = cast(str, record.source_type) if source_type_val == "upload": @@ -197,7 +215,6 @@ def _resolve_source_info(self, record: Any) -> tuple[SourceType, str]: if original: external_source = original - # Normalize YouTube IDs to prevent duplicates (Short URLs, Full URLs vs 11-char IDs) if source_type == SourceType.YOUTUBE: normalized_vid = YoutubeExtractor.get_video_id(external_source) if normalized_vid: @@ -244,7 +261,6 @@ def _get_or_create_source( ) else: self.cs_service.update_processing_status(source.id, ContentSourceStatus.PROCESSING) - # Update title if it has changed if cmd.name and source.title != cmd.name: self.cs_service.update_title(source.id, cmd.name) @@ -332,10 +348,18 @@ def _format_transcript(self, segments: List[Dict[str, Any]], recognition: Option if not segments: return "" - mapping = recognition.get("mapping", {}) if recognition else {} - + mapping = (recognition or {}).get("mapping", {}) merged_lines = [] - curr_speaker, curr_start, curr_end, curr_texts = None, None, None, [] + + curr_speaker: Optional[str] = None + curr_start = 0.0 + curr_end = 0.0 + curr_texts: List[str] = [] + + def flush_block() -> None: + if curr_speaker is not None: + ts = f"[{self._format_seconds(curr_start)} - {self._format_seconds(curr_end)}]" + merged_lines.append(f"{ts} {curr_speaker}: {' '.join(curr_texts)}") for seg in segments: spk_label = seg.get("speaker", "UNKNOWN") @@ -344,28 +368,15 @@ def _format_transcript(self, segments: List[Dict[str, Any]], recognition: Option end = float(seg.get("end", 0)) text = seg.get("text", "").strip() - if spk_name == curr_speaker: - curr_end = end - if text: - curr_texts.append(text) - else: - if curr_speaker is not None: - start_str = self._format_seconds(cast(float, curr_start)) - end_str = self._format_seconds(cast(float, curr_end)) - ts = f"[{start_str} - {end_str}]" - merged_lines.append(f"{ts} {curr_speaker}: {' '.join(curr_texts)}") - - curr_speaker, curr_start, curr_end, curr_texts = ( - spk_name, - start, - end, - [text] if text else [], - ) - - if curr_speaker is not None: - ts = f"[{self._format_seconds(cast(float, curr_start))} - {self._format_seconds(cast(float, curr_end))}]" - merged_lines.append(f"{ts} {curr_speaker}: {' '.join(curr_texts)}") + if spk_name != curr_speaker: + flush_block() + curr_speaker, curr_start, curr_texts = spk_name, start, [] + + curr_end = end + if text: + curr_texts.append(text) + flush_block() return "\n".join(merged_lines) def _build_chunk_entities( diff --git a/tests/infrastructure/services/test_voice_profile_service.py b/tests/infrastructure/services/test_voice_profile_service.py index 4fb1c05c..87ff2b13 100644 --- a/tests/infrastructure/services/test_voice_profile_service.py +++ b/tests/infrastructure/services/test_voice_profile_service.py @@ -44,12 +44,12 @@ def test_add_voice_local_file(self, sqlite_memory): voice_id, _ = db_service.add("Test User", "local.wav") assert voice_id is not None - + # Verify status was updated to ready record = sqlite_memory.get(VoiceRecord, voice_id) assert record.status == "ready" assert record.status_message is None - + voices = db_service.voices assert "Test User" in voices @@ -159,13 +159,13 @@ def test_add_voice_s3_download_failure(self, sqlite_memory): # After failure, it should NOT have created a successful record # but let's check if it created a fixed "failed" record if we implement it that way. # Currently, if it fails at download, it doesn't even create the record yet in the DB. - # Wait, the record is created AFTER the S3 check block. + # Wait, the record is created AFTER the S3 check block. # So no record should exist in DB yet. assert sqlite_memory.query(VoiceRecord).count() == 0 def test_add_voice_embedding_extraction_failure(self, sqlite_memory): db_service = VoiceDB(sqlite_memory, hf_token="fake") - + # Fail during embedding extraction with patch.object(db_service, "_extract_embedding", side_effect=Exception("Model Error")): with pytest.raises(Exception, match="Model Error"): @@ -183,10 +183,10 @@ def test_list_audio_files(self, sqlite_memory): sqlite_memory.commit() self.mock_storage.list_files.return_value = [{"key": "f1.wav"}, {"key": "f2.wav"}] - + db_service = VoiceDB(sqlite_memory, hf_token="fake") files = db_service.list_audio_files("v1") - + assert len(files) == 2 self.mock_storage.list_files.assert_called_once_with(prefix="voices/v1/", extension=".wav") @@ -202,15 +202,15 @@ def test_list_voices_and_len(self, sqlite_memory): sqlite_memory.commit() db_service = VoiceDB(sqlite_memory, hf_token="fake") - + # list_voices should only show ready ones voice_list = db_service.list_voices() assert "Ready" in voice_list assert "Processing" not in voice_list - + # len should only show ready ones assert len(db_service) == 1 - + # .voices property should only show ready ones assert "Ready" in db_service.voices assert "Processing" not in db_service.voices From fd708b1ee6122373cb32a6a4baa6447efb489904 Mon Sep 17 00:00:00 2001 From: ericksonlopes Date: Tue, 7 Apr 2026 20:33:01 -0300 Subject: [PATCH 2/7] fix(frontend): implement shared SubjectIcon and fix missing translations in search view --- frontend/src/components/AddSubjectModal.tsx | 46 +-- frontend/src/components/ContextSelector.tsx | 10 +- .../src/components/KnowledgeAdminView.tsx | 58 +-- .../src/components/LocalContextSelector.tsx | 3 +- frontend/src/components/SearchView.tsx | 350 +++++++++++------- frontend/src/components/SubjectIcon.tsx | 56 +++ frontend/src/locales/en.json | 4 + frontend/src/locales/pt-BR.json | 4 + 8 files changed, 293 insertions(+), 238 deletions(-) create mode 100644 frontend/src/components/SubjectIcon.tsx diff --git a/frontend/src/components/AddSubjectModal.tsx b/frontend/src/components/AddSubjectModal.tsx index 303d0225..d3f8625c 100644 --- a/frontend/src/components/AddSubjectModal.tsx +++ b/frontend/src/components/AddSubjectModal.tsx @@ -1,55 +1,15 @@ import React, { useState, type SyntheticEvent } from 'react'; import { useTranslation } from 'react-i18next'; -import { - X, Brain, Briefcase, ChefHat, Cpu, Landmark, Lightbulb, Activity, Hash, - Database, Book, Globe, Zap, Shield, Search, Code, MessageSquare, Layout, - Layers, HardDrive, Cloud, Lock, User, Users, Target, Award, GraduationCap, - Music, Video, Image, FileText, Mail, Terminal, Bug -} from 'lucide-react'; +import { X } from 'lucide-react'; import { useAppContext } from '../store/AppContext'; import { motion, AnimatePresence } from 'motion/react'; +import { SubjectIcon, ICONS_LIST as ICONS } from './SubjectIcon'; interface AddSubjectModalProps { readonly isOpen: boolean; readonly onClose: () => void; } -const ICONS = [ - { name: 'Brain', icon: Brain }, - { name: 'Briefcase', icon: Briefcase }, - { name: 'ChefHat', icon: ChefHat }, - { name: 'Cpu', icon: Cpu }, - { name: 'Landmark', icon: Landmark }, - { name: 'Lightbulb', icon: Lightbulb }, - { name: 'Activity', icon: Activity }, - { name: 'Hash', icon: Hash }, - { name: 'Database', icon: Database }, - { name: 'Book', icon: Book }, - { name: 'Globe', icon: Globe }, - { name: 'Zap', icon: Zap }, - { name: 'Shield', icon: Shield }, - { name: 'Search', icon: Search }, - { name: 'Code', icon: Code }, - { name: 'MessageSquare', icon: MessageSquare }, - { name: 'Layout', icon: Layout }, - { name: 'Layers', icon: Layers }, - { name: 'HardDrive', icon: HardDrive }, - { name: 'Cloud', icon: Cloud }, - { name: 'Lock', icon: Lock }, - { name: 'User', icon: User }, - { name: 'Users', icon: Users }, - { name: 'Target', icon: Target }, - { name: 'Award', icon: Award }, - { name: 'GraduationCap', icon: GraduationCap }, - { name: 'Music', icon: Music }, - { name: 'Video', icon: Video }, - { name: 'Image', icon: Image }, - { name: 'FileText', icon: FileText }, - { name: 'Mail', icon: Mail }, - { name: 'Terminal', icon: Terminal }, - { name: 'Bug', icon: Bug }, -]; - export function AddSubjectModal({ isOpen, onClose }: AddSubjectModalProps) { const { t } = useTranslation(); const { addSubject } = useAppContext(); @@ -153,7 +113,7 @@ export function AddSubjectModal({ isOpen, onClose }: AddSubjectModalProps) { : 'bg-black/40 border-white/5 text-zinc-500 hover:bg-zinc-800 hover:text-zinc-200' }`} > - + ))} diff --git a/frontend/src/components/ContextSelector.tsx b/frontend/src/components/ContextSelector.tsx index dcb65676..1575d9c4 100644 --- a/frontend/src/components/ContextSelector.tsx +++ b/frontend/src/components/ContextSelector.tsx @@ -8,6 +8,7 @@ import { } from 'lucide-react'; import { motion, AnimatePresence } from 'motion/react'; import { useAppContext } from '../store/AppContext'; +import { SubjectIcon } from './SubjectIcon'; export function ContextSelector() { const { t } = useTranslation(); @@ -174,9 +175,12 @@ export function ContextSelector() {
- - {subject.name} - +
+ + + {subject.name} + +
{subject.sourceCount !== undefined && ( {subject.sourceCount} {t('sidebar.operations.sources')} diff --git a/frontend/src/components/KnowledgeAdminView.tsx b/frontend/src/components/KnowledgeAdminView.tsx index 4cb19818..84a7c8fd 100644 --- a/frontend/src/components/KnowledgeAdminView.tsx +++ b/frontend/src/components/KnowledgeAdminView.tsx @@ -1,57 +1,12 @@ import React, { useState } from 'react'; import { useTranslation } from 'react-i18next'; -import { - Settings, Trash2, Edit2, Check, X, Brain, Briefcase, - ChefHat, Cpu, Landmark, Lightbulb, Activity, Hash, - Plus, AlertTriangle, Database, Book, Globe, Zap, Shield, - Search, Code, MessageSquare, Layout, Layers, HardDrive, - Cloud, Lock, User, Users, Target, Award, GraduationCap, - Music, Video, Image, FileText, Mail, Terminal, Bug +import { + Settings, Trash2, Edit2, Check, X, Plus, AlertTriangle, Search, Zap } from 'lucide-react'; import { useAppContext } from '../store/AppContext'; import { motion, AnimatePresence } from 'motion/react'; import { Subject } from '../types'; - -const ICONS = [ - { name: 'Brain', icon: Brain }, - { name: 'Briefcase', icon: Briefcase }, - { name: 'ChefHat', icon: ChefHat }, - { name: 'Cpu', icon: Cpu }, - { name: 'Landmark', icon: Landmark }, - { name: 'Lightbulb', icon: Lightbulb }, - { name: 'Activity', icon: Activity }, - { name: 'Hash', icon: Hash }, - { name: 'Database', icon: Database }, - { name: 'Book', icon: Book }, - { name: 'Globe', icon: Globe }, - { name: 'Zap', icon: Zap }, - { name: 'Shield', icon: Shield }, - { name: 'Search', icon: Search }, - { name: 'Code', icon: Code }, - { name: 'MessageSquare', icon: MessageSquare }, - { name: 'Layout', icon: Layout }, - { name: 'Layers', icon: Layers }, - { name: 'HardDrive', icon: HardDrive }, - { name: 'Cloud', icon: Cloud }, - { name: 'Lock', icon: Lock }, - { name: 'User', icon: User }, - { name: 'Users', icon: Users }, - { name: 'Target', icon: Target }, - { name: 'Award', icon: Award }, - { name: 'GraduationCap', icon: GraduationCap }, - { name: 'Music', icon: Music }, - { name: 'Video', icon: Video }, - { name: 'Image', icon: Image }, - { name: 'FileText', icon: FileText }, - { name: 'Mail', icon: Mail }, - { name: 'Terminal', icon: Terminal }, - { name: 'Bug', icon: Bug }, -]; - -const getSubjectIcon = (iconName?: string) => { - const item = ICONS.find(i => i.name === iconName); - return item ? item.icon : Hash; -}; +import { SubjectIcon, ICONS_LIST as ICONS } from './SubjectIcon'; export function KnowledgeAdminView() { const { t } = useTranslation(); @@ -158,7 +113,6 @@ export function KnowledgeAdminView() { ) : ( filteredSubjects.map((subject) => { const isEditing = editingId === subject.id; - const Icon = getSubjectIcon(isEditing ? editForm.icon : subject.icon); return (
- {ICONS.map(({ name: iconName, icon: ItemIcon }) => ( + {ICONS.map(({ name: iconName }) => ( ))}
@@ -237,7 +191,7 @@ export function KnowledgeAdminView() {
- +

{subject.name}

diff --git a/frontend/src/components/LocalContextSelector.tsx b/frontend/src/components/LocalContextSelector.tsx index ff70c93e..eeaacdd2 100644 --- a/frontend/src/components/LocalContextSelector.tsx +++ b/frontend/src/components/LocalContextSelector.tsx @@ -8,6 +8,7 @@ import { Search } from 'lucide-react'; import { useAppContext } from '../store/AppContext'; +import { SubjectIcon } from './SubjectIcon'; interface LocalContextSelectorProps { mode: 'single' | 'multi'; @@ -171,7 +172,7 @@ export const LocalContextSelector: React.FC = ({
- {subject.icon || '📁'} +

{subject.name}

diff --git a/frontend/src/components/SearchView.tsx b/frontend/src/components/SearchView.tsx index e26ad1f0..4f9272d4 100644 --- a/frontend/src/components/SearchView.tsx +++ b/frontend/src/components/SearchView.tsx @@ -1,7 +1,7 @@ import React, { useState, useEffect, useCallback } from 'react'; import { useTranslation } from 'react-i18next'; import { - Search, Sparkles, Lock, FileText, + Search, Sparkles, FileText, SlidersHorizontal, Database, TextSearch, Network, ListOrdered, ChevronDown, X, Copy, Check, Languages, Cpu, Hash, Calendar, Clock, ArrowUpDown, SquarePlay, BookOpen, Globe, Filter, Newspaper, Loader2, @@ -10,6 +10,7 @@ import { import { useAppContext } from '../store/AppContext'; import { motion, AnimatePresence } from 'motion/react'; import { api } from '../services/api'; +import { SubjectIcon } from './SubjectIcon'; // --- Types --- interface SearchResult { @@ -30,7 +31,8 @@ interface SearchResult { const getIcon = (type: string) => { switch (type.toLowerCase()) { - case 'youtube': return SquarePlay; + case 'youtube': + case 'video': return SquarePlay; case 'article': return Newspaper; case 'pdf': return FileText; case 'wikipedia': return BookOpen; @@ -56,24 +58,6 @@ const getModalModeLabelForScore = (mode: string, score: number) => { return `${(score * 100).toFixed(1)}% MATCH`; }; -const renderContextBadge = (selectedSubjects: any[], t: any) => { - if (!selectedSubjects || selectedSubjects.length === 0) { - return {t('sidebar.contexts.none')}; - } - if (selectedSubjects.length <= 2) { - return selectedSubjects.map(s => ( - - {s.name} - - )); - } - return ( - - {selectedSubjects.length} {t('sidebar.contexts.title')} - - ); -}; - export function SearchView() { const { subjects, selectedSubjects, sources } = useAppContext(); const { t } = useTranslation(); @@ -87,6 +71,36 @@ export function SearchView() { const [copied, setCopied] = useState(false); const [searchMode, setSearchMode] = useState<'semantic' | 'bm25' | 'hybrid'>('semantic'); const [useRerank, setUseRerank] = useState(true); + // Local context selection (independent from sidebar). Empty array + searchAll=true => search across all contexts. + const [searchSubjectIds, setSearchSubjectIds] = useState(() => selectedSubjects.map(s => s.id)); + const [searchAll, setSearchAll] = useState(true); + const [isContextsOpen, setIsContextsOpen] = useState(false); + const [contextsFilter, setContextsFilter] = useState(''); + const contextsRef = React.useRef(null); + + const filteredSubjects = React.useMemo( + () => subjects.filter(s => s.name.toLowerCase().includes(contextsFilter.toLowerCase())), + [subjects, contextsFilter] + ); + + // Close contexts popover on click outside + useEffect(() => { + const onClick = (e: MouseEvent) => { + if (contextsRef.current && !contextsRef.current.contains(e.target as Node)) { + setIsContextsOpen(false); + } + }; + document.addEventListener('mousedown', onClick); + return () => document.removeEventListener('mousedown', onClick); + }, []); + + const contextsLabel = searchAll + ? t('sidebar.contexts.title_all') + : searchSubjectIds.length === 0 + ? t('sidebar.contexts.none') + : searchSubjectIds.length === 1 + ? subjects.find(s => s.id === searchSubjectIds[0])?.name || t('common.selected_one') + : t('common.selected', { count: searchSubjectIds.length }); const sourceMap = React.useMemo(() => { const map = new Map(); @@ -98,14 +112,15 @@ export function SearchView() { }, [sources]); const runSearch = useCallback(async (currentQuery: string, currentMode: string, currentUseRerank: boolean) => { - if (!currentQuery.trim() || selectedSubjects.length === 0) return; + if (!currentQuery.trim()) return; + if (!searchAll && searchSubjectIds.length === 0) return; setIsSearching(true); setHasSearched(true); setResults([]); try { - const subjectIds = selectedSubjects.length > 0 ? selectedSubjects.map(s => s.id) : undefined; + const subjectIds = searchAll ? undefined : searchSubjectIds; const data = await api.search(currentQuery, topK, subjectIds, currentMode, currentUseRerank); const mappedResults: SearchResult[] = data.results.map((res: any) => { @@ -136,7 +151,7 @@ export function SearchView() { } finally { setIsSearching(false); } - }, [selectedSubjects, topK, sourceMap, t]); + }, [searchAll, searchSubjectIds, topK, sourceMap, subjects, t]); // Re-run search automatically when the mode or useRerank changes (only if a search was already performed) useEffect(() => { @@ -158,104 +173,187 @@ export function SearchView() { transition={{ duration: 0.4 }} className="h-full flex flex-col overflow-hidden" > - {/* Header & Search Bar */} -
- + -

{t('search.title')}

-

- {t('search.subtitle')} -

+

{t('search.title')}

+

{t('search.subtitle')}

- -
- +
+ setQuery(e.target.value)} placeholder={t('search.placeholder')} - className="w-full bg-transparent text-xl text-zinc-100 placeholder:text-zinc-600 px-5 py-4 focus:outline-none font-sans" + className="w-full bg-transparent text-base text-zinc-100 placeholder:text-zinc-600 px-4 focus:outline-none" />
- {/* Search Controls */} - -
- - {t('search.results.source')}: -
- {renderContextBadge(selectedSubjects, t)} -
+ {/* Contexts chip — opens its own popover */} +
+ + + + {isContextsOpen && ( + + {/* Search input */} +
+
+ + setContextsFilter(e.target.value)} + placeholder={t('common.actions.search') + '...'} + className="w-full h-8 pl-8 pr-2 bg-white/[0.04] ring-1 ring-white/10 rounded-lg text-xs text-zinc-200 placeholder:text-zinc-600 focus:outline-none focus:ring-emerald-500/30" + /> +
+
+
+ +
+
+
+ {filteredSubjects.length === 0 ? ( +
{t('sidebar.contexts.none')}
+ ) : ( + filteredSubjects.map(s => { + const checked = !searchAll && searchSubjectIds.includes(s.id); + return ( + + ); + }) + )} +
+ + )} +
-
- {/* Top K Selector */} + {/* spacer pushes options to the right */} +
+ + {/* Options group */} +
+ {/* Top K */}
- {isTopKOpen && ( - {[3, 5, 10, 20, 50].map((val) => ( ))} @@ -263,81 +361,55 @@ export function SearchView() {
- {/* Re-rank Toggle */} + {/* Re-rank — icon toggle */} - - {/* Search Mode Toggle */} -
- {/* Semantic */} - - - {/* Keyword (BM25) */} - - - {/* Hybrid */} - - {/* MMR (Disabled — needs dedicated implementation) */} -
- + ); + })} + -
- {t('ingestion.coming_soon.title')} (MMR) -
-
{/* Results Area */} diff --git a/frontend/src/components/SubjectIcon.tsx b/frontend/src/components/SubjectIcon.tsx new file mode 100644 index 00000000..7e9f76ab --- /dev/null +++ b/frontend/src/components/SubjectIcon.tsx @@ -0,0 +1,56 @@ +import React from 'react'; +import { + Brain, Briefcase, ChefHat, Cpu, Landmark, Lightbulb, Activity, Hash, + Database, Book, Globe, Zap, Shield, Search, Code, MessageSquare, Layout, + Layers, HardDrive, Cloud, Lock, User, Users, Target, Award, GraduationCap, + Music, Video, Image, FileText, Mail, Terminal, Bug +} from 'lucide-react'; + +export const ICONS_LIST = [ + { name: 'Brain', icon: Brain }, + { name: 'Briefcase', icon: Briefcase }, + { name: 'ChefHat', icon: ChefHat }, + { name: 'Cpu', icon: Cpu }, + { name: 'Landmark', icon: Landmark }, + { name: 'Lightbulb', icon: Lightbulb }, + { name: 'Activity', icon: Activity }, + { name: 'Hash', icon: Hash }, + { name: 'Database', icon: Database }, + { name: 'Book', icon: Book }, + { name: 'Globe', icon: Globe }, + { name: 'Zap', icon: Zap }, + { name: 'Shield', icon: Shield }, + { name: 'Search', icon: Search }, + { name: 'Code', icon: Code }, + { name: 'MessageSquare', icon: MessageSquare }, + { name: 'Layout', icon: Layout }, + { name: 'Layers', icon: Layers }, + { name: 'HardDrive', icon: HardDrive }, + { name: 'Cloud', icon: Cloud }, + { name: 'Lock', icon: Lock }, + { name: 'User', icon: User }, + { name: 'Users', icon: Users }, + { name: 'Target', icon: Target }, + { name: 'Award', icon: Award }, + { name: 'GraduationCap', icon: GraduationCap }, + { name: 'Music', icon: Music }, + { name: 'Video', icon: Video }, + { name: 'Image', icon: Image }, + { name: 'FileText', icon: FileText }, + { name: 'Mail', icon: Mail }, + { name: 'Terminal', icon: Terminal }, + { name: 'Bug', icon: Bug }, +]; + +interface SubjectIconProps { + readonly iconName?: string; + readonly className?: string; + readonly size?: number; +} + +export function SubjectIcon({ iconName, className, size }: SubjectIconProps) { + const item = ICONS_LIST.find(i => i.name === iconName); + const IconComponent = item ? item.icon : Hash; + + return ; +} diff --git a/frontend/src/locales/en.json b/frontend/src/locales/en.json index 78787084..b9545ced 100644 --- a/frontend/src/locales/en.json +++ b/frontend/src/locales/en.json @@ -22,6 +22,9 @@ "selectAll": "Select all", "deselectAll": "Deselect all" }, + "selected_one": "1 selected", + "selected_other": "{{count}} selected", + "all": "All", "errors": { "general": "An unexpected error occurred. Please try again." }, @@ -58,6 +61,7 @@ }, "contexts": { "title": "Knowledge Contexts", + "title_all": "Knowledge Contexts • All", "placeholder": "Filter contexts...", "none": "No contexts found.", "only": "Only", diff --git a/frontend/src/locales/pt-BR.json b/frontend/src/locales/pt-BR.json index 98317080..d95bf03e 100644 --- a/frontend/src/locales/pt-BR.json +++ b/frontend/src/locales/pt-BR.json @@ -22,6 +22,9 @@ "selectAll": "Selecionar todos", "deselectAll": "Desmarcar todos" }, + "selected_one": "1 selecionado", + "selected_other": "{{count}} selecionados", + "all": "Todos", "errors": { "general": "Ocorreu um erro inesperado. Tente novamente." }, @@ -58,6 +61,7 @@ }, "contexts": { "title": "Bases de Conhecimento", + "title_all": "Bases de Conhecimento • Todas", "placeholder": "Filtrar contextos...", "none": "Nenhum contexto encontrado.", "only": "Apenas", From f9de053d331b7bff211e8e48b2001fcc5e9850bf Mon Sep 17 00:00:00 2001 From: ericksonlopes Date: Wed, 8 Apr 2026 12:04:03 -0300 Subject: [PATCH 3/7] feat: implementation of chunk duplication tests and sidebar UX improvements --- alembic/env.py | 6 +- ...c845_add_chunk_duplicates_table_and_is_.py | 42 +++ ...673_add_content_source_id_to_duplicates.py | 32 ++ docs/issues/issue-duplication-tests-ux.md | 14 + frontend/src/App.tsx | 106 ++----- frontend/src/components/DiarizationView.tsx | 84 ++--- frontend/src/components/DuplicatesView.tsx | 289 ++++++++++++++++++ frontend/src/components/Sidebar.tsx | 143 ++++++--- frontend/src/components/SidebarContext.tsx | 180 +++++++++++ frontend/src/locales/en.json | 26 ++ frontend/src/locales/pt-BR.json | 34 ++- frontend/src/services/api.ts | 56 +++- frontend/src/store/AppContext.tsx | 8 +- frontend/src/types.ts | 17 +- main.py | 9 + scripts/clear_sql_db.py | 2 +- scripts/dump_database.py | 2 +- scripts/migrate_vector_db.py | 2 +- src/application/workers.py | 147 ++++++++- src/domain/entities/chunk_duplicate_entity.py | 17 ++ src/infrastructure/connectors/__init__.py | 0 .../connector_sql.py} | 1 - .../extractors/youtube_extractor.py | 35 ++- .../sql/chunk_duplicate_repository.py | 98 ++++++ .../sql/chunk_index_repository.py | 21 +- .../sql/content_source_repository.py | 2 +- .../sql/diarization_repository.py | 14 +- .../sql/ingestion_job_repository.py | 2 +- .../sql/knowledge_subject_repository.py | 2 +- .../sql/models/chunk_duplicate.py | 39 +++ .../repositories/sql/models/chunk_index.py | 4 +- .../repositories/sql/models/content_source.py | 2 +- .../sql/models/diarization_record.py | 2 +- .../repositories/sql/models/ingestion_job.py | 2 +- .../sql/models/knowledge_subject.py | 2 +- .../repositories/sql/models/user.py | 2 +- .../repositories/sql/models/voice_record.py | 2 +- .../repositories/sql/user_repository.py | 2 +- .../services/chunk_duplicate_service.py | 156 ++++++++++ src/presentation/api/dependencies.py | 18 +- ...udio_diarization_and_recognition_router.py | 6 +- .../api/routes/duplicate_router.py | 96 ++++++ src/presentation/api/routes/ingest_router.py | 28 +- .../api/routes/settings_router.py | 2 +- .../api/schemas/duplicate_schemas.py | 35 +++ test_dispatcher.py | 60 ++++ test_playlist.py | 25 ++ tests/conftest.py | 2 +- .../sql/test_chunk_duplicate_repository.py | 90 ++++++ .../services/test_chunk_duplicate_service.py | 86 ++++++ .../api/routes/test_duplicate_router.py | 74 +++++ tmp/cleanup_db.py | 27 ++ 52 files changed, 1902 insertions(+), 251 deletions(-) create mode 100644 alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py create mode 100644 alembic/versions/84524e052673_add_content_source_id_to_duplicates.py create mode 100644 docs/issues/issue-duplication-tests-ux.md create mode 100644 frontend/src/components/DuplicatesView.tsx create mode 100644 frontend/src/components/SidebarContext.tsx create mode 100644 src/domain/entities/chunk_duplicate_entity.py create mode 100644 src/infrastructure/connectors/__init__.py rename src/infrastructure/{repositories/sql/connector.py => connectors/connector_sql.py} (95%) create mode 100644 src/infrastructure/repositories/sql/chunk_duplicate_repository.py create mode 100644 src/infrastructure/repositories/sql/models/chunk_duplicate.py create mode 100644 src/infrastructure/services/chunk_duplicate_service.py create mode 100644 src/presentation/api/routes/duplicate_router.py create mode 100644 src/presentation/api/schemas/duplicate_schemas.py create mode 100644 test_dispatcher.py create mode 100644 test_playlist.py create mode 100644 tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py create mode 100644 tests/infrastructure/services/test_chunk_duplicate_service.py create mode 100644 tests/presentation/api/routes/test_duplicate_router.py create mode 100644 tmp/cleanup_db.py diff --git a/alembic/env.py b/alembic/env.py index 4f8e48f2..286fa357 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -10,7 +10,7 @@ from alembic import context from src.config.settings import settings -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base _package_name = "src.infrastructure.repositories.sql.models" @@ -58,7 +58,7 @@ def include_object(obj, name, type_, reflected, compare_to): @writer.rewrites(ops.CreateTableOp) @writer.rewrites(ops.CreateIndexOp) def add_if_not_exists(context, revision, op): - if not context.as_batch: + if not getattr(context, "as_batch", False): op.if_not_exists = True return op @@ -66,7 +66,7 @@ def add_if_not_exists(context, revision, op): @writer.rewrites(ops.DropTableOp) @writer.rewrites(ops.DropIndexOp) def add_if_exists(context, revision, op): - if not context.as_batch: + if not getattr(context, "as_batch", False): op.if_exists = True return op diff --git a/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py new file mode 100644 index 00000000..65229188 --- /dev/null +++ b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py @@ -0,0 +1,42 @@ +"""Add chunk duplicates table and is_active flag + +Revision ID: 646a175ac845 +Revises: b2c3d4e5f6a7 +Create Date: 2026-04-08 09:56:58.625813 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '646a175ac845' +down_revision: Union[str, Sequence[str], None] = 'b2c3d4e5f6a7' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('chunk_duplicates', + sa.Column('id', sa.UUID(), nullable=False), + sa.Column('chunk_ids', sa.JSON(), nullable=False), + sa.Column('similarity', sa.Float(), nullable=False), + sa.Column('status', sa.Text(), nullable=False), + sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False), + sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.add_column('chunk_index', sa.Column('is_active', sa.Boolean(), server_default=sa.text('1'), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('chunk_index', 'is_active') + op.drop_table('chunk_duplicates') + # ### end Alembic commands ### diff --git a/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py new file mode 100644 index 00000000..f0ac6124 --- /dev/null +++ b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py @@ -0,0 +1,32 @@ +"""add_content_source_id_to_duplicates + +Revision ID: 84524e052673 +Revises: 646a175ac845 +Create Date: 2026-04-08 10:50:39.027257 + +""" +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = '84524e052673' +down_revision: Union[str, Sequence[str], None] = '646a175ac845' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op: + batch_op.add_column(sa.Column('content_source_id', sa.UUID(), nullable=True)) + batch_op.create_foreign_key('fk_chunk_duplicates_content_source_id_content_sources', 'content_sources', ['content_source_id'], ['id'], initially='IMMEDIATE', deferrable=True) + + +def downgrade() -> None: + """Downgrade schema.""" + with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op: + batch_op.drop_constraint('fk_chunk_duplicates_content_source_id_content_sources', type_='foreignkey') + batch_op.drop_column('content_source_id') diff --git a/docs/issues/issue-duplication-tests-ux.md b/docs/issues/issue-duplication-tests-ux.md new file mode 100644 index 00000000..5a712623 --- /dev/null +++ b/docs/issues/issue-duplication-tests-ux.md @@ -0,0 +1,14 @@ +## Description +Implemented a comprehensive test suite for the chunk duplication feature, covering repository, service, and API layers. Additionally, improved the sidebar UX by enabling simple-toggle multi-selection, fixing indicator icon bugs, and adding a search-by-name field for subjects. + +## Tasks +- [x] Create SQL repository tests for chunk duplicates `tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py` +- [x] Create service tests for duplicate detection logic `tests/infrastructure/services/test_chunk_duplicate_service.py` +- [x] Create API router tests for duplicate endpoints `tests/presentation/api/routes/test_duplicate_router.py` +- [x] Update `SidebarContext.tsx` to enable simple toggle selection for multiple bases. +- [x] Fix Check icon bug in multi-selection in `SidebarContext.tsx`. +- [x] Add search filter field in `SidebarContext.tsx`. +- [x] Fix `tests/conftest.py` import path for infrastructure. + +## Additional Context +The sidebar changes eliminate the need for Ctrl+Click, making the multi-knowledge selection more discoverable. The search field ensures usability as the number of knowledge bases grows. diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 140e0da9..7343c312 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -34,6 +34,8 @@ import {ContentSource} from './types'; import {ChatView} from './components/ChatView'; import {KnowledgeAdminView} from './components/KnowledgeAdminView'; import {QueueMonitorView} from './components/QueueMonitorView'; +import {DuplicatesView} from './components/DuplicatesView'; +import {SidebarContext} from './components/SidebarContext'; function ActivityMonitorView() { @@ -307,10 +309,10 @@ function ContentSourcesView() { const filteredSources = React.useMemo(() => { let result = sources; - // Filter by subject context (Single select in this view) + // Filter by subject context (Multi select) if (selectedSubjects.length > 0) { - const selectedId = selectedSubjects[0].id; - result = result.filter(src => src.subjectId === selectedId); + const selectedIds = selectedSubjects.map(s => s.id); + result = result.filter(src => src.subjectId && selectedIds.includes(src.subjectId)); } if (typeFilter !== 'all') { @@ -423,9 +425,7 @@ function ContentSourcesView() {
- - {/* 🚀 FIXED RIGHT SIDEBAR (Experience unified with Diarization) */} -
-
-
- -

{t('ecosystem.title')}

-
-
- -
-

- {t('ecosystem.description')} -

-
- -
- {subjects.map((ctx) => { - const isSelected = selectedSubjects.some(s => s.id === ctx.id); - return ( - - ); - })} - {subjects.length === 0 && ( -
- - {t('ecosystem.no_base')} -
- )} -
- -
-
-
- - {selectedSubjects.length > 0 ? selectedSubjects[0].name : t('ecosystem.no_active_base')} - -
-
-
); } @@ -589,19 +529,27 @@ function MainContent() { {/* View Router */} -
- - {currentView === 'activity' && } - {currentView === 'queue' && } - {currentView === 'sources' && } - - {currentView === 'chat' && } - {currentView === 'search' && } - {currentView === 'database' && } - {currentView === 'knowledge_contexts' && } - {currentView === 'diarization' && } - {currentView === 'voice_profiles' && } - +
+
+ + {currentView === 'activity' && } + {currentView === 'queue' && } + {currentView === 'sources' && } + + {currentView === 'chat' && } + {currentView === 'search' && } + {currentView === 'database' && } + {currentView === 'knowledge_contexts' && } + {currentView === 'diarization' && } + {currentView === 'voice_profiles' && } + {currentView === 'duplicates' && } + +
+ + {/* Global Ecosystem Sidebar for Data operations */} + {['sources', 'duplicates', 'diarization'].includes(currentView) && ( + + )}
{ if (!silent) setIsLoadingJobs(true); try { - const subjectId = selectedSubjects.length > 0 ? selectedSubjects[0].id : undefined; - const data = await api.fetchDiarizations(50, 0, subjectId); + const subject_ids = selectedSubjects.map(s => s.id); + const data = await api.fetchDiarizations(50, 0, subject_ids); const mappedJobs = data.map(mapBackendJob); setJobs(mappedJobs); return mappedJobs; @@ -209,10 +206,13 @@ export function DiarizationView() { if (url) { const audio = new Audio(url); audioRef.current = audio; - audio.onended = () => { + + const onAudioEnded = () => { setSpeakers(prev => prev.map(s => ({...s, isPlaying: false}))); audioRef.current = null; }; + + audio.onended = onAudioEnded; await audio.play(); } } catch (err) { @@ -442,65 +442,19 @@ export function DiarizationView() {
- {/* 🔵 RIGHT SIDEBAR */} + {/* 🔵 RIGHT SIDEBAR (Metadata only, Ecosystem is global) */} - - {viewMode === 'list' ? ( -
-
-
- -

{t('ecosystem.title')}

-
-
- -
-

- {t('ecosystem.description')} -

-
- -
- {subjects.map((ctx) => { - const isSelected = selectedSubjects.some(s => s.id === ctx.id); - return ( - - ); - })} -
-
- ) : ( - activeJob && - )} -
+ {viewMode === 'detail' && activeJob && ( + + + + )}
{/* MODALS */} diff --git a/frontend/src/components/DuplicatesView.tsx b/frontend/src/components/DuplicatesView.tsx new file mode 100644 index 00000000..c168e4bf --- /dev/null +++ b/frontend/src/components/DuplicatesView.tsx @@ -0,0 +1,289 @@ +import React, { useEffect, useState, useCallback } from 'react'; +import { useTranslation } from 'react-i18next'; +import { + Copy, + Trash2, + CheckCircle, + AlertTriangle, + RefreshCw, + ExternalLink, + Loader2, + Clock, + ChevronLeft, + ChevronRight +} from 'lucide-react'; +import { motion } from 'framer-motion'; +import { useAppContext } from '../store/AppContext'; +import { api } from '../services/api'; +import { ChunkDuplicate } from '../types'; + +export function DuplicatesView() { + const { t } = useTranslation(); + const { addToast, selectedSubjects } = useAppContext(); + const [duplicates, setDuplicates] = useState([]); + const [isLoading, setIsLoading] = useState(true); + const [isSyncing, setIsSyncing] = useState(false); + const [filterStatus, setFilterStatus] = useState('pending'); + const [page, setPage] = useState(1); + const [total, setTotal] = useState(0); + const pageSize = 10; + + const fetchDuplicates = useCallback(async (isRefresh = false) => { + if (isRefresh) setIsSyncing(true); + else setIsLoading(true); + + try { + const subject_ids = selectedSubjects.map(s => s.id); + const offset = (page - 1) * pageSize; + const { results, total: totalCount } = await api.fetchDuplicates(filterStatus, subject_ids, pageSize, offset); + setDuplicates(results); + setTotal(totalCount); + } catch (err) { + console.error('Failed to fetch duplicates:', err); + addToast(t('common.errors.generic'), 'error'); + } finally { + setIsLoading(false); + setIsSyncing(false); + } + }, [filterStatus, selectedSubjects, addToast, t, page, pageSize]); + + useEffect(() => { + fetchDuplicates(); + }, [fetchDuplicates]); + + // Reset page when filter or subjects change + useEffect(() => { + setPage(1); + }, [filterStatus, selectedSubjects]); + + const handleUpdateStatus = async (id: string, status: string) => { + try { + await api.updateDuplicateStatus(id, status); + addToast(t('common.actions.success'), 'success'); + fetchDuplicates(); + } catch (err) { + console.error('Failed to update status:', err); + addToast(t('common.errors.generic'), 'error'); + } + }; + + const handleDeactivate = async (chunkId: string) => { + try { + await api.deactivateChunk(chunkId); + addToast(t('common.actions.success'), 'success'); + fetchDuplicates(); + } catch (err) { + console.error('Failed to deactivate chunk:', err); + addToast(t('common.errors.generic'), 'error'); + } + }; + + const totalPages = Math.ceil(total / pageSize); + + const containerVariants = { + hidden: { opacity: 0 }, + show: { + opacity: 1, + transition: { + staggerChildren: 0.1 + } + } + }; + + const itemVariants = { + hidden: { opacity: 0, y: 20 }, + show: { opacity: 1, y: 0 } + }; + + return ( +
+
+
+
+
+ +
+
+

{t('operations.duplicatesTitle')}

+

{t('operations.duplicatesDesc')}

+
+
+ +
+
+ {['pending', 'reviewed', 'ignored'].map((status) => ( + + ))} +
+ + +
+
+
+ +
+ {isLoading ? ( +
+ + Analisando redundâncias... +
+ ) : duplicates.length === 0 ? ( + +
+ +
+

{t('operations.noDuplicates')}

+

Sua base de conhecimento parece estar limpa e sem redundâncias óbvias.

+
+ ) : ( + + {duplicates.map((dup) => ( + +
+ +
+ {/* Header do Card */} +
+
+
+ + Conflito de Similaridade +
+
+ Score: + {(dup.similarity * 100).toFixed(1)}% +
+
+ +
+ + +
+
+ + {/* Comparação Lado a Lado */} +
+ {dup.chunks?.map((chunk, idx) => ( +
+
+
+ #{(idx + 1).toString().padStart(2, '0')} + {chunk.source_title} +
+
+ +
+
+
+

+ "{chunk.content}" +

+
+
+ ID: {chunk.id.slice(0, 8)}... + +
+
+ ))} +
+
+ + ))} + + )} +
+ +
+
+
+ + Total: {total} encontrados +
+ + {totalPages > 1 && ( +
+ +
+ {page} / {totalPages} +
+ +
+ )} +
+ + +
+
+ ); +} diff --git a/frontend/src/components/Sidebar.tsx b/frontend/src/components/Sidebar.tsx index ba2c6561..0f0ddf49 100644 --- a/frontend/src/components/Sidebar.tsx +++ b/frontend/src/components/Sidebar.tsx @@ -1,7 +1,7 @@ -import React, {useState} from 'react'; -import {useAppContext} from '../store/AppContext'; -import {useAuth} from '../store/AuthContext'; -import {useTranslation} from 'react-i18next'; +import React, { useState } from 'react'; +import { useAppContext } from '../store/AppContext'; +import { useAuth } from '../store/AuthContext'; +import { useTranslation } from 'react-i18next'; import { Activity as ActivityIcon, Database, @@ -11,16 +11,25 @@ import { Search, Settings, User, - Layers + Layers, + ChevronDown, + Copy } from 'lucide-react'; -import {SettingsModal} from './SettingsModal'; +import { SettingsModal } from './SettingsModal'; export function Sidebar() { const { currentView, setCurrentView } = useAppContext(); const { user, logout, isAuthEnabled } = useAuth(); const { t } = useTranslation(); const [isSettingsModalOpen, setIsSettingsModalOpen] = useState(false); + const [expandedGroups, setExpandedGroups] = useState>({ + data: true + }); + + const toggleGroup = (id: string) => { + setExpandedGroups(prev => ({ ...prev, [id]: !prev[id] })); + }; const navGroups = [ { @@ -41,9 +50,12 @@ export function Sidebar() { }, { id: 'data', - label: t('sidebar.groups.data'), + label: t('sidebar.operations.contentSources'), + icon: Database, + isExpandable: true, items: [ { id: 'sources', label: t('sidebar.operations.sources'), icon: Database }, + { id: 'duplicates', label: t('sidebar.operations.duplicates'), icon: Copy }, { id: 'diarization', label: t('sidebar.operations.diarization'), icon: Mic }, ] }, @@ -55,17 +67,16 @@ export function Sidebar() { { id: 'queue', label: t('sidebar.operations.queue', 'Task Queue (Redis)'), icon: Layers }, ] } - - ] as const; + ]; const getItemClass = (isActive: boolean, isDisabled: boolean) => { - if (isActive) return 'bg-emerald-500/10 text-emerald-400 font-bold border border-emerald-500/20'; + if (isActive) return 'bg-emerald-500/10 text-emerald-400 font-bold border border-emerald-500/20 shadow-[0_4px_12px_rgba(16,185,129,0.05)]'; if (isDisabled) return 'opacity-30 cursor-not-allowed grayscale'; return 'text-zinc-400 hover:bg-white/5 hover:text-zinc-200'; }; return ( -
+
{/* Brand */}
@@ -81,39 +92,85 @@ export function Sidebar() { {/* Navigation Groups - Scrollable Area */}
- {navGroups.map((group, index) => ( -
-

- {group.label} -

-
- {group.items.map((item) => { - const Icon = item.icon; - const isSourcesGroup = item.id === 'sources'; - const isActive = currentView === item.id || (isSourcesGroup && currentView === 'database'); - const isDisabled = 'disabled' in item ? item.disabled : false; + {navGroups.map((group, index) => { + const isExpandable = 'isExpandable' in group ? (group as any).isExpandable : false; + const isExpanded = expandedGroups[group.id] ?? false; + const GroupIcon = 'icon' in group ? (group as any).icon : null; + + if (isExpandable) { + return ( +
+ + + {isExpanded && ( +
+ {group.items.map((item) => { + const Icon = item.icon; + const isActive = currentView === item.id || (item.id === 'sources' && currentView === 'database'); + const isDisabled = 'disabled' in item ? item.disabled : false; + + return ( + + ); + })} +
+ )} + {index < navGroups.length - 1 &&
} +
+ ); + } - return ( - - ); - })} + return ( +
+

+ {group.label} +

+
+ {group.items.map((item) => { + const Icon = item.icon; + const isActive = currentView === item.id; + const isDisabled = 'disabled' in item ? item.disabled : false; + + return ( + + ); + })} +
+ {index < navGroups.length - 1 &&
}
- {index < navGroups.length - 1 &&
} -
- ))} + ); + })}
{/* User Profile / Settings */} @@ -159,8 +216,6 @@ export function Sidebar() {
- - setIsSettingsModalOpen(false)} diff --git a/frontend/src/components/SidebarContext.tsx b/frontend/src/components/SidebarContext.tsx new file mode 100644 index 00000000..e802afa8 --- /dev/null +++ b/frontend/src/components/SidebarContext.tsx @@ -0,0 +1,180 @@ +import React, { useCallback, useState, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Database, Plus, Eraser, Check, MousePointer2, Search } from 'lucide-react'; +import { motion, AnimatePresence } from 'motion/react'; +import { useAppContext } from '../store/AppContext'; +import { Subject } from '../types'; + +export function SidebarContext() { + const { + subjects, + selectedSubjects, + setSelectedSubjects, + setIsAddSubjectModalOpen + } = useAppContext(); + const { t } = useTranslation(); + const [searchQuery, setSearchQuery] = useState(''); + + const filteredSubjects = useMemo(() => { + return subjects.filter(s => + s.name.toLowerCase().includes(searchQuery.toLowerCase()) + ); + }, [subjects, searchQuery]); + + const handleSelectAll = useCallback(() => { + setSelectedSubjects([]); // Empty means "All" + }, [setSelectedSubjects]); + + const handleToggleSubject = useCallback((subject: Subject) => { + // Simplified UX: Always toggle selection on click + const isSelected = selectedSubjects.some(s => s.id === subject.id); + + if (isSelected) { + // If it's the last one being unselected, revert to "All" (empty list) + if (selectedSubjects.length === 5) { + // Wait, empty list means "All" in this app. + } + setSelectedSubjects(selectedSubjects.filter(s => s.id !== subject.id)); + } else { + setSelectedSubjects([...selectedSubjects, subject]); + } + }, [selectedSubjects, setSelectedSubjects]); + + const isAllSelected = selectedSubjects.length === 0; + + return ( +
+
+
+ +

{t('sidebarContext.title')}

+
+
+ + {!isAllSelected && ( + + + + )} + + +
+
+ +
+
+

+ + {t('sidebarContext.description')} +

+
+ +
+
+ + setSearchQuery(e.target.value)} + placeholder={t('common.actions.search') + '...'} + className="w-full pl-9 pr-3 py-2 bg-white/[0.03] border border-white/5 rounded-xl text-xs text-zinc-300 placeholder:text-zinc-600 focus:outline-none focus:border-emerald-500/30 transition-all font-medium" + /> +
+
+ +
+ {/* ALL / TODOS Button */} + + +
+ + + {filteredSubjects.map((ctx) => { + const isSelected = selectedSubjects.some(s => s.id === ctx.id); + return ( + handleToggleSubject(ctx)} + className={`w-full group flex items-center gap-3 p-3 rounded-2xl transition-all border ${isSelected ? 'bg-emerald-500/10 border-emerald-500/30 text-white shadow-[0_0_20px_rgba(16,185,129,0.1)]' : 'bg-transparent border-transparent text-zinc-500 hover:bg-white/5 hover:text-zinc-300'}`} + > +
+ +
+
+
+ {ctx.name} +
+
+ {ctx.sourceCount || 0} {t('sidebarContext.sources_count')} +
+
+ {isSelected && ( + + + + )} +
+ ); + })} +
+ + {subjects.length === 0 && ( +
+ + {t('sidebarContext.no_base')} +
+ )} +
+ +
+
+
+ + {isAllSelected ? t('sidebarContext.all_bases') : selectedSubjects.map(s => s.name).join(', ')} + +
+
+
+ ); +} diff --git a/frontend/src/locales/en.json b/frontend/src/locales/en.json index b9545ced..40cbe729 100644 --- a/frontend/src/locales/en.json +++ b/frontend/src/locales/en.json @@ -25,6 +25,8 @@ "selected_one": "1 selected", "selected_other": "{{count}} selected", "all": "All", + "active": "Active", + "select": "Select", "errors": { "general": "An unexpected error occurred. Please try again." }, @@ -81,6 +83,8 @@ "videosSelected": "Video(s) selected", "voices": "Voice Profiles", "sources": "Content Sources", + "contentSources": "Content Sources", + "duplicates": "Duplicates", "chunks": "Sources > Chunks", "activity": "Activity Monitor", "queue": "Task Queue (Redis)", @@ -101,6 +105,14 @@ "settings": "Settings" } }, + "sidebarContext": { + "title": "Knowledge Bases", + "description": "Filter content sources by one or more selected contexts.", + "all_bases": "All Knowledge Bases", + "clear_filter": "Clear Selection", + "sources_count": "sources", + "no_base": "No knowledge base found" + }, "search": { "title": "Semantic Search", "subtitle": "Explore your knowledge base using semantic, keyword, or hybrid search.", @@ -244,6 +256,20 @@ "phase": "Phase", "duration": "duration" }, + "operations": { + "duplicatesTitle": "Duplicate Management", + "duplicatesDesc": "We identified segments with high similarity (>90%) that might be redundant or repeated cuts.", + "duplicates": "Duplicates", + "similarity": "Similarity", + "ignore": "Ignore", + "deactivate": "Deactivate", + "resolved": "Resolved", + "pending": "Pending", + "reviewed": "Reviewed", + "ignored": "Ignored", + "noDuplicates": "No duplicates found at the moment.", + "contentSources": "Content Sources" + }, "settings": { "title": "Settings", "subtitle": "Manage your local environment configurations", diff --git a/frontend/src/locales/pt-BR.json b/frontend/src/locales/pt-BR.json index d95bf03e..65581eac 100644 --- a/frontend/src/locales/pt-BR.json +++ b/frontend/src/locales/pt-BR.json @@ -25,6 +25,8 @@ "selected_one": "1 selecionado", "selected_other": "{{count}} selecionados", "all": "Todos", + "active": "Ativo", + "select": "Selecionar", "errors": { "general": "Ocorreu um erro inesperado. Tente novamente." }, @@ -75,12 +77,13 @@ "search": "Busca Semântica", "diarization": "Reconhecimento de Fala", "voices": "Perfis de Voz", - "sources": "Fontes de Conteúdo", - "chunks": "Fontes > Chunks", + "contentSources": "Fontes de Conteúdo", + "duplicates": "Duplicidades", + "sources": "Fontes", + "knowledgeSubject": "Assunto de Conhecimento", "activity": "Monitor de Atividade", "queue": "Fila de Tarefas (Redis)", "knowledge_contexts": "Bases de Conhecimento", - "database": "Detalhes da Fonte", "voice_profiles": "Perfis de Voz" }, @@ -97,6 +100,14 @@ "settings": "Configurações" } }, + "sidebarContext": { + "title": "Bases de Conhecimento", + "description": "Filtre as fontes de conteúdo por um ou mais contextos selecionados.", + "all_bases": "Todas as Bases", + "clear_filter": "Limpar Filtros", + "sources_count": "fontes", + "no_base": "Nenhuma base de conhecimento encontrada" + }, "search": { "title": "Busca Semântica", "subtitle": "Explore sua base de conhecimento usando busca semântica, palavra-chave ou híbrida.", @@ -238,7 +249,22 @@ "pipeline_optimized": "Pipeline otimizada", "reprocessed_message": "Substituído por uma nova versão", "phase": "Phase", - "duration": "duração" + "duration": "duração", + "diarization": "Diarização" + }, + "operations": { + "duplicatesTitle": "Gerenciamento de Duplicidades", + "duplicatesDesc": "Identificamos trechos com alta similaridade (>90%) que podem ser redundantes ou cortes repetidos.", + "duplicates": "Duplicidades", + "similarity": "Similaridade", + "ignore": "Ignorar", + "deactivate": "Inativar", + "resolved": "Resolvido", + "pending": "Pendente", + "reviewed": "Revisado", + "ignored": "Ignorado", + "noDuplicates": "Nenhuma duplicidade encontrada no momento.", + "contentSources": "Fontes de Conteúdo" }, "settings": { "title": "Configurações", diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts index 4eedcafc..ec7c23ce 100644 --- a/frontend/src/services/api.ts +++ b/frontend/src/services/api.ts @@ -1,4 +1,4 @@ -import { Subject, IngestionTask, ContentSource, Chunk, PaginatedResponse, RawQueueTask } from '../types'; +import { Subject, IngestionTask, ContentSource, Chunk, PaginatedResponse, RawQueueTask, ChunkDuplicate } from '../types'; const API_BASE_URL = '/rest'; @@ -353,11 +353,18 @@ export const api = { }, // Diarization Methods - async fetchDiarizations(limit = 20, offset = 0, subjectId?: string): Promise { + async fetchDiarizations(limit = 10, offset = 0, subject_ids?: string | string[]): Promise { const url = new URL(`${API_BASE_URL}/audio`, globalThis.location.origin); url.searchParams.append('limit', limit.toString()); url.searchParams.append('offset', offset.toString()); - if (subjectId) url.searchParams.append('subject_id', subjectId); + + if (subject_ids) { + if (Array.isArray(subject_ids)) { + subject_ids.forEach(id => url.searchParams.append('subject_id', id)); + } else { + url.searchParams.append('subject_id', subject_ids); + } + } const response = await fetch(url.toString(), { headers: getHeaders() @@ -567,6 +574,49 @@ export const api = { headers: getHeaders() }); await handleResponseError(response, 'Failed to remove task from queue'); + }, + + // Duplicate Management + async fetchDuplicates(status?: string, subject_ids?: string[], limit = 100, offset = 0): Promise<{ results: ChunkDuplicate[]; total: number }> { + const url = new URL(`${API_BASE_URL}/duplicates`, globalThis.location.origin); + if (status) url.searchParams.append('status', status); + if (subject_ids && subject_ids.length > 0) { + subject_ids.forEach(id => url.searchParams.append('subject_id', id)); + } + url.searchParams.append('limit', limit.toString()); + url.searchParams.append('offset', offset.toString()); + + const response = await fetch(url.toString(), { + headers: getHeaders() + }); + await handleResponseError(response, 'Failed to fetch duplicates'); + return response.json(); + }, + + async updateDuplicateStatus(id: string, status: string): Promise { + const response = await fetch(`${API_BASE_URL}/duplicates/${id}/status`, { + method: 'PATCH', + headers: getHeaders(), + body: JSON.stringify({ status }) + }); + await handleResponseError(response, 'Failed to update duplicate status'); + }, + + async deactivateChunk(id: string): Promise { + const response = await fetch(`${API_BASE_URL}/duplicates/chunks/${id}/deactivate`, { + method: 'POST', + headers: getHeaders() + }); + await handleResponseError(response, 'Failed to deactivate chunk'); + }, + + async analyzeAllDuplicates(): Promise { + const response = await fetch(`${API_BASE_URL}/duplicates/analyze-all`, { + method: 'POST', + headers: getHeaders() + }); + await handleResponseError(response, 'Global analysis failed'); + return response.json(); } }; diff --git a/frontend/src/store/AppContext.tsx b/frontend/src/store/AppContext.tsx index ed000a31..22712ecc 100644 --- a/frontend/src/store/AppContext.tsx +++ b/frontend/src/store/AppContext.tsx @@ -79,7 +79,7 @@ export function AppProvider({ children }: { readonly children: ReactNode }) { const [isJobsLoaded, setIsJobsLoaded] = useState(false); const [currentView, setCurrentView] = useState(() => { const saved = localStorage.getItem('currentView') as ViewState; - const validViews: ViewState[] = ['chat', 'search', 'sources', 'activity', 'database', 'knowledge_contexts', 'diarization', 'voice_profiles']; + const validViews: ViewState[] = ['chat', 'search', 'sources', 'activity', 'database', 'knowledge_contexts', 'diarization', 'voice_profiles', 'duplicates']; const initial = validViews.includes(saved) ? saved : 'search'; return initial; }); @@ -153,9 +153,7 @@ export function AppProvider({ children }: { readonly children: ReactNode }) { } } - if (data.length > 0 && selectedSubjects.length === 0) { - setSelectedSubjects([data[0]]); - } + // No auto-selecting anymore, empty selection means "All" } catch (err) { console.error('Error fetching subjects:', err); } @@ -398,10 +396,8 @@ export function AppProvider({ children }: { readonly children: ReactNode }) { // Persist selectedSubjects useEffect(() => { - if (selectedSubjects.length > 0) { const ids = selectedSubjects.map(s => s.id); localStorage.setItem('selectedSubjectIds', JSON.stringify(ids)); - } }, [selectedSubjects]); const toggleSubjectSelection = useCallback((subject: Subject) => { diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 53384d3e..88958413 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -55,7 +55,22 @@ export interface RawQueueTask { enqueued_at: number; } -export type ViewState = 'chat' | 'search' | 'sources' | 'activity' | 'database' | 'knowledge_contexts' | 'diarization' | 'voice_profiles' | 'queue'; +export type ViewState = 'chat' | 'search' | 'sources' | 'activity' | 'database' | 'knowledge_contexts' | 'diarization' | 'voice_profiles' | 'queue' | 'duplicates'; + +export type ChunkDuplicate = { + id: string; + chunk_ids: string[]; + chunks?: { + id: string; + content: string; + source_title?: string; + source_id?: string; + }[]; + similarity: number; + status: string; + created_at: string; + updated_at: string; +}; export type ToastType = 'success' | 'info' | 'error'; diff --git a/main.py b/main.py index a212feed..d5206c26 100644 --- a/main.py +++ b/main.py @@ -25,6 +25,7 @@ from src.presentation.api.routes import ( # noqa: E402 auth_router, chunk_router, + duplicate_router, ingest_router, job_router, notification_router, @@ -89,6 +90,7 @@ async def lifespan(app: FastAPI): run_audio_diarization_dispatcher_worker, run_audio_diarization_worker, run_diarization_ingestion_worker, + run_duplicate_detection_worker, run_file_ingestion_worker, run_web_ingestion_worker, run_youtube_dispatcher_worker, @@ -106,6 +108,7 @@ async def lifespan(app: FastAPI): "run_audio_diarization_dispatcher_worker", run_audio_diarization_dispatcher_worker, ) + register_task("run_duplicate_detection_worker", run_duplicate_detection_worker) logger.info("Initializing RedisTaskQueueService...") app.state.task_queue = RedisTaskQueueService(num_workers=1) @@ -190,6 +193,12 @@ async def lifespan(app: FastAPI): tags=["Chunks"], dependencies=secured_deps, ) +app.include_router( + duplicate_router.router, + prefix="/rest/duplicates", + tags=["Duplicates"], + dependencies=secured_deps, +) app.include_router( notification_router.router, prefix="/rest/notifications", diff --git a/scripts/clear_sql_db.py b/scripts/clear_sql_db.py index a5231263..39cc29c4 100644 --- a/scripts/clear_sql_db.py +++ b/scripts/clear_sql_db.py @@ -4,7 +4,7 @@ # Add project root to sys.path sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from src.infrastructure.repositories.sql.connector import engine, Base +from infrastructure.connectors.connector_sql import engine, Base from src.infrastructure.repositories.sql import models # noqa: F401 from src.config.logger import Logger diff --git a/scripts/dump_database.py b/scripts/dump_database.py index af0513a9..529ce528 100644 --- a/scripts/dump_database.py +++ b/scripts/dump_database.py @@ -11,7 +11,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from src.infrastructure.repositories.sql.connector import engine +from infrastructure.connectors.connector_sql import engine logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" diff --git a/scripts/migrate_vector_db.py b/scripts/migrate_vector_db.py index 00e75a35..8f58993e 100644 --- a/scripts/migrate_vector_db.py +++ b/scripts/migrate_vector_db.py @@ -31,7 +31,7 @@ from sqlalchemy.orm import Session from sqlalchemy import or_, and_ from clear_vector_db import clear_vector_db -from src.infrastructure.repositories.sql.connector import Session as DBSessionFactory +from src.infrastructure.connectors.connector_sql import Session as DBSessionFactory from src.infrastructure.repositories.sql.models.chunk_index import ChunkIndexModel from src.infrastructure.services.model_loader_service import ModelLoaderService from src.infrastructure.repositories.vector.models.chunk_model import ChunkModel diff --git a/src/application/workers.py b/src/application/workers.py index a9a7be28..7ff71021 100644 --- a/src/application/workers.py +++ b/src/application/workers.py @@ -72,7 +72,16 @@ def run_file_ingestion_worker(cmd: IngestFileCommand): event_bus=ctx.event_bus, ) - use_case.execute(cmd) + result = use_case.execute(cmd) + + # Enqueue duplicate detection + if result and "vector_ids" in result: + task_queue = app.state.task_queue + task_queue.enqueue( + run_duplicate_detection_worker, + {"chunk_ids": result["vector_ids"]}, + task_title=f"Dup Check: {cmd.file_name}", + ) except Exception as e: logger.error(f"Worker Error: Failed to execute file ingestion: {e}", exc_info=True) finally: @@ -119,7 +128,16 @@ def run_youtube_ingestion_worker(cmd: IngestYoutubeCommand): event_bus=ctx.event_bus, ) - use_case.execute(cmd) + result = use_case.execute(cmd) + + # Enqueue duplicate detection + if result and "vector_ids" in result: + task_queue = app.state.task_queue + task_queue.enqueue( + run_duplicate_detection_worker, + {"chunk_ids": result["vector_ids"]}, + task_title=f"Dup Check YouTube: {cmd.video_url}", + ) except Exception as e: logger.error(f"Worker Error: Failed to execute YouTube ingestion: {e}", exc_info=True) finally: @@ -144,8 +162,20 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): try: from src.application.dtos.enums.youtube_data_type import YoutubeDataType from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor + from src.presentation.api.dependencies import resolve_ingestion_context task_queue = app.state.task_queue + context = resolve_ingestion_context(app) + job_service = context.job_service + job_id = str(cmd.ingestion_job_id) if cmd.ingestion_job_id else None + + if job_id: + job_service.update_job_status( + job_id, + status="PROCESSING", + status_message=f"Resolving {cmd.data_type} videos...", + progress=5, + ) # 1. Resolve the full list of URLs video_list = [] @@ -154,6 +184,8 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): playlist_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None) if not playlist_url: logger.warning("No URL provided for playlist dispatcher") + if job_id: + job_service.update_job_status(job_id, "FAILED", "Missing playlist URL.") return extractor = YoutubeExtractor(language=cmd.language) @@ -163,6 +195,8 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): channel_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None) if not channel_url: logger.warning("No URL provided for channel dispatcher") + if job_id: + job_service.update_job_status(job_id, "FAILED", "Missing channel URL.") return extractor = YoutubeExtractor(language=cmd.language) @@ -173,15 +207,27 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): if not video_list: logger.warning(f"YouTube Dispatcher resolved 0 videos for type {cmd.data_type}.") + if job_id: + job_service.update_job_status( + job_id, + "FAILED", + f"No videos found in {cmd.data_type}. Verify if the URL is valid and public.", + ) return + if job_id: + job_service.update_job_status( + job_id, + status="PROCESSING", + status_message=f"Dispatched {len(video_list)} videos for ingestion.", + progress=50, + ) + logger.info(f"YouTube Dispatcher resolved {len(video_list)} videos. Enqueueing individual tasks...") # 2. Enqueue each video as a separate task for url in video_list: # Create a clone of the command for a single video - # IMPORTANT: We don't reuse the same ingestion_job_id from the dispatcher - # so that each video can either create its own tracking or we let the use case handle it. single_cmd = IngestYoutubeCommand( video_url=url, subject_id=cmd.subject_id, @@ -202,9 +248,21 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): ) logger.info(f"Successfully dispatched {len(video_list)} YouTube ingestion tasks.") + if job_id: + job_service.update_job_status( + job_id, + status="SUCCESS", + status_message=f"Dispatched {len(video_list)} videos successfully.", + progress=100, + ) except Exception as e: logger.error(f"YouTube Dispatcher Worker Error: {e}", exc_info=True) + if job_id: + try: + job_service.update_job_status(job_id, "FAILED", str(e)) + except Exception: + pass finally: clear_global_context() @@ -238,7 +296,7 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand): vector_svc = ChunkVectorService(vector_repo, rerank_service=rerank_svc) # DiarizationRepository needs a DB session - from src.infrastructure.repositories.sql.connector import Session as DBSession + from infrastructure.connectors.connector_sql import Session as DBSession db = DBSession() try: @@ -256,7 +314,16 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand): event_bus=ctx.event_bus, ) - use_case.execute(cmd) + result = use_case.execute(cmd) + + # Enqueue duplicate detection + if result and "vector_ids" in result: + task_queue = app.state.task_queue + task_queue.enqueue( + run_duplicate_detection_worker, + {"chunk_ids": result["vector_ids"]}, + task_title=f"Dup Check Diarization: {cmd.source}" + ) finally: db.close() except Exception as e: @@ -313,7 +380,16 @@ async def _run(): extractor=extractor, ) - await use_case.execute(cmd) + result = await use_case.execute(cmd) + + # Enqueue duplicate detection + if result and "vector_ids" in result: + task_queue = app.state.task_queue + task_queue.enqueue( + run_duplicate_detection_worker, + {"chunk_ids": result["vector_ids"]}, + task_title=f"Dup Check Web: {cmd.url}" + ) except Exception as e: logging.getLogger(__name__).error(f"Worker Error: Failed to execute Web Scraping: {e}", exc_info=True) finally: @@ -324,12 +400,12 @@ async def _run(): def _audio_diarization_subprocess(cmd_dict: dict): """Run audio diarization in a separate process to avoid torch/CUDA thread deadlocks.""" + from infrastructure.connectors.connector_sql import ( + Session as DBSessionFactory, + ) from src.application.use_cases.process_audio_diarization_pipeline import ( ProcessAudioDiarizationPipelineUseCase, ) - from src.infrastructure.repositories.sql.connector import ( - Session as DBSessionFactory, - ) from src.infrastructure.repositories.sql.content_source_repository import ( ContentSourceSQLRepository, ) @@ -400,10 +476,10 @@ def run_audio_diarization_dispatcher_worker(cmd: ProcessAudioCommand): return try: - from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor - from src.infrastructure.repositories.sql.connector import ( + from infrastructure.connectors.connector_sql import ( Session as DBSessionFactory, ) + from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor from src.infrastructure.repositories.sql.diarization_repository import ( DiarizationRepository, ) @@ -516,7 +592,7 @@ def run_audio_diarization_worker(cmd: ProcessAudioCommand): if process.exitcode != 0: logger.error("Audio diarization subprocess exited with code %d", process.exitcode) if cmd.diarization_id: - from src.infrastructure.repositories.sql.connector import ( + from infrastructure.connectors.connector_sql import ( Session as DBSessionFactory, ) from src.infrastructure.repositories.sql.diarization_repository import ( @@ -574,10 +650,10 @@ def run_voice_training_worker(cmd: TrainVoiceCommand): return try: + from infrastructure.connectors.connector_sql import Session as DBSession from src.application.use_cases.manage_voice_profiles import ( TrainVoiceProfileFromSpeakerSegmentUseCase, ) - from src.infrastructure.repositories.sql.connector import Session as DBSession from src.presentation.api.dependencies import resolve_ingestion_context ctx = resolve_ingestion_context(app) @@ -595,3 +671,46 @@ def run_voice_training_worker(cmd: TrainVoiceCommand): logger.error(f"Worker Error: Failed to execute voice training: {e}", exc_info=True) finally: clear_global_context() + + +def run_duplicate_detection_worker(cmd: dict): + """Background worker for detecting duplicate chunks.""" + set_global_context({"correlation_id": "worker-duplicate-detection"}) + + app = _get_app() + if not app: + clear_global_context() + return + + try: + from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService + from src.presentation.api.dependencies import ( + get_chunk_repo, + get_chunk_vector_service, + get_duplicate_repo, + resolve_rerank_service, + resolve_vector_repository, + ) + + # Manual resolution since we don't have a Request object + vector_repo = resolve_vector_repository(app) + rerank_svc = resolve_rerank_service(app) + vector_svc = get_chunk_vector_service(vector_repo, rerank_svc) + + duplicate_repo = get_duplicate_repo() + chunk_repo = get_chunk_repo() + + service = ChunkDuplicateService(duplicate_repo, chunk_repo, vector_svc) + + chunk_ids = cmd.get("chunk_ids", []) + if not chunk_ids: + return + + logger.info(f"Running duplicate detection for {len(chunk_ids)} chunks") + count = service.find_and_register_duplicates(chunk_ids, similarity_threshold=0.90) + logger.info(f"Duplicate detection finished. Found {count} potential duplicate groups.") + + except Exception as e: + logger.error(f"Worker Error: Failed to execute duplicate detection: {e}", exc_info=True) + finally: + clear_global_context() diff --git a/src/domain/entities/chunk_duplicate_entity.py b/src/domain/entities/chunk_duplicate_entity.py new file mode 100644 index 00000000..73e7fcf6 --- /dev/null +++ b/src/domain/entities/chunk_duplicate_entity.py @@ -0,0 +1,17 @@ +from datetime import datetime +from typing import List, Optional +from uuid import UUID, uuid4 + +from pydantic import BaseModel, Field + + +class ChunkDuplicateEntity(BaseModel): + """Domain entity representing a group of duplicate chunks.""" + + id: UUID = Field(default_factory=uuid4) + chunk_ids: List[UUID] = Field(default_factory=list) + similarity: float + content_source_id: Optional[str] = None + status: str = "pending" # e.g., "pending", "reviewed", "ignored" + created_at: datetime = Field(default_factory=datetime.utcnow) + updated_at: datetime = Field(default_factory=datetime.utcnow) diff --git a/src/infrastructure/connectors/__init__.py b/src/infrastructure/connectors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/infrastructure/repositories/sql/connector.py b/src/infrastructure/connectors/connector_sql.py similarity index 95% rename from src/infrastructure/repositories/sql/connector.py rename to src/infrastructure/connectors/connector_sql.py index 439fdddf..7e665e5e 100644 --- a/src/infrastructure/repositories/sql/connector.py +++ b/src/infrastructure/connectors/connector_sql.py @@ -3,7 +3,6 @@ from src.config.settings import settings -# Engine setup based on dialect connect_args = {} if settings.sql.url.startswith("sqlite"): connect_args["check_same_thread"] = False diff --git a/src/infrastructure/extractors/youtube_extractor.py b/src/infrastructure/extractors/youtube_extractor.py index 87e68e86..726d9aa3 100644 --- a/src/infrastructure/extractors/youtube_extractor.py +++ b/src/infrastructure/extractors/youtube_extractor.py @@ -202,7 +202,9 @@ def _download(): with YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) base_name = ydl.prepare_filename(info) - return str(Path(base_name).with_suffix(".mp3")) + mp3_path = str(Path(base_name).with_suffix(".mp3")) + self._validate_mp3_file(mp3_path) + return mp3_path try: return self._run_with_retry(_download) @@ -210,6 +212,37 @@ def _download(): logger.error(f"Download failed after ALL retries: {e}", context={"url": url}) return None + @staticmethod + def _validate_mp3_file(path: str) -> None: + """Validate that the downloaded file is a real MP3. + + Raises ValueError if the file is missing, empty, or doesn't start with + a valid MP3 signature (ID3 tag or MPEG audio frame sync). This catches + cases where yt-dlp/ffmpeg silently produced a corrupt or HTML-error + artifact with an .mp3 extension. + """ + p = Path(path) + if not p.exists(): + raise ValueError(f"Downloaded MP3 not found: {path}") + + size = p.stat().st_size + if size < 1024: + raise ValueError(f"Downloaded MP3 is too small ({size} bytes): {path}") + + with open(p, "rb") as f: + header = f.read(4) + + # ID3v2 tag + if header[:3] == b"ID3": + return + # MPEG audio frame sync: 11 bits set (0xFF 0xEx/0xFx) + if len(header) >= 2 and header[0] == 0xFF and (header[1] & 0xE0) == 0xE0: + return + + raise ValueError( + f"Downloaded file is not a valid MP3 (header={header!r}): {path}" + ) + def extract_playlist_videos(self, playlist_url: str) -> list[str]: """Extracts all video URLs from a YouTube playlist using yt_dlp.""" diff --git a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py new file mode 100644 index 00000000..911f864b --- /dev/null +++ b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py @@ -0,0 +1,98 @@ +from typing import Any, List, Optional +from uuid import UUID + +from sqlalchemy import desc + +from src.config.logger import Logger +from src.infrastructure.connectors.connector_sql import Connector +from src.infrastructure.repositories.sql.models.chunk_duplicate import ChunkDuplicateModel +from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel +from src.infrastructure.repositories.sql.utils.utils import ensure_uuid + +logger = Logger() + +class ChunkDuplicateSQLRepository: + """Repository for managing duplicate chunk records in SQL.""" + + def create_duplicate_record(self, chunk_ids: List[UUID], similarity: float, status: str = "pending", content_source_id: Optional[str] = None) -> ChunkDuplicateModel: + """Create a new duplicate grouping record.""" + with Connector() as session: + try: + record = ChunkDuplicateModel( + chunk_ids=[str(cid) for cid in chunk_ids], + similarity=similarity, + status=status, + content_source_id=content_source_id + ) + session.add(record) + session.commit() + session.refresh(record) + return record + except Exception as e: + session.rollback() + logger.error( + "Error creating duplicate record", + context={"error": str(e)} + ) + raise + + def list_duplicates(self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0) -> tuple[List[ChunkDuplicateModel], int]: + """List duplicate records with optional status and context filtering.""" + with Connector() as session: + query = session.query(ChunkDuplicateModel) + + if subject_ids: + # Convert string IDs to UUID objects for safe matching in SQL + parsed_ids = [UUID(sid) for sid in subject_ids] + query = query.join( + ContentSourceModel, + ContentSourceModel.id == ChunkDuplicateModel.content_source_id + ).filter(ContentSourceModel.subject_id.in_(parsed_ids)) + + if status: + query = query.filter(ChunkDuplicateModel.status == status) + + total = query.count() + items = query.order_by(desc(ChunkDuplicateModel.created_at)).limit(limit).offset(offset).all() + return items, total + + def get_by_id(self, duplicate_id: Any) -> Optional[ChunkDuplicateModel]: + """Fetch a duplicate record by its UUID.""" + duplicate_id = ensure_uuid(duplicate_id) + with Connector() as session: + return session.query(ChunkDuplicateModel).filter_by(id=duplicate_id).first() + + def update_status(self, duplicate_id: Any, status: str) -> bool: + """Update the status of a duplicate record.""" + duplicate_id = ensure_uuid(duplicate_id) + with Connector() as session: + try: + record = session.query(ChunkDuplicateModel).filter_by(id=duplicate_id).first() + if record: + record.status = status + session.commit() + return True + return False + except Exception as e: + session.rollback() + logger.error( + "Error updating duplicate status", + context={"duplicate_id": str(duplicate_id), "error": str(e)} + ) + raise + + def delete_record(self, duplicate_id: Any) -> bool: + """Delete a duplicate record.""" + duplicate_id = ensure_uuid(duplicate_id) + with Connector() as session: + try: + record = session.query(ChunkDuplicateModel).filter_by(id=duplicate_id).first() + if record: + session.delete(record) + session.commit() + return True + return False + except Exception as e: + session.rollback() + logger.error("Error deleting duplicate record", context={"duplicate_id": str(duplicate_id), "error": str(e)}) + raise diff --git a/src/infrastructure/repositories/sql/chunk_index_repository.py b/src/infrastructure/repositories/sql/chunk_index_repository.py index ea4f4e2e..b58f49b3 100644 --- a/src/infrastructure/repositories/sql/chunk_index_repository.py +++ b/src/infrastructure/repositories/sql/chunk_index_repository.py @@ -5,7 +5,7 @@ from sqlalchemy.orm import joinedload from src.config.logger import Logger -from src.infrastructure.repositories.sql.connector import Connector +from src.infrastructure.connectors.connector_sql import Connector from src.infrastructure.repositories.sql.models.chunk_index import ChunkIndexModel from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel from src.infrastructure.repositories.sql.utils.utils import ensure_uuid @@ -260,3 +260,22 @@ def get_by_id(self, chunk_id: Any) -> Optional[ChunkIndexModel]: context={"chunk_id": str(chunk_id), "error": str(e)}, ) return None + + def update_is_active(self, chunk_id: Any, is_active: bool) -> bool: + """Update the is_active flag of a chunk.""" + chunk_id = ensure_uuid(chunk_id) + with Connector() as session: + try: + chunk = session.query(ChunkIndexModel).filter_by(id=chunk_id).first() + if chunk: + chunk.is_active = is_active + session.commit() + return True + return False + except Exception as e: + session.rollback() + logger.error( + "Error updating chunk is_active", + context={"chunk_id": str(chunk_id), "error": str(e)}, + ) + raise diff --git a/src/infrastructure/repositories/sql/content_source_repository.py b/src/infrastructure/repositories/sql/content_source_repository.py index 6db68951..3714c056 100644 --- a/src/infrastructure/repositories/sql/content_source_repository.py +++ b/src/infrastructure/repositories/sql/content_source_repository.py @@ -3,7 +3,7 @@ from uuid import UUID from src.config.logger import Logger -from src.infrastructure.repositories.sql.connector import Connector +from src.infrastructure.connectors.connector_sql import Connector from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel from src.infrastructure.repositories.sql.utils.utils import ensure_uuid diff --git a/src/infrastructure/repositories/sql/diarization_repository.py b/src/infrastructure/repositories/sql/diarization_repository.py index 26baf739..8a066ff4 100644 --- a/src/infrastructure/repositories/sql/diarization_repository.py +++ b/src/infrastructure/repositories/sql/diarization_repository.py @@ -107,7 +107,7 @@ def get_by_external_source( DiarizationRecord.external_source == external_source, ) if subject_id: - parsed_id = UUID(subject_id) if isinstance(subject_id, str) else subject_id + parsed_id = UUID(str(subject_id)) if isinstance(subject_id, str) else subject_id query = query.filter(DiarizationRecord.subject_id == parsed_id) else: query = query.filter(DiarizationRecord.subject_id.is_(None)) @@ -119,16 +119,20 @@ def get_all( self, limit: int = 10, offset: int = 0, - subject_id: str | object | None = None, + subject_id: str | List[str] | None = None, ) -> List[DiarizationRecord]: query = self.db.query(DiarizationRecord) if subject_id: - parsed_id = UUID(subject_id) if isinstance(subject_id, str) else subject_id - query = query.filter(DiarizationRecord.subject_id == parsed_id) + if isinstance(subject_id, list): + parsed_ids = [UUID(sid) if isinstance(sid, str) else sid for sid in subject_id] + query = query.filter(DiarizationRecord.subject_id.in_(parsed_ids)) + else: + parsed_id = UUID(subject_id) if isinstance(subject_id, str) else subject_id + query = query.filter(DiarizationRecord.subject_id == parsed_id) result = query.order_by(DiarizationRecord.created_at.desc()).offset(offset).limit(limit).all() - return cast(List[DiarizationRecord], cast(object, result)) + return cast(List[DiarizationRecord], result) def get_by_id(self, diarization_id: str) -> Optional[DiarizationRecord]: result = self.db.query(DiarizationRecord).filter(DiarizationRecord.id == diarization_id).first() diff --git a/src/infrastructure/repositories/sql/ingestion_job_repository.py b/src/infrastructure/repositories/sql/ingestion_job_repository.py index 4c216103..0c9c239b 100644 --- a/src/infrastructure/repositories/sql/ingestion_job_repository.py +++ b/src/infrastructure/repositories/sql/ingestion_job_repository.py @@ -5,7 +5,7 @@ from sqlalchemy.orm import joinedload from src.config.logger import Logger -from src.infrastructure.repositories.sql.connector import Connector +from src.infrastructure.connectors.connector_sql import Connector from src.infrastructure.repositories.sql.models.ingestion_job import IngestionJobModel from src.infrastructure.repositories.sql.utils.utils import ensure_uuid diff --git a/src/infrastructure/repositories/sql/knowledge_subject_repository.py b/src/infrastructure/repositories/sql/knowledge_subject_repository.py index 5d38a616..668c444f 100644 --- a/src/infrastructure/repositories/sql/knowledge_subject_repository.py +++ b/src/infrastructure/repositories/sql/knowledge_subject_repository.py @@ -4,7 +4,7 @@ from sqlalchemy.orm import selectinload from src.config.logger import Logger -from src.infrastructure.repositories.sql.connector import Connector +from src.infrastructure.connectors.connector_sql import Connector from src.infrastructure.repositories.sql.models.knowledge_subject import ( KnowledgeSubjectModel, ) diff --git a/src/infrastructure/repositories/sql/models/chunk_duplicate.py b/src/infrastructure/repositories/sql/models/chunk_duplicate.py new file mode 100644 index 00000000..6b64a66c --- /dev/null +++ b/src/infrastructure/repositories/sql/models/chunk_duplicate.py @@ -0,0 +1,39 @@ +""" +ORM models for chunk_duplicate table. +""" + +import uuid + +from sqlalchemy import ( + JSON, + UUID, + Column, + DateTime, + Float, + ForeignKey, + Text, + func, +) + +from src.infrastructure.connectors.connector_sql import Base + + +class ChunkDuplicateModel(Base): + __tablename__ = "chunk_duplicates" + + id = Column(UUID, primary_key=True, default=uuid.uuid4) + chunk_ids = Column(JSON, nullable=False) + similarity = Column(Float, nullable=False) + content_source_id = Column( + UUID, + ForeignKey("content_sources.id", deferrable=True, initially="IMMEDIATE"), + nullable=True, + ) + status = Column(Text, default="pending", nullable=False) + created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) + updated_at = Column( + DateTime(timezone=True), + server_default=func.now(), + onupdate=func.now(), + nullable=False, + ) diff --git a/src/infrastructure/repositories/sql/models/chunk_index.py b/src/infrastructure/repositories/sql/models/chunk_index.py index 6ad883f1..6047f336 100644 --- a/src/infrastructure/repositories/sql/models/chunk_index.py +++ b/src/infrastructure/repositories/sql/models/chunk_index.py @@ -7,6 +7,7 @@ from sqlalchemy import ( JSON, UUID, + Boolean, Column, DateTime, ForeignKey, @@ -18,7 +19,7 @@ ) from sqlalchemy.orm import relationship -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base class ChunkIndexModel(Base): @@ -58,6 +59,7 @@ class ChunkIndexModel(Base): onupdate=func.now(), nullable=False, ) + is_active = Column(Boolean, default=True, server_default=text("1"), nullable=False) __table_args__ = ( Index("ix_chunk_index_content_source_id", "content_source_id"), diff --git a/src/infrastructure/repositories/sql/models/content_source.py b/src/infrastructure/repositories/sql/models/content_source.py index 7686f055..1734e422 100644 --- a/src/infrastructure/repositories/sql/models/content_source.py +++ b/src/infrastructure/repositories/sql/models/content_source.py @@ -19,7 +19,7 @@ ) from sqlalchemy.orm import relationship -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base class ContentSourceModel(Base): diff --git a/src/infrastructure/repositories/sql/models/diarization_record.py b/src/infrastructure/repositories/sql/models/diarization_record.py index 1b637fae..b31a77a2 100644 --- a/src/infrastructure/repositories/sql/models/diarization_record.py +++ b/src/infrastructure/repositories/sql/models/diarization_record.py @@ -8,7 +8,7 @@ from sqlalchemy import JSON, UUID, Column, DateTime, Float, ForeignKey, String from src.domain.entities.enums.diarization_status_enum import DiarizationStatus -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base def _generate_uuid() -> str: diff --git a/src/infrastructure/repositories/sql/models/ingestion_job.py b/src/infrastructure/repositories/sql/models/ingestion_job.py index ae7dc01e..986161af 100644 --- a/src/infrastructure/repositories/sql/models/ingestion_job.py +++ b/src/infrastructure/repositories/sql/models/ingestion_job.py @@ -7,7 +7,7 @@ from sqlalchemy import UUID, Column, DateTime, ForeignKey, Index, Integer, Text, func from sqlalchemy.orm import relationship, synonym -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base class IngestionJobModel(Base): diff --git a/src/infrastructure/repositories/sql/models/knowledge_subject.py b/src/infrastructure/repositories/sql/models/knowledge_subject.py index 595ddcb9..2b6c83b5 100644 --- a/src/infrastructure/repositories/sql/models/knowledge_subject.py +++ b/src/infrastructure/repositories/sql/models/knowledge_subject.py @@ -7,7 +7,7 @@ from sqlalchemy import UUID, Column, DateTime, Text, func from sqlalchemy.orm import relationship -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base class KnowledgeSubjectModel(Base): diff --git a/src/infrastructure/repositories/sql/models/user.py b/src/infrastructure/repositories/sql/models/user.py index fa6c9a0e..bcc8c8d3 100644 --- a/src/infrastructure/repositories/sql/models/user.py +++ b/src/infrastructure/repositories/sql/models/user.py @@ -4,7 +4,7 @@ from sqlalchemy import DateTime, String from sqlalchemy.orm import Mapped, mapped_column -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base class User(Base): diff --git a/src/infrastructure/repositories/sql/models/voice_record.py b/src/infrastructure/repositories/sql/models/voice_record.py index 8b0d476b..4ac7377c 100644 --- a/src/infrastructure/repositories/sql/models/voice_record.py +++ b/src/infrastructure/repositories/sql/models/voice_record.py @@ -7,7 +7,7 @@ from sqlalchemy import JSON, Column, DateTime, String -from src.infrastructure.repositories.sql.connector import Base +from src.infrastructure.connectors.connector_sql import Base def _generate_uuid() -> str: diff --git a/src/infrastructure/repositories/sql/user_repository.py b/src/infrastructure/repositories/sql/user_repository.py index a8faf9c2..7bb7d5ce 100644 --- a/src/infrastructure/repositories/sql/user_repository.py +++ b/src/infrastructure/repositories/sql/user_repository.py @@ -5,7 +5,7 @@ from src.domain.entities.user import User as UserEntity from src.domain.interfaces.repository.user_repository import IUserRepository -from src.infrastructure.repositories.sql.connector import Connector +from src.infrastructure.connectors.connector_sql import Connector from src.infrastructure.repositories.sql.models.user import User as UserModel from src.infrastructure.repositories.sql.utils.utils import ensure_uuid diff --git a/src/infrastructure/services/chunk_duplicate_service.py b/src/infrastructure/services/chunk_duplicate_service.py new file mode 100644 index 00000000..a50954bc --- /dev/null +++ b/src/infrastructure/services/chunk_duplicate_service.py @@ -0,0 +1,156 @@ +from typing import Any, List, Optional, Set +from uuid import UUID + +from src.config.logger import Logger +from src.domain.entities.chunk_duplicate_entity import ChunkDuplicateEntity +from src.domain.entities.enums.search_mode_enum import SearchMode +from src.infrastructure.repositories.sql.chunk_duplicate_repository import ( + ChunkDuplicateSQLRepository, +) +from src.infrastructure.repositories.sql.chunk_index_repository import ( + ChunkIndexSQLRepository, +) +from src.infrastructure.services.chunk_vector_service import ChunkVectorService + +logger = Logger() + + +class ChunkDuplicateService: + """Service for detecting and managing duplicate chunks using vector search.""" + + def __init__( + self, + repository: ChunkDuplicateSQLRepository, + chunk_repo: ChunkIndexSQLRepository, + vector_service: ChunkVectorService, + ): + self._repo = repository + self._chunk_repo = chunk_repo + self._vector_service = vector_service + + def find_and_register_duplicates( + self, chunk_ids: List[UUID], similarity_threshold: float = 0.90 + ) -> int: + """ + Check a list of chunks for duplicates against the entire vector store. + If duplicates are found with similarity >= threshold, register them. + """ + registered_count = 0 + processed_pairs: Set[tuple[str, ...]] = set() + + for cid in chunk_ids: + chunk = self._chunk_repo.get_by_id(cid) + if not chunk or not chunk.content: + continue + + # Search for similar chunks + content_str = str(chunk.content) + similar_chunks = self._vector_service.retrieve( + query=content_str, + top_k=5, + search_mode=SearchMode.SEMANTIC, + re_rank=False, + ) + + duplicates = self._filter_duplicates(cid, similar_chunks, similarity_threshold) + + if duplicates: + registered_count += self._register_cluster( + source_id=cid, + source_content_source_id=str(chunk.content_source_id) if chunk.content_source_id else None, + duplicates=duplicates, + processed_pairs=processed_pairs + ) + + return registered_count + + def _filter_duplicates(self, source_id: UUID, similar_chunks: List[Any], threshold: float) -> List[tuple[UUID, float]]: + """Filter results to find valid duplicates above threshold.""" + duplicates = [] + source_id_str = str(source_id) + for sim_chunk in similar_chunks: + if str(sim_chunk.id) == source_id_str: + continue + + score = getattr(sim_chunk, "score", 0.0) + if score >= threshold: + duplicates.append((sim_chunk.id, float(score))) + return duplicates + + def _register_cluster( + self, + source_id: UUID, + source_content_source_id: Optional[str], + duplicates: List[tuple[UUID, float]], + processed_pairs: Set[tuple[str, ...]] + ) -> int: + """Register a new duplicate group if not already processed.""" + duplicate_ids = [d[0] for d in duplicates] + all_uuids = [source_id] + duplicate_ids + # Sort as strings for consistent cluster identification + cluster_ids_str = sorted([str(cid) for cid in all_uuids]) + cluster_key = tuple(cluster_ids_str) + + if cluster_key not in processed_pairs: + # Get exact similarity for the highest match + max_sim = max([float(d[1]) for d in duplicates] + [0.0]) + self._repo.create_duplicate_record( + chunk_ids=all_uuids, + similarity=max_sim, + status="pending", + content_source_id=source_content_source_id + ) + processed_pairs.add(cluster_key) + return 1 + return 0 + + def list_duplicates( + self, + status: Optional[str] = None, + subject_ids: Optional[List[str]] = None, + limit: int = 100, + offset: int = 0 + ) -> tuple[List[ChunkDuplicateEntity], int]: + """List mapped duplicate records.""" + models, total = self._repo.list_duplicates(status=status, subject_ids=subject_ids, limit=limit, offset=offset) + entities = [] + from datetime import datetime + for m in models: + chunk_ids: List[UUID] = [] + if isinstance(m.chunk_ids, list): + for cid in m.chunk_ids: + if isinstance(cid, str): + chunk_ids.append(UUID(cid)) + elif isinstance(cid, UUID): + chunk_ids.append(cid) + + # Ensure datetime types for Mypy + created_at = m.created_at if isinstance(m.created_at, datetime) else datetime.now() + updated_at = m.updated_at if isinstance(m.updated_at, datetime) else datetime.now() + + entities.append(ChunkDuplicateEntity( + id=UUID(str(m.id)), + chunk_ids=chunk_ids, + similarity=float(m.similarity), + content_source_id=str(m.content_source_id) if m.content_source_id else None, + status=str(m.status), + created_at=created_at, + updated_at=updated_at, + )) + return entities, total + + def resolve_duplicate(self, duplicate_id: UUID, status: str) -> bool: + """Mark a duplicate record as resolved (ignored, reviewed, etc).""" + return self._repo.update_status(duplicate_id, status) + + def deactivate_chunk(self, chunk_id: UUID) -> bool: + """ + Deactivate a chunk in both SQL and Vector Store. + """ + # 1. Update SQL status to inactive + success = self._chunk_repo.update_is_active(chunk_id, False) + if success: + # 2. Remove from Vector Store to stop it from appearing in RAG + self._vector_service.delete_by_id(chunk_id) + return True + return False diff --git a/src/presentation/api/dependencies.py b/src/presentation/api/dependencies.py index 2c2b9bee..d98d2e65 100644 --- a/src/presentation/api/dependencies.py +++ b/src/presentation/api/dependencies.py @@ -42,11 +42,14 @@ from src.domain.interfaces.repository.retriver_repository import IVectorRepository from src.domain.interfaces.services.i_event_bus import IEventBus from src.domain.interfaces.services.i_task_queue import ITaskQueue +from src.infrastructure.connectors.connector_sql import Session as DBSessionFactory from src.infrastructure.extractors.crawl4ai_extractor import Crawl4AIExtractor +from src.infrastructure.repositories.sql.chunk_duplicate_repository import ( + ChunkDuplicateSQLRepository, +) from src.infrastructure.repositories.sql.chunk_index_repository import ( ChunkIndexSQLRepository, ) -from src.infrastructure.repositories.sql.connector import Session as DBSessionFactory from src.infrastructure.repositories.sql.content_source_repository import ( ContentSourceSQLRepository, ) @@ -61,6 +64,7 @@ ) from src.infrastructure.repositories.sql.user_repository import UserSQLRepository from src.infrastructure.services.auth_service import AuthService +from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService from src.infrastructure.services.chunk_index_service import ChunkIndexService from src.infrastructure.services.chunk_vector_service import ChunkVectorService from src.infrastructure.services.content_source_service import ContentSourceService @@ -117,6 +121,10 @@ def get_user_repo() -> UserSQLRepository: return UserSQLRepository() +def get_duplicate_repo() -> ChunkDuplicateSQLRepository: + return ChunkDuplicateSQLRepository() + + # Services def get_model_loader(request: Request) -> ModelLoaderService: return request.app.state.model_loader @@ -295,6 +303,14 @@ def get_chunk_index_service( return ChunkIndexService(repo) +def get_duplicate_service( + repo: ChunkDuplicateSQLRepository = Depends(get_duplicate_repo), + chunk_repo: ChunkIndexSQLRepository = Depends(get_chunk_repo), + vector_service: ChunkVectorService = Depends(get_chunk_vector_service), +) -> ChunkDuplicateService: + return ChunkDuplicateService(repo, chunk_repo, vector_service) + + def get_youtube_vector_service( vector_repo: IVectorRepository = Depends(get_vector_repository), ) -> YouTubeVectorService: diff --git a/src/presentation/api/routes/audio_diarization_and_recognition_router.py b/src/presentation/api/routes/audio_diarization_and_recognition_router.py index 52e5eaf5..99b3900f 100644 --- a/src/presentation/api/routes/audio_diarization_and_recognition_router.py +++ b/src/presentation/api/routes/audio_diarization_and_recognition_router.py @@ -1,9 +1,9 @@ import logging import traceback -from typing import Annotated, Any, Optional, cast +from typing import Annotated, Any, List, Optional, cast from uuid import UUID -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from sqlalchemy.orm import Session from src.application.dtos.commands.process_audio_command import ProcessAudioCommand @@ -325,7 +325,7 @@ async def retrieve_all_processed_audio_history( use_case: Annotated[RetrieveProcessedAudioHistoryUseCase, Depends(get_retrieve_history_use_case)], limit: int = 10, offset: int = 0, - subject_id: str | None = None, + subject_id: Optional[List[str]] = Query(None), ): try: logger.info( diff --git a/src/presentation/api/routes/duplicate_router.py b/src/presentation/api/routes/duplicate_router.py new file mode 100644 index 00000000..ee482c8b --- /dev/null +++ b/src/presentation/api/routes/duplicate_router.py @@ -0,0 +1,96 @@ +from typing import List, Optional +from uuid import UUID + +from fastapi import APIRouter, Depends, HTTPException, Query + +from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService +from src.infrastructure.services.chunk_index_service import ChunkIndexService +from src.presentation.api.dependencies import ( + get_chunk_index_service, + get_current_user, + get_duplicate_service, +) +from src.presentation.api.schemas.duplicate_schemas import ( + ChunkDuplicateResponse, + ChunkDuplicateStatusUpdate, + ChunkMinimal, + PaginatedChunkDuplicateResponse, +) + +router = APIRouter(tags=["duplicates"]) + + +@router.get("", response_model=PaginatedChunkDuplicateResponse) +def list_duplicates( + status: Optional[str] = None, + subject_id: Optional[List[str]] = Query(None), + limit: int = 100, + offset: int = 0, + service: ChunkDuplicateService = Depends(get_duplicate_service), + chunk_service: ChunkIndexService = Depends(get_chunk_index_service), + user=Depends(get_current_user), +): + """List all detected chunk duplicate groups.""" + entities, total = service.list_duplicates(status=status, subject_ids=subject_id, limit=limit, offset=offset) + + # Enrich entities with chunk content if needed for UI + results = [] + for entity in entities: + resp = ChunkDuplicateResponse.model_validate(entity) + chunks_info = [] + for cid in entity.chunk_ids: + chunk = chunk_service.get_by_id(cid) + if chunk: + chunks_info.append(ChunkMinimal( + id=chunk.id, + content=chunk.content or "", + source_title=chunk.extra.get("source_title", "Unknown"), + source_id=chunk.content_source_id + )) + resp.chunks = chunks_info + results.append(resp) + + return PaginatedChunkDuplicateResponse(results=results, total=total) + + +@router.patch("/{duplicate_id}/status") +def update_duplicate_status( + duplicate_id: UUID, + cmd: ChunkDuplicateStatusUpdate, + service: ChunkDuplicateService = Depends(get_duplicate_service), + user=Depends(get_current_user), +): + """Update the resolution status of a duplicate group.""" + success = service.resolve_duplicate(duplicate_id, cmd.status) + if not success: + raise HTTPException(status_code=404, detail="Duplicate group not found") + return {"status": "success"} + + +@router.post("/chunks/{chunk_id}/deactivate") +def deactivate_chunk( + chunk_id: UUID, + service: ChunkDuplicateService = Depends(get_duplicate_service), + user=Depends(get_current_user), +): + """Deactivate a specific chunk (soft delete from RAG).""" + success = service.deactivate_chunk(chunk_id) + if not success: + raise HTTPException(status_code=404, detail="Chunk not found") + return {"status": "success"} + + +@router.post("/analyze-all") +def analyze_all_chunks( + service: ChunkDuplicateService = Depends(get_duplicate_service), + chunk_service: ChunkIndexService = Depends(get_chunk_index_service), + user=Depends(get_current_user), +): + """Run duplicate detection analysis on all existing chunks (heavy operation).""" + # This should probably be a background task, but for now we'll do it synchronously + # or just list everything and iterate + all_chunks = chunk_service.list_chunks(limit=1000) # Limit for safety + chunk_ids = [c.id for c in all_chunks] + + count = service.find_and_register_duplicates(chunk_ids) + return {"status": "success", "groups_found": count} diff --git a/src/presentation/api/routes/ingest_router.py b/src/presentation/api/routes/ingest_router.py index fbda4c14..1615f2ef 100644 --- a/src/presentation/api/routes/ingest_router.py +++ b/src/presentation/api/routes/ingest_router.py @@ -26,12 +26,14 @@ from src.domain.entities.user import User from src.domain.interfaces.services.i_task_queue import ITaskQueue from src.infrastructure.services.content_source_service import ContentSourceService +from src.infrastructure.services.ingestion_job_service import IngestionJobService from src.presentation.api.dependencies import ( get_cs_service, get_current_user, get_diarization_ingestion_use_case, get_file_ingestion_use_case, get_ingest_youtube_use_case, + get_job_service, get_task_queue_service, get_web_scraping_use_case, ) @@ -63,7 +65,9 @@ def ingest_youtube( request: Annotated[YoutubeIngestRequest, Body()], use_case: Annotated[YoutubeIngestionUseCase, Depends(get_ingest_youtube_use_case)], task_queue: Annotated[ITaskQueue, Depends(get_task_queue_service)], -): + job_service: Annotated[IngestionJobService, Depends(get_job_service)], + current_user: Annotated[User, Depends(get_current_user)], +) -> IngestResponse: """ Ingest data from YouTube videos or playlists into the vector store. """ @@ -119,15 +123,31 @@ def ingest_youtube( logger.info("Running ingestion in background via queue", context={"reason": reason}) + # Ensure we have a job ID for background tasks so they are visible in UI + job_id = request.ingestion_job_id + if not job_id: + job_title = request.title or request.video_url or f"YouTube {reason.capitalize()}" + job = job_service.create_job( + source_title=job_title, + external_source=request.video_url or (request.video_urls[0] if request.video_urls else None), + subject_id=request.subject_id, + ingestion_type="YOUTUBE", + status="INITIALIZING", + status_message=f"Starting YouTube {reason} ingestion...", + ) + job_id = str(job.id) + cmd.ingestion_job_id = job_id + task_queue.enqueue( worker, cmd, - task_title=request.title or request.video_url or "YouTube Ingestion", - metadata={"job_id": str(request.ingestion_job_id)} if request.ingestion_job_id else {}, + task_title=request.title or request.video_url or f"YouTube {reason.capitalize()}", + metadata={"job_id": job_id}, ) return IngestResponse( skipped=False, - reason="Ingestion started in background queue.", + reason=f"Ingestion started in background queue (Job: {job_id}).", + job_id=UUID(job_id) if job_id else None, ) try: diff --git a/src/presentation/api/routes/settings_router.py b/src/presentation/api/routes/settings_router.py index 79bcebd6..b139a477 100644 --- a/src/presentation/api/routes/settings_router.py +++ b/src/presentation/api/routes/settings_router.py @@ -6,7 +6,7 @@ from src.config.settings import Settings from src.domain.interfaces.repository.retriver_repository import IVectorRepository -from src.infrastructure.repositories.sql.connector import Connector +from src.infrastructure.connectors.connector_sql import Connector from src.presentation.api.dependencies import get_settings, get_vector_repository from src.presentation.api.schemas.settings_schemas import ( AppSettingsSchema, diff --git a/src/presentation/api/schemas/duplicate_schemas.py b/src/presentation/api/schemas/duplicate_schemas.py new file mode 100644 index 00000000..e429803f --- /dev/null +++ b/src/presentation/api/schemas/duplicate_schemas.py @@ -0,0 +1,35 @@ +from datetime import datetime +from typing import List, Optional +from uuid import UUID + +from pydantic import BaseModel, ConfigDict + + +class ChunkMinimal(BaseModel): + id: UUID + content: str + source_title: Optional[str] = None + source_id: Optional[UUID] = None + + model_config = ConfigDict(from_attributes=True) + + +class ChunkDuplicateResponse(BaseModel): + id: UUID + chunk_ids: List[UUID] + chunks: Optional[List[ChunkMinimal]] = None # Enriched chunks for UI + similarity: float + status: str + created_at: datetime + updated_at: datetime + + model_config = ConfigDict(from_attributes=True) + + +class ChunkDuplicateStatusUpdate(BaseModel): + status: str + + +class PaginatedChunkDuplicateResponse(BaseModel): + results: List[ChunkDuplicateResponse] + total: int diff --git a/test_dispatcher.py b/test_dispatcher.py new file mode 100644 index 00000000..feea32ab --- /dev/null +++ b/test_dispatcher.py @@ -0,0 +1,60 @@ +import os +import sys +from unittest.mock import MagicMock + +# Add the project root to sys.path +sys.path.append(os.getcwd()) + +# Mock out dependencies before importing workers +sys.modules['src.presentation.api.dependencies'] = MagicMock() +mock_job_service = MagicMock() + +class MockContext: + def __init__(self, job_svc): + self.job_service = job_svc + +import src.presentation.api.dependencies as deps + +deps.resolve_ingestion_context = MagicMock(return_value=MockContext(mock_job_service)) + +from src.application.dtos.commands.ingest_youtube_command import IngestYoutubeCommand +from src.application.dtos.enums.youtube_data_type import YoutubeDataType +from src.application.workers import run_youtube_dispatcher_worker + + +def test_dispatcher(): + # Setup mock app state + mock_app = MagicMock() + mock_app.state.task_queue = MagicMock() + + # Patch _get_app to return our mock + import src.application.workers as workers + workers._get_app = MagicMock(return_value=mock_app) + + # Create command for a small playlist (or the user's one) + # Using a known small playlist to speed up test if possible + # But user's URL is fine since we are testing logic + url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59" + cmd = IngestYoutubeCommand( + video_url=url, + data_type=YoutubeDataType.PLAYLIST, + ingestion_job_id="test-job-uuid", + subject_id="test-subject" + ) + + print(f"Testing dispatcher with URL: {url}") + run_youtube_dispatcher_worker(cmd) + + # Assertions + print("\nVerifying Job Service calls:") + for call in mock_job_service.update_job_status.call_args_list: + print(f" - Status update: {call[1]}") + + print("\nVerifying Task Queue calls:") + enqueue_calls = mock_app.state.task_queue.enqueue.call_args_list + print(f" - Tasks enqueued: {len(enqueue_calls)}") + if len(enqueue_calls) > 0: + print(f" - First task URL: {enqueue_calls[0][0][1].video_url}") + +if __name__ == "__main__": + test_dispatcher() diff --git a/test_playlist.py b/test_playlist.py new file mode 100644 index 00000000..c9f17e06 --- /dev/null +++ b/test_playlist.py @@ -0,0 +1,25 @@ +from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor + + +def test_playlist(): + # Use the playlist provided by the user + playlist_url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59" + + print(f"Testing playlist extraction for: {playlist_url}") + extractor = YoutubeExtractor(language="pt") + + try: + videos = extractor.extract_playlist_videos(playlist_url) + print(f"Extracted {len(videos)} videos.") + for i, url in enumerate(videos[:5]): + print(f" {i+1}: {url}") + + if not videos: + print("FAILED: No videos extracted.") + else: + print("SUCCESS: Videos extracted.") + except Exception as e: + print(f"ERROR: {e}") + +if __name__ == "__main__": + test_playlist() diff --git a/tests/conftest.py b/tests/conftest.py index 859a65a6..39214ce2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -import src.infrastructure.repositories.sql.connector as connector +import src.infrastructure.connectors.connector_sql as connector @pytest.fixture(autouse=True) diff --git a/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py new file mode 100644 index 00000000..cd88f6f6 --- /dev/null +++ b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py @@ -0,0 +1,90 @@ +import uuid + +import pytest + +from src.infrastructure.repositories.sql.chunk_duplicate_repository import ChunkDuplicateSQLRepository +from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel +from src.infrastructure.repositories.sql.models.knowledge_subject import KnowledgeSubjectModel + + +@pytest.mark.ChunkDuplicateSQLRepository +def test_create_duplicate_record(sqlite_memory): + """Test creating a duplicate record in the repository.""" + repo = ChunkDuplicateSQLRepository() + chunk_ids = [str(uuid.uuid4()), str(uuid.uuid4())] + similarity = 0.95 + status = "pending" + + record = repo.create_duplicate_record(chunk_ids, similarity, status) + + assert record.id is not None + assert record.chunk_ids == chunk_ids + assert record.similarity == pytest.approx(similarity) + assert record.status == status + +@pytest.mark.ChunkDuplicateSQLRepository +def test_list_duplicates_filtering(sqlite_memory): + """Test listing duplicates with status and subject filtering.""" + db = sqlite_memory + repo = ChunkDuplicateSQLRepository() + + # Create a subject and content source + subject = KnowledgeSubjectModel(name="Test Subject") + db.add(subject) + db.commit() + db.refresh(subject) + + source = ContentSourceModel( + subject_id=subject.id, + source_type="file", + external_source="test.txt" + ) + db.add(source) + db.commit() + db.refresh(source) + + # Create duplicate records + repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending", content_source_id=source.id) + repo.create_duplicate_record([str(uuid.uuid4())], 0.8, "reviewed", content_source_id=source.id) + + # List all + _, total = repo.list_duplicates() + assert total == 2 + + # Filter by status + pending_items, total = repo.list_duplicates(status="pending") + assert total == 1 + assert pending_items[0].status == "pending" + + # Filter by subject_id + _, total = repo.list_duplicates(subject_ids=[str(subject.id)]) + assert total == 2 + + # Filter with non-existent subject + _, total = repo.list_duplicates(subject_ids=[str(uuid.uuid4())]) + assert total == 0 + +@pytest.mark.ChunkDuplicateSQLRepository +def test_update_status(sqlite_memory): + """Test updating the status of a duplicate record.""" + repo = ChunkDuplicateSQLRepository() + record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending") + + success = repo.update_status(record.id, "reviewed") + assert success is True + + updated = repo.get_by_id(record.id) + assert updated is not None + assert updated.status == "reviewed" + +@pytest.mark.ChunkDuplicateSQLRepository +def test_delete_record(sqlite_memory): + """Test deleting a duplicate record.""" + repo = ChunkDuplicateSQLRepository() + record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending") + + success = repo.delete_record(record.id) + assert success is True + + deleted = repo.get_by_id(record.id) + assert deleted is None diff --git a/tests/infrastructure/services/test_chunk_duplicate_service.py b/tests/infrastructure/services/test_chunk_duplicate_service.py new file mode 100644 index 00000000..0a58a3b6 --- /dev/null +++ b/tests/infrastructure/services/test_chunk_duplicate_service.py @@ -0,0 +1,86 @@ +import uuid +from unittest.mock import MagicMock + +import pytest + +from src.domain.entities.enums.search_mode_enum import SearchMode +from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService + + +@pytest.fixture +def mock_repos(): + repo = MagicMock() + chunk_repo = MagicMock() + vector_svc = MagicMock() + return repo, chunk_repo, vector_svc + +def test_find_and_register_duplicates(mock_repos): + """Test finding and registering duplicates.""" + repo, chunk_repo, vector_svc = mock_repos + service = ChunkDuplicateService(repo, chunk_repo, vector_svc) + + chunk_id = uuid.uuid4() + mock_chunk = MagicMock() + mock_chunk.id = chunk_id + mock_chunk.content = "Duplicate test content" + mock_chunk.content_source_id = str(uuid.uuid4()) + chunk_repo.get_by_id.return_value = mock_chunk + + # Mock similar chunks found + sim_chunk = MagicMock() + sim_chunk.id = uuid.uuid4() + sim_chunk.score = 0.95 + vector_svc.retrieve.return_value = [sim_chunk] + + count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90) + + assert count == 1 + vector_svc.retrieve.assert_called_once_with( + query=mock_chunk.content, + top_k=5, + search_mode=SearchMode.SEMANTIC, + re_rank=False + ) + repo.create_duplicate_record.assert_called_once() + + # Check arguments of create_duplicate_record + _, kwargs = repo.create_duplicate_record.call_args + assert kwargs['similarity'] == pytest.approx(0.95) + assert str(chunk_id) in kwargs['chunk_ids'] + assert str(sim_chunk.id) in kwargs['chunk_ids'] + +def test_find_and_register_no_duplicates(mock_repos): + """Test when no duplicates are found above threshold.""" + repo, chunk_repo, vector_svc = mock_repos + service = ChunkDuplicateService(repo, chunk_repo, vector_svc) + + chunk_id = uuid.uuid4() + mock_chunk = MagicMock() + mock_chunk.id = chunk_id + mock_chunk.content = "Unique content" + chunk_repo.get_by_id.return_value = mock_chunk + + # Sim chunk with low score + sim_chunk = MagicMock() + sim_chunk.id = uuid.uuid4() + sim_chunk.score = 0.5 + vector_svc.retrieve.return_value = [sim_chunk] + + count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90) + + assert count == 0 + repo.create_duplicate_record.assert_not_called() + +def test_deactivate_chunk(mock_repos): + """Test deactivating a chunk.""" + repo, chunk_repo, vector_svc = mock_repos + service = ChunkDuplicateService(repo, chunk_repo, vector_svc) + + chunk_id = uuid.uuid4() + chunk_repo.update_is_active.return_value = True + + success = service.deactivate_chunk(chunk_id) + + assert success is True + chunk_repo.update_is_active.assert_called_once_with(chunk_id, False) + vector_svc.delete_by_id.assert_called_once_with(chunk_id) diff --git a/tests/presentation/api/routes/test_duplicate_router.py b/tests/presentation/api/routes/test_duplicate_router.py new file mode 100644 index 00000000..ac2c4bb0 --- /dev/null +++ b/tests/presentation/api/routes/test_duplicate_router.py @@ -0,0 +1,74 @@ +import uuid +from unittest.mock import MagicMock + +import pytest +from fastapi.testclient import TestClient + +from main import app +from src.presentation.api.dependencies import ( + get_chunk_index_service, + get_duplicate_repo, + get_duplicate_service, +) + +client = TestClient(app) + +@pytest.mark.DuplicateRouter +class TestDuplicateRouter: + def test_list_duplicates(self): + mock_repo = MagicMock() + app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo + + mock_repo.list_duplicates.return_value = ([], 0) + + response = client.get("/rest/duplicates") + assert response.status_code == 200 + assert response.json()["total"] == 0 + + app.dependency_overrides.clear() + + def test_update_duplicate_status(self): + mock_repo = MagicMock() + app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo + + # Use a service mock instead because the router calls resolved_duplicate on service + # Wait, the router calls service.resolve_duplicate + mock_service = MagicMock() + app.dependency_overrides[get_duplicate_service] = lambda: mock_service + + duplicate_id = str(uuid.uuid4()) + mock_service.resolve_duplicate.return_value = True + + response = client.patch(f"/rest/duplicates/{duplicate_id}/status", json={"status": "reviewed"}) + assert response.status_code == 200 + assert response.json()["status"] == "success" + + app.dependency_overrides.clear() + + def test_deactivate_chunk(self): + mock_service = MagicMock() + app.dependency_overrides[get_duplicate_service] = lambda: mock_service + + chunk_id = str(uuid.uuid4()) + mock_service.deactivate_chunk.return_value = True + + response = client.post(f"/rest/duplicates/chunks/{chunk_id}/deactivate") + assert response.status_code == 200 + assert response.json()["status"] == "success" + + app.dependency_overrides.clear() + + def test_trigger_duplicate_analysis(self): + mock_service = MagicMock() + app.dependency_overrides[get_duplicate_service] = lambda: mock_service + mock_chunk_service = MagicMock() + app.dependency_overrides[get_chunk_index_service] = lambda: mock_chunk_service + + mock_chunk_service.list_chunks.return_value = [] + mock_service.find_and_register_duplicates.return_value = 0 + + response = client.post("/rest/duplicates/analyze-all") + assert response.status_code == 200 + assert response.json()["status"] == "success" + + app.dependency_overrides.clear() diff --git a/tmp/cleanup_db.py b/tmp/cleanup_db.py new file mode 100644 index 00000000..09417232 --- /dev/null +++ b/tmp/cleanup_db.py @@ -0,0 +1,27 @@ +from sqlalchemy import create_engine, text + +from src.config.settings import settings + +engine = create_engine(settings.sql.url) + +with engine.connect() as conn: + try: + conn.execute(text("DROP TABLE IF EXISTS chunk_duplicates")) + print("Dropped chunk_duplicates table if it existed.") + except Exception as e: + print(f"Error dropping chunk_duplicates: {e}") + + try: + # Check if is_active exists in chunk_index + res = conn.execute(text("PRAGMA table_info(chunk_index)")) + columns = [row[1] for row in res] + if 'is_active' in columns: + print("is_active already exists in chunk_index. Attempting to drop it (batch mode needed for SQLite).") + # For simplicity in this scratch script, I'll just note it. + # Usually we group these with migrations. + else: + print("is_active does not exist in chunk_index.") + except Exception as e: + print(f"Error checking chunk_index: {e}") + + conn.commit() From f4cf6a68d9cfc903a2ff26f8d36cdb6fe4a6db81 Mon Sep 17 00:00:00 2001 From: ericksonlopes Date: Wed, 8 Apr 2026 12:17:29 -0300 Subject: [PATCH 4/7] style: fix lint and typing errors across project --- .../retrieve_processed_audio_history.py | 2 +- .../sql/chunk_duplicate_repository.py | 21 ++++++++++++++++--- .../services/chunk_duplicate_service.py | 7 ++++++- test_dispatcher.py | 10 ++++----- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/application/use_cases/retrieve_processed_audio_history.py b/src/application/use_cases/retrieve_processed_audio_history.py index 494d8f4a..084f649e 100644 --- a/src/application/use_cases/retrieve_processed_audio_history.py +++ b/src/application/use_cases/retrieve_processed_audio_history.py @@ -10,7 +10,7 @@ class RetrieveProcessedAudioHistoryUseCase: def __init__(self, db: Session): self.repo = DiarizationRepository(db) - def execute(self, limit: int = 10, offset: int = 0, subject_id: str | None = None) -> list[dict]: + def execute(self, limit: int = 10, offset: int = 0, subject_id: str | list[str] | None = None) -> list[dict]: records = self.repo.get_all(limit=limit, offset=offset, subject_id=subject_id) return [ diff --git a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py index 911f864b..8caf11ec 100644 --- a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py +++ b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py @@ -14,7 +14,13 @@ class ChunkDuplicateSQLRepository: """Repository for managing duplicate chunk records in SQL.""" - def create_duplicate_record(self, chunk_ids: List[UUID], similarity: float, status: str = "pending", content_source_id: Optional[str] = None) -> ChunkDuplicateModel: + def create_duplicate_record( + self, + chunk_ids: List[UUID], + similarity: float, + status: str = "pending", + content_source_id: Optional[str] = None + ) -> ChunkDuplicateModel: """Create a new duplicate grouping record.""" with Connector() as session: try: @@ -36,7 +42,13 @@ def create_duplicate_record(self, chunk_ids: List[UUID], similarity: float, stat ) raise - def list_duplicates(self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0) -> tuple[List[ChunkDuplicateModel], int]: + def list_duplicates( + self, + status: Optional[str] = None, + subject_ids: Optional[List[str]] = None, + limit: int = 100, + offset: int = 0 + ) -> tuple[List[ChunkDuplicateModel], int]: """List duplicate records with optional status and context filtering.""" with Connector() as session: query = session.query(ChunkDuplicateModel) @@ -94,5 +106,8 @@ def delete_record(self, duplicate_id: Any) -> bool: return False except Exception as e: session.rollback() - logger.error("Error deleting duplicate record", context={"duplicate_id": str(duplicate_id), "error": str(e)}) + logger.error( + "Error deleting duplicate record", + context={"duplicate_id": str(duplicate_id), "error": str(e)} + ) raise diff --git a/src/infrastructure/services/chunk_duplicate_service.py b/src/infrastructure/services/chunk_duplicate_service.py index a50954bc..4edd7b82 100644 --- a/src/infrastructure/services/chunk_duplicate_service.py +++ b/src/infrastructure/services/chunk_duplicate_service.py @@ -64,7 +64,12 @@ def find_and_register_duplicates( return registered_count - def _filter_duplicates(self, source_id: UUID, similar_chunks: List[Any], threshold: float) -> List[tuple[UUID, float]]: + def _filter_duplicates( + self, + source_id: UUID, + similar_chunks: List[Any], + threshold: float + ) -> List[tuple[UUID, float]]: """Filter results to find valid duplicates above threshold.""" duplicates = [] source_id_str = str(source_id) diff --git a/test_dispatcher.py b/test_dispatcher.py index feea32ab..b1fee754 100644 --- a/test_dispatcher.py +++ b/test_dispatcher.py @@ -13,13 +13,13 @@ class MockContext: def __init__(self, job_svc): self.job_service = job_svc -import src.presentation.api.dependencies as deps +import src.presentation.api.dependencies as deps # noqa: E402 deps.resolve_ingestion_context = MagicMock(return_value=MockContext(mock_job_service)) -from src.application.dtos.commands.ingest_youtube_command import IngestYoutubeCommand -from src.application.dtos.enums.youtube_data_type import YoutubeDataType -from src.application.workers import run_youtube_dispatcher_worker +from src.application.dtos.commands.ingest_youtube_command import IngestYoutubeCommand # noqa: E402 +from src.application.dtos.enums.youtube_data_type import YoutubeDataType # noqa: E402 +from src.application.workers import run_youtube_dispatcher_worker # noqa: E402 def test_dispatcher(): @@ -28,7 +28,7 @@ def test_dispatcher(): mock_app.state.task_queue = MagicMock() # Patch _get_app to return our mock - import src.application.workers as workers + import src.application.workers as workers # noqa: E402 workers._get_app = MagicMock(return_value=mock_app) # Create command for a small playlist (or the user's one) From 99ad8d145fac2ac7a2bd9a51c302eec7dfc52d39 Mon Sep 17 00:00:00 2001 From: ericksonlopes Date: Wed, 8 Apr 2026 13:06:46 -0300 Subject: [PATCH 5/7] feat: complete chunk duplication system with worker integration and lint fixes --- ...c845_add_chunk_duplicates_table_and_is_.py | 32 ++++--- ...673_add_content_source_id_to_duplicates.py | 24 +++-- src/application/workers.py | 94 +++++++++++-------- .../extractors/youtube_extractor.py | 4 +- .../sql/chunk_duplicate_repository.py | 35 +++---- .../services/chunk_duplicate_service.py | 53 +++++------ src/presentation/__init__.py | 1 + .../api/routes/duplicate_router.py | 22 +++-- src/presentation/api/routes/ingest_router.py | 25 ++++- test_dispatcher.py | 23 ++--- test_playlist.py | 9 +- .../sql/test_chunk_duplicate_repository.py | 35 ++++--- .../services/test_chunk_duplicate_service.py | 38 ++++---- .../api/routes/test_duplicate_router.py | 27 +++--- tmp/cleanup_db.py | 2 +- 15 files changed, 226 insertions(+), 198 deletions(-) create mode 100644 src/presentation/__init__.py diff --git a/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py index 65229188..ba6f38c1 100644 --- a/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py +++ b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py @@ -5,6 +5,7 @@ Create Date: 2026-04-08 09:56:58.625813 """ + from typing import Sequence, Union import sqlalchemy as sa @@ -12,8 +13,8 @@ from alembic import op # revision identifiers, used by Alembic. -revision: str = '646a175ac845' -down_revision: Union[str, Sequence[str], None] = 'b2c3d4e5f6a7' +revision: str = "646a175ac845" +down_revision: Union[str, Sequence[str], None] = "b2c3d4e5f6a7" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -21,22 +22,27 @@ def upgrade() -> None: """Upgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.create_table('chunk_duplicates', - sa.Column('id', sa.UUID(), nullable=False), - sa.Column('chunk_ids', sa.JSON(), nullable=False), - sa.Column('similarity', sa.Float(), nullable=False), - sa.Column('status', sa.Text(), nullable=False), - sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False), - sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False), - sa.PrimaryKeyConstraint('id') + op.create_table( + "chunk_duplicates", + sa.Column("id", sa.UUID(), nullable=False), + sa.Column("chunk_ids", sa.JSON(), nullable=False), + sa.Column("similarity", sa.Float(), nullable=False), + sa.Column("status", sa.Text(), nullable=False), + sa.Column( + "created_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=False + ), + sa.Column( + "updated_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=False + ), + sa.PrimaryKeyConstraint("id"), ) - op.add_column('chunk_index', sa.Column('is_active', sa.Boolean(), server_default=sa.text('1'), nullable=False)) + op.add_column("chunk_index", sa.Column("is_active", sa.Boolean(), server_default=sa.text("1"), nullable=False)) # ### end Alembic commands ### def downgrade() -> None: """Downgrade schema.""" # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('chunk_index', 'is_active') - op.drop_table('chunk_duplicates') + op.drop_column("chunk_index", "is_active") + op.drop_table("chunk_duplicates") # ### end Alembic commands ### diff --git a/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py index f0ac6124..163be91b 100644 --- a/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py +++ b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py @@ -5,6 +5,7 @@ Create Date: 2026-04-08 10:50:39.027257 """ + from typing import Sequence, Union import sqlalchemy as sa @@ -12,21 +13,28 @@ from alembic import op # revision identifiers, used by Alembic. -revision: str = '84524e052673' -down_revision: Union[str, Sequence[str], None] = '646a175ac845' +revision: str = "84524e052673" +down_revision: Union[str, Sequence[str], None] = "646a175ac845" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: """Upgrade schema.""" - with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op: - batch_op.add_column(sa.Column('content_source_id', sa.UUID(), nullable=True)) - batch_op.create_foreign_key('fk_chunk_duplicates_content_source_id_content_sources', 'content_sources', ['content_source_id'], ['id'], initially='IMMEDIATE', deferrable=True) + with op.batch_alter_table("chunk_duplicates", schema=None) as batch_op: + batch_op.add_column(sa.Column("content_source_id", sa.UUID(), nullable=True)) + batch_op.create_foreign_key( + "fk_chunk_duplicates_content_source_id_content_sources", + "content_sources", + ["content_source_id"], + ["id"], + initially="IMMEDIATE", + deferrable=True, + ) def downgrade() -> None: """Downgrade schema.""" - with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op: - batch_op.drop_constraint('fk_chunk_duplicates_content_source_id_content_sources', type_='foreignkey') - batch_op.drop_column('content_source_id') + with op.batch_alter_table("chunk_duplicates", schema=None) as batch_op: + batch_op.drop_constraint("fk_chunk_duplicates_content_source_id_content_sources", type_="foreignkey") + batch_op.drop_column("content_source_id") diff --git a/src/application/workers.py b/src/application/workers.py index 7ff71021..67e9ad01 100644 --- a/src/application/workers.py +++ b/src/application/workers.py @@ -1,5 +1,6 @@ import logging from typing import Any +from uuid import UUID from src.application.dtos.commands.ingest_diarization_command import ( IngestDiarizationCommand, @@ -10,6 +11,7 @@ from src.application.dtos.commands.process_audio_command import ProcessAudioCommand from src.application.dtos.commands.train_voice_command import TrainVoiceCommand from src.application.service_registry import registry +from src.domain.entities.enums.ingestion_job_status_enum import IngestionJobStatus from src.infrastructure.loggers.std_logger import ( clear_global_context, set_global_context, @@ -113,6 +115,10 @@ def run_youtube_ingestion_worker(cmd: IngestYoutubeCommand): ) ctx = resolve_ingestion_context(app) + if not ctx: + logger.error("Could not resolve ingestion context for YouTube worker") + return + vector_repo = resolve_vector_repository(app) vector_svc = YouTubeVectorService(vector_repo) @@ -130,12 +136,13 @@ def run_youtube_ingestion_worker(cmd: IngestYoutubeCommand): result = use_case.execute(cmd) - # Enqueue duplicate detection - if result and "vector_ids" in result: + # Enqueue duplicate detection if we have vector IDs + if result and getattr(result, "vector_ids", None): + task_ids = [UUID(str(vid)) for vid in result.vector_ids] task_queue = app.state.task_queue task_queue.enqueue( run_duplicate_detection_worker, - {"chunk_ids": result["vector_ids"]}, + {"chunk_ids": task_ids}, task_title=f"Dup Check YouTube: {cmd.video_url}", ) except Exception as e: @@ -166,15 +173,20 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): task_queue = app.state.task_queue context = resolve_ingestion_context(app) + if not context: + logger.error("Could not resolve ingestion context for YouTube dispatcher") + return + job_service = context.job_service job_id = str(cmd.ingestion_job_id) if cmd.ingestion_job_id else None - if job_id: - job_service.update_job_status( - job_id, - status="PROCESSING", + if job_id and job_service: + job_service.update_job( + UUID(job_id), + status=IngestionJobStatus.PROCESSING, status_message=f"Resolving {cmd.data_type} videos...", - progress=5, + current_step=5, + total_steps=100, ) # 1. Resolve the full list of URLs @@ -184,8 +196,10 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): playlist_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None) if not playlist_url: logger.warning("No URL provided for playlist dispatcher") - if job_id: - job_service.update_job_status(job_id, "FAILED", "Missing playlist URL.") + if job_id and job_service: + job_service.update_job( + UUID(job_id), status=IngestionJobStatus.FAILED, status_message="Missing playlist URL." + ) return extractor = YoutubeExtractor(language=cmd.language) @@ -195,8 +209,10 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): channel_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None) if not channel_url: logger.warning("No URL provided for channel dispatcher") - if job_id: - job_service.update_job_status(job_id, "FAILED", "Missing channel URL.") + if job_id and job_service: + job_service.update_job( + UUID(job_id), status=IngestionJobStatus.FAILED, status_message="Missing channel URL." + ) return extractor = YoutubeExtractor(language=cmd.language) @@ -207,20 +223,21 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): if not video_list: logger.warning(f"YouTube Dispatcher resolved 0 videos for type {cmd.data_type}.") - if job_id: - job_service.update_job_status( - job_id, - "FAILED", - f"No videos found in {cmd.data_type}. Verify if the URL is valid and public.", + if job_id and job_service: + job_service.update_job( + UUID(job_id), + status=IngestionJobStatus.FAILED, + status_message=f"No videos found in {cmd.data_type}. Verify if the URL is valid and public.", ) return - if job_id: - job_service.update_job_status( - job_id, - status="PROCESSING", + if job_id and job_service: + job_service.update_job( + UUID(job_id), + status=IngestionJobStatus.PROCESSING, status_message=f"Dispatched {len(video_list)} videos for ingestion.", - progress=50, + current_step=50, + total_steps=100, ) logger.info(f"YouTube Dispatcher resolved {len(video_list)} videos. Enqueueing individual tasks...") @@ -248,21 +265,22 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand): ) logger.info(f"Successfully dispatched {len(video_list)} YouTube ingestion tasks.") - if job_id: - job_service.update_job_status( - job_id, - status="SUCCESS", + if job_id and job_service: + job_service.update_job( + UUID(job_id), + status=IngestionJobStatus.FINISHED, status_message=f"Dispatched {len(video_list)} videos successfully.", - progress=100, + current_step=100, + total_steps=100, ) except Exception as e: logger.error(f"YouTube Dispatcher Worker Error: {e}", exc_info=True) - if job_id: + if job_id and job_service: try: - job_service.update_job_status(job_id, "FAILED", str(e)) - except Exception: - pass + job_service.update_job(UUID(job_id), status=IngestionJobStatus.FAILED, error_message=str(e)) + except Exception as outer_e: + logger.warning(f"Failed to update job status during error cleanup: {outer_e}") finally: clear_global_context() @@ -321,8 +339,8 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand): task_queue = app.state.task_queue task_queue.enqueue( run_duplicate_detection_worker, - {"chunk_ids": result["vector_ids"]}, - task_title=f"Dup Check Diarization: {cmd.source}" + {"chunk_ids": [UUID(v) if isinstance(v, str) else v for v in result["vector_ids"]]}, + task_title=f"Dup Check Diarization: {cmd.name or str(cmd.diarization_id)}", ) finally: db.close() @@ -387,8 +405,8 @@ async def _run(): task_queue = app.state.task_queue task_queue.enqueue( run_duplicate_detection_worker, - {"chunk_ids": result["vector_ids"]}, - task_title=f"Dup Check Web: {cmd.url}" + {"chunk_ids": [UUID(v) if isinstance(v, str) else v for v in result["vector_ids"]]}, + task_title=f"Dup Check Web: {cmd.url}", ) except Exception as e: logging.getLogger(__name__).error(f"Worker Error: Failed to execute Web Scraping: {e}", exc_info=True) @@ -696,12 +714,12 @@ def run_duplicate_detection_worker(cmd: dict): vector_repo = resolve_vector_repository(app) rerank_svc = resolve_rerank_service(app) vector_svc = get_chunk_vector_service(vector_repo, rerank_svc) - + duplicate_repo = get_duplicate_repo() chunk_repo = get_chunk_repo() - + service = ChunkDuplicateService(duplicate_repo, chunk_repo, vector_svc) - + chunk_ids = cmd.get("chunk_ids", []) if not chunk_ids: return diff --git a/src/infrastructure/extractors/youtube_extractor.py b/src/infrastructure/extractors/youtube_extractor.py index 726d9aa3..c53ee967 100644 --- a/src/infrastructure/extractors/youtube_extractor.py +++ b/src/infrastructure/extractors/youtube_extractor.py @@ -239,9 +239,7 @@ def _validate_mp3_file(path: str) -> None: if len(header) >= 2 and header[0] == 0xFF and (header[1] & 0xE0) == 0xE0: return - raise ValueError( - f"Downloaded file is not a valid MP3 (header={header!r}): {path}" - ) + raise ValueError(f"Downloaded file is not a valid MP3 (header={header!r}): {path}") def extract_playlist_videos(self, playlist_url: str) -> list[str]: """Extracts all video URLs from a YouTube playlist using yt_dlp.""" diff --git a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py index 8caf11ec..e5045125 100644 --- a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py +++ b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py @@ -11,15 +11,12 @@ logger = Logger() + class ChunkDuplicateSQLRepository: """Repository for managing duplicate chunk records in SQL.""" def create_duplicate_record( - self, - chunk_ids: List[UUID], - similarity: float, - status: str = "pending", - content_source_id: Optional[str] = None + self, chunk_ids: List[UUID], similarity: float, status: str = "pending", content_source_id: Optional[str] = None ) -> ChunkDuplicateModel: """Create a new duplicate grouping record.""" with Connector() as session: @@ -28,7 +25,7 @@ def create_duplicate_record( chunk_ids=[str(cid) for cid in chunk_ids], similarity=similarity, status=status, - content_source_id=content_source_id + content_source_id=content_source_id, ) session.add(record) session.commit() @@ -36,34 +33,26 @@ def create_duplicate_record( return record except Exception as e: session.rollback() - logger.error( - "Error creating duplicate record", - context={"error": str(e)} - ) + logger.error("Error creating duplicate record", context={"error": str(e)}) raise def list_duplicates( - self, - status: Optional[str] = None, - subject_ids: Optional[List[str]] = None, - limit: int = 100, - offset: int = 0 + self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0 ) -> tuple[List[ChunkDuplicateModel], int]: """List duplicate records with optional status and context filtering.""" with Connector() as session: query = session.query(ChunkDuplicateModel) - + if subject_ids: # Convert string IDs to UUID objects for safe matching in SQL parsed_ids = [UUID(sid) for sid in subject_ids] query = query.join( - ContentSourceModel, - ContentSourceModel.id == ChunkDuplicateModel.content_source_id + ContentSourceModel, ContentSourceModel.id == ChunkDuplicateModel.content_source_id ).filter(ContentSourceModel.subject_id.in_(parsed_ids)) - + if status: query = query.filter(ChunkDuplicateModel.status == status) - + total = query.count() items = query.order_by(desc(ChunkDuplicateModel.created_at)).limit(limit).offset(offset).all() return items, total @@ -88,8 +77,7 @@ def update_status(self, duplicate_id: Any, status: str) -> bool: except Exception as e: session.rollback() logger.error( - "Error updating duplicate status", - context={"duplicate_id": str(duplicate_id), "error": str(e)} + "Error updating duplicate status", context={"duplicate_id": str(duplicate_id), "error": str(e)} ) raise @@ -107,7 +95,6 @@ def delete_record(self, duplicate_id: Any) -> bool: except Exception as e: session.rollback() logger.error( - "Error deleting duplicate record", - context={"duplicate_id": str(duplicate_id), "error": str(e)} + "Error deleting duplicate record", context={"duplicate_id": str(duplicate_id), "error": str(e)} ) raise diff --git a/src/infrastructure/services/chunk_duplicate_service.py b/src/infrastructure/services/chunk_duplicate_service.py index 4edd7b82..d75f6dfc 100644 --- a/src/infrastructure/services/chunk_duplicate_service.py +++ b/src/infrastructure/services/chunk_duplicate_service.py @@ -28,9 +28,7 @@ def __init__( self._chunk_repo = chunk_repo self._vector_service = vector_service - def find_and_register_duplicates( - self, chunk_ids: List[UUID], similarity_threshold: float = 0.90 - ) -> int: + def find_and_register_duplicates(self, chunk_ids: List[UUID], similarity_threshold: float = 0.90) -> int: """ Check a list of chunks for duplicates against the entire vector store. If duplicates are found with similarity >= threshold, register them. @@ -56,19 +54,16 @@ def find_and_register_duplicates( if duplicates: registered_count += self._register_cluster( - source_id=cid, + source_id=cid, source_content_source_id=str(chunk.content_source_id) if chunk.content_source_id else None, - duplicates=duplicates, - processed_pairs=processed_pairs + duplicates=duplicates, + processed_pairs=processed_pairs, ) return registered_count def _filter_duplicates( - self, - source_id: UUID, - similar_chunks: List[Any], - threshold: float + self, source_id: UUID, similar_chunks: List[Any], threshold: float ) -> List[tuple[UUID, float]]: """Filter results to find valid duplicates above threshold.""" duplicates = [] @@ -76,7 +71,7 @@ def _filter_duplicates( for sim_chunk in similar_chunks: if str(sim_chunk.id) == source_id_str: continue - + score = getattr(sim_chunk, "score", 0.0) if score >= threshold: duplicates.append((sim_chunk.id, float(score))) @@ -87,7 +82,7 @@ def _register_cluster( source_id: UUID, source_content_source_id: Optional[str], duplicates: List[tuple[UUID, float]], - processed_pairs: Set[tuple[str, ...]] + processed_pairs: Set[tuple[str, ...]], ) -> int: """Register a new duplicate group if not already processed.""" duplicate_ids = [d[0] for d in duplicates] @@ -100,26 +95,20 @@ def _register_cluster( # Get exact similarity for the highest match max_sim = max([float(d[1]) for d in duplicates] + [0.0]) self._repo.create_duplicate_record( - chunk_ids=all_uuids, - similarity=max_sim, - status="pending", - content_source_id=source_content_source_id + chunk_ids=all_uuids, similarity=max_sim, status="pending", content_source_id=source_content_source_id ) processed_pairs.add(cluster_key) return 1 return 0 def list_duplicates( - self, - status: Optional[str] = None, - subject_ids: Optional[List[str]] = None, - limit: int = 100, - offset: int = 0 + self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0 ) -> tuple[List[ChunkDuplicateEntity], int]: """List mapped duplicate records.""" models, total = self._repo.list_duplicates(status=status, subject_ids=subject_ids, limit=limit, offset=offset) entities = [] from datetime import datetime + for m in models: chunk_ids: List[UUID] = [] if isinstance(m.chunk_ids, list): @@ -128,20 +117,22 @@ def list_duplicates( chunk_ids.append(UUID(cid)) elif isinstance(cid, UUID): chunk_ids.append(cid) - + # Ensure datetime types for Mypy created_at = m.created_at if isinstance(m.created_at, datetime) else datetime.now() updated_at = m.updated_at if isinstance(m.updated_at, datetime) else datetime.now() - entities.append(ChunkDuplicateEntity( - id=UUID(str(m.id)), - chunk_ids=chunk_ids, - similarity=float(m.similarity), - content_source_id=str(m.content_source_id) if m.content_source_id else None, - status=str(m.status), - created_at=created_at, - updated_at=updated_at, - )) + entities.append( + ChunkDuplicateEntity( + id=UUID(str(m.id)), + chunk_ids=chunk_ids, + similarity=float(m.similarity), + content_source_id=str(m.content_source_id) if m.content_source_id else None, + status=str(m.status), + created_at=created_at, + updated_at=updated_at, + ) + ) return entities, total def resolve_duplicate(self, duplicate_id: UUID, status: str) -> bool: diff --git a/src/presentation/__init__.py b/src/presentation/__init__.py new file mode 100644 index 00000000..e16c76df --- /dev/null +++ b/src/presentation/__init__.py @@ -0,0 +1 @@ +"" diff --git a/src/presentation/api/routes/duplicate_router.py b/src/presentation/api/routes/duplicate_router.py index ee482c8b..c33dc188 100644 --- a/src/presentation/api/routes/duplicate_router.py +++ b/src/presentation/api/routes/duplicate_router.py @@ -32,7 +32,7 @@ def list_duplicates( ): """List all detected chunk duplicate groups.""" entities, total = service.list_duplicates(status=status, subject_ids=subject_id, limit=limit, offset=offset) - + # Enrich entities with chunk content if needed for UI results = [] for entity in entities: @@ -41,15 +41,17 @@ def list_duplicates( for cid in entity.chunk_ids: chunk = chunk_service.get_by_id(cid) if chunk: - chunks_info.append(ChunkMinimal( - id=chunk.id, - content=chunk.content or "", - source_title=chunk.extra.get("source_title", "Unknown"), - source_id=chunk.content_source_id - )) + chunks_info.append( + ChunkMinimal( + id=chunk.id, + content=chunk.content or "", + source_title=chunk.extra.get("source_title", "Unknown"), + source_id=chunk.content_source_id, + ) + ) resp.chunks = chunks_info results.append(resp) - + return PaginatedChunkDuplicateResponse(results=results, total=total) @@ -89,8 +91,8 @@ def analyze_all_chunks( """Run duplicate detection analysis on all existing chunks (heavy operation).""" # This should probably be a background task, but for now we'll do it synchronously # or just list everything and iterate - all_chunks = chunk_service.list_chunks(limit=1000) # Limit for safety + all_chunks = chunk_service.list_chunks(limit=1000) # Limit for safety chunk_ids = [c.id for c in all_chunks] - + count = service.find_and_register_duplicates(chunk_ids) return {"status": "success", "groups_found": count} diff --git a/src/presentation/api/routes/ingest_router.py b/src/presentation/api/routes/ingest_router.py index 1615f2ef..30d0a315 100644 --- a/src/presentation/api/routes/ingest_router.py +++ b/src/presentation/api/routes/ingest_router.py @@ -124,16 +124,23 @@ def ingest_youtube( logger.info("Running ingestion in background via queue", context={"reason": reason}) # Ensure we have a job ID for background tasks so they are visible in UI - job_id = request.ingestion_job_id + job_id = str(request.ingestion_job_id) if request.ingestion_job_id else None if not job_id: + if not job_service: + logger.error("Job service dependency missing") + raise HTTPException(status_code=500, detail="Internal configuration error") + job_title = request.title or request.video_url or f"YouTube {reason.capitalize()}" + from src.domain.entities.enums.ingestion_job_status_enum import IngestionJobStatus + + s_uuid = UUID(request.subject_id) if request.subject_id else None job = job_service.create_job( + content_source_id=None, + status=IngestionJobStatus.STARTED, source_title=job_title, external_source=request.video_url or (request.video_urls[0] if request.video_urls else None), - subject_id=request.subject_id, + subject_id=s_uuid, ingestion_type="YOUTUBE", - status="INITIALIZING", - status_message=f"Starting YouTube {reason} ingestion...", ) job_id = str(job.id) cmd.ingestion_job_id = job_id @@ -160,7 +167,15 @@ def ingest_youtube( detail=result.reason or "This content has already been ingested.", ) - return result + return IngestResponse( + skipped=result.skipped, + reason=result.reason, + source_id=result.source_id, + job_id=result.job_id, + created_chunks=result.created_chunks, + vector_ids=result.vector_ids, + video_results=result.video_results, + ) except HTTPException: raise except ValueError as ve: diff --git a/test_dispatcher.py b/test_dispatcher.py index b1fee754..b954fe1c 100644 --- a/test_dispatcher.py +++ b/test_dispatcher.py @@ -6,13 +6,15 @@ sys.path.append(os.getcwd()) # Mock out dependencies before importing workers -sys.modules['src.presentation.api.dependencies'] = MagicMock() +sys.modules["src.presentation.api.dependencies"] = MagicMock() mock_job_service = MagicMock() + class MockContext: def __init__(self, job_svc): self.job_service = job_svc + import src.presentation.api.dependencies as deps # noqa: E402 deps.resolve_ingestion_context = MagicMock(return_value=MockContext(mock_job_service)) @@ -26,35 +28,34 @@ def test_dispatcher(): # Setup mock app state mock_app = MagicMock() mock_app.state.task_queue = MagicMock() - + # Patch _get_app to return our mock import src.application.workers as workers # noqa: E402 + workers._get_app = MagicMock(return_value=mock_app) - + # Create command for a small playlist (or the user's one) # Using a known small playlist to speed up test if possible # But user's URL is fine since we are testing logic url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59" cmd = IngestYoutubeCommand( - video_url=url, - data_type=YoutubeDataType.PLAYLIST, - ingestion_job_id="test-job-uuid", - subject_id="test-subject" + video_url=url, data_type=YoutubeDataType.PLAYLIST, ingestion_job_id="test-job-uuid", subject_id="test-subject" ) - + print(f"Testing dispatcher with URL: {url}") run_youtube_dispatcher_worker(cmd) - + # Assertions print("\nVerifying Job Service calls:") - for call in mock_job_service.update_job_status.call_args_list: + for call in mock_job_service.update_job.call_args_list: print(f" - Status update: {call[1]}") - + print("\nVerifying Task Queue calls:") enqueue_calls = mock_app.state.task_queue.enqueue.call_args_list print(f" - Tasks enqueued: {len(enqueue_calls)}") if len(enqueue_calls) > 0: print(f" - First task URL: {enqueue_calls[0][0][1].video_url}") + if __name__ == "__main__": test_dispatcher() diff --git a/test_playlist.py b/test_playlist.py index c9f17e06..c2873fc6 100644 --- a/test_playlist.py +++ b/test_playlist.py @@ -4,16 +4,16 @@ def test_playlist(): # Use the playlist provided by the user playlist_url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59" - + print(f"Testing playlist extraction for: {playlist_url}") extractor = YoutubeExtractor(language="pt") - + try: videos = extractor.extract_playlist_videos(playlist_url) print(f"Extracted {len(videos)} videos.") for i, url in enumerate(videos[:5]): - print(f" {i+1}: {url}") - + print(f" {i + 1}: {url}") + if not videos: print("FAILED: No videos extracted.") else: @@ -21,5 +21,6 @@ def test_playlist(): except Exception as e: print(f"ERROR: {e}") + if __name__ == "__main__": test_playlist() diff --git a/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py index cd88f6f6..47d4ea4b 100644 --- a/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py +++ b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py @@ -14,77 +14,76 @@ def test_create_duplicate_record(sqlite_memory): chunk_ids = [str(uuid.uuid4()), str(uuid.uuid4())] similarity = 0.95 status = "pending" - + record = repo.create_duplicate_record(chunk_ids, similarity, status) - + assert record.id is not None assert record.chunk_ids == chunk_ids assert record.similarity == pytest.approx(similarity) assert record.status == status + @pytest.mark.ChunkDuplicateSQLRepository def test_list_duplicates_filtering(sqlite_memory): """Test listing duplicates with status and subject filtering.""" db = sqlite_memory repo = ChunkDuplicateSQLRepository() - + # Create a subject and content source subject = KnowledgeSubjectModel(name="Test Subject") db.add(subject) db.commit() db.refresh(subject) - - source = ContentSourceModel( - subject_id=subject.id, - source_type="file", - external_source="test.txt" - ) + + source = ContentSourceModel(subject_id=subject.id, source_type="file", external_source="test.txt") db.add(source) db.commit() db.refresh(source) - + # Create duplicate records repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending", content_source_id=source.id) repo.create_duplicate_record([str(uuid.uuid4())], 0.8, "reviewed", content_source_id=source.id) - + # List all _, total = repo.list_duplicates() assert total == 2 - + # Filter by status pending_items, total = repo.list_duplicates(status="pending") assert total == 1 assert pending_items[0].status == "pending" - + # Filter by subject_id _, total = repo.list_duplicates(subject_ids=[str(subject.id)]) assert total == 2 - + # Filter with non-existent subject _, total = repo.list_duplicates(subject_ids=[str(uuid.uuid4())]) assert total == 0 + @pytest.mark.ChunkDuplicateSQLRepository def test_update_status(sqlite_memory): """Test updating the status of a duplicate record.""" repo = ChunkDuplicateSQLRepository() record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending") - + success = repo.update_status(record.id, "reviewed") assert success is True - + updated = repo.get_by_id(record.id) assert updated is not None assert updated.status == "reviewed" + @pytest.mark.ChunkDuplicateSQLRepository def test_delete_record(sqlite_memory): """Test deleting a duplicate record.""" repo = ChunkDuplicateSQLRepository() record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending") - + success = repo.delete_record(record.id) assert success is True - + deleted = repo.get_by_id(record.id) assert deleted is None diff --git a/tests/infrastructure/services/test_chunk_duplicate_service.py b/tests/infrastructure/services/test_chunk_duplicate_service.py index 0a58a3b6..ca7a4241 100644 --- a/tests/infrastructure/services/test_chunk_duplicate_service.py +++ b/tests/infrastructure/services/test_chunk_duplicate_service.py @@ -14,73 +14,73 @@ def mock_repos(): vector_svc = MagicMock() return repo, chunk_repo, vector_svc + def test_find_and_register_duplicates(mock_repos): """Test finding and registering duplicates.""" repo, chunk_repo, vector_svc = mock_repos service = ChunkDuplicateService(repo, chunk_repo, vector_svc) - + chunk_id = uuid.uuid4() mock_chunk = MagicMock() mock_chunk.id = chunk_id mock_chunk.content = "Duplicate test content" mock_chunk.content_source_id = str(uuid.uuid4()) chunk_repo.get_by_id.return_value = mock_chunk - + # Mock similar chunks found sim_chunk = MagicMock() sim_chunk.id = uuid.uuid4() sim_chunk.score = 0.95 vector_svc.retrieve.return_value = [sim_chunk] - + count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90) - + assert count == 1 vector_svc.retrieve.assert_called_once_with( - query=mock_chunk.content, - top_k=5, - search_mode=SearchMode.SEMANTIC, - re_rank=False + query=mock_chunk.content, top_k=5, search_mode=SearchMode.SEMANTIC, re_rank=False ) repo.create_duplicate_record.assert_called_once() - + # Check arguments of create_duplicate_record _, kwargs = repo.create_duplicate_record.call_args - assert kwargs['similarity'] == pytest.approx(0.95) - assert str(chunk_id) in kwargs['chunk_ids'] - assert str(sim_chunk.id) in kwargs['chunk_ids'] + assert kwargs["similarity"] == pytest.approx(0.95) + assert chunk_id in kwargs["chunk_ids"] + assert sim_chunk.id in kwargs["chunk_ids"] + def test_find_and_register_no_duplicates(mock_repos): """Test when no duplicates are found above threshold.""" repo, chunk_repo, vector_svc = mock_repos service = ChunkDuplicateService(repo, chunk_repo, vector_svc) - + chunk_id = uuid.uuid4() mock_chunk = MagicMock() mock_chunk.id = chunk_id mock_chunk.content = "Unique content" chunk_repo.get_by_id.return_value = mock_chunk - + # Sim chunk with low score sim_chunk = MagicMock() sim_chunk.id = uuid.uuid4() sim_chunk.score = 0.5 vector_svc.retrieve.return_value = [sim_chunk] - + count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90) - + assert count == 0 repo.create_duplicate_record.assert_not_called() + def test_deactivate_chunk(mock_repos): """Test deactivating a chunk.""" repo, chunk_repo, vector_svc = mock_repos service = ChunkDuplicateService(repo, chunk_repo, vector_svc) - + chunk_id = uuid.uuid4() chunk_repo.update_is_active.return_value = True - + success = service.deactivate_chunk(chunk_id) - + assert success is True chunk_repo.update_is_active.assert_called_once_with(chunk_id, False) vector_svc.delete_by_id.assert_called_once_with(chunk_id) diff --git a/tests/presentation/api/routes/test_duplicate_router.py b/tests/presentation/api/routes/test_duplicate_router.py index ac2c4bb0..5bfef896 100644 --- a/tests/presentation/api/routes/test_duplicate_router.py +++ b/tests/presentation/api/routes/test_duplicate_router.py @@ -13,49 +13,50 @@ client = TestClient(app) + @pytest.mark.DuplicateRouter class TestDuplicateRouter: def test_list_duplicates(self): mock_repo = MagicMock() app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo - + mock_repo.list_duplicates.return_value = ([], 0) - + response = client.get("/rest/duplicates") assert response.status_code == 200 assert response.json()["total"] == 0 - + app.dependency_overrides.clear() def test_update_duplicate_status(self): mock_repo = MagicMock() app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo - + # Use a service mock instead because the router calls resolved_duplicate on service # Wait, the router calls service.resolve_duplicate mock_service = MagicMock() app.dependency_overrides[get_duplicate_service] = lambda: mock_service - + duplicate_id = str(uuid.uuid4()) mock_service.resolve_duplicate.return_value = True - + response = client.patch(f"/rest/duplicates/{duplicate_id}/status", json={"status": "reviewed"}) assert response.status_code == 200 assert response.json()["status"] == "success" - + app.dependency_overrides.clear() def test_deactivate_chunk(self): mock_service = MagicMock() app.dependency_overrides[get_duplicate_service] = lambda: mock_service - + chunk_id = str(uuid.uuid4()) mock_service.deactivate_chunk.return_value = True - + response = client.post(f"/rest/duplicates/chunks/{chunk_id}/deactivate") assert response.status_code == 200 assert response.json()["status"] == "success" - + app.dependency_overrides.clear() def test_trigger_duplicate_analysis(self): @@ -63,12 +64,12 @@ def test_trigger_duplicate_analysis(self): app.dependency_overrides[get_duplicate_service] = lambda: mock_service mock_chunk_service = MagicMock() app.dependency_overrides[get_chunk_index_service] = lambda: mock_chunk_service - + mock_chunk_service.list_chunks.return_value = [] mock_service.find_and_register_duplicates.return_value = 0 - + response = client.post("/rest/duplicates/analyze-all") assert response.status_code == 200 assert response.json()["status"] == "success" - + app.dependency_overrides.clear() diff --git a/tmp/cleanup_db.py b/tmp/cleanup_db.py index 09417232..8e47d7d2 100644 --- a/tmp/cleanup_db.py +++ b/tmp/cleanup_db.py @@ -15,7 +15,7 @@ # Check if is_active exists in chunk_index res = conn.execute(text("PRAGMA table_info(chunk_index)")) columns = [row[1] for row in res] - if 'is_active' in columns: + if "is_active" in columns: print("is_active already exists in chunk_index. Attempting to drop it (batch mode needed for SQLite).") # For simplicity in this scratch script, I'll just note it. # Usually we group these with migrations. From 39619047c44f9df66d5992b6c5a0b2fdc61394f0 Mon Sep 17 00:00:00 2001 From: ericksonlopes Date: Wed, 8 Apr 2026 14:30:40 -0300 Subject: [PATCH 6/7] fix: update sql connector imports and fix failing tests --- src/application/workers.py | 10 +++++----- tests/application/test_audio_diarization_workers.py | 4 ++-- tests/application/test_workers.py | 6 +++--- tests/conftest.py | 1 + .../services/test_youtube_audio_downloader.py | 3 ++- tests/presentation/api/routes/test_ingest_router.py | 13 ++++++++++--- 6 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/application/workers.py b/src/application/workers.py index 67e9ad01..6f15f50c 100644 --- a/src/application/workers.py +++ b/src/application/workers.py @@ -314,7 +314,7 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand): vector_svc = ChunkVectorService(vector_repo, rerank_service=rerank_svc) # DiarizationRepository needs a DB session - from infrastructure.connectors.connector_sql import Session as DBSession + from src.infrastructure.connectors.connector_sql import Session as DBSession db = DBSession() try: @@ -418,7 +418,7 @@ async def _run(): def _audio_diarization_subprocess(cmd_dict: dict): """Run audio diarization in a separate process to avoid torch/CUDA thread deadlocks.""" - from infrastructure.connectors.connector_sql import ( + from src.infrastructure.connectors.connector_sql import ( Session as DBSessionFactory, ) from src.application.use_cases.process_audio_diarization_pipeline import ( @@ -494,7 +494,7 @@ def run_audio_diarization_dispatcher_worker(cmd: ProcessAudioCommand): return try: - from infrastructure.connectors.connector_sql import ( + from src.infrastructure.connectors.connector_sql import ( Session as DBSessionFactory, ) from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor @@ -610,7 +610,7 @@ def run_audio_diarization_worker(cmd: ProcessAudioCommand): if process.exitcode != 0: logger.error("Audio diarization subprocess exited with code %d", process.exitcode) if cmd.diarization_id: - from infrastructure.connectors.connector_sql import ( + from src.infrastructure.connectors.connector_sql import ( Session as DBSessionFactory, ) from src.infrastructure.repositories.sql.diarization_repository import ( @@ -668,7 +668,7 @@ def run_voice_training_worker(cmd: TrainVoiceCommand): return try: - from infrastructure.connectors.connector_sql import Session as DBSession + from src.infrastructure.connectors.connector_sql import Session as DBSession from src.application.use_cases.manage_voice_profiles import ( TrainVoiceProfileFromSpeakerSegmentUseCase, ) diff --git a/tests/application/test_audio_diarization_workers.py b/tests/application/test_audio_diarization_workers.py index 1509237c..9b493d4f 100644 --- a/tests/application/test_audio_diarization_workers.py +++ b/tests/application/test_audio_diarization_workers.py @@ -40,7 +40,7 @@ def test_run_audio_diarization_dispatcher_worker_deduplication(self, mock_app, m patch("src.application.workers.registry.get", return_value=mock_app), patch("src.infrastructure.extractors.youtube_extractor.YoutubeExtractor") as mock_extractor_cls, patch( - "src.infrastructure.repositories.sql.connector.Session", + "src.infrastructure.connectors.connector_sql.Session", return_value=mock_db_session, ), patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository") as mock_repo_cls, @@ -92,7 +92,7 @@ def test_run_audio_diarization_dispatcher_worker_retry_failed(self, mock_app, mo patch("src.application.workers.registry.get", return_value=mock_app), patch("src.infrastructure.extractors.youtube_extractor.YoutubeExtractor") as mock_extractor_cls, patch( - "src.infrastructure.repositories.sql.connector.Session", + "src.infrastructure.connectors.connector_sql.Session", return_value=mock_db_session, ), patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository") as mock_repo_cls, diff --git a/tests/application/test_workers.py b/tests/application/test_workers.py index 25b4ce5d..c3aa20dc 100644 --- a/tests/application/test_workers.py +++ b/tests/application/test_workers.py @@ -171,7 +171,7 @@ def test_run_diarization_ingestion_worker_success(self): patch("src.presentation.api.dependencies.resolve_vector_repository"), patch("src.presentation.api.dependencies.resolve_rerank_service"), patch("src.infrastructure.services.chunk_vector_service.ChunkVectorService"), - patch("src.infrastructure.repositories.sql.connector.Session") as mock_session_cls, + patch("src.infrastructure.connectors.connector_sql.Session") as mock_session_cls, patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository"), patch( "src.application.use_cases.diarization_ingestion_use_case.DiarizationIngestionUseCase" @@ -211,7 +211,7 @@ def test_audio_diarization_subprocess_success(self): from src.application.workers import _audio_diarization_subprocess with ( - patch("src.infrastructure.repositories.sql.connector.Session") as mock_session_cls, + patch("src.infrastructure.connectors.connector_sql.Session") as mock_session_cls, patch("src.infrastructure.services.redis_event_bus.RedisEventBus"), patch( "src.application.use_cases.process_audio_diarization_pipeline.ProcessAudioDiarizationPipelineUseCase" @@ -266,7 +266,7 @@ def test_run_audio_diarization_worker_failure(self): with ( patch("multiprocessing.get_context") as mock_get_ctx, - patch("src.infrastructure.repositories.sql.connector.Session") as mock_session_factory, + patch("src.infrastructure.connectors.connector_sql.Session") as mock_session_factory, patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository") as mock_repo_cls, patch("src.infrastructure.services.redis_event_bus.RedisEventBus"), ): diff --git a/tests/conftest.py b/tests/conftest.py index 39214ce2..95eae632 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ from sqlalchemy.orm import sessionmaker import src.infrastructure.connectors.connector_sql as connector +import src.infrastructure.repositories.sql.models # noqa: F401 @pytest.fixture(autouse=True) diff --git a/tests/infrastructure/services/test_youtube_audio_downloader.py b/tests/infrastructure/services/test_youtube_audio_downloader.py index 899b5a96..771263eb 100644 --- a/tests/infrastructure/services/test_youtube_audio_downloader.py +++ b/tests/infrastructure/services/test_youtube_audio_downloader.py @@ -7,9 +7,10 @@ @pytest.mark.Downloader class TestYoutubeExtractorDownload: + @patch("src.infrastructure.extractors.youtube_extractor.YoutubeExtractor._validate_mp3_file") @patch("src.infrastructure.extractors.youtube_extractor.YoutubeDL") @patch("os.makedirs") - def test_download_success(self, mock_makedirs, mock_ytdl): + def test_download_success(self, mock_makedirs, mock_ytdl, mock_validate): # Mocking yt_dlp to return a fake filename mock_instance = mock_ytdl.return_value.__enter__.return_value mock_instance.extract_info.return_value = {"title": "Test Audio", "ext": "webm"} diff --git a/tests/presentation/api/routes/test_ingest_router.py b/tests/presentation/api/routes/test_ingest_router.py index 0ecd6f71..f7991409 100644 --- a/tests/presentation/api/routes/test_ingest_router.py +++ b/tests/presentation/api/routes/test_ingest_router.py @@ -76,11 +76,18 @@ def test_ingest_youtube_exception(mock_use_case): def test_ingest_youtube_reprocess(): # Use real dependency override for task_queue if needed, # but mock_app_state fixture already sets app.state.task_queue - from src.presentation.api.dependencies import get_task_queue_service - + from src.presentation.api.dependencies import get_task_queue_service, get_job_service + mock_queue = MagicMock() app.dependency_overrides[get_task_queue_service] = lambda: mock_queue + mock_job_service = MagicMock() + # Mocking the created job to have a valid UUID id + mock_job = MagicMock() + mock_job.id = "123e4567-e89b-12d3-a456-426614174000" + mock_job_service.create_job.return_value = mock_job + app.dependency_overrides[get_job_service] = lambda: mock_job_service + try: response = client.post( "/rest/ingest/youtube", @@ -89,7 +96,7 @@ def test_ingest_youtube_reprocess(): assert response.status_code == 200 assert response.json()["skipped"] is False - assert response.json()["reason"] == "Ingestion started in background queue." + assert response.json()["reason"] == "Ingestion started in background queue (Job: 123e4567-e89b-12d3-a456-426614174000)." assert mock_queue.enqueue.called finally: app.dependency_overrides.pop(get_task_queue_service, None) From f6a50a97c09742a0b796cb1e281006ca49d96a24 Mon Sep 17 00:00:00 2001 From: ericksonlopes Date: Wed, 8 Apr 2026 14:41:17 -0300 Subject: [PATCH 7/7] style: fix linting issues (import sorting and line length) --- src/application/workers.py | 8 ++++---- tests/presentation/api/routes/test_ingest_router.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/application/workers.py b/src/application/workers.py index 6f15f50c..81c341c6 100644 --- a/src/application/workers.py +++ b/src/application/workers.py @@ -418,12 +418,12 @@ async def _run(): def _audio_diarization_subprocess(cmd_dict: dict): """Run audio diarization in a separate process to avoid torch/CUDA thread deadlocks.""" - from src.infrastructure.connectors.connector_sql import ( - Session as DBSessionFactory, - ) from src.application.use_cases.process_audio_diarization_pipeline import ( ProcessAudioDiarizationPipelineUseCase, ) + from src.infrastructure.connectors.connector_sql import ( + Session as DBSessionFactory, + ) from src.infrastructure.repositories.sql.content_source_repository import ( ContentSourceSQLRepository, ) @@ -668,10 +668,10 @@ def run_voice_training_worker(cmd: TrainVoiceCommand): return try: - from src.infrastructure.connectors.connector_sql import Session as DBSession from src.application.use_cases.manage_voice_profiles import ( TrainVoiceProfileFromSpeakerSegmentUseCase, ) + from src.infrastructure.connectors.connector_sql import Session as DBSession from src.presentation.api.dependencies import resolve_ingestion_context ctx = resolve_ingestion_context(app) diff --git a/tests/presentation/api/routes/test_ingest_router.py b/tests/presentation/api/routes/test_ingest_router.py index f7991409..997f554f 100644 --- a/tests/presentation/api/routes/test_ingest_router.py +++ b/tests/presentation/api/routes/test_ingest_router.py @@ -76,7 +76,7 @@ def test_ingest_youtube_exception(mock_use_case): def test_ingest_youtube_reprocess(): # Use real dependency override for task_queue if needed, # but mock_app_state fixture already sets app.state.task_queue - from src.presentation.api.dependencies import get_task_queue_service, get_job_service + from src.presentation.api.dependencies import get_job_service, get_task_queue_service mock_queue = MagicMock() app.dependency_overrides[get_task_queue_service] = lambda: mock_queue @@ -96,7 +96,8 @@ def test_ingest_youtube_reprocess(): assert response.status_code == 200 assert response.json()["skipped"] is False - assert response.json()["reason"] == "Ingestion started in background queue (Job: 123e4567-e89b-12d3-a456-426614174000)." + expected_reason = "Ingestion started in background queue (Job: 123e4567-e89b-12d3-a456-426614174000)." + assert response.json()["reason"] == expected_reason assert mock_queue.enqueue.called finally: app.dependency_overrides.pop(get_task_queue_service, None)