From cfe4bf8fff4901bb91f4c9f01fba145f1a418ad3 Mon Sep 17 00:00:00 2001
From: ericksonlopes <ofc.erickson@gmail.com>
Date: Tue, 7 Apr 2026 17:19:01 -0300
Subject: [PATCH 1/7] feat: harden diarization ingestion and refactor readme

---
 README.md                                     | 141 ++++++-----
 .../diarization_ingestion_use_case.py         | 233 +++++++++---------
 .../services/test_voice_profile_service.py    |  18 +-
 3 files changed, 209 insertions(+), 183 deletions(-)
diff --git a/README.md b/README.md
index 8f53eacf..10196680 100644
--- a/README.md
+++ b/README.md
@@ -2,115 +2,129 @@
 
 # WhatYouSaid
 
-[![codecov](https://codecov.io/github/ericksonlopes/WhatYouSaid/branch/main/graph/badge.svg?token=8CZJARVJUE)](https://codecov.io/github/ericksonlopes/WhatYouSaid)
+## The Vectorized Intelligence & Diarization Hub
+
 
+[![codecov](https://codecov.io/github/ericksonlopes/WhatYouSaid/branch/main/graph/badge.svg?token=8CZJARVJUE)](https://codecov.io/github/ericksonlopes/WhatYouSaid)
 [![Tests](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/tests.yml)
 [![Code Quality](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/code-quality.yml/badge.svg?branch=main)](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/code-quality.yml)
 [![Security](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/security.yml/badge.svg?branch=main)](https://github.com/ericksonlopes/WhatYouSaid/actions/workflows/security.yml)
 
 ![Python](https://img.shields.io/badge/-Python-3776AB?&logo=Python&logoColor=FFFFFF)
 ![React](https://img.shields.io/badge/-React-61DAFB?&logo=React&logoColor=000000)
-![Pytest](https://img.shields.io/badge/-Pytest-0A9EDC?&logo=Pytest&logoColor=FFFFFF)
-![GitHub Actions](https://img.shields.io/badge/-GitHub%20Actions-2088FF?&logo=GitHub%20Actions&logoColor=FFFFFF)
+![FastAPI](https://img.shields.io/badge/-FastAPI-05998B?&logo=FastAPI&logoColor=FFFFFF)
+![Redis](https://img.shields.io/badge/-Redis-DC382D?&logo=Redis&logoColor=FFFFFF)
+![Postgres](https://img.shields.io/badge/-PostgreSQL-4169E1?&logo=PostgreSQL&logoColor=FFFFFF)
 
 </div>
 
-WhatYouSaid is a vectorized data hub designed to explore any topic or knowledge domain. It extracts, processes, and indexes content from YouTube videos, local files, and remote URLs to enable advanced semantic search and Retrieval-Augmented Generation (RAG) workflows.
-
-This repository provides modular extractors, robust splitting utilities, and a scalable background processing pipeline to build searchable knowledge bases efficiently.
+**WhatYouSaid** is a state-of-the-art vectorized data hub designed to explore any knowledge domain. It transforms unstructured audio, video, files, and web content into structured, searchable intelligence using advanced AI techniques, including **Speaker Diarization**, **Voice Recognition**, and **RAG** (Retrieval-Augmented Generation).
 
 ---
 
-## 📚 Documentation
+## ✨ Features
 
-Detailed guides for specific topics:
+### 🎧 Diarization & Voice Intelligence
 
-- 🐳 **[Docker Deployment Guide](docs/docker-deployment.md)**: Learn how to use Docker Profiles to run different combinations of databases (MySQL, Postgres, SQLite) and vector stores (FAISS, Weaviate).
+- **Speaker Segmentation**: Automatically split audio/video files by speaker using WhisperX/Whisper for unmatched accuracy.
+- **Voice Recognition**: Identify and label speakers across your entire knowledge base using trained voice profiles.
+- **Diarization Pipeline**: Interactive dashboard to review, edit, and finalize transcripts and speaker assignments before indexing.
 
----
 
-## 🚀 Features
+### 📥 Multi-Source Ingestion
+
+- **YouTube Ecosystem**: Full support for individual videos, entire playlists, or entire channels.
+- **Document Extractors**: High-fidelity extraction from PDF, DOCX, and TXT files.
+- **Web Intelligence**: Powerful scraping via **Crawl4AI** and **Docling** for websites and remote URLs.
+- **Robust Pipeline**: Step-by-step progress tracking with real-time SSE notifications and full rollback support on failure.
+
+### 🔍 Advanced Semantic Search
 
-- **Multi-source Extraction**: Ingest data from YouTube (transcripts), local files (PDF, DOCX, TXT), **remote URLs** via Docling, and **Websites** via Crawl4AI.
-- **Robust Fallbacks**: Integrated `PlainTextExtractor` ensuring successful ingestion even for formats not supported by specialized extractors.
-- **Async Task Queue**: High-performance background processing powered by **Redis**, ensuring responsive workflows.
-- **Structured Logging & Tracing**: Centralized logging equipped with contextvars and request tracing (Correlation IDs) for end-to-end observability.
-- **Real-time Updates**: Live ingestion status and progress monitoring via a **Redis Event Bus** (SSE-ready).
-- **Advanced Search**: Semantic, keyword (BM25), and **Hybrid Search** with cross-encoder re-ranking for maximum precision.
-- **Pluggable Vector Stores**: Support for **FAISS** (local), **ChromaDB**, **Weaviate** (scalable), and **Qdrant**.
-- **Pluggable SQL Databases**: Support for **SQLite**, **PostgreSQL**, **MySQL**, **MariaDB**, and **MSSQL**.
-- **Modern Dashboard**: A clean React + Tailwind CSS frontend for managing knowledge subjects, content sources, and monitoring background tasks.
+- **Hybrid Search**: Combining Vector (FAISS/Weaviate/Chroma) and Keyword (BM25) search for maximum precision.
+- **Re-Ranking**: Specialized Cross-Encoders ensure the most relevant context is always at the top.
+- **Pluggable Architecture**: Seamlessly switch between SQL databases (SQLite/Postgres/MySQL) and Vector stores.
 
 ---
 
-## 🛠️ Infrastructure & Deployment
+## 🚀 Quick Start
 
-WhatYouSaid is designed to be flexible, from a lightweight local setup to a scalable production-ready environment.
+WhatYouSaid is powered by **Python 3.12** and uses **uv** for high-speed dependency management.
 
-### 1. Storage & Messaging Options
+### 1. Prerequisites
 
-| Component | Lightweight (Local) | Scalable / Production |
-| :--- | :--- | :--- |
-| **Relational Database** | **SQLite** (Default, file-based) | **PostgreSQL**, **MySQL**, **MariaDB**, **MSSQL** |
-| **Vector Store** | **FAISS** (Local, file-based) | **Weaviate** (Container or Cloud), **ChromaDB** |
-| **Task Queue & Bus** | **In-memory** (Limited) | **Redis** (Default in Docker) |
+- [uv](https://github.com/astral-sh/uv) (Recommended) or `pip`
+- [Docker](https://www.docker.com/)
+
+### 2. Environment Setup
 
-### 2. Docker Compose Profiles & Dependencies
+```bash
+# Clone the repository
+git clone https://github.com/ericksonlopes/WhatYouSaid.git
+cd WhatYouSaid
 
-We use **Docker Profiles** to keep the environment lean. Only the services you need are started. The project also natively supports both **CPU** and **GPU** environments via optional Python dependencies.
+# Install dependencies (including dev groups)
+uv sync --group dev
+```
 
-> 📘 **Detailed Guide**: For a step-by-step tutorial on different deployment scenarios, see our [Docker Deployment Guide](docs/docker-deployment.md).
+### 3. Spin Up Infrastructure
 
-#### **Scenario A: Lite (Default)**
-Uses **SQLite**, **FAISS**, and **Redis**.
 ```bash
+# Lite mode: SQLite + FAISS + Redis
 docker-compose up -d
+
+# Scalable mode: PostgreSQL + Weaviate + Redis
+docker-compose --profile base up -d
 ```
 
-#### **Scenario B: Scalable (Base)**
-Starts **PostgreSQL**, **Weaviate**, and **Redis**.
+### 4. Run Application
+
 ```bash
-docker-compose --profile base up -d
-# Note: Set SQL__TYPE=postgres and VECTOR__STORE_TYPE=weaviate in .env
+# Start Backend (FastAPI)
+python main.py
+
+# Start Frontend (React)
+cd frontend
+npm install
+npm run dev
 ```
 
 ---
 
-## 🏗️ Architecture
+## 🐳 Deployment Profiles
 
-The system follows a clean architecture approach, ensuring separation of concerns:
+We use **Docker Profiles** to keep your environment lean. Only the services you need are started.
 
-- **Application Layer**: Contains use cases (e.g., `FileIngestionUseCase`, `SearchUseCase`) and a `ServiceRegistry` for background worker dependency resolution.
-- **Infrastructure Layer**:
-  - `extractors/`: Fetch raw content (Docling, YouTube, PlainText).
-  - `repositories/`: Data persistence (SQLAlchemy for relational, specialized clients for Vector Stores).
-  - `services/`: Core logic (text splitting, embedding, re-ranking, Redis task queue).
-- **Presentation Layer**: FastAPI-based REST API with real-time SSE notifications.
+| Component | Lite Profile (Default) | Scalable Profile (`base`) |
+| :--- | :--- | :--- |
+| **Relational DB** | SQLite (File-based) | PostgreSQL / MySQL / MariaDB |
+| **Vector Store** | FAISS (Local) | Weaviate / ChromaDB / Qdrant |
+| **Task Queue** | Redis | Redis (Production-ready) |
 
----
+> [!TIP]
+> Use the **Scalable** profile if you require high-concurrency access or plan to manage multi-gigabyte vector indexes.
 
-## 🧪 Quality & Testing
+---
 
-We maintain a high standard of code quality and test coverage:
+## 🏗️ Clean Architecture
 
-- **417+ Automated Tests**: Covering unit, integration, and complex edge cases.
-- **93% Code Coverage**: Verified via `pytest-cov`.
-- **Strict Linting**: Powered by `ruff` for code style and `mypy` for static type checking.
-- **Security Scanning**: Integrated `bandit` scans for vulnerability detection.
+The system follows a modular approach ensuring maximum testability and maintainability:
 
-**Run tests locally:**
-```bash
-uv run pytest
-```
+- **Application Layer**: Orchestrates logic via use cases and resolves background worker dependencies through a `ServiceRegistry`.
+- **Infrastructure Layer**:
+  - `extractors/`: Fetch raw content from specialized sources (Docling, YouTube, Crawl4AI).
+  - `repositories/`: Persistence via SQL (SQLAlchemy) and specialized Vector clients.
+  - `services/`: Core providers for embeddings, text splitting, and re-ranking.
+- **Presentation Layer**: FastAPI-based REST API with real-time event broadcasting and a modern React dashboard.
 
 ---
 
-## 🤝 Contributing
+## 🤝 Contributing & Quality
+
+Contributions are what make the open-source community such an amazing place! Please:
 
-Contributions are welcome. Please:
-- Open an issue to discuss major changes.
-- Add tests for any new feature or bug fix.
-- Ensure `ruff check .` and `mypy .` pass before submitting.
+1. Open an **Issue** to discuss proposed changes.
+2. Ensure `uv run ruff check . --fix` and `uv run mypy .` pass.
+3. Run all tests: `uv run pytest`.
 
 ---
 
@@ -119,8 +133,9 @@ Contributions are welcome. Please:
 This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
 
 <div align="center">
-    <p>Made with ❤️ by Erickson Lopes </p>
 
-[![LinkedIn](https://img.shields.io/badge/LinkedIn-Erickson_Lopes-blue)](https://www.linkedin.com/in/ericksonlopes/)
+Hand-crafted with ❤️ by **Erickson Lopes**
+
+[![LinkedIn](https://img.shields.io/badge/LinkedIn-Erickson_Lopes-blue?style=for-the-badge&logo=linkedin)](https://www.linkedin.com/in/ericksonlopes/)
 
 </div>
diff --git a/src/application/use_cases/diarization_ingestion_use_case.py b/src/application/use_cases/diarization_ingestion_use_case.py
index 7fe26127..ad48effd 100644
--- a/src/application/use_cases/diarization_ingestion_use_case.py
+++ b/src/application/use_cases/diarization_ingestion_use_case.py
@@ -59,26 +59,18 @@ def __init__(
         self.event_bus = event_bus
 
     def execute(self, cmd: IngestDiarizationCommand) -> Dict[str, Any]:
-        self.event_bus.publish(
-            "ingestion_status",
-            {
-                "job_id": str(cmd.ingestion_job_id) if cmd.ingestion_job_id else "new",
-                "status": "started",
-                "diarization_id": str(cmd.diarization_id),
-            },
-        )
+        """Orchestrates the ingestion of diarization results with status tracking and rollback."""
+        self._publish_initial_status(cmd)
         logger.info(
-            "Starting Diarization ingestion",
-            context={
-                "diarization_id": str(cmd.diarization_id),
-                "subject_id": str(cmd.subject_id),
-            },
+            "Starting Diarization ingestion pipeline",
+            context={"diarization_id": str(cmd.diarization_id), "subject_id": str(cmd.subject_id)},
         )
 
-        ingestion = None
+        job = None
         source = None
 
         try:
+            # 1. Resolve domain objects and source info
             record = self.diarization_repo.get_by_id(str(cmd.diarization_id))
             if not record:
                 raise ValueError(f"Diarization record not found: {cmd.diarization_id}")
@@ -89,98 +81,124 @@ def execute(self, cmd: IngestDiarizationCommand) -> Dict[str, Any]:
 
             source_type, external_source = self._resolve_source_info(record)
 
-            if cmd.ingestion_job_id:
-                ingestion = self.ingestion_service.get_by_id(cmd.ingestion_job_id)
-
-            if ingestion is None:
-                ingestion = self._create_ingestion_job(external_source, source_type, subject.id)
-
-            self.ingestion_service.update_job(
-                job_id=ingestion.id,
-                status=IngestionJobStatus.PROCESSING,
-                status_message="Formatting transcript from diarization...",
-                current_step=1,
-                total_steps=4,
-            )
-
-            full_text = self._format_transcript(cast(list, record.segments), cast(dict, record.recognition_results))
-            if not full_text:
-                raise ValueError("No segments found in diarization record")
+            # 2. Setup ingestion job tracking
+            job = self._ensure_ingestion_job(cmd, external_source, source_type, subject.id)
 
+            # 3. Format transcript and prepare source
+            full_text = self._prepare_transcript(job, record)
             display_name = cmd.name or cast(str, record.name) or "Transcrição"
             source = self._get_or_create_source(source_type, external_source, subject.id, display_name, cmd, record)
 
-            # Generate chunks and Embeddings
-            self.ingestion_service.update_job(
-                job_id=ingestion.id,
-                status=IngestionJobStatus.PROCESSING,
-                status_message="Generating embeddings...",
-                current_step=2,
-                total_steps=4,
-                content_source_id=source.id,
-            )
-
+            # 4. Generate and index chunks
+            self._report_step(job, 2, "Generating embeddings...", content_id=source.id)
             split_docs = self._generate_split_docs(full_text, display_name, external_source, source_type, cmd, record)
 
-            # Persist Chunks
-            chunks = self._build_chunk_entities(split_docs, source, subject, cmd, ingestion.id)
+            chunks = self._build_chunk_entities(split_docs, source, subject, cmd, job.id)
             self.chunk_service.create_chunks(chunks)
 
-            # Index
-            self.ingestion_service.update_job(
-                job_id=ingestion.id,
-                status=IngestionJobStatus.PROCESSING,
-                status_message="Indexing in vector store...",
-                current_step=3,
-                total_steps=4,
-            )
+            self._report_step(job, 3, "Indexing in vector store...")
             self.vector_service.index_documents(chunks)
 
-            # Finalize
-            self._finalize_ingestion(ingestion, source, chunks, cmd)
-
-            # Update Diarization record status to COMPLETED
-            self.diarization_repo.update_status(
-                diarization_id=str(cmd.diarization_id),
-                status=DiarizationStatus.COMPLETED.value,
-                status_message="Ingestão concluída com sucesso",
-                error_message="",  # Clear any previous error
-            )
-
-            # Notify frontend that diarization is fully done
-            self.event_bus.publish(
-                "ingestion_status",
-                {
-                    "type": "diarization",
-                    "id": str(cmd.diarization_id),
-                    "status": "done",
-                    "message": "Diarização indexada com sucesso",
-                },
-            )
+            # 5. Finalize
+            self._finalize_ingestion(job, source, chunks, cmd)
+            self._complete_diarization_record(cmd)
+            self._publish_final_notification(cmd)
 
             return {
                 "diarization_id": str(cmd.diarization_id),
                 "created_chunks": len(chunks),
                 "source_id": source.id,
-                "job_id": ingestion.id,
+                "job_id": job.id,
             }
 
         except Exception as e:
             logger.error(e, context={"action": "diarization_ingestion_execute"})
-            if ingestion:
-                self.ingestion_service.update_job(
-                    job_id=ingestion.id,
-                    status=IngestionJobStatus.FAILED,
-                    error_message=str(e),
-                )
-            if source:
-                self.cs_service.update_processing_status(
-                    content_source_id=source.id,
-                    status=ContentSourceStatus.FAILED,
-                    error_message=str(e),
-                )
+            self._rollback_on_failure(job, source, e)
             raise
 
+    def _publish_initial_status(self, cmd: IngestDiarizationCommand) -> None:
+        self.event_bus.publish(
+            "ingestion_status",
+            {
+                "job_id": str(cmd.ingestion_job_id) if cmd.ingestion_job_id else "new",
+                "status": "started",
+                "diarization_id": str(cmd.diarization_id),
+            },
+        )
+
+    def _ensure_ingestion_job(
+        self, cmd: IngestDiarizationCommand, external_source: str, source_type: SourceType, subject_id: UUID
+    ) -> Any:
+        job = None
+        if cmd.ingestion_job_id:
+            job = self.ingestion_service.get_by_id(cmd.ingestion_job_id)
+
+        if not job:
+            job = self._create_ingestion_job(external_source, source_type, subject_id)
+        return job
+
+    def _prepare_transcript(self, job: Any, record: Any) -> str:
+        self.ingestion_service.update_job(
+            job_id=job.id,
+            status=IngestionJobStatus.PROCESSING,
+            status_message="Formatting transcript from diarization...",
+            current_step=1,
+            total_steps=4,
+        )
+        full_text = self._format_transcript(cast(list, record.segments), cast(dict, record.recognition_results))
+        if not full_text:
+            raise ValueError("No segments found in diarization record")
+        return full_text
+
+    def _report_step(self, job: Any, step: int, message: str, content_id: Optional[UUID] = None) -> None:
+        self.ingestion_service.update_job(
+            job_id=job.id,
+            status=IngestionJobStatus.PROCESSING,
+            status_message=message,
+            current_step=step,
+            total_steps=4,
+            content_source_id=content_id,
+        )
+
+    def _complete_diarization_record(self, cmd: IngestDiarizationCommand) -> None:
+        self.diarization_repo.update_status(
+            diarization_id=str(cmd.diarization_id),
+            status=DiarizationStatus.COMPLETED.value,
+            status_message="Ingestão concluída com sucesso",
+            error_message="",
+        )
+
+    def _publish_final_notification(self, cmd: IngestDiarizationCommand) -> None:
+        self.event_bus.publish(
+            "ingestion_status",
+            {
+                "type": "diarization",
+                "id": str(cmd.diarization_id),
+                "status": "done",
+                "message": "Diarização indexada com sucesso",
+            },
+        )
+
+    def _rollback_on_failure(self, job: Optional[Any], source: Optional[Any], error: Exception) -> None:
+        """Rolls back changes if ingestion fails."""
+        if job:
+            self.ingestion_service.update_job(
+                job_id=job.id,
+                status=IngestionJobStatus.FAILED,
+                error_message=str(error),
+            )
+            # Cleanup SQL Chunks
+            self.chunk_service.delete_by_job_id(job.id)
+
+        if source:
+            self.cs_service.update_processing_status(
+                content_source_id=source.id,
+                status=ContentSourceStatus.FAILED,
+                error_message=str(error),
+            )
+            # Vector cleanup is handled by job_id filters if possible, or by source_id
+            self.vector_service.delete(filters={"content_source_id": str(source.id)})
+
     def _resolve_source_info(self, record: Any) -> tuple[SourceType, str]:
         source_type_val = cast(str, record.source_type)
         if source_type_val == "upload":
@@ -197,7 +215,6 @@ def _resolve_source_info(self, record: Any) -> tuple[SourceType, str]:
             if original:
                 external_source = original
 
-        # Normalize YouTube IDs to prevent duplicates (Short URLs, Full URLs vs 11-char IDs)
         if source_type == SourceType.YOUTUBE:
             normalized_vid = YoutubeExtractor.get_video_id(external_source)
             if normalized_vid:
@@ -244,7 +261,6 @@ def _get_or_create_source(
             )
         else:
             self.cs_service.update_processing_status(source.id, ContentSourceStatus.PROCESSING)
-            # Update title if it has changed
             if cmd.name and source.title != cmd.name:
                 self.cs_service.update_title(source.id, cmd.name)
 
@@ -332,10 +348,18 @@ def _format_transcript(self, segments: List[Dict[str, Any]], recognition: Option
         if not segments:
             return ""
 
-        mapping = recognition.get("mapping", {}) if recognition else {}
-
+        mapping = (recognition or {}).get("mapping", {})
         merged_lines = []
-        curr_speaker, curr_start, curr_end, curr_texts = None, None, None, []
+
+        curr_speaker: Optional[str] = None
+        curr_start = 0.0
+        curr_end = 0.0
+        curr_texts: List[str] = []
+
+        def flush_block() -> None:
+            if curr_speaker is not None:
+                ts = f"[{self._format_seconds(curr_start)} - {self._format_seconds(curr_end)}]"
+                merged_lines.append(f"{ts} {curr_speaker}: {' '.join(curr_texts)}")
 
         for seg in segments:
             spk_label = seg.get("speaker", "UNKNOWN")
@@ -344,28 +368,15 @@ def _format_transcript(self, segments: List[Dict[str, Any]], recognition: Option
             end = float(seg.get("end", 0))
             text = seg.get("text", "").strip()
 
-            if spk_name == curr_speaker:
-                curr_end = end
-                if text:
-                    curr_texts.append(text)
-            else:
-                if curr_speaker is not None:
-                    start_str = self._format_seconds(cast(float, curr_start))
-                    end_str = self._format_seconds(cast(float, curr_end))
-                    ts = f"[{start_str} - {end_str}]"
-                    merged_lines.append(f"{ts} {curr_speaker}: {' '.join(curr_texts)}")
-
-                curr_speaker, curr_start, curr_end, curr_texts = (
-                    spk_name,
-                    start,
-                    end,
-                    [text] if text else [],
-                )
-
-        if curr_speaker is not None:
-            ts = f"[{self._format_seconds(cast(float, curr_start))} - {self._format_seconds(cast(float, curr_end))}]"
-            merged_lines.append(f"{ts} {curr_speaker}: {' '.join(curr_texts)}")
+            if spk_name != curr_speaker:
+                flush_block()
+                curr_speaker, curr_start, curr_texts = spk_name, start, []
+
+            curr_end = end
+            if text:
+                curr_texts.append(text)
 
+        flush_block()
         return "\n".join(merged_lines)
 
     def _build_chunk_entities(
diff --git a/tests/infrastructure/services/test_voice_profile_service.py b/tests/infrastructure/services/test_voice_profile_service.py
index 4fb1c05c..87ff2b13 100644
--- a/tests/infrastructure/services/test_voice_profile_service.py
+++ b/tests/infrastructure/services/test_voice_profile_service.py
@@ -44,12 +44,12 @@ def test_add_voice_local_file(self, sqlite_memory):
                     voice_id, _ = db_service.add("Test User", "local.wav")
 
                     assert voice_id is not None
-                    
+
                     # Verify status was updated to ready
                     record = sqlite_memory.get(VoiceRecord, voice_id)
                     assert record.status == "ready"
                     assert record.status_message is None
-                    
+
                     voices = db_service.voices
                     assert "Test User" in voices
 
@@ -159,13 +159,13 @@ def test_add_voice_s3_download_failure(self, sqlite_memory):
         # After failure, it should NOT have created a successful record
         # but let's check if it created a fixed "failed" record if we implement it that way.
         # Currently, if it fails at download, it doesn't even create the record yet in the DB.
-        # Wait, the record is created AFTER the S3 check block. 
+        # Wait, the record is created AFTER the S3 check block.
         # So no record should exist in DB yet.
         assert sqlite_memory.query(VoiceRecord).count() == 0
 
     def test_add_voice_embedding_extraction_failure(self, sqlite_memory):
         db_service = VoiceDB(sqlite_memory, hf_token="fake")
-        
+
         # Fail during embedding extraction
         with patch.object(db_service, "_extract_embedding", side_effect=Exception("Model Error")):
             with pytest.raises(Exception, match="Model Error"):
@@ -183,10 +183,10 @@ def test_list_audio_files(self, sqlite_memory):
         sqlite_memory.commit()
 
         self.mock_storage.list_files.return_value = [{"key": "f1.wav"}, {"key": "f2.wav"}]
-        
+
         db_service = VoiceDB(sqlite_memory, hf_token="fake")
         files = db_service.list_audio_files("v1")
-        
+
         assert len(files) == 2
         self.mock_storage.list_files.assert_called_once_with(prefix="voices/v1/", extension=".wav")
 
@@ -202,15 +202,15 @@ def test_list_voices_and_len(self, sqlite_memory):
         sqlite_memory.commit()
 
         db_service = VoiceDB(sqlite_memory, hf_token="fake")
-        
+
         # list_voices should only show ready ones
         voice_list = db_service.list_voices()
         assert "Ready" in voice_list
         assert "Processing" not in voice_list
-        
+
         # len should only show ready ones
         assert len(db_service) == 1
-        
+
         # .voices property should only show ready ones
         assert "Ready" in db_service.voices
         assert "Processing" not in db_service.voices

From fd708b1ee6122373cb32a6a4baa6447efb489904 Mon Sep 17 00:00:00 2001
From: ericksonlopes <ofc.erickson@gmail.com>
Date: Tue, 7 Apr 2026 20:33:01 -0300
Subject: [PATCH 2/7] fix(frontend): implement shared SubjectIcon and fix
 missing translations in search view

---
 frontend/src/components/AddSubjectModal.tsx   |  46 +--
 frontend/src/components/ContextSelector.tsx   |  10 +-
 .../src/components/KnowledgeAdminView.tsx     |  58 +--
 .../src/components/LocalContextSelector.tsx   |   3 +-
 frontend/src/components/SearchView.tsx        | 350 +++++++++++-------
 frontend/src/components/SubjectIcon.tsx       |  56 +++
 frontend/src/locales/en.json                  |   4 +
 frontend/src/locales/pt-BR.json               |   4 +
 8 files changed, 293 insertions(+), 238 deletions(-)
 create mode 100644 frontend/src/components/SubjectIcon.tsx

diff --git a/frontend/src/components/AddSubjectModal.tsx b/frontend/src/components/AddSubjectModal.tsx
index 303d0225..d3f8625c 100644
--- a/frontend/src/components/AddSubjectModal.tsx
+++ b/frontend/src/components/AddSubjectModal.tsx
@@ -1,55 +1,15 @@
 import React, { useState, type SyntheticEvent } from 'react';
 import { useTranslation } from 'react-i18next';
-import {
-  X, Brain, Briefcase, ChefHat, Cpu, Landmark, Lightbulb, Activity, Hash,
-  Database, Book, Globe, Zap, Shield, Search, Code, MessageSquare, Layout,
-  Layers, HardDrive, Cloud, Lock, User, Users, Target, Award, GraduationCap,
-  Music, Video, Image, FileText, Mail, Terminal, Bug
-} from 'lucide-react';
+import { X } from 'lucide-react';
 import { useAppContext } from '../store/AppContext';
 import { motion, AnimatePresence } from 'motion/react';
+import { SubjectIcon, ICONS_LIST as ICONS } from './SubjectIcon';
 
 interface AddSubjectModalProps {
   readonly isOpen: boolean;
   readonly onClose: () => void;
 }
 
-const ICONS = [
-  { name: 'Brain', icon: Brain },
-  { name: 'Briefcase', icon: Briefcase },
-  { name: 'ChefHat', icon: ChefHat },
-  { name: 'Cpu', icon: Cpu },
-  { name: 'Landmark', icon: Landmark },
-  { name: 'Lightbulb', icon: Lightbulb },
-  { name: 'Activity', icon: Activity },
-  { name: 'Hash', icon: Hash },
-  { name: 'Database', icon: Database },
-  { name: 'Book', icon: Book },
-  { name: 'Globe', icon: Globe },
-  { name: 'Zap', icon: Zap },
-  { name: 'Shield', icon: Shield },
-  { name: 'Search', icon: Search },
-  { name: 'Code', icon: Code },
-  { name: 'MessageSquare', icon: MessageSquare },
-  { name: 'Layout', icon: Layout },
-  { name: 'Layers', icon: Layers },
-  { name: 'HardDrive', icon: HardDrive },
-  { name: 'Cloud', icon: Cloud },
-  { name: 'Lock', icon: Lock },
-  { name: 'User', icon: User },
-  { name: 'Users', icon: Users },
-  { name: 'Target', icon: Target },
-  { name: 'Award', icon: Award },
-  { name: 'GraduationCap', icon: GraduationCap },
-  { name: 'Music', icon: Music },
-  { name: 'Video', icon: Video },
-  { name: 'Image', icon: Image },
-  { name: 'FileText', icon: FileText },
-  { name: 'Mail', icon: Mail },
-  { name: 'Terminal', icon: Terminal },
-  { name: 'Bug', icon: Bug },
-];
-
 export function AddSubjectModal({ isOpen, onClose }: AddSubjectModalProps) {
   const { t } = useTranslation();
   const { addSubject } = useAppContext();
@@ -153,7 +113,7 @@ export function AddSubjectModal({ isOpen, onClose }: AddSubjectModalProps) {
                           : 'bg-black/40 border-white/5 text-zinc-500 hover:bg-zinc-800 hover:text-zinc-200'
                       }`}
                     >
-                      <Icon className={`w-4 h-4 transition-transform duration-200 group-hover:scale-110 ${selectedIcon === iconName ? 'scale-110' : ''}`} />
+                      <SubjectIcon iconName={iconName} className={`w-4 h-4 transition-transform duration-200 group-hover:scale-110 ${selectedIcon === iconName ? 'scale-110' : ''}`} />
                     </button>
                   ))}
                 </div>
diff --git a/frontend/src/components/ContextSelector.tsx b/frontend/src/components/ContextSelector.tsx
index dcb65676..1575d9c4 100644
--- a/frontend/src/components/ContextSelector.tsx
+++ b/frontend/src/components/ContextSelector.tsx
@@ -8,6 +8,7 @@ import {
 } from 'lucide-react';
 import { motion, AnimatePresence } from 'motion/react';
 import { useAppContext } from '../store/AppContext';
+import { SubjectIcon } from './SubjectIcon';
 
 export function ContextSelector() {
   const { t } = useTranslation();
@@ -174,9 +175,12 @@ export function ContextSelector() {
                               <SelectionIcon />
                             </div>
                             <div className="flex flex-col items-start min-w-0">
-                                <span className={`text-[15px] font-bold tracking-tight truncate ${isSelected ? 'text-white' : 'text-zinc-300'}`}>
-                                    {subject.name}
-                                </span>
+                                <div className="flex items-center gap-2">
+                                  <SubjectIcon iconName={subject.icon} className={`w-3.5 h-3.5 ${isSelected ? 'text-emerald-400' : 'text-zinc-500'}`} />
+                                  <span className={`text-[15px] font-bold tracking-tight truncate ${isSelected ? 'text-white' : 'text-zinc-300'}`}>
+                                      {subject.name}
+                                  </span>
+                                </div>
                                 {subject.sourceCount !== undefined && (
                                     <span className="text-[10px] text-zinc-500 font-bold uppercase tracking-widest mt-0.5">
                                         {subject.sourceCount} {t('sidebar.operations.sources')}
diff --git a/frontend/src/components/KnowledgeAdminView.tsx b/frontend/src/components/KnowledgeAdminView.tsx
index 4cb19818..84a7c8fd 100644
--- a/frontend/src/components/KnowledgeAdminView.tsx
+++ b/frontend/src/components/KnowledgeAdminView.tsx
@@ -1,57 +1,12 @@
 import React, { useState } from 'react';
 import { useTranslation } from 'react-i18next';
-import {
-  Settings, Trash2, Edit2, Check, X, Brain, Briefcase,
-  ChefHat, Cpu, Landmark, Lightbulb, Activity, Hash,
-  Plus, AlertTriangle, Database, Book, Globe, Zap, Shield,
-  Search, Code, MessageSquare, Layout, Layers, HardDrive,
-  Cloud, Lock, User, Users, Target, Award, GraduationCap,
-  Music, Video, Image, FileText, Mail, Terminal, Bug
+import { 
+  Settings, Trash2, Edit2, Check, X, Plus, AlertTriangle, Search, Zap
 } from 'lucide-react';
 import { useAppContext } from '../store/AppContext';
 import { motion, AnimatePresence } from 'motion/react';
 import { Subject } from '../types';
-
-const ICONS = [
-  { name: 'Brain', icon: Brain },
-  { name: 'Briefcase', icon: Briefcase },
-  { name: 'ChefHat', icon: ChefHat },
-  { name: 'Cpu', icon: Cpu },
-  { name: 'Landmark', icon: Landmark },
-  { name: 'Lightbulb', icon: Lightbulb },
-  { name: 'Activity', icon: Activity },
-  { name: 'Hash', icon: Hash },
-  { name: 'Database', icon: Database },
-  { name: 'Book', icon: Book },
-  { name: 'Globe', icon: Globe },
-  { name: 'Zap', icon: Zap },
-  { name: 'Shield', icon: Shield },
-  { name: 'Search', icon: Search },
-  { name: 'Code', icon: Code },
-  { name: 'MessageSquare', icon: MessageSquare },
-  { name: 'Layout', icon: Layout },
-  { name: 'Layers', icon: Layers },
-  { name: 'HardDrive', icon: HardDrive },
-  { name: 'Cloud', icon: Cloud },
-  { name: 'Lock', icon: Lock },
-  { name: 'User', icon: User },
-  { name: 'Users', icon: Users },
-  { name: 'Target', icon: Target },
-  { name: 'Award', icon: Award },
-  { name: 'GraduationCap', icon: GraduationCap },
-  { name: 'Music', icon: Music },
-  { name: 'Video', icon: Video },
-  { name: 'Image', icon: Image },
-  { name: 'FileText', icon: FileText },
-  { name: 'Mail', icon: Mail },
-  { name: 'Terminal', icon: Terminal },
-  { name: 'Bug', icon: Bug },
-];
-
-const getSubjectIcon = (iconName?: string) => {
-  const item = ICONS.find(i => i.name === iconName);
-  return item ? item.icon : Hash;
-};
+import { SubjectIcon, ICONS_LIST as ICONS } from './SubjectIcon';
 
 export function KnowledgeAdminView() {
   const { t } = useTranslation();
@@ -158,7 +113,6 @@ export function KnowledgeAdminView() {
           ) : (
             filteredSubjects.map((subject) => {
               const isEditing = editingId === subject.id;
-              const Icon = getSubjectIcon(isEditing ? editForm.icon : subject.icon);
 
               return (
                 <motion.div
@@ -177,7 +131,7 @@ export function KnowledgeAdminView() {
                         <div className="flex flex-col items-center gap-3">
                           <label className="text-[10px] font-black uppercase tracking-widest text-zinc-600 self-start">{t('knowledge_contexts.icon_label')}</label>
                           <div className="grid grid-cols-6 gap-2 bg-black/20 p-3 rounded-xl border border-white/5 max-h-[160px] overflow-y-auto custom-scrollbar">
-                            {ICONS.map(({ name: iconName, icon: ItemIcon }) => (
+                             {ICONS.map(({ name: iconName }) => (
                               <button
                                 key={iconName}
                                 onClick={() => setEditForm({ ...editForm, icon: iconName })}
@@ -186,7 +140,7 @@ export function KnowledgeAdminView() {
                                     : 'text-zinc-500 hover:text-zinc-300 hover:bg-white/5'
                                   }`}
                               >
-                                <ItemIcon className="w-4 h-4" />
+                                <SubjectIcon iconName={iconName} className="w-4 h-4" />
                               </button>
                             ))}
                           </div>
@@ -237,7 +191,7 @@ export function KnowledgeAdminView() {
                     <div className="flex items-center justify-between gap-6">
                       <div className="flex items-center gap-5 flex-1 min-w-0">
                         <div className="w-14 h-14 rounded-2xl bg-zinc-800 border border-white/5 flex items-center justify-center shadow-inner group">
-                          <Icon className="w-7 h-7 text-zinc-400 group-hover:text-emerald-400 transition-colors" />
+                          <SubjectIcon iconName={subject.icon} className="w-7 h-7 text-zinc-400 group-hover:text-emerald-400 transition-colors" />
                         </div>
                         <div className="min-w-0">
                           <h3 className="text-xl font-bold text-white truncate">{subject.name}</h3>
diff --git a/frontend/src/components/LocalContextSelector.tsx b/frontend/src/components/LocalContextSelector.tsx
index ff70c93e..eeaacdd2 100644
--- a/frontend/src/components/LocalContextSelector.tsx
+++ b/frontend/src/components/LocalContextSelector.tsx
@@ -8,6 +8,7 @@ import {
   Search
 } from 'lucide-react';
 import { useAppContext } from '../store/AppContext';
+import { SubjectIcon } from './SubjectIcon';
 
 interface LocalContextSelectorProps {
   mode: 'single' | 'multi';
@@ -171,7 +172,7 @@ export const LocalContextSelector: React.FC<LocalContextSelectorProps> = ({
                       <div className="flex items-center gap-3">
                         <div className={`w-8 h-8 rounded-lg flex items-center justify-center text-lg
                           ${isSelected ? 'bg-primary-500/20' : 'bg-white/5 border border-white/5 group-hover:bg-white/10'}`}>
-                          {subject.icon || '📁'}
+                          <SubjectIcon iconName={subject.icon} className="w-4 h-4" />
                         </div>
                         <div className="text-left">
                           <p className="text-sm font-medium">{subject.name}</p>
diff --git a/frontend/src/components/SearchView.tsx b/frontend/src/components/SearchView.tsx
index e26ad1f0..4f9272d4 100644
--- a/frontend/src/components/SearchView.tsx
+++ b/frontend/src/components/SearchView.tsx
@@ -1,7 +1,7 @@
 import React, { useState, useEffect, useCallback } from 'react';
 import { useTranslation } from 'react-i18next';
 import { 
-  Search, Sparkles, Lock, FileText,
+  Search, Sparkles, FileText,
   SlidersHorizontal, Database, TextSearch, Network, ListOrdered, 
   ChevronDown, X, Copy, Check, Languages, Cpu, Hash, Calendar, 
   Clock, ArrowUpDown, SquarePlay, BookOpen, Globe, Filter, Newspaper, Loader2,
@@ -10,6 +10,7 @@ import {
 import { useAppContext } from '../store/AppContext';
 import { motion, AnimatePresence } from 'motion/react';
 import { api } from '../services/api';
+import { SubjectIcon } from './SubjectIcon';
 
 // --- Types ---
 interface SearchResult {
@@ -30,7 +31,8 @@ interface SearchResult {
 
 const getIcon = (type: string) => {
   switch (type.toLowerCase()) {
-    case 'youtube': return SquarePlay;
+    case 'youtube': 
+    case 'video': return SquarePlay;
     case 'article': return Newspaper;
     case 'pdf': return FileText;
     case 'wikipedia': return BookOpen;
@@ -56,24 +58,6 @@ const getModalModeLabelForScore = (mode: string, score: number) => {
   return `${(score * 100).toFixed(1)}% MATCH`;
 };
 
-const renderContextBadge = (selectedSubjects: any[], t: any) => {
-  if (!selectedSubjects || selectedSubjects.length === 0) {
-    return <span className="text-red-400 font-medium whitespace-nowrap">{t('sidebar.contexts.none')}</span>;
-  }
-  if (selectedSubjects.length <= 2) {
-    return selectedSubjects.map(s => (
-      <span key={s.id} className="px-2 py-0.5 rounded bg-zinc-800 border border-zinc-700 text-zinc-300 text-xs truncate max-w-[180px]" title={s.name}>
-        {s.name}
-      </span>
-    ));
-  }
-  return (
-    <span className="px-2 py-0.5 rounded bg-zinc-800 border border-zinc-700 text-zinc-300 text-xs whitespace-nowrap">
-      {selectedSubjects.length} {t('sidebar.contexts.title')}
-    </span>
-  );
-};
-
 export function SearchView() {
   const { subjects, selectedSubjects, sources } = useAppContext();
   const { t } = useTranslation();
@@ -87,6 +71,36 @@ export function SearchView() {
   const [copied, setCopied] = useState(false);
   const [searchMode, setSearchMode] = useState<'semantic' | 'bm25' | 'hybrid'>('semantic');
   const [useRerank, setUseRerank] = useState(true);
+  // Local context selection (independent from sidebar). Empty array + searchAll=true => search across all contexts.
+  const [searchSubjectIds, setSearchSubjectIds] = useState<string[]>(() => selectedSubjects.map(s => s.id));
+  const [searchAll, setSearchAll] = useState<boolean>(true);
+  const [isContextsOpen, setIsContextsOpen] = useState(false);
+  const [contextsFilter, setContextsFilter] = useState('');
+  const contextsRef = React.useRef<HTMLDivElement>(null);
+
+  const filteredSubjects = React.useMemo(
+    () => subjects.filter(s => s.name.toLowerCase().includes(contextsFilter.toLowerCase())),
+    [subjects, contextsFilter]
+  );
+
+  // Close contexts popover on click outside
+  useEffect(() => {
+    const onClick = (e: MouseEvent) => {
+      if (contextsRef.current && !contextsRef.current.contains(e.target as Node)) {
+        setIsContextsOpen(false);
+      }
+    };
+    document.addEventListener('mousedown', onClick);
+    return () => document.removeEventListener('mousedown', onClick);
+  }, []);
+
+  const contextsLabel = searchAll
+    ? t('sidebar.contexts.title_all')
+    : searchSubjectIds.length === 0
+      ? t('sidebar.contexts.none')
+      : searchSubjectIds.length === 1
+        ? subjects.find(s => s.id === searchSubjectIds[0])?.name || t('common.selected_one')
+        : t('common.selected', { count: searchSubjectIds.length });
 
   const sourceMap = React.useMemo(() => {
     const map = new Map<string, string>();
@@ -98,14 +112,15 @@ export function SearchView() {
   }, [sources]);
 
   const runSearch = useCallback(async (currentQuery: string, currentMode: string, currentUseRerank: boolean) => {
-    if (!currentQuery.trim() || selectedSubjects.length === 0) return;
+    if (!currentQuery.trim()) return;
+    if (!searchAll && searchSubjectIds.length === 0) return;
 
     setIsSearching(true);
     setHasSearched(true);
     setResults([]);
 
     try {
-      const subjectIds = selectedSubjects.length > 0 ? selectedSubjects.map(s => s.id) : undefined;
+      const subjectIds = searchAll ? undefined : searchSubjectIds;
       const data = await api.search(currentQuery, topK, subjectIds, currentMode, currentUseRerank);
 
       const mappedResults: SearchResult[] = data.results.map((res: any) => {
@@ -136,7 +151,7 @@ export function SearchView() {
     } finally {
       setIsSearching(false);
     }
-  }, [selectedSubjects, topK, sourceMap, t]);
+  }, [searchAll, searchSubjectIds, topK, sourceMap, subjects, t]);
 
   // Re-run search automatically when the mode or useRerank changes (only if a search was already performed)
   useEffect(() => {
@@ -158,104 +173,187 @@ export function SearchView() {
       transition={{ duration: 0.4 }}
       className="h-full flex flex-col overflow-hidden"
     >
-      {/* Header & Search Bar */}
-      <div className="p-8 pb-4 max-w-5xl mx-auto w-full flex-shrink-0">
-        <motion.div 
-          initial={{ opacity: 0, y: -10 }}
+      {/* Header */}
+      <div className="px-8 pt-10 pb-6 max-w-4xl mx-auto w-full flex-shrink-0">
+        <motion.div
+          initial={{ opacity: 0, y: -8 }}
           animate={{ opacity: 1, y: 0 }}
-          transition={{ delay: 0.1, duration: 0.4 }}
-          className="mb-8 text-center"
+          transition={{ duration: 0.4 }}
+          className="mb-6"
         >
-          <h2 className="text-3xl font-bold text-white tracking-tight mb-3">{t('search.title')}</h2>
-          <p className="text-zinc-400">
-            {t('search.subtitle')}
-          </p>
+          <h2 className="text-2xl font-semibold text-white tracking-tight">{t('search.title')}</h2>
+          <p className="text-sm text-zinc-500 mt-1">{t('search.subtitle')}</p>
         </motion.div>
 
-        <motion.form 
-          initial={{ opacity: 0, scale: 0.98 }}
-          animate={{ opacity: 1, scale: 1 }}
-          transition={{ delay: 0.2, duration: 0.4 }}
-          onSubmit={handleSearch} 
-          className="relative max-w-4xl mx-auto"
+        {/* Search Input — pill */}
+        <motion.form
+          initial={{ opacity: 0, y: 4 }}
+          animate={{ opacity: 1, y: 0 }}
+          transition={{ delay: 0.1, duration: 0.4 }}
+          onSubmit={handleSearch}
+          className="relative"
         >
-          <div className="relative flex items-center bg-zinc-900/60 backdrop-blur-xl border border-white/10 hover:border-white/20 focus-within:border-emerald-500/50 focus-within:ring-4 focus-within:ring-emerald-500/10 rounded-2xl shadow-2xl transition-all p-2">
-            <Search className="w-6 h-6 text-zinc-400 ml-4" />
+          <div className="group relative flex items-center h-14 bg-zinc-900/60 backdrop-blur-xl border border-white/10 hover:border-white/15 focus-within:border-emerald-500/40 focus-within:ring-4 focus-within:ring-emerald-500/5 rounded-full transition-all pl-5 pr-2">
+            <Search className="w-5 h-5 text-zinc-500 flex-shrink-0" />
             <input
               type="text"
               value={query}
               onChange={(e) => setQuery(e.target.value)}
               placeholder={t('search.placeholder')}
-              className="w-full bg-transparent text-xl text-zinc-100 placeholder:text-zinc-600 px-5 py-4 focus:outline-none font-sans"
+              className="w-full bg-transparent text-base text-zinc-100 placeholder:text-zinc-600 px-4 focus:outline-none"
             />
             <button
               type="submit"
-              disabled={!query.trim() || isSearching || selectedSubjects.length === 0}
-              className="group px-8 py-4 bg-emerald-500 text-black font-bold rounded-xl hover:bg-emerald-400 disabled:opacity-50 disabled:hover:bg-emerald-500 transition-all shadow-[0_0_20px_rgba(16,185,129,0.2)] active:scale-95 flex items-center gap-3 uppercase tracking-wider text-sm"
+              disabled={!query.trim() || isSearching || (!searchAll && searchSubjectIds.length === 0)}
+              className="h-10 px-5 bg-emerald-500 text-black text-sm font-semibold rounded-full hover:bg-emerald-400 disabled:opacity-40 disabled:cursor-not-allowed transition-all active:scale-[0.97] flex items-center gap-2 flex-shrink-0"
             >
               {isSearching ? (
-                <Loader2 className="w-5 h-5 animate-spin" />
+                <Loader2 className="w-4 h-4 animate-spin" />
               ) : (
-                <Search className="w-5 h-5 transition-transform group-hover:scale-110" />
+                <Search className="w-4 h-4" />
               )}
-              {isSearching ? t('common.actions.syncing') : t('common.actions.search')}
+              <span>{isSearching ? t('common.actions.syncing') : t('common.actions.search')}</span>
             </button>
           </div>
         </motion.form>
 
-        {/* Search Controls */}
-        <motion.div 
+        {/* Toolbar */}
+        <motion.div
           initial={{ opacity: 0 }}
           animate={{ opacity: 1 }}
-          transition={{ delay: 0.3, duration: 0.5 }}
-          className="max-w-4xl mx-auto mt-6 flex flex-wrap items-center justify-between gap-4"
+          transition={{ delay: 0.2, duration: 0.4 }}
+          className="mt-4 flex items-center gap-2 min-w-0"
         >
-          <div className="flex items-center gap-2 text-sm min-w-0 flex-1">
-            <Database className="w-4 h-4 text-zinc-500 flex-shrink-0" />
-            <span className="text-zinc-400 whitespace-nowrap flex-shrink-0">{t('search.results.source')}:</span>
-            <div className="flex gap-1.5 flex-nowrap overflow-hidden min-w-0">
-              {renderContextBadge(selectedSubjects, t)}
-            </div>
+          {/* Contexts chip — opens its own popover */}
+          <div ref={contextsRef} className="relative">
+            <button
+              type="button"
+              onClick={() => setIsContextsOpen(v => !v)}
+              className={`h-9 inline-flex items-center gap-2 px-3 rounded-full text-xs font-medium transition-all ring-1 ${
+                searchAll
+                  ? 'bg-emerald-500/10 text-emerald-300 ring-emerald-500/30 hover:bg-emerald-500/15'
+                  : 'bg-white/[0.04] text-zinc-200 ring-white/10 hover:bg-white/[0.07]'
+              }`}
+            >
+              <Database className="w-3.5 h-3.5" />
+              <span className="max-w-[200px] truncate">{contextsLabel}</span>
+              <ChevronDown className={`w-3.5 h-3.5 opacity-60 transition-transform ${isContextsOpen ? 'rotate-180' : ''}`} />
+            </button>
+
+            <AnimatePresence>
+              {isContextsOpen && (
+                <motion.div
+                  initial={{ opacity: 0, y: -6, scale: 0.96 }}
+                  animate={{ opacity: 1, y: 0, scale: 1 }}
+                  exit={{ opacity: 0, y: -6, scale: 0.96 }}
+                  transition={{ duration: 0.12 }}
+                  className="absolute top-full left-0 mt-2 w-72 bg-zinc-900/95 backdrop-blur-xl ring-1 ring-white/10 rounded-2xl shadow-2xl overflow-hidden z-30"
+                >
+                  {/* Search input */}
+                  <div className="p-2 border-b border-white/5">
+                    <div className="relative">
+                      <Search className="w-3.5 h-3.5 text-zinc-500 absolute left-2.5 top-1/2 -translate-y-1/2 pointer-events-none" />
+                      <input
+                        autoFocus
+                        type="text"
+                        value={contextsFilter}
+                        onChange={(e) => setContextsFilter(e.target.value)}
+                        placeholder={t('common.actions.search') + '...'}
+                        className="w-full h-8 pl-8 pr-2 bg-white/[0.04] ring-1 ring-white/10 rounded-lg text-xs text-zinc-200 placeholder:text-zinc-600 focus:outline-none focus:ring-emerald-500/30"
+                      />
+                    </div>
+                  </div>
+                  <div className="p-1.5">
+                    <button
+                      onClick={() => { setSearchAll(true); setIsContextsOpen(false); }}
+                      className={`w-full flex items-center justify-between gap-2 px-3 py-2 rounded-xl text-xs transition-colors ${
+                        searchAll ? 'bg-emerald-500/10 text-emerald-300' : 'text-zinc-200 hover:bg-white/5'
+                      }`}
+                    >
+                      <span className="flex items-center gap-2 font-medium">
+                        <Database className="w-3.5 h-3.5" />
+                        {t('sidebar.contexts.title')} • All
+                      </span>
+                      {searchAll && <Check className="w-3.5 h-3.5" />}
+                    </button>
+                  </div>
+                  <div className="h-px bg-white/5" />
+                  <div className="max-h-64 overflow-y-auto custom-scrollbar p-1.5">
+                    {filteredSubjects.length === 0 ? (
+                      <div className="px-3 py-6 text-center text-xs text-zinc-500">{t('sidebar.contexts.none')}</div>
+                    ) : (
+                      filteredSubjects.map(s => {
+                        const checked = !searchAll && searchSubjectIds.includes(s.id);
+                        return (
+                          <button
+                            key={s.id}
+                            onClick={() => {
+                              setSearchAll(false);
+                              setSearchSubjectIds(ids =>
+                                ids.includes(s.id) ? ids.filter(i => i !== s.id) : [...ids, s.id]
+                              );
+                            }}
+                            className={`w-full flex items-center justify-between gap-2 px-3 py-2 rounded-xl text-xs transition-colors ${
+                              checked ? 'bg-white/[0.06] text-zinc-100' : 'text-zinc-300 hover:bg-white/5'
+                            }`}
+                          >
+                            <span className="flex items-center gap-2 min-w-0">
+                              <SubjectIcon iconName={s.icon} className="w-3.5 h-3.5" />
+                              <span className="truncate">{s.name}</span>
+                            </span>
+                            <span className={`w-4 h-4 rounded-md flex items-center justify-center flex-shrink-0 ${
+                              checked ? 'bg-emerald-500 text-black' : 'ring-1 ring-white/15'
+                            }`}>
+                              {checked && <Check className="w-3 h-3" strokeWidth={3} />}
+                            </span>
+                          </button>
+                        );
+                      })
+                    )}
+                  </div>
+                </motion.div>
+              )}
+            </AnimatePresence>
           </div>
 
-          <div className="flex items-center gap-3 flex-shrink-0">
-            {/* Top K Selector */}
+          {/* spacer pushes options to the right */}
+          <div className="flex-1 min-w-0" />
+
+          {/* Options group */}
+          <div className="flex items-center gap-2 flex-shrink-0">
+            {/* Top K */}
             <div className="relative">
               <button
                 type="button"
                 onClick={() => setIsTopKOpen(!isTopKOpen)}
                 onBlur={() => setTimeout(() => setIsTopKOpen(false), 150)}
-                className="flex items-center gap-2 bg-zinc-900/80 border border-zinc-800 hover:border-zinc-700 rounded-lg p-1.5 px-3 h-[34px] transition-colors"
+                className="h-9 inline-flex items-center gap-1.5 px-3 rounded-full text-xs text-zinc-300 bg-white/[0.04] ring-1 ring-white/10 hover:bg-white/[0.07] transition-all"
+                title={t('search.options.topKn')}
               >
-                <ListOrdered className="w-3.5 h-3.5 text-zinc-500" />
-                <span className="text-xs text-zinc-400 font-medium">{t('search.options.topKn')}:</span>
-                <span className="text-xs text-zinc-200 font-medium w-4 text-left">{topK}</span>
-                <ChevronDown className={`w-3.5 h-3.5 text-zinc-500 transition-transform duration-200 ${isTopKOpen ? 'rotate-180' : ''}`} />
+                <ListOrdered className="w-3.5 h-3.5 text-zinc-500 flex-shrink-0" />
+                <span className="font-medium tabular-nums">{topK}</span>
+                <ChevronDown className={`w-3 h-3 text-zinc-500 flex-shrink-0 transition-transform ${isTopKOpen ? 'rotate-180' : ''}`} />
               </button>
-              
               <AnimatePresence>
                 {isTopKOpen && (
-                  <motion.div 
-                    initial={{ opacity: 0, y: -10, scale: 0.95 }}
+                  <motion.div
+                    initial={{ opacity: 0, y: -6, scale: 0.96 }}
                     animate={{ opacity: 1, y: 0, scale: 1 }}
-                    exit={{ opacity: 0, y: -10, scale: 0.95 }}
-                    transition={{ duration: 0.1 }}
-                    className="absolute top-full right-0 mt-1 w-24 bg-zinc-900 border border-zinc-800 rounded-lg shadow-xl overflow-hidden z-20"
+                    exit={{ opacity: 0, y: -6, scale: 0.96 }}
+                    transition={{ duration: 0.12 }}
+                    className="absolute top-full right-0 mt-2 w-28 bg-zinc-900/95 backdrop-blur-xl ring-1 ring-white/10 rounded-xl shadow-2xl overflow-hidden z-20 p-1"
                   >
                     {[3, 5, 10, 20, 50].map((val) => (
                       <button
                         key={val}
-                        onClick={() => {
-                          setTopK(val);
-                          setIsTopKOpen(false);
-                        }}
-                        className={`w-full text-left px-3 py-2 text-xs transition-colors ${
-                          topK === val 
-                            ? 'bg-emerald-500/10 text-emerald-400 font-medium' 
-                            : 'text-zinc-300 hover:bg-zinc-800/80 hover:text-white'
+                        onClick={() => { setTopK(val); setIsTopKOpen(false); }}
+                        className={`w-full text-left px-3 py-1.5 text-xs rounded-lg transition-colors ${
+                          topK === val
+                            ? 'bg-emerald-500/10 text-emerald-300'
+                            : 'text-zinc-300 hover:bg-white/5'
                         }`}
                       >
-                        {val}
+                        {val} results
                       </button>
                     ))}
                   </motion.div>
@@ -263,81 +361,55 @@ export function SearchView() {
               </AnimatePresence>
             </div>
 
-            {/* Re-rank Toggle */}
+            {/* Re-rank — icon toggle */}
             <button
               type="button"
               onClick={() => setUseRerank(!useRerank)}
-              className={`flex items-center gap-2 px-3 py-1.5 rounded-lg text-xs font-medium transition-all border flex-shrink-0 ${
-                useRerank 
-                  ? 'bg-emerald-500/10 text-emerald-400 border border-emerald-500/30 shadow-[0_0_10px_rgba(16,185,129,0.1)]' 
-                  : 'bg-zinc-900/80 text-zinc-500 border-zinc-800 hover:border-zinc-700 hover:text-zinc-300'
-              }`}
               title={t('search.options.rerank')}
-            >
-              <ArrowUpDown className="w-3.5 h-3.5" />
-              {t('common.actions.filter')} {useRerank ? 'ON' : 'OFF'}
-            </button>
-
-            {/* Search Mode Toggle */}
-            <div className="flex flex-nowrap items-center gap-1 bg-zinc-900/80 border border-zinc-800 rounded-lg p-1 flex-shrink-0">
-            {/* Semantic */}
-            <button
-              type="button"
-              onClick={() => setSearchMode('semantic')}
-              className={`flex items-center gap-2 px-3 py-1.5 rounded-md text-xs font-medium transition-all whitespace-nowrap ${
-                searchMode === 'semantic'
-                  ? 'bg-zinc-800 text-emerald-400 shadow-sm border border-zinc-700/50'
-                  : 'text-zinc-500 hover:text-zinc-300'
-              }`}
-            >
-              <Sparkles className="w-3.5 h-3.5" />
-              {t('search.modes.semantic')}
-            </button>
-
-            {/* Keyword (BM25) */}
-            <button
-              type="button"
-              onClick={() => setSearchMode('bm25')}
-              className={`flex items-center gap-2 px-3 py-1.5 rounded-md text-xs font-medium transition-all whitespace-nowrap ${
-                searchMode === 'bm25'
-                  ? 'bg-zinc-800 text-sky-400 shadow-sm border border-zinc-700/50'
-                  : 'text-zinc-500 hover:text-zinc-300'
-              }`}
-            >
-              <TextSearch className="w-3.5 h-3.5" />
-              {t('search.modes.bm25')}
-            </button>
-
-            {/* Hybrid */}
-            <button
-              type="button"
-              onClick={() => setSearchMode('hybrid')}
-              className={`flex items-center gap-2 px-3 py-1.5 rounded-md text-xs font-medium transition-all whitespace-nowrap ${
-                searchMode === 'hybrid'
-                  ? 'bg-zinc-800 text-violet-400 shadow-sm border border-zinc-700/50'
-                  : 'text-zinc-500 hover:text-zinc-300'
+              aria-pressed={useRerank}
+              className={`h-9 w-9 inline-flex items-center justify-center rounded-full transition-all ring-1 ${
+                useRerank
+                  ? 'bg-emerald-500/10 text-emerald-300 ring-emerald-500/30'
+                  : 'bg-white/[0.03] text-zinc-500 ring-white/10 hover:bg-white/[0.06] hover:text-zinc-300'
               }`}
             >
-              <SlidersHorizontal className="w-3.5 h-3.5" />
-              {t('search.modes.hybrid')}
+              <ArrowUpDown className="w-4 h-4" />
             </button>
 
-            {/* MMR (Disabled — needs dedicated implementation) */}
-            <div className="relative group">
-              <button 
+            {/* Mode — icon-only segmented control */}
+            <div className="h-9 inline-flex items-center p-1 rounded-full bg-white/[0.04] ring-1 ring-white/10">
+              {[
+                { key: 'semantic', icon: Sparkles, label: t('search.modes.semantic'), color: 'text-emerald-300' },
+                { key: 'bm25', icon: TextSearch, label: t('search.modes.bm25'), color: 'text-sky-300' },
+                { key: 'hybrid', icon: SlidersHorizontal, label: t('search.modes.hybrid'), color: 'text-violet-300' },
+              ].map(({ key, icon: Icon, label, color }) => {
+                const active = searchMode === key;
+                return (
+                  <button
+                    key={key}
+                    type="button"
+                    onClick={() => setSearchMode(key as any)}
+                    title={label}
+                    className={`h-7 inline-flex items-center gap-1.5 px-2.5 rounded-full text-xs font-medium transition-all whitespace-nowrap ${
+                      active ? `bg-zinc-800 ${color} shadow-sm` : 'text-zinc-500 hover:text-zinc-300'
+                    }`}
+                  >
+                    <Icon className="w-3.5 h-3.5 flex-shrink-0" />
+                    {label}
+                  </button>
+                );
+              })}
+              <button
+                type="button"
                 disabled
-                className="flex items-center gap-2 px-3 py-1.5 rounded-md text-zinc-500 text-xs font-medium cursor-not-allowed opacity-70 whitespace-nowrap"
+                title={`${t('ingestion.coming_soon.title')} (MMR)`}
+                className="h-7 inline-flex items-center gap-1.5 px-2.5 rounded-full text-xs font-medium text-zinc-600 cursor-not-allowed whitespace-nowrap"
               >
                 <Network className="w-3.5 h-3.5" />
                 MMR
-                <Lock className="w-3 h-3 ml-1" />
               </button>
-              <div className="absolute bottom-full left-1/2 -translate-x-1/2 mb-2 w-48 p-2 bg-zinc-800 border border-zinc-700 rounded-lg text-[10px] text-zinc-300 text-center opacity-0 group-hover:opacity-100 transition-opacity pointer-events-none shadow-xl z-10">
-                {t('ingestion.coming_soon.title')} (MMR)
-              </div>
             </div>
           </div>
-          </div>
         </motion.div>
       </div>
       {/* Results Area */}
diff --git a/frontend/src/components/SubjectIcon.tsx b/frontend/src/components/SubjectIcon.tsx
new file mode 100644
index 00000000..7e9f76ab
--- /dev/null
+++ b/frontend/src/components/SubjectIcon.tsx
@@ -0,0 +1,56 @@
+import React from 'react';
+import {
+  Brain, Briefcase, ChefHat, Cpu, Landmark, Lightbulb, Activity, Hash,
+  Database, Book, Globe, Zap, Shield, Search, Code, MessageSquare, Layout,
+  Layers, HardDrive, Cloud, Lock, User, Users, Target, Award, GraduationCap,
+  Music, Video, Image, FileText, Mail, Terminal, Bug
+} from 'lucide-react';
+
+export const ICONS_LIST = [
+  { name: 'Brain', icon: Brain },
+  { name: 'Briefcase', icon: Briefcase },
+  { name: 'ChefHat', icon: ChefHat },
+  { name: 'Cpu', icon: Cpu },
+  { name: 'Landmark', icon: Landmark },
+  { name: 'Lightbulb', icon: Lightbulb },
+  { name: 'Activity', icon: Activity },
+  { name: 'Hash', icon: Hash },
+  { name: 'Database', icon: Database },
+  { name: 'Book', icon: Book },
+  { name: 'Globe', icon: Globe },
+  { name: 'Zap', icon: Zap },
+  { name: 'Shield', icon: Shield },
+  { name: 'Search', icon: Search },
+  { name: 'Code', icon: Code },
+  { name: 'MessageSquare', icon: MessageSquare },
+  { name: 'Layout', icon: Layout },
+  { name: 'Layers', icon: Layers },
+  { name: 'HardDrive', icon: HardDrive },
+  { name: 'Cloud', icon: Cloud },
+  { name: 'Lock', icon: Lock },
+  { name: 'User', icon: User },
+  { name: 'Users', icon: Users },
+  { name: 'Target', icon: Target },
+  { name: 'Award', icon: Award },
+  { name: 'GraduationCap', icon: GraduationCap },
+  { name: 'Music', icon: Music },
+  { name: 'Video', icon: Video },
+  { name: 'Image', icon: Image },
+  { name: 'FileText', icon: FileText },
+  { name: 'Mail', icon: Mail },
+  { name: 'Terminal', icon: Terminal },
+  { name: 'Bug', icon: Bug },
+];
+
+interface SubjectIconProps {
+  readonly iconName?: string;
+  readonly className?: string;
+  readonly size?: number;
+}
+
+export function SubjectIcon({ iconName, className, size }: SubjectIconProps) {
+  const item = ICONS_LIST.find(i => i.name === iconName);
+  const IconComponent = item ? item.icon : Hash;
+  
+  return <IconComponent className={className} size={size} />;
+}
diff --git a/frontend/src/locales/en.json b/frontend/src/locales/en.json
index 78787084..b9545ced 100644
--- a/frontend/src/locales/en.json
+++ b/frontend/src/locales/en.json
@@ -22,6 +22,9 @@
       "selectAll": "Select all",
       "deselectAll": "Deselect all"
     },
+    "selected_one": "1 selected",
+    "selected_other": "{{count}} selected",
+    "all": "All",
     "errors": {
       "general": "An unexpected error occurred. Please try again."
     },
@@ -58,6 +61,7 @@
     },
     "contexts": {
       "title": "Knowledge Contexts",
+      "title_all": "Knowledge Contexts • All",
       "placeholder": "Filter contexts...",
       "none": "No contexts found.",
       "only": "Only",
diff --git a/frontend/src/locales/pt-BR.json b/frontend/src/locales/pt-BR.json
index 98317080..d95bf03e 100644
--- a/frontend/src/locales/pt-BR.json
+++ b/frontend/src/locales/pt-BR.json
@@ -22,6 +22,9 @@
       "selectAll": "Selecionar todos",
       "deselectAll": "Desmarcar todos"
     },
+    "selected_one": "1 selecionado",
+    "selected_other": "{{count}} selecionados",
+    "all": "Todos",
     "errors": {
       "general": "Ocorreu um erro inesperado. Tente novamente."
     },
@@ -58,6 +61,7 @@
     },
     "contexts": {
       "title": "Bases de Conhecimento",
+      "title_all": "Bases de Conhecimento • Todas",
       "placeholder": "Filtrar contextos...",
       "none": "Nenhum contexto encontrado.",
       "only": "Apenas",

From f9de053d331b7bff211e8e48b2001fcc5e9850bf Mon Sep 17 00:00:00 2001
From: ericksonlopes <ofc.erickson@gmail.com>
Date: Wed, 8 Apr 2026 12:04:03 -0300
Subject: [PATCH 3/7] feat: implementation of chunk duplication tests and
 sidebar UX improvements

---
 alembic/env.py                                |   6 +-
 ...c845_add_chunk_duplicates_table_and_is_.py |  42 +++
 ...673_add_content_source_id_to_duplicates.py |  32 ++
 docs/issues/issue-duplication-tests-ux.md     |  14 +
 frontend/src/App.tsx                          | 106 ++-----
 frontend/src/components/DiarizationView.tsx   |  84 ++---
 frontend/src/components/DuplicatesView.tsx    | 289 ++++++++++++++++++
 frontend/src/components/Sidebar.tsx           | 143 ++++++---
 frontend/src/components/SidebarContext.tsx    | 180 +++++++++++
 frontend/src/locales/en.json                  |  26 ++
 frontend/src/locales/pt-BR.json               |  34 ++-
 frontend/src/services/api.ts                  |  56 +++-
 frontend/src/store/AppContext.tsx             |   8 +-
 frontend/src/types.ts                         |  17 +-
 main.py                                       |   9 +
 scripts/clear_sql_db.py                       |   2 +-
 scripts/dump_database.py                      |   2 +-
 scripts/migrate_vector_db.py                  |   2 +-
 src/application/workers.py                    | 147 ++++++++-
 src/domain/entities/chunk_duplicate_entity.py |  17 ++
 src/infrastructure/connectors/__init__.py     |   0
 .../connector_sql.py}                         |   1 -
 .../extractors/youtube_extractor.py           |  35 ++-
 .../sql/chunk_duplicate_repository.py         |  98 ++++++
 .../sql/chunk_index_repository.py             |  21 +-
 .../sql/content_source_repository.py          |   2 +-
 .../sql/diarization_repository.py             |  14 +-
 .../sql/ingestion_job_repository.py           |   2 +-
 .../sql/knowledge_subject_repository.py       |   2 +-
 .../sql/models/chunk_duplicate.py             |  39 +++
 .../repositories/sql/models/chunk_index.py    |   4 +-
 .../repositories/sql/models/content_source.py |   2 +-
 .../sql/models/diarization_record.py          |   2 +-
 .../repositories/sql/models/ingestion_job.py  |   2 +-
 .../sql/models/knowledge_subject.py           |   2 +-
 .../repositories/sql/models/user.py           |   2 +-
 .../repositories/sql/models/voice_record.py   |   2 +-
 .../repositories/sql/user_repository.py       |   2 +-
 .../services/chunk_duplicate_service.py       | 156 ++++++++++
 src/presentation/api/dependencies.py          |  18 +-
 ...udio_diarization_and_recognition_router.py |   6 +-
 .../api/routes/duplicate_router.py            |  96 ++++++
 src/presentation/api/routes/ingest_router.py  |  28 +-
 .../api/routes/settings_router.py             |   2 +-
 .../api/schemas/duplicate_schemas.py          |  35 +++
 test_dispatcher.py                            |  60 ++++
 test_playlist.py                              |  25 ++
 tests/conftest.py                             |   2 +-
 .../sql/test_chunk_duplicate_repository.py    |  90 ++++++
 .../services/test_chunk_duplicate_service.py  |  86 ++++++
 .../api/routes/test_duplicate_router.py       |  74 +++++
 tmp/cleanup_db.py                             |  27 ++
 52 files changed, 1902 insertions(+), 251 deletions(-)
 create mode 100644 alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py
 create mode 100644 alembic/versions/84524e052673_add_content_source_id_to_duplicates.py
 create mode 100644 docs/issues/issue-duplication-tests-ux.md
 create mode 100644 frontend/src/components/DuplicatesView.tsx
 create mode 100644 frontend/src/components/SidebarContext.tsx
 create mode 100644 src/domain/entities/chunk_duplicate_entity.py
 create mode 100644 src/infrastructure/connectors/__init__.py
 rename src/infrastructure/{repositories/sql/connector.py => connectors/connector_sql.py} (95%)
 create mode 100644 src/infrastructure/repositories/sql/chunk_duplicate_repository.py
 create mode 100644 src/infrastructure/repositories/sql/models/chunk_duplicate.py
 create mode 100644 src/infrastructure/services/chunk_duplicate_service.py
 create mode 100644 src/presentation/api/routes/duplicate_router.py
 create mode 100644 src/presentation/api/schemas/duplicate_schemas.py
 create mode 100644 test_dispatcher.py
 create mode 100644 test_playlist.py
 create mode 100644 tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py
 create mode 100644 tests/infrastructure/services/test_chunk_duplicate_service.py
 create mode 100644 tests/presentation/api/routes/test_duplicate_router.py
 create mode 100644 tmp/cleanup_db.py

diff --git a/alembic/env.py b/alembic/env.py
index 4f8e48f2..286fa357 100644
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -10,7 +10,7 @@
 
 from alembic import context
 from src.config.settings import settings
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 _package_name = "src.infrastructure.repositories.sql.models"
 
@@ -58,7 +58,7 @@ def include_object(obj, name, type_, reflected, compare_to):
 @writer.rewrites(ops.CreateTableOp)
 @writer.rewrites(ops.CreateIndexOp)
 def add_if_not_exists(context, revision, op):
-    if not context.as_batch:
+    if not getattr(context, "as_batch", False):
         op.if_not_exists = True
     return op
 
@@ -66,7 +66,7 @@ def add_if_not_exists(context, revision, op):
 @writer.rewrites(ops.DropTableOp)
 @writer.rewrites(ops.DropIndexOp)
 def add_if_exists(context, revision, op):
-    if not context.as_batch:
+    if not getattr(context, "as_batch", False):
         op.if_exists = True
     return op
 
diff --git a/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py
new file mode 100644
index 00000000..65229188
--- /dev/null
+++ b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py
@@ -0,0 +1,42 @@
+"""Add chunk duplicates table and is_active flag
+
+Revision ID: 646a175ac845
+Revises: b2c3d4e5f6a7
+Create Date: 2026-04-08 09:56:58.625813
+
+"""
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = '646a175ac845'
+down_revision: Union[str, Sequence[str], None] = 'b2c3d4e5f6a7'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('chunk_duplicates',
+        sa.Column('id', sa.UUID(), nullable=False),
+        sa.Column('chunk_ids', sa.JSON(), nullable=False),
+        sa.Column('similarity', sa.Float(), nullable=False),
+        sa.Column('status', sa.Text(), nullable=False),
+        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
+        sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
+        sa.PrimaryKeyConstraint('id')
+    )
+    op.add_column('chunk_index', sa.Column('is_active', sa.Boolean(), server_default=sa.text('1'), nullable=False))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('chunk_index', 'is_active')
+    op.drop_table('chunk_duplicates')
+    # ### end Alembic commands ###
diff --git a/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py
new file mode 100644
index 00000000..f0ac6124
--- /dev/null
+++ b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py
@@ -0,0 +1,32 @@
+"""add_content_source_id_to_duplicates
+
+Revision ID: 84524e052673
+Revises: 646a175ac845
+Create Date: 2026-04-08 10:50:39.027257
+
+"""
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = '84524e052673'
+down_revision: Union[str, Sequence[str], None] = '646a175ac845'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('content_source_id', sa.UUID(), nullable=True))
+        batch_op.create_foreign_key('fk_chunk_duplicates_content_source_id_content_sources', 'content_sources', ['content_source_id'], ['id'], initially='IMMEDIATE', deferrable=True)
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op:
+        batch_op.drop_constraint('fk_chunk_duplicates_content_source_id_content_sources', type_='foreignkey')
+        batch_op.drop_column('content_source_id')
diff --git a/docs/issues/issue-duplication-tests-ux.md b/docs/issues/issue-duplication-tests-ux.md
new file mode 100644
index 00000000..5a712623
--- /dev/null
+++ b/docs/issues/issue-duplication-tests-ux.md
@@ -0,0 +1,14 @@
+## Description
+Implemented a comprehensive test suite for the chunk duplication feature, covering repository, service, and API layers. Additionally, improved the sidebar UX by enabling simple-toggle multi-selection, fixing indicator icon bugs, and adding a search-by-name field for subjects.
+
+## Tasks
+- [x] Create SQL repository tests for chunk duplicates `tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py`
+- [x] Create service tests for duplicate detection logic `tests/infrastructure/services/test_chunk_duplicate_service.py`
+- [x] Create API router tests for duplicate endpoints `tests/presentation/api/routes/test_duplicate_router.py`
+- [x] Update `SidebarContext.tsx` to enable simple toggle selection for multiple bases.
+- [x] Fix Check icon bug in multi-selection in `SidebarContext.tsx`.
+- [x] Add search filter field in `SidebarContext.tsx`.
+- [x] Fix `tests/conftest.py` import path for infrastructure.
+
+## Additional Context
+The sidebar changes eliminate the need for Ctrl+Click, making the multi-knowledge selection more discoverable. The search field ensures usability as the number of knowledge bases grows.
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index 140e0da9..7343c312 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -34,6 +34,8 @@ import {ContentSource} from './types';
 import {ChatView} from './components/ChatView';
 import {KnowledgeAdminView} from './components/KnowledgeAdminView';
 import {QueueMonitorView} from './components/QueueMonitorView';
+import {DuplicatesView} from './components/DuplicatesView';
+import {SidebarContext} from './components/SidebarContext';
 
 
 function ActivityMonitorView() {
@@ -307,10 +309,10 @@ function ContentSourcesView() {
   const filteredSources = React.useMemo(() => {
     let result = sources;
 
-    // Filter by subject context (Single select in this view)
+    // Filter by subject context (Multi select)
     if (selectedSubjects.length > 0) {
-      const selectedId = selectedSubjects[0].id;
-      result = result.filter(src => src.subjectId === selectedId);
+      const selectedIds = selectedSubjects.map(s => s.id);
+      result = result.filter(src => src.subjectId && selectedIds.includes(src.subjectId));
     }
     
     if (typeFilter !== 'all') {
@@ -423,9 +425,7 @@ function ContentSourcesView() {
 
               <button
                 onClick={() => setIsAddModalOpen(true)}
-                disabled={selectedSubjects.length === 0}
-                className={`flex items-center gap-2 px-4 py-2 rounded-xl font-black uppercase text-[10px] tracking-widest transition-all ${selectedSubjects.length === 0 ? 'bg-zinc-800 text-zinc-500 cursor-not-allowed opacity-50' : 'bg-emerald-500 text-black hover:bg-emerald-400 shadow-[0_0_15px_rgba(16,185,129,0.2)]'}`}
-                title={selectedSubjects.length === 0 ? t('common.hints.select_subject') : ''}
+                className="flex items-center gap-2 px-4 py-2 rounded-xl font-black uppercase text-[10px] tracking-widest transition-all bg-emerald-500 text-black hover:bg-emerald-400 shadow-[0_0_15px_rgba(16,185,129,0.2)]"
               >
                 <Plus className="w-4 h-4 stroke-[3px]" />
                 {t('sources.add_btn')}
@@ -456,71 +456,11 @@ function ContentSourcesView() {
               onSearchSubmit={handleSearchSubmit}
               typeFilter={typeFilter}
               onTypeFilterChange={handleTypeChange}
-              onPageSizeChange={setPageSize}
               emptyMessage="Nenhuma fonte encontrada nesta base ou com esses filtros."
             />
           )}
         </motion.div>
       </div>
-
-      {/* 🚀 FIXED RIGHT SIDEBAR (Experience unified with Diarization) */}
-      <div className="w-80 border-l border-white/5 bg-black/20 backdrop-blur-xl flex flex-col h-full shrink-0 relative z-20">
-        <div className="p-6 border-b border-white/5 flex items-center justify-between">
-          <div className="flex items-center gap-2">
-            <Database className="w-4 h-4 text-emerald-400" />
-            <h3 className="text-xs font-black text-white uppercase tracking-widest">{t('ecosystem.title')}</h3>
-          </div>
-        </div>
-        
-        <div className="p-6 border-b border-white/5 bg-emerald-500/5">
-          <p className="text-[10px] text-emerald-400/70 font-black uppercase tracking-widest leading-relaxed">
-            {t('ecosystem.description')}
-          </p>
-        </div>
-
-        <div className="flex-1 overflow-y-auto p-4 space-y-2 custom-scrollbar">
-          {subjects.map((ctx) => {
-            const isSelected = selectedSubjects.some(s => s.id === ctx.id);
-            return (
-              <button
-                key={ctx.id}
-                onClick={() => selectSubject(ctx)}
-                className={`w-full flex items-center gap-3 p-3 rounded-2xl transition-all border ${isSelected ? 'bg-emerald-500/10 border-emerald-500/30 text-white shadow-[0_0_20px_rgba(16,185,129,0.1)]' : 'bg-transparent border-transparent text-zinc-500 hover:bg-white/5 hover:text-zinc-300'}`}
-              >
-                <div className={`w-8 h-8 rounded-xl flex items-center justify-center transition-all ${isSelected ? 'bg-emerald-500 text-black shadow-lg shadow-emerald-500/20' : 'bg-zinc-900 text-zinc-600'}`}>
-                  <Database className="w-4 h-4" />
-                </div>
-                <div className="text-left flex-1 min-w-0">
-                  <div className={`text-xs font-bold truncate ${isSelected ? 'text-white' : 'text-zinc-400'}`}>
-                    {ctx.name}
-                  </div>
-                  <div className="text-[9px] font-black uppercase tracking-widest opacity-50 mt-0.5">
-                    {ctx.sourceCount || 0} {t('ecosystem.sources_count')}
-                  </div>
-                </div>
-                {isSelected && (
-                  <div className="w-1.5 h-1.5 rounded-full bg-emerald-500 shadow-[0_0_8px_rgba(16,185,129,0.5)]" />
-                )}
-              </button>
-            );
-          })}
-          {subjects.length === 0 && (
-             <div className="py-20 text-center opacity-20">
-                <Database className="w-8 h-8 mx-auto mb-3 text-zinc-500" />
-                <span className="text-[9px] font-black uppercase tracking-widest text-zinc-500">{t('ecosystem.no_base')}</span>
-             </div>
-          )}
-        </div>
-
-        <div className="p-6 border-t border-white/5 mt-auto">
-           <div className="flex items-center gap-3">
-              <div className="w-2 h-2 rounded-full bg-emerald-500 animate-pulse" />
-              <span className="text-[10px] font-black text-zinc-500 uppercase tracking-widest leading-none mt-0.5">
-                {selectedSubjects.length > 0 ? selectedSubjects[0].name : t('ecosystem.no_active_base')}
-              </span>
-           </div>
-        </div>
-      </div>
     </div>
   );
 }
@@ -589,19 +529,27 @@ function MainContent() {
       </header>
 
       {/* View Router */}
-      <main className="flex-1 overflow-hidden relative">
-        <ErrorBoundary>
-          {currentView === 'activity' && <ActivityMonitorView />}
-          {currentView === 'queue' && <QueueMonitorView />}
-          {currentView === 'sources' && <ContentSourcesView />}
-
-          {currentView === 'chat' && <ChatView />}
-          {currentView === 'search' && <SearchView />}
-          {currentView === 'database' && <ChunksViewer />}
-          {currentView === 'knowledge_contexts' && <KnowledgeAdminView />}
-          {currentView === 'diarization' && <DiarizationView/>}
-          {currentView === 'voice_profiles' && <VoiceProfilesView />}
-        </ErrorBoundary>
+      <main className="flex-1 overflow-hidden relative flex">
+        <div className="flex-1 h-full min-w-0 overflow-y-auto">
+          <ErrorBoundary>
+            {currentView === 'activity' && <ActivityMonitorView />}
+            {currentView === 'queue' && <QueueMonitorView />}
+            {currentView === 'sources' && <ContentSourcesView />}
+
+            {currentView === 'chat' && <ChatView />}
+            {currentView === 'search' && <SearchView />}
+            {currentView === 'database' && <ChunksViewer />}
+            {currentView === 'knowledge_contexts' && <KnowledgeAdminView />}
+            {currentView === 'diarization' && <DiarizationView/>}
+            {currentView === 'voice_profiles' && <VoiceProfilesView />}
+            {currentView === 'duplicates' && <DuplicatesView />}
+          </ErrorBoundary>
+        </div>
+        
+        {/* Global Ecosystem Sidebar for Data operations */}
+        {['sources', 'duplicates', 'diarization'].includes(currentView) && (
+          <SidebarContext />
+        )}
       </main>
 
       <AddContentModal 
diff --git a/frontend/src/components/DiarizationView.tsx b/frontend/src/components/DiarizationView.tsx
index 55a274b3..6d52c819 100644
--- a/frontend/src/components/DiarizationView.tsx
+++ b/frontend/src/components/DiarizationView.tsx
@@ -2,8 +2,7 @@ import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'
 import { useTranslation } from 'react-i18next';
 import { AnimatePresence, motion } from 'motion/react';
 import { 
-    ChevronLeft, 
-    Database 
+    ChevronLeft
 } from 'lucide-react';
 import { useAppContext } from '../store/AppContext';
 import { api } from '../services/api';
@@ -28,8 +27,6 @@ export function DiarizationView() {
     const { t } = useTranslation();
     const { 
         selectedSubjects, 
-        setSelectedSubjects,
-        subjects,
         addToast, 
         refreshJobs, 
         refreshSources,
@@ -66,8 +63,8 @@ export function DiarizationView() {
     const loadJobs = useCallback(async (silent = false) => {
         if (!silent) setIsLoadingJobs(true);
         try {
-            const subjectId = selectedSubjects.length > 0 ? selectedSubjects[0].id : undefined;
-            const data = await api.fetchDiarizations(50, 0, subjectId);
+            const subject_ids = selectedSubjects.map(s => s.id);
+            const data = await api.fetchDiarizations(50, 0, subject_ids);
             const mappedJobs = data.map(mapBackendJob);
             setJobs(mappedJobs);
             return mappedJobs;
@@ -209,10 +206,13 @@ export function DiarizationView() {
             if (url) {
                 const audio = new Audio(url);
                 audioRef.current = audio;
-                audio.onended = () => {
+                
+                const onAudioEnded = () => {
                     setSpeakers(prev => prev.map(s => ({...s, isPlaying: false})));
                     audioRef.current = null;
                 };
+                
+                audio.onended = onAudioEnded;
                 await audio.play();
             }
         } catch (err) {
@@ -442,65 +442,19 @@ export function DiarizationView() {
                 </AnimatePresence>
             </div>
 
-            {/* 🔵 RIGHT SIDEBAR */}
+            {/* 🔵 RIGHT SIDEBAR (Metadata only, Ecosystem is global) */}
             <AnimatePresence mode="wait">
-                <motion.div 
-                    key={viewMode === 'list' ? 'ecosystem' : 'metadata'}
-                    initial={{ opacity: 0, x: 20, width: 0 }}
-                    animate={{ opacity: 1, x: 0, width: 320 }}
-                    exit={{ opacity: 0, x: 20, width: 0 }}
-                    className="border-l border-white/5 bg-black/20 backdrop-blur-xl flex flex-col shrink-0 overflow-hidden"
-                >
-                    {viewMode === 'list' ? (
-                        <div className="w-[320px] flex flex-col h-full">
-                            <div className="p-6 border-b border-white/5 flex items-center justify-between">
-                                <div className="flex items-center gap-2">
-                                    <Database className="w-4 h-4 text-emerald-400" />
-                                    <h3 className="text-xs font-black text-white uppercase tracking-widest">{t('ecosystem.title')}</h3>
-                                </div>
-                            </div>
-                            
-                            <div className="p-6 border-b border-white/5 bg-emerald-500/5">
-                                <p className="text-[10px] text-emerald-400/70 font-black uppercase tracking-widest leading-relaxed">
-                                    {t('ecosystem.description')}
-                                </p>
-                            </div>
-
-                            <div className="flex-1 overflow-y-auto p-4 space-y-2 custom-scrollbar">
-                                {subjects.map((ctx) => {
-                                    const isSelected = selectedSubjects.some(s => s.id === ctx.id);
-                                    return (
-                                        <button
-                                            key={ctx.id}
-                                            onClick={() => {
-                                                const sub = subjects.find(s => s.id === ctx.id);
-                                                if (sub) setSelectedSubjects([sub]);
-                                            }}
-                                            className={`w-full flex items-center gap-3 p-3 rounded-2xl transition-all border ${isSelected ? 'bg-emerald-500/10 border-emerald-500/30 text-white shadow-[0_0_20px_rgba(16,185,129,0.1)]' : 'bg-transparent border-transparent text-zinc-500 hover:bg-white/5 hover:text-zinc-300'}`}
-                                        >
-                                            <div className={`w-8 h-8 rounded-xl flex items-center justify-center transition-all ${isSelected ? 'bg-emerald-500 text-black shadow-lg shadow-emerald-500/20' : 'bg-zinc-900 text-zinc-600'}`}>
-                                                <Database className="w-4 h-4" />
-                                            </div>
-                                            <div className="text-left flex-1 min-w-0">
-                                                <div className={`text-xs font-bold truncate ${isSelected ? 'text-white' : 'text-zinc-400'}`}>
-                                                    {ctx.name}
-                                                </div>
-                                                <div className="text-[9px] font-black uppercase tracking-widest opacity-50 mt-0.5">
-                                                    {ctx.sourceCount || 0} {t('ecosystem.sources_count')}
-                                                </div>
-                                            </div>
-                                            {isSelected && (
-                                                <div className="w-1.5 h-1.5 rounded-full bg-emerald-500 shadow-[0_0_8px_rgba(16,185,129,0.5)]" />
-                                            )}
-                                        </button>
-                                    );
-                                })}
-                            </div>
-                        </div>
-                    ) : (
-                        activeJob && <DiarizationMetadataPanel job={activeJob} onReprocess={handleReprocessJob} />
-                    )}
-                </motion.div>
+                {viewMode === 'detail' && activeJob && (
+                    <motion.div 
+                        key="metadata"
+                        initial={{ opacity: 0, x: 20, width: 0 }}
+                        animate={{ opacity: 1, x: 0, width: 320 }}
+                        exit={{ opacity: 0, x: 20, width: 0 }}
+                        className="border-l border-white/5 bg-black/20 backdrop-blur-xl flex flex-col shrink-0 overflow-hidden"
+                    >
+                        <DiarizationMetadataPanel job={activeJob} onReprocess={handleReprocessJob} />
+                    </motion.div>
+                )}
             </AnimatePresence>
 
             {/* MODALS */}
diff --git a/frontend/src/components/DuplicatesView.tsx b/frontend/src/components/DuplicatesView.tsx
new file mode 100644
index 00000000..c168e4bf
--- /dev/null
+++ b/frontend/src/components/DuplicatesView.tsx
@@ -0,0 +1,289 @@
+import React, { useEffect, useState, useCallback } from 'react';
+import { useTranslation } from 'react-i18next';
+import { 
+  Copy, 
+  Trash2, 
+  CheckCircle, 
+  AlertTriangle, 
+  RefreshCw,
+  ExternalLink,
+  Loader2,
+  Clock,
+  ChevronLeft,
+  ChevronRight
+} from 'lucide-react';
+import { motion } from 'framer-motion';
+import { useAppContext } from '../store/AppContext';
+import { api } from '../services/api';
+import { ChunkDuplicate } from '../types';
+
+export function DuplicatesView() {
+  const { t } = useTranslation();
+  const { addToast, selectedSubjects } = useAppContext();
+  const [duplicates, setDuplicates] = useState<ChunkDuplicate[]>([]);
+  const [isLoading, setIsLoading] = useState(true);
+  const [isSyncing, setIsSyncing] = useState(false);
+  const [filterStatus, setFilterStatus] = useState('pending');
+  const [page, setPage] = useState(1);
+  const [total, setTotal] = useState(0);
+  const pageSize = 10;
+
+  const fetchDuplicates = useCallback(async (isRefresh = false) => {
+    if (isRefresh) setIsSyncing(true);
+    else setIsLoading(true);
+    
+    try {
+      const subject_ids = selectedSubjects.map(s => s.id);
+      const offset = (page - 1) * pageSize;
+      const { results, total: totalCount } = await api.fetchDuplicates(filterStatus, subject_ids, pageSize, offset);
+      setDuplicates(results);
+      setTotal(totalCount);
+    } catch (err) {
+      console.error('Failed to fetch duplicates:', err);
+      addToast(t('common.errors.generic'), 'error');
+    } finally {
+      setIsLoading(false);
+      setIsSyncing(false);
+    }
+  }, [filterStatus, selectedSubjects, addToast, t, page, pageSize]);
+
+  useEffect(() => {
+    fetchDuplicates();
+  }, [fetchDuplicates]);
+
+  // Reset page when filter or subjects change
+  useEffect(() => {
+    setPage(1);
+  }, [filterStatus, selectedSubjects]);
+
+  const handleUpdateStatus = async (id: string, status: string) => {
+    try {
+      await api.updateDuplicateStatus(id, status);
+      addToast(t('common.actions.success'), 'success');
+      fetchDuplicates();
+    } catch (err) {
+      console.error('Failed to update status:', err);
+      addToast(t('common.errors.generic'), 'error');
+    }
+  };
+
+  const handleDeactivate = async (chunkId: string) => {
+    try {
+      await api.deactivateChunk(chunkId);
+      addToast(t('common.actions.success'), 'success');
+      fetchDuplicates();
+    } catch (err) {
+      console.error('Failed to deactivate chunk:', err);
+      addToast(t('common.errors.generic'), 'error');
+    }
+  };
+
+  const totalPages = Math.ceil(total / pageSize);
+
+  const containerVariants = {
+    hidden: { opacity: 0 },
+    show: {
+      opacity: 1,
+      transition: {
+        staggerChildren: 0.1
+      }
+    }
+  };
+
+  const itemVariants = {
+    hidden: { opacity: 0, y: 20 },
+    show: { opacity: 1, y: 0 }
+  };
+
+  return (
+    <div className="p-8 pt-10 max-w-7xl mx-auto h-full flex flex-col overflow-hidden">
+      <div className="mb-10 space-y-6">
+        <div className="flex flex-col md:flex-row md:items-center justify-between gap-6">
+          <div className="flex items-center gap-4">
+            <div className="p-3.5 rounded-2xl bg-amber-500/10 border border-amber-500/20 shadow-[0_0_20px_rgba(245,158,11,0.1)]">
+              <Copy className="w-7 h-7 text-amber-400" />
+            </div>
+            <div>
+              <h2 className="text-3xl font-black text-white tracking-tight leading-none">{t('operations.duplicatesTitle')}</h2>
+              <p className="text-zinc-500 text-sm mt-2 font-medium">{t('operations.duplicatesDesc')}</p>
+            </div>
+          </div>
+
+          <div className="flex items-center gap-2">
+             <div className="flex p-1 bg-zinc-900 border border-white/5 rounded-xl">
+                {['pending', 'reviewed', 'ignored'].map((status) => (
+                  <button
+                    key={status}
+                    onClick={() => setFilterStatus(status)}
+                    className={`px-4 py-1.5 rounded-lg text-xs font-black uppercase tracking-widest transition-all ${filterStatus === status ? 'bg-zinc-800 text-white shadow-lg' : 'text-zinc-500 hover:text-zinc-300'}`}
+                  >
+                    {t(`operations.${status}`)}
+                  </button>
+                ))}
+             </div>
+
+             <button 
+                onClick={() => fetchDuplicates(true)}
+                disabled={isSyncing}
+                className="group p-2.5 text-zinc-400 bg-zinc-900 border border-white/5 rounded-xl hover:bg-zinc-800 hover:text-white transition-all disabled:opacity-50"
+              >
+                <RefreshCw className={`w-5 h-5 ${isSyncing ? 'animate-spin text-amber-500' : 'group-hover:rotate-180 transition-transform duration-500'}`} />
+              </button>
+          </div>
+        </div>
+      </div>
+
+      <div className="flex-1 overflow-y-auto custom-scrollbar pr-1 pb-10">
+        {isLoading ? (
+          <div className="flex flex-col items-center justify-center py-20 gap-4">
+            <Loader2 className="w-10 h-10 text-amber-500 animate-spin opacity-20" />
+            <span className="text-[10px] font-black uppercase tracking-widest text-zinc-600">Analisando redundâncias...</span>
+          </div>
+        ) : duplicates.length === 0 ? (
+          <motion.div 
+            initial={{ opacity: 0 }} 
+            animate={{ opacity: 1 }}
+            className="flex flex-col items-center justify-center py-32 text-center bg-zinc-900/20 border border-dashed border-white/5 rounded-3xl"
+          >
+            <div className="w-20 h-20 rounded-full bg-zinc-900 border border-white/5 flex items-center justify-center mb-6 shadow-2xl">
+              <CheckCircle className="w-10 h-10 text-emerald-500/20" />
+            </div>
+            <h3 className="text-zinc-200 font-bold text-xl mb-2">{t('operations.noDuplicates')}</h3>
+            <p className="text-zinc-500 text-sm max-w-sm mx-auto leading-relaxed">Sua base de conhecimento parece estar limpa e sem redundâncias óbvias.</p>
+          </motion.div>
+        ) : (
+          <motion.div 
+            variants={containerVariants}
+            initial="hidden"
+            animate="show"
+            className="grid grid-cols-1 gap-6"
+          >
+            {duplicates.map((dup) => (
+              <motion.div 
+                key={dup.id} 
+                variants={itemVariants}
+                className="group relative bg-[#121212]/40 backdrop-blur-md border border-white/5 rounded-2xl overflow-hidden hover:border-white/10 transition-all"
+              >
+                <div className="absolute top-0 left-0 w-1 h-full bg-amber-500/50 group-hover:bg-amber-500 transition-colors" />
+                
+                <div className="p-6">
+                  {/* Header do Card */}
+                  <div className="flex items-center justify-between mb-6">
+                    <div className="flex items-center gap-4">
+                      <div className="flex items-center gap-2 px-3 py-1 bg-amber-500/10 border border-amber-500/20 rounded-full">
+                        <AlertTriangle className="w-3.5 h-3.5 text-amber-500" />
+                        <span className="text-[10px] font-black uppercase tracking-widest text-amber-500">Conflito de Similaridade</span>
+                      </div>
+                      <div className="flex items-center gap-2 px-3 py-1 bg-white/5 border border-white/10 rounded-full">
+                        <span className="text-[10px] font-black uppercase tracking-widest text-zinc-400">Score:</span>
+                        <span className="text-[10px] font-black text-white">{(dup.similarity * 100).toFixed(1)}%</span>
+                      </div>
+                    </div>
+
+                    <div className="flex items-center gap-2">
+                       <button 
+                         onClick={() => handleUpdateStatus(dup.id, 'ignored')}
+                         className="px-4 py-1.5 rounded-lg text-[10px] font-black uppercase tracking-widest text-zinc-500 hover:text-white hover:bg-white/5 transition-all"
+                       >
+                         {t('operations.ignore')}
+                       </button>
+                       <button 
+                         onClick={() => handleUpdateStatus(dup.id, 'reviewed')}
+                         className="px-4 py-1.5 rounded-lg text-[10px] font-black uppercase tracking-widest bg-emerald-500 text-black hover:bg-emerald-400 transition-all shadow-lg shadow-emerald-500/10"
+                       >
+                         Marcar como Revisado
+                       </button>
+                    </div>
+                  </div>
+
+                  {/* Comparação Lado a Lado */}
+                  <div className="grid grid-cols-1 md:grid-cols-2 gap-6">
+                     {dup.chunks?.map((chunk, idx) => (
+                       <div key={chunk.id} className="flex flex-col h-full bg-black/40 rounded-xl border border-white/5 overflow-hidden">
+                          <div className="px-4 py-3 bg-white/5 border-b border-white/5 flex items-center justify-between">
+                             <div className="flex items-center gap-2">
+                                <span className="w-5 h-5 flex items-center justify-center rounded-md bg-zinc-800 text-[10px] font-black text-zinc-400">#{(idx + 1).toString().padStart(2, '0')}</span>
+                                <span className="text-[10px] font-black uppercase tracking-widest text-zinc-300 truncate max-w-[150px]">{chunk.source_title}</span>
+                             </div>
+                             <div className="flex items-center gap-2">
+                                <button
+                                  onClick={() => handleDeactivate(chunk.id)}
+                                  className="p-1.5 rounded-md hover:bg-rose-500/20 text-zinc-500 hover:text-rose-400 transition-all"
+                                  title={t('operations.deactivate')}
+                                >
+                                  <Trash2 className="w-3.5 h-3.5" />
+                                </button>
+                             </div>
+                          </div>
+                          <div className="p-4 flex-1">
+                             <p className="text-zinc-400 text-[13px] leading-relaxed line-clamp-6 italic font-serif">
+                                "{chunk.content}"
+                             </p>
+                          </div>
+                          <div className="px-4 py-2 bg-white/[0.02] mt-auto flex items-center justify-between">
+                             <span className="text-[9px] font-black uppercase tracking-widest text-zinc-600">ID: {chunk.id.slice(0, 8)}...</span>
+                             <button className="text-[9px] font-black uppercase tracking-widest text-primary-500 flex items-center gap-1 hover:underline">
+                                Ver Fonte <ExternalLink className="w-2.5 h-2.5" />
+                             </button>
+                          </div>
+                       </div>
+                     ))}
+                  </div>
+                </div>
+              </motion.div>
+            ))}
+          </motion.div>
+        )}
+      </div>
+
+      <div className="mt-auto pt-6 border-t border-white/5 flex items-center justify-between">
+         <div className="flex items-center gap-8">
+            <div className="flex items-center gap-2">
+               <Clock className="w-3.5 h-3.5 text-zinc-600" />
+               <span className="text-[10px] font-black uppercase tracking-widest text-zinc-500">Total: {total} encontrados</span>
+            </div>
+
+            {totalPages > 1 && (
+              <div className="flex items-center gap-1 p-1 bg-zinc-900 rounded-xl border border-white/5">
+                <button
+                  onClick={() => setPage(p => Math.max(1, p - 1))}
+                  disabled={page === 1}
+                  className="p-1.5 rounded-lg hover:bg-zinc-800 text-zinc-500 hover:text-white disabled:opacity-20 transition-all"
+                >
+                  <ChevronLeft className="w-4 h-4" />
+                </button>
+                <div className="text-[10px] font-black px-3 text-zinc-400 tabular-nums">
+                  {page} <span className="mx-1 text-zinc-700">/</span> {totalPages}
+                </div>
+                <button
+                  onClick={() => setPage(p => Math.min(totalPages, p + 1))}
+                  disabled={page === totalPages}
+                  className="p-1.5 rounded-lg hover:bg-zinc-800 text-zinc-500 hover:text-white disabled:opacity-20 transition-all"
+                >
+                  <ChevronRight className="w-4 h-4" />
+                </button>
+              </div>
+            )}
+         </div>
+         
+         <button 
+           className="px-6 py-2 rounded-xl bg-zinc-900 border border-white/10 text-xs font-black uppercase tracking-widest text-zinc-300 hover:bg-zinc-800 hover:border-white/20 transition-all"
+           onClick={async () => {
+              setIsSyncing(true);
+              try {
+                await api.analyzeAllDuplicates();
+                addToast('Análise global iniciada na fila de tarefas', 'info');
+              } catch (err) {
+                console.error('Analysis error:', err);
+                addToast('Erro ao iniciar análise', 'error');
+              } finally {
+                setIsSyncing(false);
+              }
+           }}
+          >
+           Forçar Re-análise Global
+         </button>
+      </div>
+    </div>
+  );
+}
diff --git a/frontend/src/components/Sidebar.tsx b/frontend/src/components/Sidebar.tsx
index ba2c6561..0f0ddf49 100644
--- a/frontend/src/components/Sidebar.tsx
+++ b/frontend/src/components/Sidebar.tsx
@@ -1,7 +1,7 @@
-import React, {useState} from 'react';
-import {useAppContext} from '../store/AppContext';
-import {useAuth} from '../store/AuthContext';
-import {useTranslation} from 'react-i18next';
+import React, { useState } from 'react';
+import { useAppContext } from '../store/AppContext';
+import { useAuth } from '../store/AuthContext';
+import { useTranslation } from 'react-i18next';
 import {
   Activity as ActivityIcon,
   Database,
@@ -11,16 +11,25 @@ import {
   Search,
   Settings,
   User,
-  Layers
+  Layers,
+  ChevronDown,
+  Copy
 } from 'lucide-react';
 
-import {SettingsModal} from './SettingsModal';
+import { SettingsModal } from './SettingsModal';
 
 export function Sidebar() {
   const { currentView, setCurrentView } = useAppContext();
   const { user, logout, isAuthEnabled } = useAuth();
   const { t } = useTranslation();
   const [isSettingsModalOpen, setIsSettingsModalOpen] = useState(false);
+  const [expandedGroups, setExpandedGroups] = useState<Record<string, boolean>>({
+    data: true
+  });
+
+  const toggleGroup = (id: string) => {
+    setExpandedGroups(prev => ({ ...prev, [id]: !prev[id] }));
+  };
 
   const navGroups = [
     {
@@ -41,9 +50,12 @@ export function Sidebar() {
     },
     {
       id: 'data',
-      label: t('sidebar.groups.data'),
+      label: t('sidebar.operations.contentSources'),
+      icon: Database,
+      isExpandable: true,
       items: [
         { id: 'sources', label: t('sidebar.operations.sources'), icon: Database },
+        { id: 'duplicates', label: t('sidebar.operations.duplicates'), icon: Copy },
         { id: 'diarization', label: t('sidebar.operations.diarization'), icon: Mic },
       ]
     },
@@ -55,17 +67,16 @@ export function Sidebar() {
         { id: 'queue', label: t('sidebar.operations.queue', 'Task Queue (Redis)'), icon: Layers },
       ]
     }
-
-  ] as const;
+  ];
 
   const getItemClass = (isActive: boolean, isDisabled: boolean) => {
-    if (isActive) return 'bg-emerald-500/10 text-emerald-400 font-bold border border-emerald-500/20';
+    if (isActive) return 'bg-emerald-500/10 text-emerald-400 font-bold border border-emerald-500/20 shadow-[0_4px_12px_rgba(16,185,129,0.05)]';
     if (isDisabled) return 'opacity-30 cursor-not-allowed grayscale';
     return 'text-zinc-400 hover:bg-white/5 hover:text-zinc-200';
   };
 
   return (
-    <div className="w-64 border-r border-border-subtle bg-[#121212] flex flex-col h-screen">
+    <div className="w-64 border-r border-border-subtle bg-[#121212] flex flex-col h-screen z-50">
       {/* Brand */}
       <div className="p-5 border-b border-border-subtle bg-black/20">
         <div className="flex items-center gap-3">
@@ -81,39 +92,85 @@ export function Sidebar() {
 
       {/* Navigation Groups - Scrollable Area */}
       <div className="flex-1 overflow-y-auto custom-scrollbar p-4 pt-2 space-y-6">
-        {navGroups.map((group, index) => (
-          <div key={group.id} className="space-y-2">
-            <h3 className="text-[10px] font-black uppercase tracking-widest text-zinc-600 px-3">
-              {group.label}
-            </h3>
-            <div className="space-y-1">
-              {group.items.map((item) => {
-                const Icon = item.icon;
-                const isSourcesGroup = item.id === 'sources';
-                const isActive = currentView === item.id || (isSourcesGroup && currentView === 'database');
-                const isDisabled = 'disabled' in item ? item.disabled : false;
+        {navGroups.map((group, index) => {
+          const isExpandable = 'isExpandable' in group ? (group as any).isExpandable : false;
+          const isExpanded = expandedGroups[group.id] ?? false;
+          const GroupIcon = 'icon' in group ? (group as any).icon : null;
+
+          if (isExpandable) {
+            return (
+              <div key={group.id} className="space-y-1">
+                <button
+                  onClick={() => toggleGroup(group.id)}
+                  className={`w-full flex items-center justify-between px-3 py-2 rounded-lg text-[10px] font-black uppercase tracking-widest transition-all ${isExpanded ? 'text-white' : 'text-zinc-500 hover:text-zinc-300'}`}
+                >
+                  <div className="flex items-center gap-3">
+                    {GroupIcon && <GroupIcon className="w-3.5 h-3.5" />}
+                    {group.label}
+                  </div>
+                  <ChevronDown className={`w-3.5 h-3.5 transition-transform duration-300 ${isExpanded ? '' : '-rotate-90'}`} />
+                </button>
+                
+                {isExpanded && (
+                  <div className="ml-4 pl-2 border-l border-white/5 space-y-1 mt-1">
+                    {group.items.map((item) => {
+                      const Icon = item.icon;
+                      const isActive = currentView === item.id || (item.id === 'sources' && currentView === 'database');
+                      const isDisabled = 'disabled' in item ? item.disabled : false;
+
+                      return (
+                        <button
+                          key={item.id}
+                          disabled={isDisabled}
+                          onClick={() => setCurrentView(item.id)}
+                          className={`w-full flex items-center gap-3 px-3 py-2 rounded-lg text-sm transition-all group ${getItemClass(isActive, isDisabled)}`}
+                        >
+                          <Icon className={`w-3.5 h-3.5 ${isActive ? 'text-emerald-400' : 'text-zinc-600 group-hover:text-zinc-400'}`} />
+                          <span className="truncate">{item.label}</span>
+                          {isActive && (
+                            <div className="absolute left-0 top-1/2 -translate-y-1/2 w-1 h-3 bg-emerald-500 rounded-r-full shadow-[0_0_8px_rgba(16,185,129,0.5)]" />
+                          )}
+                        </button>
+                      );
+                    })}
+                  </div>
+                )}
+                {index < navGroups.length - 1 && <div className="h-px bg-white/5 mt-6 mx-2" />}
+              </div>
+            );
+          }
 
-                return (
-                  <button
-                    key={item.id}
-                    disabled={isDisabled}
-                    onClick={() => setCurrentView(item.id)}
-                    className={`w-full flex items-center gap-3 px-3 py-2 rounded-lg text-sm transition-all relative group ${getItemClass(isActive, isDisabled)}`}
-                  >
-                    <Icon className={`w-4 h-4 transition-transform duration-200 group-hover:scale-110 ${isActive ? 'scale-110 text-emerald-400' : 'text-zinc-500 group-hover:text-zinc-300'}`} />
-                    <span className="truncate">
-                      {isSourcesGroup && currentView === 'database' ? t('sidebar.operations.chunks') : item.label}
-                    </span>
-                    {isActive && (
-                      <div className="absolute left-0 top-1/2 -translate-y-1/2 w-1 h-4 bg-emerald-500 rounded-r-full shadow-[0_0_8px_rgba(16,185,129,0.5)]" />
-                    )}
-                  </button>
-                );
-              })}
+          return (
+            <div key={group.id} className="space-y-2">
+              <h3 className="text-[10px] font-black uppercase tracking-widest text-zinc-600 px-3">
+                {group.label}
+              </h3>
+              <div className="space-y-1">
+                {group.items.map((item) => {
+                  const Icon = item.icon;
+                  const isActive = currentView === item.id;
+                  const isDisabled = 'disabled' in item ? item.disabled : false;
+
+                  return (
+                    <button
+                      key={item.id}
+                      disabled={isDisabled}
+                      onClick={() => setCurrentView(item.id)}
+                      className={`w-full flex items-center gap-3 px-3 py-2 rounded-lg text-sm transition-all relative group ${getItemClass(isActive, isDisabled)}`}
+                    >
+                      <Icon className={`w-4 h-4 transition-transform duration-200 group-hover:scale-110 ${isActive ? 'scale-110 text-emerald-400' : 'text-zinc-500 group-hover:text-zinc-300'}`} />
+                      <span className="truncate">{item.label}</span>
+                      {isActive && (
+                        <div className="absolute left-0 top-1/2 -translate-y-1/2 w-1 h-4 bg-emerald-500 rounded-r-full shadow-[0_0_8px_rgba(16,185,129,0.5)]" />
+                      )}
+                    </button>
+                  );
+                })}
+              </div>
+              {index < navGroups.length - 1 && <div className="h-px bg-white/5 mt-6 mx-2" />}
             </div>
-            {index < navGroups.length - 1 && <div className="h-px bg-white/5 mt-6 mx-2" />}
-          </div>
-        ))}
+          );
+        })}
       </div>
 
       {/* User Profile / Settings */}
@@ -159,8 +216,6 @@ export function Sidebar() {
         </div>
       </div>
 
-
-
       <SettingsModal
         isOpen={isSettingsModalOpen}
         onClose={() => setIsSettingsModalOpen(false)}
diff --git a/frontend/src/components/SidebarContext.tsx b/frontend/src/components/SidebarContext.tsx
new file mode 100644
index 00000000..e802afa8
--- /dev/null
+++ b/frontend/src/components/SidebarContext.tsx
@@ -0,0 +1,180 @@
+import React, { useCallback, useState, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Database, Plus, Eraser, Check, MousePointer2, Search } from 'lucide-react';
+import { motion, AnimatePresence } from 'motion/react';
+import { useAppContext } from '../store/AppContext';
+import { Subject } from '../types';
+
+export function SidebarContext() {
+  const { 
+    subjects, 
+    selectedSubjects, 
+    setSelectedSubjects, 
+    setIsAddSubjectModalOpen 
+  } = useAppContext();
+  const { t } = useTranslation();
+  const [searchQuery, setSearchQuery] = useState('');
+
+  const filteredSubjects = useMemo(() => {
+    return subjects.filter(s => 
+      s.name.toLowerCase().includes(searchQuery.toLowerCase())
+    );
+  }, [subjects, searchQuery]);
+
+  const handleSelectAll = useCallback(() => {
+    setSelectedSubjects([]); // Empty means "All"
+  }, [setSelectedSubjects]);
+
+  const handleToggleSubject = useCallback((subject: Subject) => {
+    // Simplified UX: Always toggle selection on click
+    const isSelected = selectedSubjects.some(s => s.id === subject.id);
+    
+    if (isSelected) {
+      // If it's the last one being unselected, revert to "All" (empty list)
+      if (selectedSubjects.length === 5) {
+        // Wait, empty list means "All" in this app.
+      }
+      setSelectedSubjects(selectedSubjects.filter(s => s.id !== subject.id));
+    } else {
+      setSelectedSubjects([...selectedSubjects, subject]);
+    }
+  }, [selectedSubjects, setSelectedSubjects]);
+
+  const isAllSelected = selectedSubjects.length === 0;
+
+  return (
+    <div className="w-80 border-l border-white/5 bg-black/20 backdrop-blur-xl flex flex-col h-full shrink-0 relative z-20">
+      <div className="p-6 border-b border-white/5 flex items-center justify-between">
+        <div className="flex items-center gap-2">
+          <Database className="w-4 h-4 text-emerald-400" />
+          <h3 className="text-xs font-black text-white uppercase tracking-widest">{t('sidebarContext.title')}</h3>
+        </div>
+        <div className="flex items-center gap-2">
+           <AnimatePresence>
+            {!isAllSelected && (
+              <motion.button
+                initial={{ opacity: 0, scale: 0.9, x: 10 }}
+                animate={{ opacity: 1, scale: 1, x: 0 }}
+                exit={{ opacity: 0, scale: 0.9, x: 10 }}
+                onClick={handleSelectAll}
+                className="flex items-center justify-center w-8 h-8 rounded-lg bg-white/5 border border-white/10 text-zinc-400 hover:text-white hover:bg-white/10 transition-all"
+                title={t('sidebarContext.clear_filter')}
+              >
+                <Eraser className="w-3.5 h-3.5" />
+              </motion.button>
+            )}
+           </AnimatePresence>
+          <button
+            onClick={() => setIsAddSubjectModalOpen(true)}
+            className="w-8 h-8 rounded-xl bg-zinc-900 border border-white/5 flex items-center justify-center text-zinc-500 hover:text-white hover:bg-emerald-500/20 hover:border-emerald-500/30 transition-all shadow-sm shadow-black"
+            title={t('sidebarContext.new_base')}
+          >
+            <Plus className="w-4 h-4" />
+          </button>
+        </div>
+      </div>
+      
+      <div className="p-6 border-b border-white/5 bg-emerald-500/5 group relative overflow-hidden">
+        <div className="absolute inset-0 bg-gradient-to-r from-emerald-500/0 via-emerald-500/5 to-emerald-500/0 translate-x-[-100%] group-hover:translate-x-[100%] transition-transform duration-1000" />
+        <p className="text-[10px] text-emerald-400/70 font-black uppercase tracking-widest leading-relaxed relative z-10 flex items-center gap-2">
+           <MousePointer2 className="w-3 h-3 opacity-50" />
+           {t('sidebarContext.description')}
+        </p>
+      </div>
+
+      <div className="px-4 pt-4 pb-2">
+        <div className="relative group/search">
+          <Search className="w-3.5 h-3.5 absolute left-3 top-1/2 -translate-y-1/2 text-zinc-500 group-focus-within/search:text-emerald-500 transition-colors" />
+          <input
+            type="text"
+            value={searchQuery}
+            onChange={(e) => setSearchQuery(e.target.value)}
+            placeholder={t('common.actions.search') + '...'}
+            className="w-full pl-9 pr-3 py-2 bg-white/[0.03] border border-white/5 rounded-xl text-xs text-zinc-300 placeholder:text-zinc-600 focus:outline-none focus:border-emerald-500/30 transition-all font-medium"
+          />
+        </div>
+      </div>
+
+      <div className="flex-1 overflow-y-auto p-4 space-y-2 custom-scrollbar">
+        {/* ALL / TODOS Button */}
+        <button
+          onClick={handleSelectAll}
+          className={`w-full group flex items-center gap-3 p-3 rounded-2xl transition-all border relative overflow-hidden ${isAllSelected ? 'bg-emerald-500/10 border-emerald-500/30 text-white shadow-[0_0_20px_rgba(16,185,129,0.1)]' : 'bg-transparent border-transparent text-zinc-500 hover:bg-white/5 hover:text-zinc-300'}`}
+        >
+          <div className={`w-8 h-8 rounded-xl flex items-center justify-center transition-all ${isAllSelected ? 'bg-emerald-500 text-black shadow-lg shadow-emerald-500/20' : 'bg-zinc-900 text-zinc-600'}`}>
+            <Database className="w-4 h-4" />
+          </div>
+          <div className="text-left flex-1 min-w-0">
+            <div className={`text-xs font-bold truncate ${isAllSelected ? 'text-white' : 'text-zinc-400 group-hover:text-zinc-200'}`}>
+              {t('sidebarContext.all_bases')}
+            </div>
+            <div className="text-[9px] font-black uppercase tracking-widest opacity-40">
+               {isAllSelected ? t('common.active') : t('common.select')}
+            </div>
+          </div>
+          {isAllSelected && (
+            <motion.div 
+               className="w-5 h-5 rounded-lg bg-emerald-500 flex items-center justify-center shadow-[0_0_8px_rgba(16,185,129,0.5)]"
+            >
+               <Check className="w-3 h-3 text-black stroke-[3px]" />
+            </motion.div>
+          )}
+        </button>
+
+        <div className="h-px bg-white/5 my-4 mx-2" />
+
+        <AnimatePresence mode="popLayout">
+          {filteredSubjects.map((ctx) => {
+            const isSelected = selectedSubjects.some(s => s.id === ctx.id);
+            return (
+              <motion.button
+                layout
+                initial={{ opacity: 0, y: 10 }}
+                animate={{ opacity: 1, y: 0 }}
+                exit={{ opacity: 0, scale: 0.95 }}
+                key={ctx.id}
+                onClick={() => handleToggleSubject(ctx)}
+                className={`w-full group flex items-center gap-3 p-3 rounded-2xl transition-all border ${isSelected ? 'bg-emerald-500/10 border-emerald-500/30 text-white shadow-[0_0_20px_rgba(16,185,129,0.1)]' : 'bg-transparent border-transparent text-zinc-500 hover:bg-white/5 hover:text-zinc-300'}`}
+              >
+                <div className={`w-8 h-8 rounded-xl flex items-center justify-center transition-all ${isSelected ? 'bg-emerald-500 text-black shadow-lg shadow-emerald-500/20' : 'bg-zinc-900 text-zinc-600 group-hover:bg-zinc-800'}`}>
+                  <Database className="w-4 h-4" />
+                </div>
+                <div className="text-left flex-1 min-w-0">
+                  <div className={`text-xs font-bold truncate ${isSelected ? 'text-white' : 'text-zinc-400 group-hover:text-zinc-200'}`}>
+                    {ctx.name}
+                  </div>
+                  <div className="text-[9px] font-black uppercase tracking-widest opacity-50 mt-0.5">
+                    {ctx.sourceCount || 0} {t('sidebarContext.sources_count')}
+                  </div>
+                </div>
+                {isSelected && (
+                  <motion.div 
+                    className="w-5 h-5 rounded-lg bg-emerald-500 flex items-center justify-center shadow-[0_0_8px_rgba(16,185,129,0.5)]"
+                  >
+                     <Check className="w-3 h-3 text-black stroke-[3px]" />
+                  </motion.div>
+                )}
+              </motion.button>
+            );
+          })}
+        </AnimatePresence>
+
+        {subjects.length === 0 && (
+           <div className="py-20 text-center opacity-20">
+              <Database className="w-8 h-8 mx-auto mb-3 text-zinc-500" />
+              <span className="text-[9px] font-black uppercase tracking-widest text-zinc-500">{t('sidebarContext.no_base')}</span>
+           </div>
+        )}
+      </div>
+
+      <div className="p-6 border-t border-white/5 mt-auto bg-black/40">
+         <div className="flex items-center gap-3">
+            <div className={`w-2 h-2 rounded-full ${isAllSelected ? 'bg-emerald-500 animate-pulse' : 'bg-emerald-400 shadow-[0_0_8px_rgba(16,185,129,0.5)]'}`} />
+            <span className="text-[10px] font-black text-zinc-500 uppercase tracking-widest leading-none mt-0.5 truncate flex-1">
+              {isAllSelected ? t('sidebarContext.all_bases') : selectedSubjects.map(s => s.name).join(', ')}
+            </span>
+         </div>
+      </div>
+    </div>
+  );
+}
diff --git a/frontend/src/locales/en.json b/frontend/src/locales/en.json
index b9545ced..40cbe729 100644
--- a/frontend/src/locales/en.json
+++ b/frontend/src/locales/en.json
@@ -25,6 +25,8 @@
     "selected_one": "1 selected",
     "selected_other": "{{count}} selected",
     "all": "All",
+    "active": "Active",
+    "select": "Select",
     "errors": {
       "general": "An unexpected error occurred. Please try again."
     },
@@ -81,6 +83,8 @@
       "videosSelected": "Video(s) selected",
       "voices": "Voice Profiles",
       "sources": "Content Sources",
+      "contentSources": "Content Sources",
+      "duplicates": "Duplicates",
       "chunks": "Sources > Chunks",
       "activity": "Activity Monitor",
       "queue": "Task Queue (Redis)",
@@ -101,6 +105,14 @@
       "settings": "Settings"
     }
   },
+  "sidebarContext": {
+    "title": "Knowledge Bases",
+    "description": "Filter content sources by one or more selected contexts.",
+    "all_bases": "All Knowledge Bases",
+    "clear_filter": "Clear Selection",
+    "sources_count": "sources",
+    "no_base": "No knowledge base found"
+  },
   "search": {
     "title": "Semantic Search",
     "subtitle": "Explore your knowledge base using semantic, keyword, or hybrid search.",
@@ -244,6 +256,20 @@
     "phase": "Phase",
     "duration": "duration"
   },
+  "operations": {
+    "duplicatesTitle": "Duplicate Management",
+    "duplicatesDesc": "We identified segments with high similarity (>90%) that might be redundant or repeated cuts.",
+    "duplicates": "Duplicates",
+    "similarity": "Similarity",
+    "ignore": "Ignore",
+    "deactivate": "Deactivate",
+    "resolved": "Resolved",
+    "pending": "Pending",
+    "reviewed": "Reviewed",
+    "ignored": "Ignored",
+    "noDuplicates": "No duplicates found at the moment.",
+    "contentSources": "Content Sources"
+  },
   "settings": {
     "title": "Settings",
     "subtitle": "Manage your local environment configurations",
diff --git a/frontend/src/locales/pt-BR.json b/frontend/src/locales/pt-BR.json
index d95bf03e..65581eac 100644
--- a/frontend/src/locales/pt-BR.json
+++ b/frontend/src/locales/pt-BR.json
@@ -25,6 +25,8 @@
     "selected_one": "1 selecionado",
     "selected_other": "{{count}} selecionados",
     "all": "Todos",
+    "active": "Ativo",
+    "select": "Selecionar",
     "errors": {
       "general": "Ocorreu um erro inesperado. Tente novamente."
     },
@@ -75,12 +77,13 @@
       "search": "Busca Semântica",
       "diarization": "Reconhecimento de Fala",
       "voices": "Perfis de Voz",
-      "sources": "Fontes de Conteúdo",
-      "chunks": "Fontes > Chunks",
+      "contentSources": "Fontes de Conteúdo",
+      "duplicates": "Duplicidades",
+      "sources": "Fontes",
+      "knowledgeSubject": "Assunto de Conhecimento",
       "activity": "Monitor de Atividade",
       "queue": "Fila de Tarefas (Redis)",
       "knowledge_contexts": "Bases de Conhecimento",
-
       "database": "Detalhes da Fonte",
       "voice_profiles": "Perfis de Voz"
     },
@@ -97,6 +100,14 @@
       "settings": "Configurações"
     }
   },
+  "sidebarContext": {
+    "title": "Bases de Conhecimento",
+    "description": "Filtre as fontes de conteúdo por um ou mais contextos selecionados.",
+    "all_bases": "Todas as Bases",
+    "clear_filter": "Limpar Filtros",
+    "sources_count": "fontes",
+    "no_base": "Nenhuma base de conhecimento encontrada"
+  },
   "search": {
     "title": "Busca Semântica",
     "subtitle": "Explore sua base de conhecimento usando busca semântica, palavra-chave ou híbrida.",
@@ -238,7 +249,22 @@
     "pipeline_optimized": "Pipeline otimizada",
     "reprocessed_message": "Substituído por uma nova versão",
     "phase": "Phase",
-    "duration": "duração"
+    "duration": "duração",
+    "diarization": "Diarização"
+  },
+  "operations": {
+    "duplicatesTitle": "Gerenciamento de Duplicidades",
+    "duplicatesDesc": "Identificamos trechos com alta similaridade (>90%) que podem ser redundantes ou cortes repetidos.",
+    "duplicates": "Duplicidades",
+    "similarity": "Similaridade",
+    "ignore": "Ignorar",
+    "deactivate": "Inativar",
+    "resolved": "Resolvido",
+    "pending": "Pendente",
+    "reviewed": "Revisado",
+    "ignored": "Ignorado",
+    "noDuplicates": "Nenhuma duplicidade encontrada no momento.",
+    "contentSources": "Fontes de Conteúdo"
   },
   "settings": {
     "title": "Configurações",
diff --git a/frontend/src/services/api.ts b/frontend/src/services/api.ts
index 4eedcafc..ec7c23ce 100644
--- a/frontend/src/services/api.ts
+++ b/frontend/src/services/api.ts
@@ -1,4 +1,4 @@
-import { Subject, IngestionTask, ContentSource, Chunk, PaginatedResponse, RawQueueTask } from '../types';
+import { Subject, IngestionTask, ContentSource, Chunk, PaginatedResponse, RawQueueTask, ChunkDuplicate } from '../types';
 
 
 const API_BASE_URL = '/rest';
@@ -353,11 +353,18 @@ export const api = {
   },
 
   // Diarization Methods
-  async fetchDiarizations(limit = 20, offset = 0, subjectId?: string): Promise<any[]> {
+  async fetchDiarizations(limit = 10, offset = 0, subject_ids?: string | string[]): Promise<any[]> {
     const url = new URL(`${API_BASE_URL}/audio`, globalThis.location.origin);
     url.searchParams.append('limit', limit.toString());
     url.searchParams.append('offset', offset.toString());
-    if (subjectId) url.searchParams.append('subject_id', subjectId);
+    
+    if (subject_ids) {
+      if (Array.isArray(subject_ids)) {
+        subject_ids.forEach(id => url.searchParams.append('subject_id', id));
+      } else {
+        url.searchParams.append('subject_id', subject_ids);
+      }
+    }
 
     const response = await fetch(url.toString(), {
       headers: getHeaders()
@@ -567,6 +574,49 @@ export const api = {
       headers: getHeaders()
     });
     await handleResponseError(response, 'Failed to remove task from queue');
+  },
+
+  // Duplicate Management
+  async fetchDuplicates(status?: string, subject_ids?: string[], limit = 100, offset = 0): Promise<{ results: ChunkDuplicate[]; total: number }> {
+    const url = new URL(`${API_BASE_URL}/duplicates`, globalThis.location.origin);
+    if (status) url.searchParams.append('status', status);
+    if (subject_ids && subject_ids.length > 0) {
+      subject_ids.forEach(id => url.searchParams.append('subject_id', id));
+    }
+    url.searchParams.append('limit', limit.toString());
+    url.searchParams.append('offset', offset.toString());
+
+    const response = await fetch(url.toString(), {
+      headers: getHeaders()
+    });
+    await handleResponseError(response, 'Failed to fetch duplicates');
+    return response.json();
+  },
+
+  async updateDuplicateStatus(id: string, status: string): Promise<void> {
+    const response = await fetch(`${API_BASE_URL}/duplicates/${id}/status`, {
+      method: 'PATCH',
+      headers: getHeaders(),
+      body: JSON.stringify({ status })
+    });
+    await handleResponseError(response, 'Failed to update duplicate status');
+  },
+
+  async deactivateChunk(id: string): Promise<void> {
+    const response = await fetch(`${API_BASE_URL}/duplicates/chunks/${id}/deactivate`, {
+      method: 'POST',
+      headers: getHeaders()
+    });
+    await handleResponseError(response, 'Failed to deactivate chunk');
+  },
+
+  async analyzeAllDuplicates(): Promise<any> {
+    const response = await fetch(`${API_BASE_URL}/duplicates/analyze-all`, {
+      method: 'POST',
+      headers: getHeaders()
+    });
+    await handleResponseError(response, 'Global analysis failed');
+    return response.json();
   }
 };
 
diff --git a/frontend/src/store/AppContext.tsx b/frontend/src/store/AppContext.tsx
index ed000a31..22712ecc 100644
--- a/frontend/src/store/AppContext.tsx
+++ b/frontend/src/store/AppContext.tsx
@@ -79,7 +79,7 @@ export function AppProvider({ children }: { readonly children: ReactNode }) {
   const [isJobsLoaded, setIsJobsLoaded] = useState(false);
   const [currentView, setCurrentView] = useState<ViewState>(() => {
     const saved = localStorage.getItem('currentView') as ViewState;
-    const validViews: ViewState[] = ['chat', 'search', 'sources', 'activity', 'database', 'knowledge_contexts', 'diarization', 'voice_profiles'];
+    const validViews: ViewState[] = ['chat', 'search', 'sources', 'activity', 'database', 'knowledge_contexts', 'diarization', 'voice_profiles', 'duplicates'];
     const initial = validViews.includes(saved) ? saved : 'search';
     return initial;
   });
@@ -153,9 +153,7 @@ export function AppProvider({ children }: { readonly children: ReactNode }) {
         }
       }
 
-      if (data.length > 0 && selectedSubjects.length === 0) {
-        setSelectedSubjects([data[0]]);
-      }
+      // No auto-selecting anymore, empty selection means "All"
     } catch (err) {
       console.error('Error fetching subjects:', err);
     }
@@ -398,10 +396,8 @@ export function AppProvider({ children }: { readonly children: ReactNode }) {
 
   // Persist selectedSubjects
   useEffect(() => {
-    if (selectedSubjects.length > 0) {
       const ids = selectedSubjects.map(s => s.id);
       localStorage.setItem('selectedSubjectIds', JSON.stringify(ids));
-    }
   }, [selectedSubjects]);
 
   const toggleSubjectSelection = useCallback((subject: Subject) => {
diff --git a/frontend/src/types.ts b/frontend/src/types.ts
index 53384d3e..88958413 100644
--- a/frontend/src/types.ts
+++ b/frontend/src/types.ts
@@ -55,7 +55,22 @@ export interface RawQueueTask {
   enqueued_at: number;
 }
 
-export type ViewState = 'chat' | 'search' | 'sources' | 'activity' | 'database' | 'knowledge_contexts' | 'diarization' | 'voice_profiles' | 'queue';
+export type ViewState = 'chat' | 'search' | 'sources' | 'activity' | 'database' | 'knowledge_contexts' | 'diarization' | 'voice_profiles' | 'queue' | 'duplicates';
+
+export type ChunkDuplicate = {
+  id: string;
+  chunk_ids: string[];
+  chunks?: {
+    id: string;
+    content: string;
+    source_title?: string;
+    source_id?: string;
+  }[];
+  similarity: number;
+  status: string;
+  created_at: string;
+  updated_at: string;
+};
 
 
 export type ToastType = 'success' | 'info' | 'error';
diff --git a/main.py b/main.py
index a212feed..d5206c26 100644
--- a/main.py
+++ b/main.py
@@ -25,6 +25,7 @@
 from src.presentation.api.routes import (  # noqa: E402
     auth_router,
     chunk_router,
+    duplicate_router,
     ingest_router,
     job_router,
     notification_router,
@@ -89,6 +90,7 @@ async def lifespan(app: FastAPI):
             run_audio_diarization_dispatcher_worker,
             run_audio_diarization_worker,
             run_diarization_ingestion_worker,
+            run_duplicate_detection_worker,
             run_file_ingestion_worker,
             run_web_ingestion_worker,
             run_youtube_dispatcher_worker,
@@ -106,6 +108,7 @@ async def lifespan(app: FastAPI):
             "run_audio_diarization_dispatcher_worker",
             run_audio_diarization_dispatcher_worker,
         )
+        register_task("run_duplicate_detection_worker", run_duplicate_detection_worker)
 
         logger.info("Initializing RedisTaskQueueService...")
         app.state.task_queue = RedisTaskQueueService(num_workers=1)
@@ -190,6 +193,12 @@ async def lifespan(app: FastAPI):
     tags=["Chunks"],
     dependencies=secured_deps,
 )
+app.include_router(
+    duplicate_router.router,
+    prefix="/rest/duplicates",
+    tags=["Duplicates"],
+    dependencies=secured_deps,
+)
 app.include_router(
     notification_router.router,
     prefix="/rest/notifications",
diff --git a/scripts/clear_sql_db.py b/scripts/clear_sql_db.py
index a5231263..39cc29c4 100644
--- a/scripts/clear_sql_db.py
+++ b/scripts/clear_sql_db.py
@@ -4,7 +4,7 @@
 # Add project root to sys.path
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 
-from src.infrastructure.repositories.sql.connector import engine, Base
+from infrastructure.connectors.connector_sql import engine, Base
 from src.infrastructure.repositories.sql import models  # noqa: F401
 from src.config.logger import Logger
 
diff --git a/scripts/dump_database.py b/scripts/dump_database.py
index af0513a9..529ce528 100644
--- a/scripts/dump_database.py
+++ b/scripts/dump_database.py
@@ -11,7 +11,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 
-from src.infrastructure.repositories.sql.connector import engine
+from infrastructure.connectors.connector_sql import engine
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
diff --git a/scripts/migrate_vector_db.py b/scripts/migrate_vector_db.py
index 00e75a35..8f58993e 100644
--- a/scripts/migrate_vector_db.py
+++ b/scripts/migrate_vector_db.py
@@ -31,7 +31,7 @@
 from sqlalchemy.orm import Session
 from sqlalchemy import or_, and_
 from clear_vector_db import clear_vector_db
-from src.infrastructure.repositories.sql.connector import Session as DBSessionFactory
+from src.infrastructure.connectors.connector_sql import Session as DBSessionFactory
 from src.infrastructure.repositories.sql.models.chunk_index import ChunkIndexModel
 from src.infrastructure.services.model_loader_service import ModelLoaderService
 from src.infrastructure.repositories.vector.models.chunk_model import ChunkModel
diff --git a/src/application/workers.py b/src/application/workers.py
index a9a7be28..7ff71021 100644
--- a/src/application/workers.py
+++ b/src/application/workers.py
@@ -72,7 +72,16 @@ def run_file_ingestion_worker(cmd: IngestFileCommand):
             event_bus=ctx.event_bus,
         )
 
-        use_case.execute(cmd)
+        result = use_case.execute(cmd)
+
+        # Enqueue duplicate detection
+        if result and "vector_ids" in result:
+            task_queue = app.state.task_queue
+            task_queue.enqueue(
+                run_duplicate_detection_worker,
+                {"chunk_ids": result["vector_ids"]},
+                task_title=f"Dup Check: {cmd.file_name}",
+            )
     except Exception as e:
         logger.error(f"Worker Error: Failed to execute file ingestion: {e}", exc_info=True)
     finally:
@@ -119,7 +128,16 @@ def run_youtube_ingestion_worker(cmd: IngestYoutubeCommand):
             event_bus=ctx.event_bus,
         )
 
-        use_case.execute(cmd)
+        result = use_case.execute(cmd)
+
+        # Enqueue duplicate detection
+        if result and "vector_ids" in result:
+            task_queue = app.state.task_queue
+            task_queue.enqueue(
+                run_duplicate_detection_worker,
+                {"chunk_ids": result["vector_ids"]},
+                task_title=f"Dup Check YouTube: {cmd.video_url}",
+            )
     except Exception as e:
         logger.error(f"Worker Error: Failed to execute YouTube ingestion: {e}", exc_info=True)
     finally:
@@ -144,8 +162,20 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
     try:
         from src.application.dtos.enums.youtube_data_type import YoutubeDataType
         from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor
+        from src.presentation.api.dependencies import resolve_ingestion_context
 
         task_queue = app.state.task_queue
+        context = resolve_ingestion_context(app)
+        job_service = context.job_service
+        job_id = str(cmd.ingestion_job_id) if cmd.ingestion_job_id else None
+
+        if job_id:
+            job_service.update_job_status(
+                job_id,
+                status="PROCESSING",
+                status_message=f"Resolving {cmd.data_type} videos...",
+                progress=5,
+            )
 
         # 1. Resolve the full list of URLs
         video_list = []
@@ -154,6 +184,8 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
             playlist_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None)
             if not playlist_url:
                 logger.warning("No URL provided for playlist dispatcher")
+                if job_id:
+                    job_service.update_job_status(job_id, "FAILED", "Missing playlist URL.")
                 return
 
             extractor = YoutubeExtractor(language=cmd.language)
@@ -163,6 +195,8 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
             channel_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None)
             if not channel_url:
                 logger.warning("No URL provided for channel dispatcher")
+                if job_id:
+                    job_service.update_job_status(job_id, "FAILED", "Missing channel URL.")
                 return
 
             extractor = YoutubeExtractor(language=cmd.language)
@@ -173,15 +207,27 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
 
         if not video_list:
             logger.warning(f"YouTube Dispatcher resolved 0 videos for type {cmd.data_type}.")
+            if job_id:
+                job_service.update_job_status(
+                    job_id,
+                    "FAILED",
+                    f"No videos found in {cmd.data_type}. Verify if the URL is valid and public.",
+                )
             return
 
+        if job_id:
+            job_service.update_job_status(
+                job_id,
+                status="PROCESSING",
+                status_message=f"Dispatched {len(video_list)} videos for ingestion.",
+                progress=50,
+            )
+
         logger.info(f"YouTube Dispatcher resolved {len(video_list)} videos. Enqueueing individual tasks...")
 
         # 2. Enqueue each video as a separate task
         for url in video_list:
             # Create a clone of the command for a single video
-            # IMPORTANT: We don't reuse the same ingestion_job_id from the dispatcher
-            # so that each video can either create its own tracking or we let the use case handle it.
             single_cmd = IngestYoutubeCommand(
                 video_url=url,
                 subject_id=cmd.subject_id,
@@ -202,9 +248,21 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
             )
 
         logger.info(f"Successfully dispatched {len(video_list)} YouTube ingestion tasks.")
+        if job_id:
+            job_service.update_job_status(
+                job_id,
+                status="SUCCESS",
+                status_message=f"Dispatched {len(video_list)} videos successfully.",
+                progress=100,
+            )
 
     except Exception as e:
         logger.error(f"YouTube Dispatcher Worker Error: {e}", exc_info=True)
+        if job_id:
+            try:
+                job_service.update_job_status(job_id, "FAILED", str(e))
+            except Exception:
+                pass
     finally:
         clear_global_context()
 
@@ -238,7 +296,7 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand):
         vector_svc = ChunkVectorService(vector_repo, rerank_service=rerank_svc)
 
         # DiarizationRepository needs a DB session
-        from src.infrastructure.repositories.sql.connector import Session as DBSession
+        from infrastructure.connectors.connector_sql import Session as DBSession
 
         db = DBSession()
         try:
@@ -256,7 +314,16 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand):
                 event_bus=ctx.event_bus,
             )
 
-            use_case.execute(cmd)
+            result = use_case.execute(cmd)
+
+            # Enqueue duplicate detection
+            if result and "vector_ids" in result:
+                task_queue = app.state.task_queue
+                task_queue.enqueue(
+                    run_duplicate_detection_worker,
+                    {"chunk_ids": result["vector_ids"]},
+                    task_title=f"Dup Check Diarization: {cmd.source}"
+                )
         finally:
             db.close()
     except Exception as e:
@@ -313,7 +380,16 @@ async def _run():
                 extractor=extractor,
             )
 
-            await use_case.execute(cmd)
+            result = await use_case.execute(cmd)
+
+            # Enqueue duplicate detection
+            if result and "vector_ids" in result:
+                task_queue = app.state.task_queue
+                task_queue.enqueue(
+                    run_duplicate_detection_worker,
+                    {"chunk_ids": result["vector_ids"]},
+                    task_title=f"Dup Check Web: {cmd.url}"
+                )
         except Exception as e:
             logging.getLogger(__name__).error(f"Worker Error: Failed to execute Web Scraping: {e}", exc_info=True)
         finally:
@@ -324,12 +400,12 @@ async def _run():
 
 def _audio_diarization_subprocess(cmd_dict: dict):
     """Run audio diarization in a separate process to avoid torch/CUDA thread deadlocks."""
+    from infrastructure.connectors.connector_sql import (
+        Session as DBSessionFactory,
+    )
     from src.application.use_cases.process_audio_diarization_pipeline import (
         ProcessAudioDiarizationPipelineUseCase,
     )
-    from src.infrastructure.repositories.sql.connector import (
-        Session as DBSessionFactory,
-    )
     from src.infrastructure.repositories.sql.content_source_repository import (
         ContentSourceSQLRepository,
     )
@@ -400,10 +476,10 @@ def run_audio_diarization_dispatcher_worker(cmd: ProcessAudioCommand):
         return
 
     try:
-        from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor
-        from src.infrastructure.repositories.sql.connector import (
+        from infrastructure.connectors.connector_sql import (
             Session as DBSessionFactory,
         )
+        from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor
         from src.infrastructure.repositories.sql.diarization_repository import (
             DiarizationRepository,
         )
@@ -516,7 +592,7 @@ def run_audio_diarization_worker(cmd: ProcessAudioCommand):
         if process.exitcode != 0:
             logger.error("Audio diarization subprocess exited with code %d", process.exitcode)
             if cmd.diarization_id:
-                from src.infrastructure.repositories.sql.connector import (
+                from infrastructure.connectors.connector_sql import (
                     Session as DBSessionFactory,
                 )
                 from src.infrastructure.repositories.sql.diarization_repository import (
@@ -574,10 +650,10 @@ def run_voice_training_worker(cmd: TrainVoiceCommand):
         return
 
     try:
+        from infrastructure.connectors.connector_sql import Session as DBSession
         from src.application.use_cases.manage_voice_profiles import (
             TrainVoiceProfileFromSpeakerSegmentUseCase,
         )
-        from src.infrastructure.repositories.sql.connector import Session as DBSession
         from src.presentation.api.dependencies import resolve_ingestion_context
 
         ctx = resolve_ingestion_context(app)
@@ -595,3 +671,46 @@ def run_voice_training_worker(cmd: TrainVoiceCommand):
         logger.error(f"Worker Error: Failed to execute voice training: {e}", exc_info=True)
     finally:
         clear_global_context()
+
+
+def run_duplicate_detection_worker(cmd: dict):
+    """Background worker for detecting duplicate chunks."""
+    set_global_context({"correlation_id": "worker-duplicate-detection"})
+
+    app = _get_app()
+    if not app:
+        clear_global_context()
+        return
+
+    try:
+        from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService
+        from src.presentation.api.dependencies import (
+            get_chunk_repo,
+            get_chunk_vector_service,
+            get_duplicate_repo,
+            resolve_rerank_service,
+            resolve_vector_repository,
+        )
+
+        # Manual resolution since we don't have a Request object
+        vector_repo = resolve_vector_repository(app)
+        rerank_svc = resolve_rerank_service(app)
+        vector_svc = get_chunk_vector_service(vector_repo, rerank_svc)
+        
+        duplicate_repo = get_duplicate_repo()
+        chunk_repo = get_chunk_repo()
+        
+        service = ChunkDuplicateService(duplicate_repo, chunk_repo, vector_svc)
+        
+        chunk_ids = cmd.get("chunk_ids", [])
+        if not chunk_ids:
+            return
+
+        logger.info(f"Running duplicate detection for {len(chunk_ids)} chunks")
+        count = service.find_and_register_duplicates(chunk_ids, similarity_threshold=0.90)
+        logger.info(f"Duplicate detection finished. Found {count} potential duplicate groups.")
+
+    except Exception as e:
+        logger.error(f"Worker Error: Failed to execute duplicate detection: {e}", exc_info=True)
+    finally:
+        clear_global_context()
diff --git a/src/domain/entities/chunk_duplicate_entity.py b/src/domain/entities/chunk_duplicate_entity.py
new file mode 100644
index 00000000..73e7fcf6
--- /dev/null
+++ b/src/domain/entities/chunk_duplicate_entity.py
@@ -0,0 +1,17 @@
+from datetime import datetime
+from typing import List, Optional
+from uuid import UUID, uuid4
+
+from pydantic import BaseModel, Field
+
+
+class ChunkDuplicateEntity(BaseModel):
+    """Domain entity representing a group of duplicate chunks."""
+
+    id: UUID = Field(default_factory=uuid4)
+    chunk_ids: List[UUID] = Field(default_factory=list)
+    similarity: float
+    content_source_id: Optional[str] = None
+    status: str = "pending"  # e.g., "pending", "reviewed", "ignored"
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
diff --git a/src/infrastructure/connectors/__init__.py b/src/infrastructure/connectors/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/infrastructure/repositories/sql/connector.py b/src/infrastructure/connectors/connector_sql.py
similarity index 95%
rename from src/infrastructure/repositories/sql/connector.py
rename to src/infrastructure/connectors/connector_sql.py
index 439fdddf..7e665e5e 100644
--- a/src/infrastructure/repositories/sql/connector.py
+++ b/src/infrastructure/connectors/connector_sql.py
@@ -3,7 +3,6 @@
 
 from src.config.settings import settings
 
-# Engine setup based on dialect
 connect_args = {}
 if settings.sql.url.startswith("sqlite"):
     connect_args["check_same_thread"] = False
diff --git a/src/infrastructure/extractors/youtube_extractor.py b/src/infrastructure/extractors/youtube_extractor.py
index 87e68e86..726d9aa3 100644
--- a/src/infrastructure/extractors/youtube_extractor.py
+++ b/src/infrastructure/extractors/youtube_extractor.py
@@ -202,7 +202,9 @@ def _download():
             with YoutubeDL(ydl_opts) as ydl:
                 info = ydl.extract_info(url, download=True)
                 base_name = ydl.prepare_filename(info)
-                return str(Path(base_name).with_suffix(".mp3"))
+                mp3_path = str(Path(base_name).with_suffix(".mp3"))
+                self._validate_mp3_file(mp3_path)
+                return mp3_path
 
         try:
             return self._run_with_retry(_download)
@@ -210,6 +212,37 @@ def _download():
             logger.error(f"Download failed after ALL retries: {e}", context={"url": url})
             return None
 
+    @staticmethod
+    def _validate_mp3_file(path: str) -> None:
+        """Validate that the downloaded file is a real MP3.
+
+        Raises ValueError if the file is missing, empty, or doesn't start with
+        a valid MP3 signature (ID3 tag or MPEG audio frame sync). This catches
+        cases where yt-dlp/ffmpeg silently produced a corrupt or HTML-error
+        artifact with an .mp3 extension.
+        """
+        p = Path(path)
+        if not p.exists():
+            raise ValueError(f"Downloaded MP3 not found: {path}")
+
+        size = p.stat().st_size
+        if size < 1024:
+            raise ValueError(f"Downloaded MP3 is too small ({size} bytes): {path}")
+
+        with open(p, "rb") as f:
+            header = f.read(4)
+
+        # ID3v2 tag
+        if header[:3] == b"ID3":
+            return
+        # MPEG audio frame sync: 11 bits set (0xFF 0xEx/0xFx)
+        if len(header) >= 2 and header[0] == 0xFF and (header[1] & 0xE0) == 0xE0:
+            return
+
+        raise ValueError(
+            f"Downloaded file is not a valid MP3 (header={header!r}): {path}"
+        )
+
     def extract_playlist_videos(self, playlist_url: str) -> list[str]:
         """Extracts all video URLs from a YouTube playlist using yt_dlp."""
 
diff --git a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
new file mode 100644
index 00000000..911f864b
--- /dev/null
+++ b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
@@ -0,0 +1,98 @@
+from typing import Any, List, Optional
+from uuid import UUID
+
+from sqlalchemy import desc
+
+from src.config.logger import Logger
+from src.infrastructure.connectors.connector_sql import Connector
+from src.infrastructure.repositories.sql.models.chunk_duplicate import ChunkDuplicateModel
+from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel
+from src.infrastructure.repositories.sql.utils.utils import ensure_uuid
+
+logger = Logger()
+
+class ChunkDuplicateSQLRepository:
+    """Repository for managing duplicate chunk records in SQL."""
+
+    def create_duplicate_record(self, chunk_ids: List[UUID], similarity: float, status: str = "pending", content_source_id: Optional[str] = None) -> ChunkDuplicateModel:
+        """Create a new duplicate grouping record."""
+        with Connector() as session:
+            try:
+                record = ChunkDuplicateModel(
+                    chunk_ids=[str(cid) for cid in chunk_ids],
+                    similarity=similarity,
+                    status=status,
+                    content_source_id=content_source_id
+                )
+                session.add(record)
+                session.commit()
+                session.refresh(record)
+                return record
+            except Exception as e:
+                session.rollback()
+                logger.error(
+                    "Error creating duplicate record",
+                    context={"error": str(e)}
+                )
+                raise
+
+    def list_duplicates(self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0) -> tuple[List[ChunkDuplicateModel], int]:
+        """List duplicate records with optional status and context filtering."""
+        with Connector() as session:
+            query = session.query(ChunkDuplicateModel)
+            
+            if subject_ids:
+                # Convert string IDs to UUID objects for safe matching in SQL
+                parsed_ids = [UUID(sid) for sid in subject_ids]
+                query = query.join(
+                    ContentSourceModel,
+                    ContentSourceModel.id == ChunkDuplicateModel.content_source_id
+                ).filter(ContentSourceModel.subject_id.in_(parsed_ids))
+            
+            if status:
+                query = query.filter(ChunkDuplicateModel.status == status)
+            
+            total = query.count()
+            items = query.order_by(desc(ChunkDuplicateModel.created_at)).limit(limit).offset(offset).all()
+            return items, total
+
+    def get_by_id(self, duplicate_id: Any) -> Optional[ChunkDuplicateModel]:
+        """Fetch a duplicate record by its UUID."""
+        duplicate_id = ensure_uuid(duplicate_id)
+        with Connector() as session:
+            return session.query(ChunkDuplicateModel).filter_by(id=duplicate_id).first()
+
+    def update_status(self, duplicate_id: Any, status: str) -> bool:
+        """Update the status of a duplicate record."""
+        duplicate_id = ensure_uuid(duplicate_id)
+        with Connector() as session:
+            try:
+                record = session.query(ChunkDuplicateModel).filter_by(id=duplicate_id).first()
+                if record:
+                    record.status = status
+                    session.commit()
+                    return True
+                return False
+            except Exception as e:
+                session.rollback()
+                logger.error(
+                    "Error updating duplicate status",
+                    context={"duplicate_id": str(duplicate_id), "error": str(e)}
+                )
+                raise
+
+    def delete_record(self, duplicate_id: Any) -> bool:
+        """Delete a duplicate record."""
+        duplicate_id = ensure_uuid(duplicate_id)
+        with Connector() as session:
+            try:
+                record = session.query(ChunkDuplicateModel).filter_by(id=duplicate_id).first()
+                if record:
+                    session.delete(record)
+                    session.commit()
+                    return True
+                return False
+            except Exception as e:
+                session.rollback()
+                logger.error("Error deleting duplicate record", context={"duplicate_id": str(duplicate_id), "error": str(e)})
+                raise
diff --git a/src/infrastructure/repositories/sql/chunk_index_repository.py b/src/infrastructure/repositories/sql/chunk_index_repository.py
index ea4f4e2e..b58f49b3 100644
--- a/src/infrastructure/repositories/sql/chunk_index_repository.py
+++ b/src/infrastructure/repositories/sql/chunk_index_repository.py
@@ -5,7 +5,7 @@
 from sqlalchemy.orm import joinedload
 
 from src.config.logger import Logger
-from src.infrastructure.repositories.sql.connector import Connector
+from src.infrastructure.connectors.connector_sql import Connector
 from src.infrastructure.repositories.sql.models.chunk_index import ChunkIndexModel
 from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel
 from src.infrastructure.repositories.sql.utils.utils import ensure_uuid
@@ -260,3 +260,22 @@ def get_by_id(self, chunk_id: Any) -> Optional[ChunkIndexModel]:
                     context={"chunk_id": str(chunk_id), "error": str(e)},
                 )
                 return None
+
+    def update_is_active(self, chunk_id: Any, is_active: bool) -> bool:
+        """Update the is_active flag of a chunk."""
+        chunk_id = ensure_uuid(chunk_id)
+        with Connector() as session:
+            try:
+                chunk = session.query(ChunkIndexModel).filter_by(id=chunk_id).first()
+                if chunk:
+                    chunk.is_active = is_active
+                    session.commit()
+                    return True
+                return False
+            except Exception as e:
+                session.rollback()
+                logger.error(
+                    "Error updating chunk is_active",
+                    context={"chunk_id": str(chunk_id), "error": str(e)},
+                )
+                raise
diff --git a/src/infrastructure/repositories/sql/content_source_repository.py b/src/infrastructure/repositories/sql/content_source_repository.py
index 6db68951..3714c056 100644
--- a/src/infrastructure/repositories/sql/content_source_repository.py
+++ b/src/infrastructure/repositories/sql/content_source_repository.py
@@ -3,7 +3,7 @@
 from uuid import UUID
 
 from src.config.logger import Logger
-from src.infrastructure.repositories.sql.connector import Connector
+from src.infrastructure.connectors.connector_sql import Connector
 from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel
 from src.infrastructure.repositories.sql.utils.utils import ensure_uuid
 
diff --git a/src/infrastructure/repositories/sql/diarization_repository.py b/src/infrastructure/repositories/sql/diarization_repository.py
index 26baf739..8a066ff4 100644
--- a/src/infrastructure/repositories/sql/diarization_repository.py
+++ b/src/infrastructure/repositories/sql/diarization_repository.py
@@ -107,7 +107,7 @@ def get_by_external_source(
             DiarizationRecord.external_source == external_source,
         )
         if subject_id:
-            parsed_id = UUID(subject_id) if isinstance(subject_id, str) else subject_id
+            parsed_id = UUID(str(subject_id)) if isinstance(subject_id, str) else subject_id
             query = query.filter(DiarizationRecord.subject_id == parsed_id)
         else:
             query = query.filter(DiarizationRecord.subject_id.is_(None))
@@ -119,16 +119,20 @@ def get_all(
         self,
         limit: int = 10,
         offset: int = 0,
-        subject_id: str | object | None = None,
+        subject_id: str | List[str] | None = None,
     ) -> List[DiarizationRecord]:
 
         query = self.db.query(DiarizationRecord)
         if subject_id:
-            parsed_id = UUID(subject_id) if isinstance(subject_id, str) else subject_id
-            query = query.filter(DiarizationRecord.subject_id == parsed_id)
+            if isinstance(subject_id, list):
+                parsed_ids = [UUID(sid) if isinstance(sid, str) else sid for sid in subject_id]
+                query = query.filter(DiarizationRecord.subject_id.in_(parsed_ids))
+            else:
+                parsed_id = UUID(subject_id) if isinstance(subject_id, str) else subject_id
+                query = query.filter(DiarizationRecord.subject_id == parsed_id)
 
         result = query.order_by(DiarizationRecord.created_at.desc()).offset(offset).limit(limit).all()
-        return cast(List[DiarizationRecord], cast(object, result))
+        return cast(List[DiarizationRecord], result)
 
     def get_by_id(self, diarization_id: str) -> Optional[DiarizationRecord]:
         result = self.db.query(DiarizationRecord).filter(DiarizationRecord.id == diarization_id).first()
diff --git a/src/infrastructure/repositories/sql/ingestion_job_repository.py b/src/infrastructure/repositories/sql/ingestion_job_repository.py
index 4c216103..0c9c239b 100644
--- a/src/infrastructure/repositories/sql/ingestion_job_repository.py
+++ b/src/infrastructure/repositories/sql/ingestion_job_repository.py
@@ -5,7 +5,7 @@
 from sqlalchemy.orm import joinedload
 
 from src.config.logger import Logger
-from src.infrastructure.repositories.sql.connector import Connector
+from src.infrastructure.connectors.connector_sql import Connector
 from src.infrastructure.repositories.sql.models.ingestion_job import IngestionJobModel
 from src.infrastructure.repositories.sql.utils.utils import ensure_uuid
 
diff --git a/src/infrastructure/repositories/sql/knowledge_subject_repository.py b/src/infrastructure/repositories/sql/knowledge_subject_repository.py
index 5d38a616..668c444f 100644
--- a/src/infrastructure/repositories/sql/knowledge_subject_repository.py
+++ b/src/infrastructure/repositories/sql/knowledge_subject_repository.py
@@ -4,7 +4,7 @@
 from sqlalchemy.orm import selectinload
 
 from src.config.logger import Logger
-from src.infrastructure.repositories.sql.connector import Connector
+from src.infrastructure.connectors.connector_sql import Connector
 from src.infrastructure.repositories.sql.models.knowledge_subject import (
     KnowledgeSubjectModel,
 )
diff --git a/src/infrastructure/repositories/sql/models/chunk_duplicate.py b/src/infrastructure/repositories/sql/models/chunk_duplicate.py
new file mode 100644
index 00000000..6b64a66c
--- /dev/null
+++ b/src/infrastructure/repositories/sql/models/chunk_duplicate.py
@@ -0,0 +1,39 @@
+"""
+ORM models for chunk_duplicate table.
+"""
+
+import uuid
+
+from sqlalchemy import (
+    JSON,
+    UUID,
+    Column,
+    DateTime,
+    Float,
+    ForeignKey,
+    Text,
+    func,
+)
+
+from src.infrastructure.connectors.connector_sql import Base
+
+
+class ChunkDuplicateModel(Base):
+    __tablename__ = "chunk_duplicates"
+
+    id = Column(UUID, primary_key=True, default=uuid.uuid4)
+    chunk_ids = Column(JSON, nullable=False)
+    similarity = Column(Float, nullable=False)
+    content_source_id = Column(
+        UUID,
+        ForeignKey("content_sources.id", deferrable=True, initially="IMMEDIATE"),
+        nullable=True,
+    )
+    status = Column(Text, default="pending", nullable=False)
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
+    updated_at = Column(
+        DateTime(timezone=True),
+        server_default=func.now(),
+        onupdate=func.now(),
+        nullable=False,
+    )
diff --git a/src/infrastructure/repositories/sql/models/chunk_index.py b/src/infrastructure/repositories/sql/models/chunk_index.py
index 6ad883f1..6047f336 100644
--- a/src/infrastructure/repositories/sql/models/chunk_index.py
+++ b/src/infrastructure/repositories/sql/models/chunk_index.py
@@ -7,6 +7,7 @@
 from sqlalchemy import (
     JSON,
     UUID,
+    Boolean,
     Column,
     DateTime,
     ForeignKey,
@@ -18,7 +19,7 @@
 )
 from sqlalchemy.orm import relationship
 
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 
 class ChunkIndexModel(Base):
@@ -58,6 +59,7 @@ class ChunkIndexModel(Base):
         onupdate=func.now(),
         nullable=False,
     )
+    is_active = Column(Boolean, default=True, server_default=text("1"), nullable=False)
 
     __table_args__ = (
         Index("ix_chunk_index_content_source_id", "content_source_id"),
diff --git a/src/infrastructure/repositories/sql/models/content_source.py b/src/infrastructure/repositories/sql/models/content_source.py
index 7686f055..1734e422 100644
--- a/src/infrastructure/repositories/sql/models/content_source.py
+++ b/src/infrastructure/repositories/sql/models/content_source.py
@@ -19,7 +19,7 @@
 )
 from sqlalchemy.orm import relationship
 
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 
 class ContentSourceModel(Base):
diff --git a/src/infrastructure/repositories/sql/models/diarization_record.py b/src/infrastructure/repositories/sql/models/diarization_record.py
index 1b637fae..b31a77a2 100644
--- a/src/infrastructure/repositories/sql/models/diarization_record.py
+++ b/src/infrastructure/repositories/sql/models/diarization_record.py
@@ -8,7 +8,7 @@
 from sqlalchemy import JSON, UUID, Column, DateTime, Float, ForeignKey, String
 
 from src.domain.entities.enums.diarization_status_enum import DiarizationStatus
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 
 def _generate_uuid() -> str:
diff --git a/src/infrastructure/repositories/sql/models/ingestion_job.py b/src/infrastructure/repositories/sql/models/ingestion_job.py
index ae7dc01e..986161af 100644
--- a/src/infrastructure/repositories/sql/models/ingestion_job.py
+++ b/src/infrastructure/repositories/sql/models/ingestion_job.py
@@ -7,7 +7,7 @@
 from sqlalchemy import UUID, Column, DateTime, ForeignKey, Index, Integer, Text, func
 from sqlalchemy.orm import relationship, synonym
 
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 
 class IngestionJobModel(Base):
diff --git a/src/infrastructure/repositories/sql/models/knowledge_subject.py b/src/infrastructure/repositories/sql/models/knowledge_subject.py
index 595ddcb9..2b6c83b5 100644
--- a/src/infrastructure/repositories/sql/models/knowledge_subject.py
+++ b/src/infrastructure/repositories/sql/models/knowledge_subject.py
@@ -7,7 +7,7 @@
 from sqlalchemy import UUID, Column, DateTime, Text, func
 from sqlalchemy.orm import relationship
 
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 
 class KnowledgeSubjectModel(Base):
diff --git a/src/infrastructure/repositories/sql/models/user.py b/src/infrastructure/repositories/sql/models/user.py
index fa6c9a0e..bcc8c8d3 100644
--- a/src/infrastructure/repositories/sql/models/user.py
+++ b/src/infrastructure/repositories/sql/models/user.py
@@ -4,7 +4,7 @@
 from sqlalchemy import DateTime, String
 from sqlalchemy.orm import Mapped, mapped_column
 
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 
 class User(Base):
diff --git a/src/infrastructure/repositories/sql/models/voice_record.py b/src/infrastructure/repositories/sql/models/voice_record.py
index 8b0d476b..4ac7377c 100644
--- a/src/infrastructure/repositories/sql/models/voice_record.py
+++ b/src/infrastructure/repositories/sql/models/voice_record.py
@@ -7,7 +7,7 @@
 
 from sqlalchemy import JSON, Column, DateTime, String
 
-from src.infrastructure.repositories.sql.connector import Base
+from src.infrastructure.connectors.connector_sql import Base
 
 
 def _generate_uuid() -> str:
diff --git a/src/infrastructure/repositories/sql/user_repository.py b/src/infrastructure/repositories/sql/user_repository.py
index a8faf9c2..7bb7d5ce 100644
--- a/src/infrastructure/repositories/sql/user_repository.py
+++ b/src/infrastructure/repositories/sql/user_repository.py
@@ -5,7 +5,7 @@
 
 from src.domain.entities.user import User as UserEntity
 from src.domain.interfaces.repository.user_repository import IUserRepository
-from src.infrastructure.repositories.sql.connector import Connector
+from src.infrastructure.connectors.connector_sql import Connector
 from src.infrastructure.repositories.sql.models.user import User as UserModel
 from src.infrastructure.repositories.sql.utils.utils import ensure_uuid
 
diff --git a/src/infrastructure/services/chunk_duplicate_service.py b/src/infrastructure/services/chunk_duplicate_service.py
new file mode 100644
index 00000000..a50954bc
--- /dev/null
+++ b/src/infrastructure/services/chunk_duplicate_service.py
@@ -0,0 +1,156 @@
+from typing import Any, List, Optional, Set
+from uuid import UUID
+
+from src.config.logger import Logger
+from src.domain.entities.chunk_duplicate_entity import ChunkDuplicateEntity
+from src.domain.entities.enums.search_mode_enum import SearchMode
+from src.infrastructure.repositories.sql.chunk_duplicate_repository import (
+    ChunkDuplicateSQLRepository,
+)
+from src.infrastructure.repositories.sql.chunk_index_repository import (
+    ChunkIndexSQLRepository,
+)
+from src.infrastructure.services.chunk_vector_service import ChunkVectorService
+
+logger = Logger()
+
+
+class ChunkDuplicateService:
+    """Service for detecting and managing duplicate chunks using vector search."""
+
+    def __init__(
+        self,
+        repository: ChunkDuplicateSQLRepository,
+        chunk_repo: ChunkIndexSQLRepository,
+        vector_service: ChunkVectorService,
+    ):
+        self._repo = repository
+        self._chunk_repo = chunk_repo
+        self._vector_service = vector_service
+
+    def find_and_register_duplicates(
+        self, chunk_ids: List[UUID], similarity_threshold: float = 0.90
+    ) -> int:
+        """
+        Check a list of chunks for duplicates against the entire vector store.
+        If duplicates are found with similarity >= threshold, register them.
+        """
+        registered_count = 0
+        processed_pairs: Set[tuple[str, ...]] = set()
+
+        for cid in chunk_ids:
+            chunk = self._chunk_repo.get_by_id(cid)
+            if not chunk or not chunk.content:
+                continue
+
+            # Search for similar chunks
+            content_str = str(chunk.content)
+            similar_chunks = self._vector_service.retrieve(
+                query=content_str,
+                top_k=5,
+                search_mode=SearchMode.SEMANTIC,
+                re_rank=False,
+            )
+
+            duplicates = self._filter_duplicates(cid, similar_chunks, similarity_threshold)
+
+            if duplicates:
+                registered_count += self._register_cluster(
+                    source_id=cid, 
+                    source_content_source_id=str(chunk.content_source_id) if chunk.content_source_id else None,
+                    duplicates=duplicates, 
+                    processed_pairs=processed_pairs
+                )
+
+        return registered_count
+
+    def _filter_duplicates(self, source_id: UUID, similar_chunks: List[Any], threshold: float) -> List[tuple[UUID, float]]:
+        """Filter results to find valid duplicates above threshold."""
+        duplicates = []
+        source_id_str = str(source_id)
+        for sim_chunk in similar_chunks:
+            if str(sim_chunk.id) == source_id_str:
+                continue
+            
+            score = getattr(sim_chunk, "score", 0.0)
+            if score >= threshold:
+                duplicates.append((sim_chunk.id, float(score)))
+        return duplicates
+
+    def _register_cluster(
+        self,
+        source_id: UUID,
+        source_content_source_id: Optional[str],
+        duplicates: List[tuple[UUID, float]],
+        processed_pairs: Set[tuple[str, ...]]
+    ) -> int:
+        """Register a new duplicate group if not already processed."""
+        duplicate_ids = [d[0] for d in duplicates]
+        all_uuids = [source_id] + duplicate_ids
+        # Sort as strings for consistent cluster identification
+        cluster_ids_str = sorted([str(cid) for cid in all_uuids])
+        cluster_key = tuple(cluster_ids_str)
+
+        if cluster_key not in processed_pairs:
+            # Get exact similarity for the highest match
+            max_sim = max([float(d[1]) for d in duplicates] + [0.0])
+            self._repo.create_duplicate_record(
+                chunk_ids=all_uuids,
+                similarity=max_sim,
+                status="pending",
+                content_source_id=source_content_source_id
+            )
+            processed_pairs.add(cluster_key)
+            return 1
+        return 0
+
+    def list_duplicates(
+        self,
+        status: Optional[str] = None,
+        subject_ids: Optional[List[str]] = None,
+        limit: int = 100,
+        offset: int = 0
+    ) -> tuple[List[ChunkDuplicateEntity], int]:
+        """List mapped duplicate records."""
+        models, total = self._repo.list_duplicates(status=status, subject_ids=subject_ids, limit=limit, offset=offset)
+        entities = []
+        from datetime import datetime
+        for m in models:
+            chunk_ids: List[UUID] = []
+            if isinstance(m.chunk_ids, list):
+                for cid in m.chunk_ids:
+                    if isinstance(cid, str):
+                        chunk_ids.append(UUID(cid))
+                    elif isinstance(cid, UUID):
+                        chunk_ids.append(cid)
+            
+            # Ensure datetime types for Mypy
+            created_at = m.created_at if isinstance(m.created_at, datetime) else datetime.now()
+            updated_at = m.updated_at if isinstance(m.updated_at, datetime) else datetime.now()
+
+            entities.append(ChunkDuplicateEntity(
+                id=UUID(str(m.id)),
+                chunk_ids=chunk_ids,
+                similarity=float(m.similarity),
+                content_source_id=str(m.content_source_id) if m.content_source_id else None,
+                status=str(m.status),
+                created_at=created_at,
+                updated_at=updated_at,
+            ))
+        return entities, total
+
+    def resolve_duplicate(self, duplicate_id: UUID, status: str) -> bool:
+        """Mark a duplicate record as resolved (ignored, reviewed, etc)."""
+        return self._repo.update_status(duplicate_id, status)
+
+    def deactivate_chunk(self, chunk_id: UUID) -> bool:
+        """
+        Deactivate a chunk in both SQL and Vector Store.
+        """
+        # 1. Update SQL status to inactive
+        success = self._chunk_repo.update_is_active(chunk_id, False)
+        if success:
+            # 2. Remove from Vector Store to stop it from appearing in RAG
+            self._vector_service.delete_by_id(chunk_id)
+            return True
+        return False
diff --git a/src/presentation/api/dependencies.py b/src/presentation/api/dependencies.py
index 2c2b9bee..d98d2e65 100644
--- a/src/presentation/api/dependencies.py
+++ b/src/presentation/api/dependencies.py
@@ -42,11 +42,14 @@
 from src.domain.interfaces.repository.retriver_repository import IVectorRepository
 from src.domain.interfaces.services.i_event_bus import IEventBus
 from src.domain.interfaces.services.i_task_queue import ITaskQueue
+from src.infrastructure.connectors.connector_sql import Session as DBSessionFactory
 from src.infrastructure.extractors.crawl4ai_extractor import Crawl4AIExtractor
+from src.infrastructure.repositories.sql.chunk_duplicate_repository import (
+    ChunkDuplicateSQLRepository,
+)
 from src.infrastructure.repositories.sql.chunk_index_repository import (
     ChunkIndexSQLRepository,
 )
-from src.infrastructure.repositories.sql.connector import Session as DBSessionFactory
 from src.infrastructure.repositories.sql.content_source_repository import (
     ContentSourceSQLRepository,
 )
@@ -61,6 +64,7 @@
 )
 from src.infrastructure.repositories.sql.user_repository import UserSQLRepository
 from src.infrastructure.services.auth_service import AuthService
+from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService
 from src.infrastructure.services.chunk_index_service import ChunkIndexService
 from src.infrastructure.services.chunk_vector_service import ChunkVectorService
 from src.infrastructure.services.content_source_service import ContentSourceService
@@ -117,6 +121,10 @@ def get_user_repo() -> UserSQLRepository:
     return UserSQLRepository()
 
 
+def get_duplicate_repo() -> ChunkDuplicateSQLRepository:
+    return ChunkDuplicateSQLRepository()
+
+
 # Services
 def get_model_loader(request: Request) -> ModelLoaderService:
     return request.app.state.model_loader
@@ -295,6 +303,14 @@ def get_chunk_index_service(
     return ChunkIndexService(repo)
 
 
+def get_duplicate_service(
+    repo: ChunkDuplicateSQLRepository = Depends(get_duplicate_repo),
+    chunk_repo: ChunkIndexSQLRepository = Depends(get_chunk_repo),
+    vector_service: ChunkVectorService = Depends(get_chunk_vector_service),
+) -> ChunkDuplicateService:
+    return ChunkDuplicateService(repo, chunk_repo, vector_service)
+
+
 def get_youtube_vector_service(
     vector_repo: IVectorRepository = Depends(get_vector_repository),
 ) -> YouTubeVectorService:
diff --git a/src/presentation/api/routes/audio_diarization_and_recognition_router.py b/src/presentation/api/routes/audio_diarization_and_recognition_router.py
index 52e5eaf5..99b3900f 100644
--- a/src/presentation/api/routes/audio_diarization_and_recognition_router.py
+++ b/src/presentation/api/routes/audio_diarization_and_recognition_router.py
@@ -1,9 +1,9 @@
 import logging
 import traceback
-from typing import Annotated, Any, Optional, cast
+from typing import Annotated, Any, List, Optional, cast
 from uuid import UUID
 
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends, HTTPException, Query
 from sqlalchemy.orm import Session
 
 from src.application.dtos.commands.process_audio_command import ProcessAudioCommand
@@ -325,7 +325,7 @@ async def retrieve_all_processed_audio_history(
     use_case: Annotated[RetrieveProcessedAudioHistoryUseCase, Depends(get_retrieve_history_use_case)],
     limit: int = 10,
     offset: int = 0,
-    subject_id: str | None = None,
+    subject_id: Optional[List[str]] = Query(None),
 ):
     try:
         logger.info(
diff --git a/src/presentation/api/routes/duplicate_router.py b/src/presentation/api/routes/duplicate_router.py
new file mode 100644
index 00000000..ee482c8b
--- /dev/null
+++ b/src/presentation/api/routes/duplicate_router.py
@@ -0,0 +1,96 @@
+from typing import List, Optional
+from uuid import UUID
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+
+from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService
+from src.infrastructure.services.chunk_index_service import ChunkIndexService
+from src.presentation.api.dependencies import (
+    get_chunk_index_service,
+    get_current_user,
+    get_duplicate_service,
+)
+from src.presentation.api.schemas.duplicate_schemas import (
+    ChunkDuplicateResponse,
+    ChunkDuplicateStatusUpdate,
+    ChunkMinimal,
+    PaginatedChunkDuplicateResponse,
+)
+
+router = APIRouter(tags=["duplicates"])
+
+
+@router.get("", response_model=PaginatedChunkDuplicateResponse)
+def list_duplicates(
+    status: Optional[str] = None,
+    subject_id: Optional[List[str]] = Query(None),
+    limit: int = 100,
+    offset: int = 0,
+    service: ChunkDuplicateService = Depends(get_duplicate_service),
+    chunk_service: ChunkIndexService = Depends(get_chunk_index_service),
+    user=Depends(get_current_user),
+):
+    """List all detected chunk duplicate groups."""
+    entities, total = service.list_duplicates(status=status, subject_ids=subject_id, limit=limit, offset=offset)
+    
+    # Enrich entities with chunk content if needed for UI
+    results = []
+    for entity in entities:
+        resp = ChunkDuplicateResponse.model_validate(entity)
+        chunks_info = []
+        for cid in entity.chunk_ids:
+            chunk = chunk_service.get_by_id(cid)
+            if chunk:
+                chunks_info.append(ChunkMinimal(
+                    id=chunk.id,
+                    content=chunk.content or "",
+                    source_title=chunk.extra.get("source_title", "Unknown"),
+                    source_id=chunk.content_source_id
+                ))
+        resp.chunks = chunks_info
+        results.append(resp)
+        
+    return PaginatedChunkDuplicateResponse(results=results, total=total)
+
+
+@router.patch("/{duplicate_id}/status")
+def update_duplicate_status(
+    duplicate_id: UUID,
+    cmd: ChunkDuplicateStatusUpdate,
+    service: ChunkDuplicateService = Depends(get_duplicate_service),
+    user=Depends(get_current_user),
+):
+    """Update the resolution status of a duplicate group."""
+    success = service.resolve_duplicate(duplicate_id, cmd.status)
+    if not success:
+        raise HTTPException(status_code=404, detail="Duplicate group not found")
+    return {"status": "success"}
+
+
+@router.post("/chunks/{chunk_id}/deactivate")
+def deactivate_chunk(
+    chunk_id: UUID,
+    service: ChunkDuplicateService = Depends(get_duplicate_service),
+    user=Depends(get_current_user),
+):
+    """Deactivate a specific chunk (soft delete from RAG)."""
+    success = service.deactivate_chunk(chunk_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Chunk not found")
+    return {"status": "success"}
+
+
+@router.post("/analyze-all")
+def analyze_all_chunks(
+    service: ChunkDuplicateService = Depends(get_duplicate_service),
+    chunk_service: ChunkIndexService = Depends(get_chunk_index_service),
+    user=Depends(get_current_user),
+):
+    """Run duplicate detection analysis on all existing chunks (heavy operation)."""
+    # This should probably be a background task, but for now we'll do it synchronously
+    # or just list everything and iterate
+    all_chunks = chunk_service.list_chunks(limit=1000) # Limit for safety
+    chunk_ids = [c.id for c in all_chunks]
+    
+    count = service.find_and_register_duplicates(chunk_ids)
+    return {"status": "success", "groups_found": count}
diff --git a/src/presentation/api/routes/ingest_router.py b/src/presentation/api/routes/ingest_router.py
index fbda4c14..1615f2ef 100644
--- a/src/presentation/api/routes/ingest_router.py
+++ b/src/presentation/api/routes/ingest_router.py
@@ -26,12 +26,14 @@
 from src.domain.entities.user import User
 from src.domain.interfaces.services.i_task_queue import ITaskQueue
 from src.infrastructure.services.content_source_service import ContentSourceService
+from src.infrastructure.services.ingestion_job_service import IngestionJobService
 from src.presentation.api.dependencies import (
     get_cs_service,
     get_current_user,
     get_diarization_ingestion_use_case,
     get_file_ingestion_use_case,
     get_ingest_youtube_use_case,
+    get_job_service,
     get_task_queue_service,
     get_web_scraping_use_case,
 )
@@ -63,7 +65,9 @@ def ingest_youtube(
     request: Annotated[YoutubeIngestRequest, Body()],
     use_case: Annotated[YoutubeIngestionUseCase, Depends(get_ingest_youtube_use_case)],
     task_queue: Annotated[ITaskQueue, Depends(get_task_queue_service)],
-):
+    job_service: Annotated[IngestionJobService, Depends(get_job_service)],
+    current_user: Annotated[User, Depends(get_current_user)],
+) -> IngestResponse:
     """
     Ingest data from YouTube videos or playlists into the vector store.
     """
@@ -119,15 +123,31 @@ def ingest_youtube(
 
         logger.info("Running ingestion in background via queue", context={"reason": reason})
 
+        # Ensure we have a job ID for background tasks so they are visible in UI
+        job_id = request.ingestion_job_id
+        if not job_id:
+            job_title = request.title or request.video_url or f"YouTube {reason.capitalize()}"
+            job = job_service.create_job(
+                source_title=job_title,
+                external_source=request.video_url or (request.video_urls[0] if request.video_urls else None),
+                subject_id=request.subject_id,
+                ingestion_type="YOUTUBE",
+                status="INITIALIZING",
+                status_message=f"Starting YouTube {reason} ingestion...",
+            )
+            job_id = str(job.id)
+            cmd.ingestion_job_id = job_id
+
         task_queue.enqueue(
             worker,
             cmd,
-            task_title=request.title or request.video_url or "YouTube Ingestion",
-            metadata={"job_id": str(request.ingestion_job_id)} if request.ingestion_job_id else {},
+            task_title=request.title or request.video_url or f"YouTube {reason.capitalize()}",
+            metadata={"job_id": job_id},
         )
         return IngestResponse(
             skipped=False,
-            reason="Ingestion started in background queue.",
+            reason=f"Ingestion started in background queue (Job: {job_id}).",
+            job_id=UUID(job_id) if job_id else None,
         )
 
     try:
diff --git a/src/presentation/api/routes/settings_router.py b/src/presentation/api/routes/settings_router.py
index 79bcebd6..b139a477 100644
--- a/src/presentation/api/routes/settings_router.py
+++ b/src/presentation/api/routes/settings_router.py
@@ -6,7 +6,7 @@
 
 from src.config.settings import Settings
 from src.domain.interfaces.repository.retriver_repository import IVectorRepository
-from src.infrastructure.repositories.sql.connector import Connector
+from src.infrastructure.connectors.connector_sql import Connector
 from src.presentation.api.dependencies import get_settings, get_vector_repository
 from src.presentation.api.schemas.settings_schemas import (
     AppSettingsSchema,
diff --git a/src/presentation/api/schemas/duplicate_schemas.py b/src/presentation/api/schemas/duplicate_schemas.py
new file mode 100644
index 00000000..e429803f
--- /dev/null
+++ b/src/presentation/api/schemas/duplicate_schemas.py
@@ -0,0 +1,35 @@
+from datetime import datetime
+from typing import List, Optional
+from uuid import UUID
+
+from pydantic import BaseModel, ConfigDict
+
+
+class ChunkMinimal(BaseModel):
+    id: UUID
+    content: str
+    source_title: Optional[str] = None
+    source_id: Optional[UUID] = None
+
+    model_config = ConfigDict(from_attributes=True)
+
+
+class ChunkDuplicateResponse(BaseModel):
+    id: UUID
+    chunk_ids: List[UUID]
+    chunks: Optional[List[ChunkMinimal]] = None  # Enriched chunks for UI
+    similarity: float
+    status: str
+    created_at: datetime
+    updated_at: datetime
+
+    model_config = ConfigDict(from_attributes=True)
+
+
+class ChunkDuplicateStatusUpdate(BaseModel):
+    status: str
+
+
+class PaginatedChunkDuplicateResponse(BaseModel):
+    results: List[ChunkDuplicateResponse]
+    total: int
diff --git a/test_dispatcher.py b/test_dispatcher.py
new file mode 100644
index 00000000..feea32ab
--- /dev/null
+++ b/test_dispatcher.py
@@ -0,0 +1,60 @@
+import os
+import sys
+from unittest.mock import MagicMock
+
+# Add the project root to sys.path
+sys.path.append(os.getcwd())
+
+# Mock out dependencies before importing workers
+sys.modules['src.presentation.api.dependencies'] = MagicMock()
+mock_job_service = MagicMock()
+
+class MockContext:
+    def __init__(self, job_svc):
+        self.job_service = job_svc
+
+import src.presentation.api.dependencies as deps
+
+deps.resolve_ingestion_context = MagicMock(return_value=MockContext(mock_job_service))
+
+from src.application.dtos.commands.ingest_youtube_command import IngestYoutubeCommand
+from src.application.dtos.enums.youtube_data_type import YoutubeDataType
+from src.application.workers import run_youtube_dispatcher_worker
+
+
+def test_dispatcher():
+    # Setup mock app state
+    mock_app = MagicMock()
+    mock_app.state.task_queue = MagicMock()
+    
+    # Patch _get_app to return our mock
+    import src.application.workers as workers
+    workers._get_app = MagicMock(return_value=mock_app)
+    
+    # Create command for a small playlist (or the user's one)
+    # Using a known small playlist to speed up test if possible
+    # But user's URL is fine since we are testing logic
+    url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59"
+    cmd = IngestYoutubeCommand(
+        video_url=url,
+        data_type=YoutubeDataType.PLAYLIST,
+        ingestion_job_id="test-job-uuid",
+        subject_id="test-subject"
+    )
+    
+    print(f"Testing dispatcher with URL: {url}")
+    run_youtube_dispatcher_worker(cmd)
+    
+    # Assertions
+    print("\nVerifying Job Service calls:")
+    for call in mock_job_service.update_job_status.call_args_list:
+        print(f"  - Status update: {call[1]}")
+        
+    print("\nVerifying Task Queue calls:")
+    enqueue_calls = mock_app.state.task_queue.enqueue.call_args_list
+    print(f"  - Tasks enqueued: {len(enqueue_calls)}")
+    if len(enqueue_calls) > 0:
+        print(f"  - First task URL: {enqueue_calls[0][0][1].video_url}")
+
+if __name__ == "__main__":
+    test_dispatcher()
diff --git a/test_playlist.py b/test_playlist.py
new file mode 100644
index 00000000..c9f17e06
--- /dev/null
+++ b/test_playlist.py
@@ -0,0 +1,25 @@
+from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor
+
+
+def test_playlist():
+    # Use the playlist provided by the user
+    playlist_url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59"
+    
+    print(f"Testing playlist extraction for: {playlist_url}")
+    extractor = YoutubeExtractor(language="pt")
+    
+    try:
+        videos = extractor.extract_playlist_videos(playlist_url)
+        print(f"Extracted {len(videos)} videos.")
+        for i, url in enumerate(videos[:5]):
+            print(f"  {i+1}: {url}")
+            
+        if not videos:
+            print("FAILED: No videos extracted.")
+        else:
+            print("SUCCESS: Videos extracted.")
+    except Exception as e:
+        print(f"ERROR: {e}")
+
+if __name__ == "__main__":
+    test_playlist()
diff --git a/tests/conftest.py b/tests/conftest.py
index 859a65a6..39214ce2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -8,7 +8,7 @@
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
 
-import src.infrastructure.repositories.sql.connector as connector
+import src.infrastructure.connectors.connector_sql as connector
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py
new file mode 100644
index 00000000..cd88f6f6
--- /dev/null
+++ b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py
@@ -0,0 +1,90 @@
+import uuid
+
+import pytest
+
+from src.infrastructure.repositories.sql.chunk_duplicate_repository import ChunkDuplicateSQLRepository
+from src.infrastructure.repositories.sql.models.content_source import ContentSourceModel
+from src.infrastructure.repositories.sql.models.knowledge_subject import KnowledgeSubjectModel
+
+
+@pytest.mark.ChunkDuplicateSQLRepository
+def test_create_duplicate_record(sqlite_memory):
+    """Test creating a duplicate record in the repository."""
+    repo = ChunkDuplicateSQLRepository()
+    chunk_ids = [str(uuid.uuid4()), str(uuid.uuid4())]
+    similarity = 0.95
+    status = "pending"
+    
+    record = repo.create_duplicate_record(chunk_ids, similarity, status)
+    
+    assert record.id is not None
+    assert record.chunk_ids == chunk_ids
+    assert record.similarity == pytest.approx(similarity)
+    assert record.status == status
+
+@pytest.mark.ChunkDuplicateSQLRepository
+def test_list_duplicates_filtering(sqlite_memory):
+    """Test listing duplicates with status and subject filtering."""
+    db = sqlite_memory
+    repo = ChunkDuplicateSQLRepository()
+    
+    # Create a subject and content source
+    subject = KnowledgeSubjectModel(name="Test Subject")
+    db.add(subject)
+    db.commit()
+    db.refresh(subject)
+    
+    source = ContentSourceModel(
+        subject_id=subject.id,
+        source_type="file",
+        external_source="test.txt"
+    )
+    db.add(source)
+    db.commit()
+    db.refresh(source)
+    
+    # Create duplicate records
+    repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending", content_source_id=source.id)
+    repo.create_duplicate_record([str(uuid.uuid4())], 0.8, "reviewed", content_source_id=source.id)
+    
+    # List all
+    _, total = repo.list_duplicates()
+    assert total == 2
+    
+    # Filter by status
+    pending_items, total = repo.list_duplicates(status="pending")
+    assert total == 1
+    assert pending_items[0].status == "pending"
+    
+    # Filter by subject_id
+    _, total = repo.list_duplicates(subject_ids=[str(subject.id)])
+    assert total == 2
+    
+    # Filter with non-existent subject
+    _, total = repo.list_duplicates(subject_ids=[str(uuid.uuid4())])
+    assert total == 0
+
+@pytest.mark.ChunkDuplicateSQLRepository
+def test_update_status(sqlite_memory):
+    """Test updating the status of a duplicate record."""
+    repo = ChunkDuplicateSQLRepository()
+    record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending")
+    
+    success = repo.update_status(record.id, "reviewed")
+    assert success is True
+    
+    updated = repo.get_by_id(record.id)
+    assert updated is not None
+    assert updated.status == "reviewed"
+
+@pytest.mark.ChunkDuplicateSQLRepository
+def test_delete_record(sqlite_memory):
+    """Test deleting a duplicate record."""
+    repo = ChunkDuplicateSQLRepository()
+    record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending")
+    
+    success = repo.delete_record(record.id)
+    assert success is True
+    
+    deleted = repo.get_by_id(record.id)
+    assert deleted is None
diff --git a/tests/infrastructure/services/test_chunk_duplicate_service.py b/tests/infrastructure/services/test_chunk_duplicate_service.py
new file mode 100644
index 00000000..0a58a3b6
--- /dev/null
+++ b/tests/infrastructure/services/test_chunk_duplicate_service.py
@@ -0,0 +1,86 @@
+import uuid
+from unittest.mock import MagicMock
+
+import pytest
+
+from src.domain.entities.enums.search_mode_enum import SearchMode
+from src.infrastructure.services.chunk_duplicate_service import ChunkDuplicateService
+
+
+@pytest.fixture
+def mock_repos():
+    repo = MagicMock()
+    chunk_repo = MagicMock()
+    vector_svc = MagicMock()
+    return repo, chunk_repo, vector_svc
+
+def test_find_and_register_duplicates(mock_repos):
+    """Test finding and registering duplicates."""
+    repo, chunk_repo, vector_svc = mock_repos
+    service = ChunkDuplicateService(repo, chunk_repo, vector_svc)
+    
+    chunk_id = uuid.uuid4()
+    mock_chunk = MagicMock()
+    mock_chunk.id = chunk_id
+    mock_chunk.content = "Duplicate test content"
+    mock_chunk.content_source_id = str(uuid.uuid4())
+    chunk_repo.get_by_id.return_value = mock_chunk
+    
+    # Mock similar chunks found
+    sim_chunk = MagicMock()
+    sim_chunk.id = uuid.uuid4()
+    sim_chunk.score = 0.95
+    vector_svc.retrieve.return_value = [sim_chunk]
+    
+    count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90)
+    
+    assert count == 1
+    vector_svc.retrieve.assert_called_once_with(
+        query=mock_chunk.content,
+        top_k=5,
+        search_mode=SearchMode.SEMANTIC,
+        re_rank=False
+    )
+    repo.create_duplicate_record.assert_called_once()
+    
+    # Check arguments of create_duplicate_record
+    _, kwargs = repo.create_duplicate_record.call_args
+    assert kwargs['similarity'] == pytest.approx(0.95)
+    assert str(chunk_id) in kwargs['chunk_ids']
+    assert str(sim_chunk.id) in kwargs['chunk_ids']
+
+def test_find_and_register_no_duplicates(mock_repos):
+    """Test when no duplicates are found above threshold."""
+    repo, chunk_repo, vector_svc = mock_repos
+    service = ChunkDuplicateService(repo, chunk_repo, vector_svc)
+    
+    chunk_id = uuid.uuid4()
+    mock_chunk = MagicMock()
+    mock_chunk.id = chunk_id
+    mock_chunk.content = "Unique content"
+    chunk_repo.get_by_id.return_value = mock_chunk
+    
+    # Sim chunk with low score
+    sim_chunk = MagicMock()
+    sim_chunk.id = uuid.uuid4()
+    sim_chunk.score = 0.5
+    vector_svc.retrieve.return_value = [sim_chunk]
+    
+    count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90)
+    
+    assert count == 0
+    repo.create_duplicate_record.assert_not_called()
+
+def test_deactivate_chunk(mock_repos):
+    """Test deactivating a chunk."""
+    repo, chunk_repo, vector_svc = mock_repos
+    service = ChunkDuplicateService(repo, chunk_repo, vector_svc)
+    
+    chunk_id = uuid.uuid4()
+    chunk_repo.update_is_active.return_value = True
+    
+    success = service.deactivate_chunk(chunk_id)
+    
+    assert success is True
+    chunk_repo.update_is_active.assert_called_once_with(chunk_id, False)
+    vector_svc.delete_by_id.assert_called_once_with(chunk_id)
diff --git a/tests/presentation/api/routes/test_duplicate_router.py b/tests/presentation/api/routes/test_duplicate_router.py
new file mode 100644
index 00000000..ac2c4bb0
--- /dev/null
+++ b/tests/presentation/api/routes/test_duplicate_router.py
@@ -0,0 +1,74 @@
+import uuid
+from unittest.mock import MagicMock
+
+import pytest
+from fastapi.testclient import TestClient
+
+from main import app
+from src.presentation.api.dependencies import (
+    get_chunk_index_service,
+    get_duplicate_repo,
+    get_duplicate_service,
+)
+
+client = TestClient(app)
+
+@pytest.mark.DuplicateRouter
+class TestDuplicateRouter:
+    def test_list_duplicates(self):
+        mock_repo = MagicMock()
+        app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo
+        
+        mock_repo.list_duplicates.return_value = ([], 0)
+        
+        response = client.get("/rest/duplicates")
+        assert response.status_code == 200
+        assert response.json()["total"] == 0
+        
+        app.dependency_overrides.clear()
+
+    def test_update_duplicate_status(self):
+        mock_repo = MagicMock()
+        app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo
+        
+        # Use a service mock instead because the router calls resolved_duplicate on service
+        # Wait, the router calls service.resolve_duplicate
+        mock_service = MagicMock()
+        app.dependency_overrides[get_duplicate_service] = lambda: mock_service
+        
+        duplicate_id = str(uuid.uuid4())
+        mock_service.resolve_duplicate.return_value = True
+        
+        response = client.patch(f"/rest/duplicates/{duplicate_id}/status", json={"status": "reviewed"})
+        assert response.status_code == 200
+        assert response.json()["status"] == "success"
+        
+        app.dependency_overrides.clear()
+
+    def test_deactivate_chunk(self):
+        mock_service = MagicMock()
+        app.dependency_overrides[get_duplicate_service] = lambda: mock_service
+        
+        chunk_id = str(uuid.uuid4())
+        mock_service.deactivate_chunk.return_value = True
+        
+        response = client.post(f"/rest/duplicates/chunks/{chunk_id}/deactivate")
+        assert response.status_code == 200
+        assert response.json()["status"] == "success"
+        
+        app.dependency_overrides.clear()
+
+    def test_trigger_duplicate_analysis(self):
+        mock_service = MagicMock()
+        app.dependency_overrides[get_duplicate_service] = lambda: mock_service
+        mock_chunk_service = MagicMock()
+        app.dependency_overrides[get_chunk_index_service] = lambda: mock_chunk_service
+        
+        mock_chunk_service.list_chunks.return_value = []
+        mock_service.find_and_register_duplicates.return_value = 0
+        
+        response = client.post("/rest/duplicates/analyze-all")
+        assert response.status_code == 200
+        assert response.json()["status"] == "success"
+        
+        app.dependency_overrides.clear()
diff --git a/tmp/cleanup_db.py b/tmp/cleanup_db.py
new file mode 100644
index 00000000..09417232
--- /dev/null
+++ b/tmp/cleanup_db.py
@@ -0,0 +1,27 @@
+from sqlalchemy import create_engine, text
+
+from src.config.settings import settings
+
+engine = create_engine(settings.sql.url)
+
+with engine.connect() as conn:
+    try:
+        conn.execute(text("DROP TABLE IF EXISTS chunk_duplicates"))
+        print("Dropped chunk_duplicates table if it existed.")
+    except Exception as e:
+        print(f"Error dropping chunk_duplicates: {e}")
+
+    try:
+        # Check if is_active exists in chunk_index
+        res = conn.execute(text("PRAGMA table_info(chunk_index)"))
+        columns = [row[1] for row in res]
+        if 'is_active' in columns:
+            print("is_active already exists in chunk_index. Attempting to drop it (batch mode needed for SQLite).")
+            # For simplicity in this scratch script, I'll just note it.
+            # Usually we group these with migrations.
+        else:
+            print("is_active does not exist in chunk_index.")
+    except Exception as e:
+        print(f"Error checking chunk_index: {e}")
+
+    conn.commit()

From f4cf6a68d9cfc903a2ff26f8d36cdb6fe4a6db81 Mon Sep 17 00:00:00 2001
From: ericksonlopes <ofc.erickson@gmail.com>
Date: Wed, 8 Apr 2026 12:17:29 -0300
Subject: [PATCH 4/7] style: fix lint and typing errors across project

---
 .../retrieve_processed_audio_history.py       |  2 +-
 .../sql/chunk_duplicate_repository.py         | 21 ++++++++++++++++---
 .../services/chunk_duplicate_service.py       |  7 ++++++-
 test_dispatcher.py                            | 10 ++++-----
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/application/use_cases/retrieve_processed_audio_history.py b/src/application/use_cases/retrieve_processed_audio_history.py
index 494d8f4a..084f649e 100644
--- a/src/application/use_cases/retrieve_processed_audio_history.py
+++ b/src/application/use_cases/retrieve_processed_audio_history.py
@@ -10,7 +10,7 @@ class RetrieveProcessedAudioHistoryUseCase:
     def __init__(self, db: Session):
         self.repo = DiarizationRepository(db)
 
-    def execute(self, limit: int = 10, offset: int = 0, subject_id: str | None = None) -> list[dict]:
+    def execute(self, limit: int = 10, offset: int = 0, subject_id: str | list[str] | None = None) -> list[dict]:
         records = self.repo.get_all(limit=limit, offset=offset, subject_id=subject_id)
 
         return [
diff --git a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
index 911f864b..8caf11ec 100644
--- a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
+++ b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
@@ -14,7 +14,13 @@
 class ChunkDuplicateSQLRepository:
     """Repository for managing duplicate chunk records in SQL."""
 
-    def create_duplicate_record(self, chunk_ids: List[UUID], similarity: float, status: str = "pending", content_source_id: Optional[str] = None) -> ChunkDuplicateModel:
+    def create_duplicate_record(
+        self,
+        chunk_ids: List[UUID],
+        similarity: float,
+        status: str = "pending",
+        content_source_id: Optional[str] = None
+    ) -> ChunkDuplicateModel:
         """Create a new duplicate grouping record."""
         with Connector() as session:
             try:
@@ -36,7 +42,13 @@ def create_duplicate_record(self, chunk_ids: List[UUID], similarity: float, stat
                 )
                 raise
 
-    def list_duplicates(self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0) -> tuple[List[ChunkDuplicateModel], int]:
+    def list_duplicates(
+        self,
+        status: Optional[str] = None,
+        subject_ids: Optional[List[str]] = None,
+        limit: int = 100,
+        offset: int = 0
+    ) -> tuple[List[ChunkDuplicateModel], int]:
         """List duplicate records with optional status and context filtering."""
         with Connector() as session:
             query = session.query(ChunkDuplicateModel)
@@ -94,5 +106,8 @@ def delete_record(self, duplicate_id: Any) -> bool:
                 return False
             except Exception as e:
                 session.rollback()
-                logger.error("Error deleting duplicate record", context={"duplicate_id": str(duplicate_id), "error": str(e)})
+                logger.error(
+                    "Error deleting duplicate record",
+                    context={"duplicate_id": str(duplicate_id), "error": str(e)}
+                )
                 raise
diff --git a/src/infrastructure/services/chunk_duplicate_service.py b/src/infrastructure/services/chunk_duplicate_service.py
index a50954bc..4edd7b82 100644
--- a/src/infrastructure/services/chunk_duplicate_service.py
+++ b/src/infrastructure/services/chunk_duplicate_service.py
@@ -64,7 +64,12 @@ def find_and_register_duplicates(
 
         return registered_count
 
-    def _filter_duplicates(self, source_id: UUID, similar_chunks: List[Any], threshold: float) -> List[tuple[UUID, float]]:
+    def _filter_duplicates(
+        self,
+        source_id: UUID,
+        similar_chunks: List[Any],
+        threshold: float
+    ) -> List[tuple[UUID, float]]:
         """Filter results to find valid duplicates above threshold."""
         duplicates = []
         source_id_str = str(source_id)
diff --git a/test_dispatcher.py b/test_dispatcher.py
index feea32ab..b1fee754 100644
--- a/test_dispatcher.py
+++ b/test_dispatcher.py
@@ -13,13 +13,13 @@ class MockContext:
     def __init__(self, job_svc):
         self.job_service = job_svc
 
-import src.presentation.api.dependencies as deps
+import src.presentation.api.dependencies as deps  # noqa: E402
 
 deps.resolve_ingestion_context = MagicMock(return_value=MockContext(mock_job_service))
 
-from src.application.dtos.commands.ingest_youtube_command import IngestYoutubeCommand
-from src.application.dtos.enums.youtube_data_type import YoutubeDataType
-from src.application.workers import run_youtube_dispatcher_worker
+from src.application.dtos.commands.ingest_youtube_command import IngestYoutubeCommand  # noqa: E402
+from src.application.dtos.enums.youtube_data_type import YoutubeDataType  # noqa: E402
+from src.application.workers import run_youtube_dispatcher_worker  # noqa: E402
 
 
 def test_dispatcher():
@@ -28,7 +28,7 @@ def test_dispatcher():
     mock_app.state.task_queue = MagicMock()
     
     # Patch _get_app to return our mock
-    import src.application.workers as workers
+    import src.application.workers as workers  # noqa: E402
     workers._get_app = MagicMock(return_value=mock_app)
     
     # Create command for a small playlist (or the user's one)

From 99ad8d145fac2ac7a2bd9a51c302eec7dfc52d39 Mon Sep 17 00:00:00 2001
From: ericksonlopes <ofc.erickson@gmail.com>
Date: Wed, 8 Apr 2026 13:06:46 -0300
Subject: [PATCH 5/7] feat: complete chunk duplication system with worker
 integration and lint fixes

---
 ...c845_add_chunk_duplicates_table_and_is_.py | 32 ++++---
 ...673_add_content_source_id_to_duplicates.py | 24 +++--
 src/application/workers.py                    | 94 +++++++++++--------
 .../extractors/youtube_extractor.py           |  4 +-
 .../sql/chunk_duplicate_repository.py         | 35 +++----
 .../services/chunk_duplicate_service.py       | 53 +++++------
 src/presentation/__init__.py                  |  1 +
 .../api/routes/duplicate_router.py            | 22 +++--
 src/presentation/api/routes/ingest_router.py  | 25 ++++-
 test_dispatcher.py                            | 23 ++---
 test_playlist.py                              |  9 +-
 .../sql/test_chunk_duplicate_repository.py    | 35 ++++---
 .../services/test_chunk_duplicate_service.py  | 38 ++++----
 .../api/routes/test_duplicate_router.py       | 27 +++---
 tmp/cleanup_db.py                             |  2 +-
 15 files changed, 226 insertions(+), 198 deletions(-)
 create mode 100644 src/presentation/__init__.py

diff --git a/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py
index 65229188..ba6f38c1 100644
--- a/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py
+++ b/alembic/versions/646a175ac845_add_chunk_duplicates_table_and_is_.py
@@ -5,6 +5,7 @@
 Create Date: 2026-04-08 09:56:58.625813
 
 """
+
 from typing import Sequence, Union
 
 import sqlalchemy as sa
@@ -12,8 +13,8 @@
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision: str = '646a175ac845'
-down_revision: Union[str, Sequence[str], None] = 'b2c3d4e5f6a7'
+revision: str = "646a175ac845"
+down_revision: Union[str, Sequence[str], None] = "b2c3d4e5f6a7"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 
@@ -21,22 +22,27 @@
 def upgrade() -> None:
     """Upgrade schema."""
     # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('chunk_duplicates',
-        sa.Column('id', sa.UUID(), nullable=False),
-        sa.Column('chunk_ids', sa.JSON(), nullable=False),
-        sa.Column('similarity', sa.Float(), nullable=False),
-        sa.Column('status', sa.Text(), nullable=False),
-        sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
-        sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
-        sa.PrimaryKeyConstraint('id')
+    op.create_table(
+        "chunk_duplicates",
+        sa.Column("id", sa.UUID(), nullable=False),
+        sa.Column("chunk_ids", sa.JSON(), nullable=False),
+        sa.Column("similarity", sa.Float(), nullable=False),
+        sa.Column("status", sa.Text(), nullable=False),
+        sa.Column(
+            "created_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=False
+        ),
+        sa.Column(
+            "updated_at", sa.DateTime(timezone=True), server_default=sa.text("(CURRENT_TIMESTAMP)"), nullable=False
+        ),
+        sa.PrimaryKeyConstraint("id"),
     )
-    op.add_column('chunk_index', sa.Column('is_active', sa.Boolean(), server_default=sa.text('1'), nullable=False))
+    op.add_column("chunk_index", sa.Column("is_active", sa.Boolean(), server_default=sa.text("1"), nullable=False))
     # ### end Alembic commands ###
 
 
 def downgrade() -> None:
     """Downgrade schema."""
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_column('chunk_index', 'is_active')
-    op.drop_table('chunk_duplicates')
+    op.drop_column("chunk_index", "is_active")
+    op.drop_table("chunk_duplicates")
     # ### end Alembic commands ###
diff --git a/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py
index f0ac6124..163be91b 100644
--- a/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py
+++ b/alembic/versions/84524e052673_add_content_source_id_to_duplicates.py
@@ -5,6 +5,7 @@
 Create Date: 2026-04-08 10:50:39.027257
 
 """
+
 from typing import Sequence, Union
 
 import sqlalchemy as sa
@@ -12,21 +13,28 @@
 from alembic import op
 
 # revision identifiers, used by Alembic.
-revision: str = '84524e052673'
-down_revision: Union[str, Sequence[str], None] = '646a175ac845'
+revision: str = "84524e052673"
+down_revision: Union[str, Sequence[str], None] = "646a175ac845"
 branch_labels: Union[str, Sequence[str], None] = None
 depends_on: Union[str, Sequence[str], None] = None
 
 
 def upgrade() -> None:
     """Upgrade schema."""
-    with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op:
-        batch_op.add_column(sa.Column('content_source_id', sa.UUID(), nullable=True))
-        batch_op.create_foreign_key('fk_chunk_duplicates_content_source_id_content_sources', 'content_sources', ['content_source_id'], ['id'], initially='IMMEDIATE', deferrable=True)
+    with op.batch_alter_table("chunk_duplicates", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("content_source_id", sa.UUID(), nullable=True))
+        batch_op.create_foreign_key(
+            "fk_chunk_duplicates_content_source_id_content_sources",
+            "content_sources",
+            ["content_source_id"],
+            ["id"],
+            initially="IMMEDIATE",
+            deferrable=True,
+        )
 
 
 def downgrade() -> None:
     """Downgrade schema."""
-    with op.batch_alter_table('chunk_duplicates', schema=None) as batch_op:
-        batch_op.drop_constraint('fk_chunk_duplicates_content_source_id_content_sources', type_='foreignkey')
-        batch_op.drop_column('content_source_id')
+    with op.batch_alter_table("chunk_duplicates", schema=None) as batch_op:
+        batch_op.drop_constraint("fk_chunk_duplicates_content_source_id_content_sources", type_="foreignkey")
+        batch_op.drop_column("content_source_id")
diff --git a/src/application/workers.py b/src/application/workers.py
index 7ff71021..67e9ad01 100644
--- a/src/application/workers.py
+++ b/src/application/workers.py
@@ -1,5 +1,6 @@
 import logging
 from typing import Any
+from uuid import UUID
 
 from src.application.dtos.commands.ingest_diarization_command import (
     IngestDiarizationCommand,
@@ -10,6 +11,7 @@
 from src.application.dtos.commands.process_audio_command import ProcessAudioCommand
 from src.application.dtos.commands.train_voice_command import TrainVoiceCommand
 from src.application.service_registry import registry
+from src.domain.entities.enums.ingestion_job_status_enum import IngestionJobStatus
 from src.infrastructure.loggers.std_logger import (
     clear_global_context,
     set_global_context,
@@ -113,6 +115,10 @@ def run_youtube_ingestion_worker(cmd: IngestYoutubeCommand):
         )
 
         ctx = resolve_ingestion_context(app)
+        if not ctx:
+            logger.error("Could not resolve ingestion context for YouTube worker")
+            return
+
         vector_repo = resolve_vector_repository(app)
         vector_svc = YouTubeVectorService(vector_repo)
 
@@ -130,12 +136,13 @@ def run_youtube_ingestion_worker(cmd: IngestYoutubeCommand):
 
         result = use_case.execute(cmd)
 
-        # Enqueue duplicate detection
-        if result and "vector_ids" in result:
+        # Enqueue duplicate detection if we have vector IDs
+        if result and getattr(result, "vector_ids", None):
+            task_ids = [UUID(str(vid)) for vid in result.vector_ids]
             task_queue = app.state.task_queue
             task_queue.enqueue(
                 run_duplicate_detection_worker,
-                {"chunk_ids": result["vector_ids"]},
+                {"chunk_ids": task_ids},
                 task_title=f"Dup Check YouTube: {cmd.video_url}",
             )
     except Exception as e:
@@ -166,15 +173,20 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
 
         task_queue = app.state.task_queue
         context = resolve_ingestion_context(app)
+        if not context:
+            logger.error("Could not resolve ingestion context for YouTube dispatcher")
+            return
+
         job_service = context.job_service
         job_id = str(cmd.ingestion_job_id) if cmd.ingestion_job_id else None
 
-        if job_id:
-            job_service.update_job_status(
-                job_id,
-                status="PROCESSING",
+        if job_id and job_service:
+            job_service.update_job(
+                UUID(job_id),
+                status=IngestionJobStatus.PROCESSING,
                 status_message=f"Resolving {cmd.data_type} videos...",
-                progress=5,
+                current_step=5,
+                total_steps=100,
             )
 
         # 1. Resolve the full list of URLs
@@ -184,8 +196,10 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
             playlist_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None)
             if not playlist_url:
                 logger.warning("No URL provided for playlist dispatcher")
-                if job_id:
-                    job_service.update_job_status(job_id, "FAILED", "Missing playlist URL.")
+                if job_id and job_service:
+                    job_service.update_job(
+                        UUID(job_id), status=IngestionJobStatus.FAILED, status_message="Missing playlist URL."
+                    )
                 return
 
             extractor = YoutubeExtractor(language=cmd.language)
@@ -195,8 +209,10 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
             channel_url = cmd.video_url or (cmd.video_urls[0] if cmd.video_urls else None)
             if not channel_url:
                 logger.warning("No URL provided for channel dispatcher")
-                if job_id:
-                    job_service.update_job_status(job_id, "FAILED", "Missing channel URL.")
+                if job_id and job_service:
+                    job_service.update_job(
+                        UUID(job_id), status=IngestionJobStatus.FAILED, status_message="Missing channel URL."
+                    )
                 return
 
             extractor = YoutubeExtractor(language=cmd.language)
@@ -207,20 +223,21 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
 
         if not video_list:
             logger.warning(f"YouTube Dispatcher resolved 0 videos for type {cmd.data_type}.")
-            if job_id:
-                job_service.update_job_status(
-                    job_id,
-                    "FAILED",
-                    f"No videos found in {cmd.data_type}. Verify if the URL is valid and public.",
+            if job_id and job_service:
+                job_service.update_job(
+                    UUID(job_id),
+                    status=IngestionJobStatus.FAILED,
+                    status_message=f"No videos found in {cmd.data_type}. Verify if the URL is valid and public.",
                 )
             return
 
-        if job_id:
-            job_service.update_job_status(
-                job_id,
-                status="PROCESSING",
+        if job_id and job_service:
+            job_service.update_job(
+                UUID(job_id),
+                status=IngestionJobStatus.PROCESSING,
                 status_message=f"Dispatched {len(video_list)} videos for ingestion.",
-                progress=50,
+                current_step=50,
+                total_steps=100,
             )
 
         logger.info(f"YouTube Dispatcher resolved {len(video_list)} videos. Enqueueing individual tasks...")
@@ -248,21 +265,22 @@ def run_youtube_dispatcher_worker(cmd: IngestYoutubeCommand):
             )
 
         logger.info(f"Successfully dispatched {len(video_list)} YouTube ingestion tasks.")
-        if job_id:
-            job_service.update_job_status(
-                job_id,
-                status="SUCCESS",
+        if job_id and job_service:
+            job_service.update_job(
+                UUID(job_id),
+                status=IngestionJobStatus.FINISHED,
                 status_message=f"Dispatched {len(video_list)} videos successfully.",
-                progress=100,
+                current_step=100,
+                total_steps=100,
             )
 
     except Exception as e:
         logger.error(f"YouTube Dispatcher Worker Error: {e}", exc_info=True)
-        if job_id:
+        if job_id and job_service:
             try:
-                job_service.update_job_status(job_id, "FAILED", str(e))
-            except Exception:
-                pass
+                job_service.update_job(UUID(job_id), status=IngestionJobStatus.FAILED, error_message=str(e))
+            except Exception as outer_e:
+                logger.warning(f"Failed to update job status during error cleanup: {outer_e}")
     finally:
         clear_global_context()
 
@@ -321,8 +339,8 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand):
                 task_queue = app.state.task_queue
                 task_queue.enqueue(
                     run_duplicate_detection_worker,
-                    {"chunk_ids": result["vector_ids"]},
-                    task_title=f"Dup Check Diarization: {cmd.source}"
+                    {"chunk_ids": [UUID(v) if isinstance(v, str) else v for v in result["vector_ids"]]},
+                    task_title=f"Dup Check Diarization: {cmd.name or str(cmd.diarization_id)}",
                 )
         finally:
             db.close()
@@ -387,8 +405,8 @@ async def _run():
                 task_queue = app.state.task_queue
                 task_queue.enqueue(
                     run_duplicate_detection_worker,
-                    {"chunk_ids": result["vector_ids"]},
-                    task_title=f"Dup Check Web: {cmd.url}"
+                    {"chunk_ids": [UUID(v) if isinstance(v, str) else v for v in result["vector_ids"]]},
+                    task_title=f"Dup Check Web: {cmd.url}",
                 )
         except Exception as e:
             logging.getLogger(__name__).error(f"Worker Error: Failed to execute Web Scraping: {e}", exc_info=True)
@@ -696,12 +714,12 @@ def run_duplicate_detection_worker(cmd: dict):
         vector_repo = resolve_vector_repository(app)
         rerank_svc = resolve_rerank_service(app)
         vector_svc = get_chunk_vector_service(vector_repo, rerank_svc)
-        
+
         duplicate_repo = get_duplicate_repo()
         chunk_repo = get_chunk_repo()
-        
+
         service = ChunkDuplicateService(duplicate_repo, chunk_repo, vector_svc)
-        
+
         chunk_ids = cmd.get("chunk_ids", [])
         if not chunk_ids:
             return
diff --git a/src/infrastructure/extractors/youtube_extractor.py b/src/infrastructure/extractors/youtube_extractor.py
index 726d9aa3..c53ee967 100644
--- a/src/infrastructure/extractors/youtube_extractor.py
+++ b/src/infrastructure/extractors/youtube_extractor.py
@@ -239,9 +239,7 @@ def _validate_mp3_file(path: str) -> None:
         if len(header) >= 2 and header[0] == 0xFF and (header[1] & 0xE0) == 0xE0:
             return
 
-        raise ValueError(
-            f"Downloaded file is not a valid MP3 (header={header!r}): {path}"
-        )
+        raise ValueError(f"Downloaded file is not a valid MP3 (header={header!r}): {path}")
 
     def extract_playlist_videos(self, playlist_url: str) -> list[str]:
         """Extracts all video URLs from a YouTube playlist using yt_dlp."""
diff --git a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
index 8caf11ec..e5045125 100644
--- a/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
+++ b/src/infrastructure/repositories/sql/chunk_duplicate_repository.py
@@ -11,15 +11,12 @@
 
 logger = Logger()
 
+
 class ChunkDuplicateSQLRepository:
     """Repository for managing duplicate chunk records in SQL."""
 
     def create_duplicate_record(
-        self,
-        chunk_ids: List[UUID],
-        similarity: float,
-        status: str = "pending",
-        content_source_id: Optional[str] = None
+        self, chunk_ids: List[UUID], similarity: float, status: str = "pending", content_source_id: Optional[str] = None
     ) -> ChunkDuplicateModel:
         """Create a new duplicate grouping record."""
         with Connector() as session:
@@ -28,7 +25,7 @@ def create_duplicate_record(
                     chunk_ids=[str(cid) for cid in chunk_ids],
                     similarity=similarity,
                     status=status,
-                    content_source_id=content_source_id
+                    content_source_id=content_source_id,
                 )
                 session.add(record)
                 session.commit()
@@ -36,34 +33,26 @@ def create_duplicate_record(
                 return record
             except Exception as e:
                 session.rollback()
-                logger.error(
-                    "Error creating duplicate record",
-                    context={"error": str(e)}
-                )
+                logger.error("Error creating duplicate record", context={"error": str(e)})
                 raise
 
     def list_duplicates(
-        self,
-        status: Optional[str] = None,
-        subject_ids: Optional[List[str]] = None,
-        limit: int = 100,
-        offset: int = 0
+        self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0
     ) -> tuple[List[ChunkDuplicateModel], int]:
         """List duplicate records with optional status and context filtering."""
         with Connector() as session:
             query = session.query(ChunkDuplicateModel)
-            
+
             if subject_ids:
                 # Convert string IDs to UUID objects for safe matching in SQL
                 parsed_ids = [UUID(sid) for sid in subject_ids]
                 query = query.join(
-                    ContentSourceModel,
-                    ContentSourceModel.id == ChunkDuplicateModel.content_source_id
+                    ContentSourceModel, ContentSourceModel.id == ChunkDuplicateModel.content_source_id
                 ).filter(ContentSourceModel.subject_id.in_(parsed_ids))
-            
+
             if status:
                 query = query.filter(ChunkDuplicateModel.status == status)
-            
+
             total = query.count()
             items = query.order_by(desc(ChunkDuplicateModel.created_at)).limit(limit).offset(offset).all()
             return items, total
@@ -88,8 +77,7 @@ def update_status(self, duplicate_id: Any, status: str) -> bool:
             except Exception as e:
                 session.rollback()
                 logger.error(
-                    "Error updating duplicate status",
-                    context={"duplicate_id": str(duplicate_id), "error": str(e)}
+                    "Error updating duplicate status", context={"duplicate_id": str(duplicate_id), "error": str(e)}
                 )
                 raise
 
@@ -107,7 +95,6 @@ def delete_record(self, duplicate_id: Any) -> bool:
             except Exception as e:
                 session.rollback()
                 logger.error(
-                    "Error deleting duplicate record",
-                    context={"duplicate_id": str(duplicate_id), "error": str(e)}
+                    "Error deleting duplicate record", context={"duplicate_id": str(duplicate_id), "error": str(e)}
                 )
                 raise
diff --git a/src/infrastructure/services/chunk_duplicate_service.py b/src/infrastructure/services/chunk_duplicate_service.py
index 4edd7b82..d75f6dfc 100644
--- a/src/infrastructure/services/chunk_duplicate_service.py
+++ b/src/infrastructure/services/chunk_duplicate_service.py
@@ -28,9 +28,7 @@ def __init__(
         self._chunk_repo = chunk_repo
         self._vector_service = vector_service
 
-    def find_and_register_duplicates(
-        self, chunk_ids: List[UUID], similarity_threshold: float = 0.90
-    ) -> int:
+    def find_and_register_duplicates(self, chunk_ids: List[UUID], similarity_threshold: float = 0.90) -> int:
         """
         Check a list of chunks for duplicates against the entire vector store.
         If duplicates are found with similarity >= threshold, register them.
@@ -56,19 +54,16 @@ def find_and_register_duplicates(
 
             if duplicates:
                 registered_count += self._register_cluster(
-                    source_id=cid, 
+                    source_id=cid,
                     source_content_source_id=str(chunk.content_source_id) if chunk.content_source_id else None,
-                    duplicates=duplicates, 
-                    processed_pairs=processed_pairs
+                    duplicates=duplicates,
+                    processed_pairs=processed_pairs,
                 )
 
         return registered_count
 
     def _filter_duplicates(
-        self,
-        source_id: UUID,
-        similar_chunks: List[Any],
-        threshold: float
+        self, source_id: UUID, similar_chunks: List[Any], threshold: float
     ) -> List[tuple[UUID, float]]:
         """Filter results to find valid duplicates above threshold."""
         duplicates = []
@@ -76,7 +71,7 @@ def _filter_duplicates(
         for sim_chunk in similar_chunks:
             if str(sim_chunk.id) == source_id_str:
                 continue
-            
+
             score = getattr(sim_chunk, "score", 0.0)
             if score >= threshold:
                 duplicates.append((sim_chunk.id, float(score)))
@@ -87,7 +82,7 @@ def _register_cluster(
         source_id: UUID,
         source_content_source_id: Optional[str],
         duplicates: List[tuple[UUID, float]],
-        processed_pairs: Set[tuple[str, ...]]
+        processed_pairs: Set[tuple[str, ...]],
     ) -> int:
         """Register a new duplicate group if not already processed."""
         duplicate_ids = [d[0] for d in duplicates]
@@ -100,26 +95,20 @@ def _register_cluster(
             # Get exact similarity for the highest match
             max_sim = max([float(d[1]) for d in duplicates] + [0.0])
             self._repo.create_duplicate_record(
-                chunk_ids=all_uuids,
-                similarity=max_sim,
-                status="pending",
-                content_source_id=source_content_source_id
+                chunk_ids=all_uuids, similarity=max_sim, status="pending", content_source_id=source_content_source_id
             )
             processed_pairs.add(cluster_key)
             return 1
         return 0
 
     def list_duplicates(
-        self,
-        status: Optional[str] = None,
-        subject_ids: Optional[List[str]] = None,
-        limit: int = 100,
-        offset: int = 0
+        self, status: Optional[str] = None, subject_ids: Optional[List[str]] = None, limit: int = 100, offset: int = 0
     ) -> tuple[List[ChunkDuplicateEntity], int]:
         """List mapped duplicate records."""
         models, total = self._repo.list_duplicates(status=status, subject_ids=subject_ids, limit=limit, offset=offset)
         entities = []
         from datetime import datetime
+
         for m in models:
             chunk_ids: List[UUID] = []
             if isinstance(m.chunk_ids, list):
@@ -128,20 +117,22 @@ def list_duplicates(
                         chunk_ids.append(UUID(cid))
                     elif isinstance(cid, UUID):
                         chunk_ids.append(cid)
-            
+
             # Ensure datetime types for Mypy
             created_at = m.created_at if isinstance(m.created_at, datetime) else datetime.now()
             updated_at = m.updated_at if isinstance(m.updated_at, datetime) else datetime.now()
 
-            entities.append(ChunkDuplicateEntity(
-                id=UUID(str(m.id)),
-                chunk_ids=chunk_ids,
-                similarity=float(m.similarity),
-                content_source_id=str(m.content_source_id) if m.content_source_id else None,
-                status=str(m.status),
-                created_at=created_at,
-                updated_at=updated_at,
-            ))
+            entities.append(
+                ChunkDuplicateEntity(
+                    id=UUID(str(m.id)),
+                    chunk_ids=chunk_ids,
+                    similarity=float(m.similarity),
+                    content_source_id=str(m.content_source_id) if m.content_source_id else None,
+                    status=str(m.status),
+                    created_at=created_at,
+                    updated_at=updated_at,
+                )
+            )
         return entities, total
 
     def resolve_duplicate(self, duplicate_id: UUID, status: str) -> bool:
diff --git a/src/presentation/__init__.py b/src/presentation/__init__.py
new file mode 100644
index 00000000..e16c76df
--- /dev/null
+++ b/src/presentation/__init__.py
@@ -0,0 +1 @@
+""
diff --git a/src/presentation/api/routes/duplicate_router.py b/src/presentation/api/routes/duplicate_router.py
index ee482c8b..c33dc188 100644
--- a/src/presentation/api/routes/duplicate_router.py
+++ b/src/presentation/api/routes/duplicate_router.py
@@ -32,7 +32,7 @@ def list_duplicates(
 ):
     """List all detected chunk duplicate groups."""
     entities, total = service.list_duplicates(status=status, subject_ids=subject_id, limit=limit, offset=offset)
-    
+
     # Enrich entities with chunk content if needed for UI
     results = []
     for entity in entities:
@@ -41,15 +41,17 @@ def list_duplicates(
         for cid in entity.chunk_ids:
             chunk = chunk_service.get_by_id(cid)
             if chunk:
-                chunks_info.append(ChunkMinimal(
-                    id=chunk.id,
-                    content=chunk.content or "",
-                    source_title=chunk.extra.get("source_title", "Unknown"),
-                    source_id=chunk.content_source_id
-                ))
+                chunks_info.append(
+                    ChunkMinimal(
+                        id=chunk.id,
+                        content=chunk.content or "",
+                        source_title=chunk.extra.get("source_title", "Unknown"),
+                        source_id=chunk.content_source_id,
+                    )
+                )
         resp.chunks = chunks_info
         results.append(resp)
-        
+
     return PaginatedChunkDuplicateResponse(results=results, total=total)
 
 
@@ -89,8 +91,8 @@ def analyze_all_chunks(
     """Run duplicate detection analysis on all existing chunks (heavy operation)."""
     # This should probably be a background task, but for now we'll do it synchronously
     # or just list everything and iterate
-    all_chunks = chunk_service.list_chunks(limit=1000) # Limit for safety
+    all_chunks = chunk_service.list_chunks(limit=1000)  # Limit for safety
     chunk_ids = [c.id for c in all_chunks]
-    
+
     count = service.find_and_register_duplicates(chunk_ids)
     return {"status": "success", "groups_found": count}
diff --git a/src/presentation/api/routes/ingest_router.py b/src/presentation/api/routes/ingest_router.py
index 1615f2ef..30d0a315 100644
--- a/src/presentation/api/routes/ingest_router.py
+++ b/src/presentation/api/routes/ingest_router.py
@@ -124,16 +124,23 @@ def ingest_youtube(
         logger.info("Running ingestion in background via queue", context={"reason": reason})
 
         # Ensure we have a job ID for background tasks so they are visible in UI
-        job_id = request.ingestion_job_id
+        job_id = str(request.ingestion_job_id) if request.ingestion_job_id else None
         if not job_id:
+            if not job_service:
+                logger.error("Job service dependency missing")
+                raise HTTPException(status_code=500, detail="Internal configuration error")
+
             job_title = request.title or request.video_url or f"YouTube {reason.capitalize()}"
+            from src.domain.entities.enums.ingestion_job_status_enum import IngestionJobStatus
+
+            s_uuid = UUID(request.subject_id) if request.subject_id else None
             job = job_service.create_job(
+                content_source_id=None,
+                status=IngestionJobStatus.STARTED,
                 source_title=job_title,
                 external_source=request.video_url or (request.video_urls[0] if request.video_urls else None),
-                subject_id=request.subject_id,
+                subject_id=s_uuid,
                 ingestion_type="YOUTUBE",
-                status="INITIALIZING",
-                status_message=f"Starting YouTube {reason} ingestion...",
             )
             job_id = str(job.id)
             cmd.ingestion_job_id = job_id
@@ -160,7 +167,15 @@ def ingest_youtube(
                 detail=result.reason or "This content has already been ingested.",
             )
 
-        return result
+        return IngestResponse(
+            skipped=result.skipped,
+            reason=result.reason,
+            source_id=result.source_id,
+            job_id=result.job_id,
+            created_chunks=result.created_chunks,
+            vector_ids=result.vector_ids,
+            video_results=result.video_results,
+        )
     except HTTPException:
         raise
     except ValueError as ve:
diff --git a/test_dispatcher.py b/test_dispatcher.py
index b1fee754..b954fe1c 100644
--- a/test_dispatcher.py
+++ b/test_dispatcher.py
@@ -6,13 +6,15 @@
 sys.path.append(os.getcwd())
 
 # Mock out dependencies before importing workers
-sys.modules['src.presentation.api.dependencies'] = MagicMock()
+sys.modules["src.presentation.api.dependencies"] = MagicMock()
 mock_job_service = MagicMock()
 
+
 class MockContext:
     def __init__(self, job_svc):
         self.job_service = job_svc
 
+
 import src.presentation.api.dependencies as deps  # noqa: E402
 
 deps.resolve_ingestion_context = MagicMock(return_value=MockContext(mock_job_service))
@@ -26,35 +28,34 @@ def test_dispatcher():
     # Setup mock app state
     mock_app = MagicMock()
     mock_app.state.task_queue = MagicMock()
-    
+
     # Patch _get_app to return our mock
     import src.application.workers as workers  # noqa: E402
+
     workers._get_app = MagicMock(return_value=mock_app)
-    
+
     # Create command for a small playlist (or the user's one)
     # Using a known small playlist to speed up test if possible
     # But user's URL is fine since we are testing logic
     url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59"
     cmd = IngestYoutubeCommand(
-        video_url=url,
-        data_type=YoutubeDataType.PLAYLIST,
-        ingestion_job_id="test-job-uuid",
-        subject_id="test-subject"
+        video_url=url, data_type=YoutubeDataType.PLAYLIST, ingestion_job_id="test-job-uuid", subject_id="test-subject"
     )
-    
+
     print(f"Testing dispatcher with URL: {url}")
     run_youtube_dispatcher_worker(cmd)
-    
+
     # Assertions
     print("\nVerifying Job Service calls:")
-    for call in mock_job_service.update_job_status.call_args_list:
+    for call in mock_job_service.update_job.call_args_list:
         print(f"  - Status update: {call[1]}")
-        
+
     print("\nVerifying Task Queue calls:")
     enqueue_calls = mock_app.state.task_queue.enqueue.call_args_list
     print(f"  - Tasks enqueued: {len(enqueue_calls)}")
     if len(enqueue_calls) > 0:
         print(f"  - First task URL: {enqueue_calls[0][0][1].video_url}")
 
+
 if __name__ == "__main__":
     test_dispatcher()
diff --git a/test_playlist.py b/test_playlist.py
index c9f17e06..c2873fc6 100644
--- a/test_playlist.py
+++ b/test_playlist.py
@@ -4,16 +4,16 @@
 def test_playlist():
     # Use the playlist provided by the user
     playlist_url = "https://www.youtube.com/watch?v=dlQG02mwTD0&list=PLG47XsLEf0LdvYtX_zU7E_y_C1TgjgU59"
-    
+
     print(f"Testing playlist extraction for: {playlist_url}")
     extractor = YoutubeExtractor(language="pt")
-    
+
     try:
         videos = extractor.extract_playlist_videos(playlist_url)
         print(f"Extracted {len(videos)} videos.")
         for i, url in enumerate(videos[:5]):
-            print(f"  {i+1}: {url}")
-            
+            print(f"  {i + 1}: {url}")
+
         if not videos:
             print("FAILED: No videos extracted.")
         else:
@@ -21,5 +21,6 @@ def test_playlist():
     except Exception as e:
         print(f"ERROR: {e}")
 
+
 if __name__ == "__main__":
     test_playlist()
diff --git a/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py
index cd88f6f6..47d4ea4b 100644
--- a/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py
+++ b/tests/infrastructure/repositories/sql/test_chunk_duplicate_repository.py
@@ -14,77 +14,76 @@ def test_create_duplicate_record(sqlite_memory):
     chunk_ids = [str(uuid.uuid4()), str(uuid.uuid4())]
     similarity = 0.95
     status = "pending"
-    
+
     record = repo.create_duplicate_record(chunk_ids, similarity, status)
-    
+
     assert record.id is not None
     assert record.chunk_ids == chunk_ids
     assert record.similarity == pytest.approx(similarity)
     assert record.status == status
 
+
 @pytest.mark.ChunkDuplicateSQLRepository
 def test_list_duplicates_filtering(sqlite_memory):
     """Test listing duplicates with status and subject filtering."""
     db = sqlite_memory
     repo = ChunkDuplicateSQLRepository()
-    
+
     # Create a subject and content source
     subject = KnowledgeSubjectModel(name="Test Subject")
     db.add(subject)
     db.commit()
     db.refresh(subject)
-    
-    source = ContentSourceModel(
-        subject_id=subject.id,
-        source_type="file",
-        external_source="test.txt"
-    )
+
+    source = ContentSourceModel(subject_id=subject.id, source_type="file", external_source="test.txt")
     db.add(source)
     db.commit()
     db.refresh(source)
-    
+
     # Create duplicate records
     repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending", content_source_id=source.id)
     repo.create_duplicate_record([str(uuid.uuid4())], 0.8, "reviewed", content_source_id=source.id)
-    
+
     # List all
     _, total = repo.list_duplicates()
     assert total == 2
-    
+
     # Filter by status
     pending_items, total = repo.list_duplicates(status="pending")
     assert total == 1
     assert pending_items[0].status == "pending"
-    
+
     # Filter by subject_id
     _, total = repo.list_duplicates(subject_ids=[str(subject.id)])
     assert total == 2
-    
+
     # Filter with non-existent subject
     _, total = repo.list_duplicates(subject_ids=[str(uuid.uuid4())])
     assert total == 0
 
+
 @pytest.mark.ChunkDuplicateSQLRepository
 def test_update_status(sqlite_memory):
     """Test updating the status of a duplicate record."""
     repo = ChunkDuplicateSQLRepository()
     record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending")
-    
+
     success = repo.update_status(record.id, "reviewed")
     assert success is True
-    
+
     updated = repo.get_by_id(record.id)
     assert updated is not None
     assert updated.status == "reviewed"
 
+
 @pytest.mark.ChunkDuplicateSQLRepository
 def test_delete_record(sqlite_memory):
     """Test deleting a duplicate record."""
     repo = ChunkDuplicateSQLRepository()
     record = repo.create_duplicate_record([str(uuid.uuid4())], 0.9, "pending")
-    
+
     success = repo.delete_record(record.id)
     assert success is True
-    
+
     deleted = repo.get_by_id(record.id)
     assert deleted is None
diff --git a/tests/infrastructure/services/test_chunk_duplicate_service.py b/tests/infrastructure/services/test_chunk_duplicate_service.py
index 0a58a3b6..ca7a4241 100644
--- a/tests/infrastructure/services/test_chunk_duplicate_service.py
+++ b/tests/infrastructure/services/test_chunk_duplicate_service.py
@@ -14,73 +14,73 @@ def mock_repos():
     vector_svc = MagicMock()
     return repo, chunk_repo, vector_svc
 
+
 def test_find_and_register_duplicates(mock_repos):
     """Test finding and registering duplicates."""
     repo, chunk_repo, vector_svc = mock_repos
     service = ChunkDuplicateService(repo, chunk_repo, vector_svc)
-    
+
     chunk_id = uuid.uuid4()
     mock_chunk = MagicMock()
     mock_chunk.id = chunk_id
     mock_chunk.content = "Duplicate test content"
     mock_chunk.content_source_id = str(uuid.uuid4())
     chunk_repo.get_by_id.return_value = mock_chunk
-    
+
     # Mock similar chunks found
     sim_chunk = MagicMock()
     sim_chunk.id = uuid.uuid4()
     sim_chunk.score = 0.95
     vector_svc.retrieve.return_value = [sim_chunk]
-    
+
     count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90)
-    
+
     assert count == 1
     vector_svc.retrieve.assert_called_once_with(
-        query=mock_chunk.content,
-        top_k=5,
-        search_mode=SearchMode.SEMANTIC,
-        re_rank=False
+        query=mock_chunk.content, top_k=5, search_mode=SearchMode.SEMANTIC, re_rank=False
     )
     repo.create_duplicate_record.assert_called_once()
-    
+
     # Check arguments of create_duplicate_record
     _, kwargs = repo.create_duplicate_record.call_args
-    assert kwargs['similarity'] == pytest.approx(0.95)
-    assert str(chunk_id) in kwargs['chunk_ids']
-    assert str(sim_chunk.id) in kwargs['chunk_ids']
+    assert kwargs["similarity"] == pytest.approx(0.95)
+    assert chunk_id in kwargs["chunk_ids"]
+    assert sim_chunk.id in kwargs["chunk_ids"]
+
 
 def test_find_and_register_no_duplicates(mock_repos):
     """Test when no duplicates are found above threshold."""
     repo, chunk_repo, vector_svc = mock_repos
     service = ChunkDuplicateService(repo, chunk_repo, vector_svc)
-    
+
     chunk_id = uuid.uuid4()
     mock_chunk = MagicMock()
     mock_chunk.id = chunk_id
     mock_chunk.content = "Unique content"
     chunk_repo.get_by_id.return_value = mock_chunk
-    
+
     # Sim chunk with low score
     sim_chunk = MagicMock()
     sim_chunk.id = uuid.uuid4()
     sim_chunk.score = 0.5
     vector_svc.retrieve.return_value = [sim_chunk]
-    
+
     count = service.find_and_register_duplicates([chunk_id], similarity_threshold=0.90)
-    
+
     assert count == 0
     repo.create_duplicate_record.assert_not_called()
 
+
 def test_deactivate_chunk(mock_repos):
     """Test deactivating a chunk."""
     repo, chunk_repo, vector_svc = mock_repos
     service = ChunkDuplicateService(repo, chunk_repo, vector_svc)
-    
+
     chunk_id = uuid.uuid4()
     chunk_repo.update_is_active.return_value = True
-    
+
     success = service.deactivate_chunk(chunk_id)
-    
+
     assert success is True
     chunk_repo.update_is_active.assert_called_once_with(chunk_id, False)
     vector_svc.delete_by_id.assert_called_once_with(chunk_id)
diff --git a/tests/presentation/api/routes/test_duplicate_router.py b/tests/presentation/api/routes/test_duplicate_router.py
index ac2c4bb0..5bfef896 100644
--- a/tests/presentation/api/routes/test_duplicate_router.py
+++ b/tests/presentation/api/routes/test_duplicate_router.py
@@ -13,49 +13,50 @@
 
 client = TestClient(app)
 
+
 @pytest.mark.DuplicateRouter
 class TestDuplicateRouter:
     def test_list_duplicates(self):
         mock_repo = MagicMock()
         app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo
-        
+
         mock_repo.list_duplicates.return_value = ([], 0)
-        
+
         response = client.get("/rest/duplicates")
         assert response.status_code == 200
         assert response.json()["total"] == 0
-        
+
         app.dependency_overrides.clear()
 
     def test_update_duplicate_status(self):
         mock_repo = MagicMock()
         app.dependency_overrides[get_duplicate_repo] = lambda: mock_repo
-        
+
         # Use a service mock instead because the router calls resolved_duplicate on service
         # Wait, the router calls service.resolve_duplicate
         mock_service = MagicMock()
         app.dependency_overrides[get_duplicate_service] = lambda: mock_service
-        
+
         duplicate_id = str(uuid.uuid4())
         mock_service.resolve_duplicate.return_value = True
-        
+
         response = client.patch(f"/rest/duplicates/{duplicate_id}/status", json={"status": "reviewed"})
         assert response.status_code == 200
         assert response.json()["status"] == "success"
-        
+
         app.dependency_overrides.clear()
 
     def test_deactivate_chunk(self):
         mock_service = MagicMock()
         app.dependency_overrides[get_duplicate_service] = lambda: mock_service
-        
+
         chunk_id = str(uuid.uuid4())
         mock_service.deactivate_chunk.return_value = True
-        
+
         response = client.post(f"/rest/duplicates/chunks/{chunk_id}/deactivate")
         assert response.status_code == 200
         assert response.json()["status"] == "success"
-        
+
         app.dependency_overrides.clear()
 
     def test_trigger_duplicate_analysis(self):
@@ -63,12 +64,12 @@ def test_trigger_duplicate_analysis(self):
         app.dependency_overrides[get_duplicate_service] = lambda: mock_service
         mock_chunk_service = MagicMock()
         app.dependency_overrides[get_chunk_index_service] = lambda: mock_chunk_service
-        
+
         mock_chunk_service.list_chunks.return_value = []
         mock_service.find_and_register_duplicates.return_value = 0
-        
+
         response = client.post("/rest/duplicates/analyze-all")
         assert response.status_code == 200
         assert response.json()["status"] == "success"
-        
+
         app.dependency_overrides.clear()
diff --git a/tmp/cleanup_db.py b/tmp/cleanup_db.py
index 09417232..8e47d7d2 100644
--- a/tmp/cleanup_db.py
+++ b/tmp/cleanup_db.py
@@ -15,7 +15,7 @@
         # Check if is_active exists in chunk_index
         res = conn.execute(text("PRAGMA table_info(chunk_index)"))
         columns = [row[1] for row in res]
-        if 'is_active' in columns:
+        if "is_active" in columns:
             print("is_active already exists in chunk_index. Attempting to drop it (batch mode needed for SQLite).")
             # For simplicity in this scratch script, I'll just note it.
             # Usually we group these with migrations.

From 39619047c44f9df66d5992b6c5a0b2fdc61394f0 Mon Sep 17 00:00:00 2001
From: ericksonlopes <ofc.erickson@gmail.com>
Date: Wed, 8 Apr 2026 14:30:40 -0300
Subject: [PATCH 6/7] fix: update sql connector imports and fix failing tests

---
 src/application/workers.py                          | 10 +++++-----
 tests/application/test_audio_diarization_workers.py |  4 ++--
 tests/application/test_workers.py                   |  6 +++---
 tests/conftest.py                                   |  1 +
 .../services/test_youtube_audio_downloader.py       |  3 ++-
 tests/presentation/api/routes/test_ingest_router.py | 13 ++++++++++---
 6 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/application/workers.py b/src/application/workers.py
index 67e9ad01..6f15f50c 100644
--- a/src/application/workers.py
+++ b/src/application/workers.py
@@ -314,7 +314,7 @@ def run_diarization_ingestion_worker(cmd: IngestDiarizationCommand):
         vector_svc = ChunkVectorService(vector_repo, rerank_service=rerank_svc)
 
         # DiarizationRepository needs a DB session
-        from infrastructure.connectors.connector_sql import Session as DBSession
+        from src.infrastructure.connectors.connector_sql import Session as DBSession
 
         db = DBSession()
         try:
@@ -418,7 +418,7 @@ async def _run():
 
 def _audio_diarization_subprocess(cmd_dict: dict):
     """Run audio diarization in a separate process to avoid torch/CUDA thread deadlocks."""
-    from infrastructure.connectors.connector_sql import (
+    from src.infrastructure.connectors.connector_sql import (
         Session as DBSessionFactory,
     )
     from src.application.use_cases.process_audio_diarization_pipeline import (
@@ -494,7 +494,7 @@ def run_audio_diarization_dispatcher_worker(cmd: ProcessAudioCommand):
         return
 
     try:
-        from infrastructure.connectors.connector_sql import (
+        from src.infrastructure.connectors.connector_sql import (
             Session as DBSessionFactory,
         )
         from src.infrastructure.extractors.youtube_extractor import YoutubeExtractor
@@ -610,7 +610,7 @@ def run_audio_diarization_worker(cmd: ProcessAudioCommand):
         if process.exitcode != 0:
             logger.error("Audio diarization subprocess exited with code %d", process.exitcode)
             if cmd.diarization_id:
-                from infrastructure.connectors.connector_sql import (
+                from src.infrastructure.connectors.connector_sql import (
                     Session as DBSessionFactory,
                 )
                 from src.infrastructure.repositories.sql.diarization_repository import (
@@ -668,7 +668,7 @@ def run_voice_training_worker(cmd: TrainVoiceCommand):
         return
 
     try:
-        from infrastructure.connectors.connector_sql import Session as DBSession
+        from src.infrastructure.connectors.connector_sql import Session as DBSession
         from src.application.use_cases.manage_voice_profiles import (
             TrainVoiceProfileFromSpeakerSegmentUseCase,
         )
diff --git a/tests/application/test_audio_diarization_workers.py b/tests/application/test_audio_diarization_workers.py
index 1509237c..9b493d4f 100644
--- a/tests/application/test_audio_diarization_workers.py
+++ b/tests/application/test_audio_diarization_workers.py
@@ -40,7 +40,7 @@ def test_run_audio_diarization_dispatcher_worker_deduplication(self, mock_app, m
             patch("src.application.workers.registry.get", return_value=mock_app),
             patch("src.infrastructure.extractors.youtube_extractor.YoutubeExtractor") as mock_extractor_cls,
             patch(
-                "src.infrastructure.repositories.sql.connector.Session",
+                "src.infrastructure.connectors.connector_sql.Session",
                 return_value=mock_db_session,
             ),
             patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository") as mock_repo_cls,
@@ -92,7 +92,7 @@ def test_run_audio_diarization_dispatcher_worker_retry_failed(self, mock_app, mo
             patch("src.application.workers.registry.get", return_value=mock_app),
             patch("src.infrastructure.extractors.youtube_extractor.YoutubeExtractor") as mock_extractor_cls,
             patch(
-                "src.infrastructure.repositories.sql.connector.Session",
+                "src.infrastructure.connectors.connector_sql.Session",
                 return_value=mock_db_session,
             ),
             patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository") as mock_repo_cls,
diff --git a/tests/application/test_workers.py b/tests/application/test_workers.py
index 25b4ce5d..c3aa20dc 100644
--- a/tests/application/test_workers.py
+++ b/tests/application/test_workers.py
@@ -171,7 +171,7 @@ def test_run_diarization_ingestion_worker_success(self):
             patch("src.presentation.api.dependencies.resolve_vector_repository"),
             patch("src.presentation.api.dependencies.resolve_rerank_service"),
             patch("src.infrastructure.services.chunk_vector_service.ChunkVectorService"),
-            patch("src.infrastructure.repositories.sql.connector.Session") as mock_session_cls,
+            patch("src.infrastructure.connectors.connector_sql.Session") as mock_session_cls,
             patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository"),
             patch(
                 "src.application.use_cases.diarization_ingestion_use_case.DiarizationIngestionUseCase"
@@ -211,7 +211,7 @@ def test_audio_diarization_subprocess_success(self):
         from src.application.workers import _audio_diarization_subprocess
 
         with (
-            patch("src.infrastructure.repositories.sql.connector.Session") as mock_session_cls,
+            patch("src.infrastructure.connectors.connector_sql.Session") as mock_session_cls,
             patch("src.infrastructure.services.redis_event_bus.RedisEventBus"),
             patch(
                 "src.application.use_cases.process_audio_diarization_pipeline.ProcessAudioDiarizationPipelineUseCase"
@@ -266,7 +266,7 @@ def test_run_audio_diarization_worker_failure(self):
 
         with (
             patch("multiprocessing.get_context") as mock_get_ctx,
-            patch("src.infrastructure.repositories.sql.connector.Session") as mock_session_factory,
+            patch("src.infrastructure.connectors.connector_sql.Session") as mock_session_factory,
             patch("src.infrastructure.repositories.sql.diarization_repository.DiarizationRepository") as mock_repo_cls,
             patch("src.infrastructure.services.redis_event_bus.RedisEventBus"),
         ):
diff --git a/tests/conftest.py b/tests/conftest.py
index 39214ce2..95eae632 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,6 +9,7 @@
 from sqlalchemy.orm import sessionmaker
 
 import src.infrastructure.connectors.connector_sql as connector
+import src.infrastructure.repositories.sql.models  # noqa: F401
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/infrastructure/services/test_youtube_audio_downloader.py b/tests/infrastructure/services/test_youtube_audio_downloader.py
index 899b5a96..771263eb 100644
--- a/tests/infrastructure/services/test_youtube_audio_downloader.py
+++ b/tests/infrastructure/services/test_youtube_audio_downloader.py
@@ -7,9 +7,10 @@
 
 @pytest.mark.Downloader
 class TestYoutubeExtractorDownload:
+    @patch("src.infrastructure.extractors.youtube_extractor.YoutubeExtractor._validate_mp3_file")
     @patch("src.infrastructure.extractors.youtube_extractor.YoutubeDL")
     @patch("os.makedirs")
-    def test_download_success(self, mock_makedirs, mock_ytdl):
+    def test_download_success(self, mock_makedirs, mock_ytdl, mock_validate):
         # Mocking yt_dlp to return a fake filename
         mock_instance = mock_ytdl.return_value.__enter__.return_value
         mock_instance.extract_info.return_value = {"title": "Test Audio", "ext": "webm"}
diff --git a/tests/presentation/api/routes/test_ingest_router.py b/tests/presentation/api/routes/test_ingest_router.py
index 0ecd6f71..f7991409 100644
--- a/tests/presentation/api/routes/test_ingest_router.py
+++ b/tests/presentation/api/routes/test_ingest_router.py
@@ -76,11 +76,18 @@ def test_ingest_youtube_exception(mock_use_case):
 def test_ingest_youtube_reprocess():
     # Use real dependency override for task_queue if needed,
     # but mock_app_state fixture already sets app.state.task_queue
-    from src.presentation.api.dependencies import get_task_queue_service
-
+    from src.presentation.api.dependencies import get_task_queue_service, get_job_service
+    
     mock_queue = MagicMock()
     app.dependency_overrides[get_task_queue_service] = lambda: mock_queue
 
+    mock_job_service = MagicMock()
+    # Mocking the created job to have a valid UUID id
+    mock_job = MagicMock()
+    mock_job.id = "123e4567-e89b-12d3-a456-426614174000"
+    mock_job_service.create_job.return_value = mock_job
+    app.dependency_overrides[get_job_service] = lambda: mock_job_service
+
     try:
         response = client.post(
             "/rest/ingest/youtube",
@@ -89,7 +96,7 @@ def test_ingest_youtube_reprocess():
 
         assert response.status_code == 200
         assert response.json()["skipped"] is False
-        assert response.json()["reason"] == "Ingestion started in background queue."
+        assert response.json()["reason"] == "Ingestion started in background queue (Job: 123e4567-e89b-12d3-a456-426614174000)."
         assert mock_queue.enqueue.called
     finally:
         app.dependency_overrides.pop(get_task_queue_service, None)

From f6a50a97c09742a0b796cb1e281006ca49d96a24 Mon Sep 17 00:00:00 2001
From: ericksonlopes <ofc.erickson@gmail.com>
Date: Wed, 8 Apr 2026 14:41:17 -0300
Subject: [PATCH 7/7] style: fix linting issues (import sorting and line
 length)

---
 src/application/workers.py                          | 8 ++++----
 tests/presentation/api/routes/test_ingest_router.py | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/application/workers.py b/src/application/workers.py
index 6f15f50c..81c341c6 100644
--- a/src/application/workers.py
+++ b/src/application/workers.py
@@ -418,12 +418,12 @@ async def _run():
 
 def _audio_diarization_subprocess(cmd_dict: dict):
     """Run audio diarization in a separate process to avoid torch/CUDA thread deadlocks."""
-    from src.infrastructure.connectors.connector_sql import (
-        Session as DBSessionFactory,
-    )
     from src.application.use_cases.process_audio_diarization_pipeline import (
         ProcessAudioDiarizationPipelineUseCase,
     )
+    from src.infrastructure.connectors.connector_sql import (
+        Session as DBSessionFactory,
+    )
     from src.infrastructure.repositories.sql.content_source_repository import (
         ContentSourceSQLRepository,
     )
@@ -668,10 +668,10 @@ def run_voice_training_worker(cmd: TrainVoiceCommand):
         return
 
     try:
-        from src.infrastructure.connectors.connector_sql import Session as DBSession
         from src.application.use_cases.manage_voice_profiles import (
             TrainVoiceProfileFromSpeakerSegmentUseCase,
         )
+        from src.infrastructure.connectors.connector_sql import Session as DBSession
         from src.presentation.api.dependencies import resolve_ingestion_context
 
         ctx = resolve_ingestion_context(app)
diff --git a/tests/presentation/api/routes/test_ingest_router.py b/tests/presentation/api/routes/test_ingest_router.py
index f7991409..997f554f 100644
--- a/tests/presentation/api/routes/test_ingest_router.py
+++ b/tests/presentation/api/routes/test_ingest_router.py
@@ -76,7 +76,7 @@ def test_ingest_youtube_exception(mock_use_case):
 def test_ingest_youtube_reprocess():
     # Use real dependency override for task_queue if needed,
     # but mock_app_state fixture already sets app.state.task_queue
-    from src.presentation.api.dependencies import get_task_queue_service, get_job_service
+    from src.presentation.api.dependencies import get_job_service, get_task_queue_service
     
     mock_queue = MagicMock()
     app.dependency_overrides[get_task_queue_service] = lambda: mock_queue
@@ -96,7 +96,8 @@ def test_ingest_youtube_reprocess():
 
         assert response.status_code == 200
         assert response.json()["skipped"] is False
-        assert response.json()["reason"] == "Ingestion started in background queue (Job: 123e4567-e89b-12d3-a456-426614174000)."
+        expected_reason = "Ingestion started in background queue (Job: 123e4567-e89b-12d3-a456-426614174000)."
+        assert response.json()["reason"] == expected_reason
         assert mock_queue.enqueue.called
     finally:
         app.dependency_overrides.pop(get_task_queue_service, None)