⚡️(summarize) use semchunk for better doc chunking

qbey · qbey · commit 49a0c20d6e13 · 2025-11-06T15:04:24.000+01:00
This reduces the code complexity while allowing better "cuts"
also providing overlap for free.
Also, do not wait for sub-batch to complete a use a global
concurrency instead.
diff --git a/src/backend/chat/agents/summarize.py b/src/backend/chat/agents/summarize.py
@@ -1,18 +1,18 @@
 """Build the summarization agent."""
 
+import asyncio
 import dataclasses
 import logging
-import asyncio
 
 from django.conf import settings
 from django.core.files.storage import default_storage
 
+import semchunk
 from asgiref.sync import sync_to_async
 from pydantic_ai import RunContext
 from pydantic_ai.messages import ToolReturn
 
 from .base import BaseAgent
-from ..tools.document_search_rag import add_document_rag_search_tool
 
 logger = logging.getLogger(__name__)
 
@@ -37,28 +37,49 @@ def read_document_content(doc):
         return doc.file_name, f.read().decode("utf-8")
 
 
-async def hand_off_to_summarization_agent(
+async def summarize_chunk(idx, chunk, total_chunks, summarization_agent, ctx):
+    """Summarize a single chunk of text."""
+    sum_prompt = (
+        "You are an agent specializing in text summarization. "
+        "Generate a clear and concise summary of the following passage "
+        f"(part {idx}/{total_chunks}):\n'''\n{chunk}\n'''\n\n"
+    )
+
+    logger.debug(
+        "[summarize] CHUNK %s/%s prompt=> %s", idx, total_chunks, sum_prompt[0:100] + "..."
+    )
+
+    resp = await summarization_agent.run(sum_prompt, usage=ctx.usage)
+
+    logger.debug("[summarize] CHUNK %s/%s response<= %s", idx, total_chunks, resp.output or "")
+    return resp.output or ""
+
+
+async def hand_off_to_summarization_agent(  # pylint: disable=too-many-locals
     ctx: RunContext, *, instructions: str | None = None
 ) -> ToolReturn:
     """
-    Summarize the documents for the user, only when asked for.
+    Generate a complete, ready-to-use summary of the documents in context
+    (do not request the documents to the user).
+    Return this summary directly to the user WITHOUT any modification,
+    or additional summarization.
+    The summary is already optimized and MUST be presented as-is in the final response
+    or translated preserving the information.
+
     Instructions are optional but should reflect the user's request.
-    Examples :
-    "Résume ce doc en 2 paragraphes" -> instructions = "résumé en 2 paragraphes"
-    "Résume ce doc en anglais" -> instructions = "In English"
-    "Résume ce doc" -> instructions = "" (default)
+
+    Examples:
+    "Summarize this doc in 2 paragraphs" -> instructions = "summary in 2 paragraphs"
+    "Summarize this doc in English" -> instructions = "In English"
+    "Summarize this doc" -> instructions = "" (default)
+
     Args:
         instructions (str | None): The instructions the user gave to use for the summarization
     """
-    summarization_agent = SummarizationAgent()
-
-    prompt = (
-        "Do not mention the user request in your answer.\n"
-        "User request:\n"
-        "{user_prompt}\n\n"
-        "Document contents:\n"
-        "{documents_prompt}\n"
+    instructions_hint = (
+        instructions.strip() if instructions else "The summary should contain 2 or 3 parts."
     )
+    summarization_agent = SummarizationAgent()
 
     # Collect documents content
     text_attachment = await sync_to_async(list)(
@@ -69,70 +90,70 @@ async def hand_off_to_summarization_agent(
 
     documents = [await read_document_content(doc) for doc in text_attachment]
 
-    # Instructions: rely on tool argument only; model should extract them upstream
-    if instructions is not None:
-        instructions_hint: str = instructions.strip()
-    else:
-        instructions_hint = ""
-
-    # Helpers
-    def chunk_text(text: str, size: int = 10000) -> list[str]:
-        if size <= 0:
-            return [text]
-        return [text[i : i + size] for i in range(0, len(text), size)]
-
-    # 2) Chunk documents and summarize each chunk
-    full_text = "\n\n".join(doc[1] for doc in documents)
-    chunks = chunk_text(full_text, size=10000)
+    # Chunk documents and summarize each chunk
+    chunk_size = settings.SUMMARIZATION_CHUNK_SIZE
+    chunker = semchunk.chunkerify(
+        tokenizer_or_token_counter=lambda text: len(text.split()),
+        chunk_size=chunk_size,
+    )
+    documents_chunks = chunker(
+        [doc[1] for doc in documents],
+        overlap=settings.SUMMARIZATION_OVERLAP_SIZE,
+    )
+
     logger.info(
         "[summarize] chunking: %s parts (size~%s), instructions='%s'",
-        len(chunks),
-        10000,
-        instructions_hint or "",
+        sum(len(chunks) for chunks in documents_chunks),
+        chunk_size,
+        instructions_hint,
     )
 
-    async def summarize_chunk(idx, chunk, total_chunks, summarization_agent, ctx):
-        sum_prompt = (
-            "Tu es un agent spécialisé en synthèses de textes. "
-            "Génère un résumé clair et concis du passage suivant (partie {idx}/{total}) :\n"
-            "'''\n{context}\n'''\n\n"
-        ).format(context=chunk, idx=idx, total=total_chunks)
-        logger.info("[summarize] CHUNK %s/%s prompt=> %s", idx, total_chunks, sum_prompt[0:100]+'...')
-        resp = await summarization_agent.run(sum_prompt, usage=ctx.usage)
-        logger.info("[summarize] CHUNK %s/%s response<= %s", idx, total_chunks, resp.output or "")
-        return resp.output or ""
-
-    # Parallelize the chunk summarization in batches of 5 using asyncio.gather
-    chunk_summaries: list[str] = []
-    batch_size = 5
-    for start_idx in range(0, len(chunks), batch_size):
-        end_idx = start_idx + batch_size
-        batch_chunks = chunks[start_idx:end_idx]
+    # Parallelize the chunk summarization with a semaphore to limit concurrent tasks
+    # because it can be very resource intensive on the LLM backend
+    semaphore = asyncio.Semaphore(settings.SUMMARIZATION_CONCURRENT_REQUESTS)
+
+    async def summarize_chunk_with_semaphore(idx, chunk, total_chunks):
+        """Summarize a chunk with semaphore-controlled concurrency."""
+        async with semaphore:
+            return await summarize_chunk(idx, chunk, total_chunks, summarization_agent, ctx)
+
+    doc_chunk_summaries = []
+    for doc_chunks in documents_chunks:
         summarization_tasks = [
-            summarize_chunk(idx, chunk, len(chunks), summarization_agent, ctx)
-            for idx, chunk in enumerate(batch_chunks, start=start_idx + 1)
+            summarize_chunk_with_semaphore(idx, chunk, len(doc_chunks))
+            for idx, chunk in enumerate(doc_chunks, start=1)
         ]
-        batch_results = await asyncio.gather(*summarization_tasks)
-        chunk_summaries.extend(batch_results)
-
-    if not instructions_hint:
-        instructions_hint = "Le résumé doit être en Français, contenir 2 ou 3 parties."
+        chunk_summaries = await asyncio.gather(*summarization_tasks)
+        doc_chunk_summaries.append(chunk_summaries)
+
+    context = "\n\n".join(
+        doc_name + "\n\n" + "\n\n".join(summaries)
+        for doc_name, summaries in zip(
+            (doc[0] for doc in documents),
+            doc_chunk_summaries,
+            strict=True,
+        )
+    )
 
-    # 3) Merge chunk summaries into a single concise summary
+    # Merge chunk summaries into a single concise summary
     merged_prompt = (
-        "Produit une synthèse cohérente à partir des résumés ci-dessous.\n\n"
-        "'''\n{context}\n'''\n\n"
-        "Contraintes :\n"
-        "- Résumer sans répéter.\n"
-        "- Harmoniser le style et la terminologie.\n"
-        "- Le résumé final doit être bien structuré et formaté en markdown. \n"
-        "- Respecter les consignes : {instructions}\n"
-        "Réponds directement avec le résumé final."
-    ).format(context="\n\n".join(chunk_summaries), instructions=instructions_hint or "")
-    logger.info("[summarize] MERGE prompt=> %s", merged_prompt)
+        "Produce a coherent synthesis from the summaries below.\n\n"
+        f"'''\n{context}\n'''\n\n"
+        "Constraints:\n"
+        "- Summarize without repetition.\n"
+        "- Harmonize style and terminology.\n"
+        "- The final summary must be well-structured and formatted in markdown.\n"
+        f"- Follow the instructions: {instructions_hint}\n"
+        "Respond directly with the final summary."
+    )
+
+    logger.debug("[summarize] MERGE prompt=> %s", merged_prompt)
+
     merged_resp = await summarization_agent.run(merged_prompt, usage=ctx.usage)
+
     final_summary = (merged_resp.output or "").strip()
-    logger.info("[summarize] MERGE response<= %s", final_summary)
+
+    logger.debug("[summarize] MERGE response<= %s", final_summary)
 
     return ToolReturn(
         return_value=final_summary,
diff --git a/src/backend/chat/clients/pydantic_ai.py b/src/backend/chat/clients/pydantic_ai.py
@@ -483,11 +483,14 @@ def force_web_search_prompt() -> str:
             @self.conversation_agent.system_prompt
             def summarization_system_prompt() -> str:
                 return (
-                    "When the user asks to summarize attached document(s), you MUST call the"
-                    " summarize tool. Pass user's instructions if provided, otherwise pass an"
-                    " empty instructions string once the user confirms (e.g. says 'ok'). Do NOT"
-                    " call web search or document_search_rag to produce summaries; rely only on"
-                    " the attached documents stored in context."
+                    "When you receive a result from the summarization tool, you MUST return it "
+                    "directly to the user without any modification, paraphrasing, or additional "
+                    "summarization."
+                    "The tool already produces optimized summaries that should be presented "
+                    "verbatim."
+                    "You may translate the summary if required, but you MUST preserve all the "
+                    "information from the original summary."
+                    "You may add a follow-up question after the summary if needed."
                 )
 
             # Inform the model (system-level) that documents are attached and available
diff --git a/src/backend/chat/tests/views/chat/conversations/test_conversation_with_document_upload.py b/src/backend/chat/tests/views/chat/conversations/test_conversation_with_document_upload.py
@@ -374,11 +374,10 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                 "timestamp": timezone_now,
             },
             {
-                "content": "If the user wants specific information from a "
-                "document, invoke web_search_albert_rag with an "
-                "appropriate query string.Do not ask the user for the "
-                "document; rely on the tool to locate and return "
-                "relevant passages.",
+                "content": "Use document_search_rag ONLY to retrieve specific "
+                "passages from attached documents. Do NOT use it to "
+                "summarize; for summaries, call the summarize tool "
+                "instead.",
                 "dynamic_ref": None,
                 "part_kind": "system-prompt",
                 "timestamp": timezone_now,
@@ -397,6 +396,15 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                 "part_kind": "system-prompt",
                 "timestamp": timezone_now,
             },
+            {
+                "content": "[Internal context] User documents are attached to this "
+                "conversation. Do not request re-upload of documents; "
+                "consider them already available via the internal "
+                "store.",
+                "dynamic_ref": None,
+                "part_kind": "system-prompt",
+                "timestamp": timezone_now,
+            },
             {
                 "content": ["What does the document say?"],
                 "part_kind": "user-prompt",
@@ -627,7 +635,7 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
         'document discusses various topics."}\n'
         '0:"The document discusses various topics."\n'
         'f:{"messageId":"<mocked_uuid>"}\n'
-        'd:{"finishReason":"stop","usage":{"promptTokens":201,"completionTokens":13}}\n'
+        'd:{"finishReason":"stop","usage":{"promptTokens":317,"completionTokens":19}}\n'
     )
 
     # Check that the conversation was updated
@@ -709,11 +717,10 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                 "timestamp": timezone_now,
             },
             {
-                "content": "If the user wants specific information from a "
-                "document, invoke web_search_albert_rag with an "
-                "appropriate query string.Do not ask the user for the "
-                "document; rely on the tool to locate and return "
-                "relevant passages.",
+                "content": "Use document_search_rag ONLY to retrieve specific "
+                "passages from attached documents. Do NOT use it to "
+                "summarize; for summaries, call the summarize tool "
+                "instead.",
                 "dynamic_ref": None,
                 "part_kind": "system-prompt",
                 "timestamp": timezone_now,
@@ -732,6 +739,15 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                 "part_kind": "system-prompt",
                 "timestamp": timezone_now,
             },
+            {
+                "content": "[Internal context] User documents are attached to this "
+                "conversation. Do not request re-upload of documents; "
+                "consider them already available via the internal "
+                "store.",
+                "dynamic_ref": None,
+                "part_kind": "system-prompt",
+                "timestamp": timezone_now,
+            },
             {
                 "content": ["Make a summary of this document."],
                 "part_kind": "user-prompt",
diff --git a/src/backend/chat/tests/views/chat/conversations/test_conversation_with_document_url.py b/src/backend/chat/tests/views/chat/conversations/test_conversation_with_document_url.py
@@ -881,10 +881,9 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                     SystemPromptPart(content="Answer in english.", timestamp=timezone.now()),
                     SystemPromptPart(
                         content=(
-                            "If the user wants specific information from a document, "
-                            "invoke web_search_albert_rag with an appropriate query string."
-                            "Do not ask the user for the document; rely on the tool to locate "
-                            "and return relevant passages."
+                            "Use document_search_rag ONLY to retrieve specific passages from "
+                            "attached documents. Do NOT use it to summarize; for summaries, "
+                            "call the summarize tool instead."
                         ),
                         timestamp=timezone.now(),
                     ),
@@ -901,6 +900,14 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                         ),
                         timestamp=timezone.now(),
                     ),
+                    SystemPromptPart(
+                        content=(
+                            "[Internal context] User documents are attached to this conversation. "
+                            "Do not request re-upload of documents; consider them already "
+                            "available via the internal store."
+                        ),
+                        timestamp=timezone.now(),
+                    ),
                     UserPromptPart(
                         content=[
                             "What is in this document?",
@@ -1002,11 +1009,10 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                     "timestamp": timestamp,
                 },
                 {
-                    "content": "If the user wants specific information from a "
-                    "document, invoke web_search_albert_rag with an "
-                    "appropriate query string.Do not ask the user for the "
-                    "document; rely on the tool to locate and return "
-                    "relevant passages.",
+                    "content": "Use document_search_rag ONLY to retrieve specific "
+                    "passages from attached documents. Do NOT use it to "
+                    "summarize; for summaries, call the summarize tool "
+                    "instead.",
                     "dynamic_ref": None,
                     "part_kind": "system-prompt",
                     "timestamp": timestamp,
@@ -1025,6 +1031,15 @@ async def agent_model(messages: list[ModelMessage], _info: AgentInfo):
                     "part_kind": "system-prompt",
                     "timestamp": timestamp,
                 },
+                {
+                    "content": "[Internal context] User documents are attached to "
+                    "this conversation. Do not request re-upload of "
+                    "documents; consider them already available via the "
+                    "internal store.",
+                    "dynamic_ref": None,
+                    "part_kind": "system-prompt",
+                    "timestamp": timestamp,
+                },
                 {
                     "content": [
                         "What is in this document?",
diff --git a/src/backend/chat/tools/document_search_rag.py b/src/backend/chat/tools/document_search_rag.py
@@ -22,10 +22,6 @@ def document_search_rag(ctx: RunContext, query: str) -> ToolReturn:
             ctx (RunContext): The run context containing the conversation.
             query (str): The query to search the documents for.
         """
-        # Defensive: ctx.deps or ctx.deps.conversation may be unavailable in some flows (start of conversation)
-        if not getattr(ctx, "deps", None) or not getattr(ctx.deps, "conversation", None):
-            return ToolReturn(return_value=[], content="", metadata={"sources": set()})
-
         document_store_backend = import_string(settings.RAG_DOCUMENT_SEARCH_BACKEND)
 
         document_store = document_store_backend(ctx.deps.conversation.collection_id)
diff --git a/src/backend/conversations/settings.py b/src/backend/conversations/settings.py
@@ -786,6 +786,21 @@ class Base(BraveSettings, Configuration):
         environ_name="SUMMARIZATION_SYSTEM_PROMPT",
         environ_prefix=None,
     )
+    SUMMARIZATION_CHUNK_SIZE = values.PositiveIntegerValue(
+        default=20_000,  # Approx 20k words per chunk
+        environ_name="SUMMARIZATION_CHUNK_SIZE",
+        environ_prefix=None,
+    )
+    SUMMARIZATION_OVERLAP_SIZE = values.FloatValue(
+        default=0.05,  # 5% overlap
+        environ_name="SUMMARIZATION_OVERLAP_SIZE",
+        environ_prefix=None,
+    )
+    SUMMARIZATION_CONCURRENT_REQUESTS = values.PositiveIntegerValue(
+        default=5,
+        environ_name="SUMMARIZATION_CONCURRENT_REQUESTS",
+        environ_prefix=None,
+    )
 
     # Tavily API
     TAVILY_API_KEY = values.Value(
diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml