From eef9ad524974e41fa008f62d567f3ee2518900ca Mon Sep 17 00:00:00 2001
From: Tim Waugh <twaugh@redhat.com>
Date: Mon, 24 Nov 2025 10:56:52 +0000
Subject: [PATCH 1/2] fix(rag): normalize page-level chunk targets to
 add_section action

Page-level chunks (__PAGE__) are synthetic entries in the RAG index
used for semantic search by page name/title/frontmatter. They don't
correspond to real blocks in the file structure, so they can't be
used as integration targets for actions like add_under or replace.

When the LLM selects a __PAGE__ chunk as a target, it means "this
knowledge belongs on this page" without specifying a particular block.
The correct interpretation is add_section (new top-level section).

Changes:
- Detect targets ending with "::__PAGE__" after LLM ID translation
- Normalize action to "add_section" regardless of LLM suggestion
- Clear target_block_id (add_section has no specific target)
- Add debug logging for normalization events

This eliminates the "Target block not found: page::__PAGE__" errors
and enables LLM to suggest integration into pages that have no blocks
yet (only page-level metadata exists in RAG index).

Impact:
- Before: Integration fails with "target block not found" error
- After: Page-level chunks correctly interpreted as add_section
- Enables: Adding knowledge to empty but relevant pages

Tests:
- Added test_plan_integration_for_block_normalizes_page_level_chunks
- Added test_plan_integration_for_block_preserves_regular_block_targets
- All existing llm_wrappers tests pass

Assisted-by: Claude Code
---
 src/logsqueak/services/llm_wrappers.py |  13 ++++
 tests/unit/test_llm_wrappers.py        | 103 +++++++++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/src/logsqueak/services/llm_wrappers.py b/src/logsqueak/services/llm_wrappers.py
index 8bbb4b1..2315132 100644
--- a/src/logsqueak/services/llm_wrappers.py
+++ b/src/logsqueak/services/llm_wrappers.py
@@ -651,6 +651,19 @@ async def plan_integration_for_block(
                 )
                 continue  # Skip this chunk
 
+        # Normalize page-level chunks (__PAGE__) to add_section
+        # Page-level chunks exist only in RAG index for semantic search,
+        # not as real blocks in the file structure
+        if target_hybrid_id and target_hybrid_id.endswith("::__PAGE__"):
+            logger.debug(
+                "llm_page_level_chunk_normalized",
+                original_action=chunk.action,
+                target_page=chunk.target_page,
+                target_id=target_hybrid_id
+            )
+            chunk.action = "add_section"
+            target_hybrid_id = None  # No target_block_id for add_section
+
         # Set translated target_block_id
         chunk.target_block_id = target_hybrid_id
 
diff --git a/tests/unit/test_llm_wrappers.py b/tests/unit/test_llm_wrappers.py
index 3cd0dca..04461ce 100644
--- a/tests/unit/test_llm_wrappers.py
+++ b/tests/unit/test_llm_wrappers.py
@@ -560,3 +560,106 @@ async def mock_stream(*args, **kwargs):
 
     # Assert - should handle gracefully (no crashes)
     assert len(results) == 0
+
+
+@pytest.mark.asyncio
+async def test_plan_integration_for_block_normalizes_page_level_chunks():
+    """Test plan_integration_for_block() normalizes __PAGE__ targets to add_section."""
+    # Arrange
+    mock_client = Mock(spec=LLMClient)
+
+    edited_content = EditedContent(
+        block_id="abc123",
+        original_content="[[diffused]] supports [[ACS]]",
+        hierarchical_context="- [[diffused]] supports [[ACS]]",
+        current_content="[[diffused]] can utilize [[ACS]]"
+    )
+
+    # Include page-level chunk (ends with ::__PAGE__)
+    candidate_chunks = [
+        ("diffused", "diffused::__PAGE__", "Page: diffused\nTitle: diffused\ntags:: system, kubernetes"),
+        ("diffused", "block1", "- Deployment architecture\n  - Uses Kubernetes")
+    ]
+
+    page_contents = {
+        "diffused": LogseqOutline.parse(
+            "tags:: system, kubernetes\n\n"
+            "- Deployment architecture\n"
+            "  id:: block1\n"
+            "  - Uses Kubernetes"
+        )
+    }
+
+    # Mock LLM suggesting add_under action for page-level chunk
+    # NOTE: "diffused::__PAGE__" will be mapped to short ID (e.g., "1")
+    async def mock_stream(*args, **kwargs):
+        # LLM suggests adding under page-level chunk
+        yield IntegrationDecisionChunk(
+            target_page="diffused",
+            action="add_under",  # LLM can suggest any action
+            target_block_id="1",  # Short ID for "diffused::__PAGE__"
+            target_block_title="diffused",
+            confidence=0.95,
+            reasoning="The knowledge is about diffused system"
+        )
+
+    mock_client.stream_ndjson = mock_stream
+
+    # Act
+    results = []
+    async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents):
+        results.append(chunk)
+
+    # Assert - __PAGE__ target should be normalized to add_section
+    assert len(results) == 1
+    assert results[0].knowledge_block_id == "abc123"
+    assert results[0].target_page == "diffused"
+    assert results[0].action == "add_section"  # Normalized from add_under
+    assert results[0].target_block_id is None  # Cleared (add_section has no target)
+    assert results[0].confidence == 0.95
+
+
+@pytest.mark.asyncio
+async def test_plan_integration_for_block_preserves_regular_block_targets():
+    """Test plan_integration_for_block() preserves regular block targets (not __PAGE__)."""
+    # Arrange
+    mock_client = Mock(spec=LLMClient)
+
+    edited_content = EditedContent(
+        block_id="abc123",
+        original_content="Kubernetes scaling info",
+        hierarchical_context="- Kubernetes scaling info",
+        current_content="Kubernetes enables horizontal scaling"
+    )
+
+    # Regular block chunk (NOT a page-level chunk)
+    candidate_chunks = [
+        ("diffused", "block1", "- Deployment architecture\n  - Uses Kubernetes")
+    ]
+
+    page_contents = {
+        "diffused": LogseqOutline.parse("- Deployment architecture\n  id:: block1")
+    }
+
+    # Mock LLM suggesting add_under action for regular block
+    async def mock_stream(*args, **kwargs):
+        yield IntegrationDecisionChunk(
+            target_page="diffused",
+            action="add_under",
+            target_block_id="1",  # Short ID for "block1"
+            target_block_title="Deployment architecture",
+            confidence=0.90,
+            reasoning="Related to deployment"
+        )
+
+    mock_client.stream_ndjson = mock_stream
+
+    # Act
+    results = []
+    async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents):
+        results.append(chunk)
+
+    # Assert - regular block should NOT be normalized
+    assert len(results) == 1
+    assert results[0].action == "add_under"  # Preserved
+    assert results[0].target_block_id == "block1"  # Translated but not cleared

From da0af4773914583ae728173573862fcc4842377a Mon Sep 17 00:00:00 2001
From: Tim Waugh <twaugh@redhat.com>
Date: Mon, 24 Nov 2025 11:02:46 +0000
Subject: [PATCH 2/2] perf(llm): strip redundant frontmatter from page-level
 chunks in prompts

Page-level chunks (__PAGE__) store frontmatter in their context for
semantic search quality during embedding. However, when formatting
these chunks for LLM prompts, the frontmatter was duplicated:
- Once in the <properties> section (parsed from page outline)
- Again in the <block> content (from stored RAG chunk context)

This wasted ~50-200 tokens per page depending on property count.

Solution: Reuse existing _clean_context_for_llm() function from
page_indexer.py to strip frontmatter from page-level chunks during
prompt formatting. This elegant approach reuses tested code instead
of duplicating logic.

Changes:
- Import _clean_context_for_llm() in llm_helpers.py
- Detect page-level chunks (::__PAGE__) in format_chunks_for_llm()
- Apply _clean_context_for_llm() to page-level chunks only
- Regular blocks unchanged (already cleaned during indexing)
- Frontmatter remains in <properties> section for LLM context

Impact:
- Before: "tags:: foo, bar" appears twice in prompt (properties + block)
- After: "tags:: foo, bar" appears once (properties only)
- Token savings: ~50-200 per page with properties
- No impact on semantic search quality (frontmatter still embedded)

Tests:
- Added test_llm_helpers.py with 3 integration tests
- Tests cover page-level chunk stripping, regular block preservation
- All existing llm_wrappers tests pass (no regressions)

Assisted-by: Claude Code
---
 src/logsqueak/services/llm_helpers.py |  12 ++-
 tests/unit/test_llm_helpers.py        | 126 ++++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/test_llm_helpers.py

diff --git a/src/logsqueak/services/llm_helpers.py b/src/logsqueak/services/llm_helpers.py
index ae82c88..adf62ce 100644
--- a/src/logsqueak/services/llm_helpers.py
+++ b/src/logsqueak/services/llm_helpers.py
@@ -2,6 +2,7 @@
 
 from collections import defaultdict
 from logseq_outline.parser import LogseqOutline
+from logsqueak.services.page_indexer import _clean_context_for_llm
 
 
 def format_chunks_for_llm(
@@ -93,7 +94,16 @@ def format_chunks_for_llm(
         for block_id, context in chunks_by_page[page_name]:
             short_id = id_mapper.to_short(block_id)
             xml_parts.append(f'<block id="{xml_escape(short_id)}">')
-            xml_parts.append(context)
+
+            # Strip redundant frontmatter from page-level chunks
+            # Page-level chunks have format: "Page: X\nTitle: Y\n<frontmatter>"
+            # Frontmatter is already shown in <properties>, so strip it
+            if block_id.endswith("::__PAGE__") and outline and outline.frontmatter:
+                cleaned_context = _clean_context_for_llm(context, outline.frontmatter)
+                xml_parts.append(cleaned_context)
+            else:
+                xml_parts.append(context)
+
             xml_parts.append("</block>")
 
         xml_parts.append("</page>")
diff --git a/tests/unit/test_llm_helpers.py b/tests/unit/test_llm_helpers.py
new file mode 100644
index 0000000..44decc6
--- /dev/null
+++ b/tests/unit/test_llm_helpers.py
@@ -0,0 +1,126 @@
+"""Unit tests for LLM helper functions."""
+
+import pytest
+from logsqueak.services.llm_helpers import format_chunks_for_llm
+from logsqueak.utils.llm_id_mapper import LLMIDMapper
+from logseq_outline.parser import LogseqOutline
+
+
+def test_format_chunks_for_llm_strips_page_level_frontmatter():
+    """Test format_chunks_for_llm() strips frontmatter from page-level chunks."""
+    # Arrange
+    id_mapper = LLMIDMapper()
+    page_name = "diffused"
+    page_chunk_id = f"{page_name}::__PAGE__"
+    regular_block_id = "block123"
+
+    # Add IDs to mapper
+    id_mapper.add(page_chunk_id)
+    id_mapper.add(regular_block_id)
+
+    # Page-level chunk with frontmatter in context
+    page_chunk_context = "Page: diffused\nTitle: Diffused System\ntags:: system, kubernetes"
+    # Regular block chunk (already cleaned during indexing)
+    regular_block_context = "- Deployment architecture\n  - Uses Kubernetes"
+
+    chunks = [
+        (page_name, page_chunk_id, page_chunk_context),
+        (page_name, regular_block_id, regular_block_context),
+    ]
+
+    page_contents = {
+        page_name: LogseqOutline.parse(
+            "tags:: system, kubernetes\n\n"
+            "- Deployment architecture\n"
+            "  id:: block123\n"
+            "  - Uses Kubernetes"
+        )
+    }
+
+    # Act
+    xml = format_chunks_for_llm(chunks, page_contents, id_mapper)
+
+    # Assert - frontmatter should appear only in <properties>, not in page-level block
+    assert "<properties>" in xml
+    assert "tags:: system, kubernetes" in xml.split("<properties>")[1].split("</properties>")[0]
+
+    # Page-level block should have "Page:" and "Title:" but NOT frontmatter
+    page_block_section = xml.split(f'<block id="1">')[1].split("</block>")[0]
+    assert "Page: diffused" in page_block_section
+    assert "Title: Diffused System" in page_block_section
+    assert "tags::" not in page_block_section  # Frontmatter stripped from block
+
+    # Regular block should be unchanged
+    regular_block_section = xml.split(f'<block id="2">')[1].split("</block>")[0]
+    assert "Deployment architecture" in regular_block_section
+
+
+def test_format_chunks_for_llm_handles_regular_blocks_normally():
+    """Test format_chunks_for_llm() doesn't strip frontmatter from regular blocks."""
+    # Arrange
+    id_mapper = LLMIDMapper()
+    page_name = "Python"
+    block_id = "block456"
+
+    id_mapper.add(block_id)
+
+    # Regular block (not a page-level chunk)
+    # These are already cleaned during indexing, but test that we don't break them
+    chunks = [
+        (page_name, block_id, "- Type hints are essential\n  - Improve code quality"),
+    ]
+
+    page_contents = {
+        page_name: LogseqOutline.parse(
+            "type:: language\n\n"
+            "- Type hints are essential\n"
+            "  id:: block456\n"
+            "  - Improve code quality"
+        )
+    }
+
+    # Act
+    xml = format_chunks_for_llm(chunks, page_contents, id_mapper)
+
+    # Assert - regular block content is preserved as-is
+    assert "Type hints are essential" in xml
+    assert "Improve code quality" in xml
+
+
+def test_format_chunks_for_llm_handles_multiple_pages():
+    """Test format_chunks_for_llm() handles multiple pages with page-level chunks."""
+    # Arrange
+    id_mapper = LLMIDMapper()
+
+    page1_id = "Page1::__PAGE__"
+    page2_id = "Page2::__PAGE__"
+
+    id_mapper.add(page1_id)
+    id_mapper.add(page2_id)
+
+    chunks = [
+        ("Page1", page1_id, "Page: Page1\ntags:: web"),
+        ("Page2", page2_id, "Page: Page2\ntype:: language"),
+    ]
+
+    page_contents = {
+        "Page1": LogseqOutline.parse("tags:: web\n\n"),
+        "Page2": LogseqOutline.parse("type:: language\n\n"),
+    }
+
+    # Act
+    xml = format_chunks_for_llm(chunks, page_contents, id_mapper)
+
+    # Assert - each page should have properties section and cleaned blocks
+    assert xml.count("<page name=") == 2
+    assert xml.count("<properties>") == 2
+
+    # Page1 block should not have frontmatter
+    page1_section = xml.split('<page name="Page1">')[1].split("</page>")[0]
+    page1_block = page1_section.split('<block id="1">')[1].split("</block>")[0]
+    assert "Page: Page1" in page1_block
+    assert "tags::" not in page1_block  # Stripped from block
+
+    # But properties should have frontmatter
+    page1_properties = page1_section.split("<properties>")[1].split("</properties>")[0]
+    assert "tags:: web" in page1_properties