twaugh · twaugh · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/src/logsqueak/services/llm_helpers.py b/src/logsqueak/services/llm_helpers.py
@@ -2,6 +2,7 @@
 
 from collections import defaultdict
 from logseq_outline.parser import LogseqOutline
+from logsqueak.services.page_indexer import _clean_context_for_llm
 
 
 def format_chunks_for_llm(
@@ -93,7 +94,16 @@ def format_chunks_for_llm(
         for block_id, context in chunks_by_page[page_name]:
             short_id = id_mapper.to_short(block_id)
             xml_parts.append(f'<block id="{xml_escape(short_id)}">')
-            xml_parts.append(context)
+
+            # Strip redundant frontmatter from page-level chunks
+            # Page-level chunks have format: "Page: X\nTitle: Y\n<frontmatter>"
+            # Frontmatter is already shown in <properties>, so strip it
+            if block_id.endswith("::__PAGE__") and outline and outline.frontmatter:
+                cleaned_context = _clean_context_for_llm(context, outline.frontmatter)
+                xml_parts.append(cleaned_context)
+            else:
+                xml_parts.append(context)
+
             xml_parts.append("</block>")
 
         xml_parts.append("</page>")

diff --git a/src/logsqueak/services/llm_wrappers.py b/src/logsqueak/services/llm_wrappers.py
@@ -651,6 +651,19 @@ async def plan_integration_for_block(
                 )
                 continue  # Skip this chunk
 
+        # Normalize page-level chunks (__PAGE__) to add_section
+        # Page-level chunks exist only in RAG index for semantic search,
+        # not as real blocks in the file structure
+        if target_hybrid_id and target_hybrid_id.endswith("::__PAGE__"):
+            logger.debug(
+                "llm_page_level_chunk_normalized",
+                original_action=chunk.action,
+                target_page=chunk.target_page,
+                target_id=target_hybrid_id
+            )
+            chunk.action = "add_section"
+            target_hybrid_id = None  # No target_block_id for add_section
+
         # Set translated target_block_id
         chunk.target_block_id = target_hybrid_id
 

diff --git a/tests/unit/test_llm_helpers.py b/tests/unit/test_llm_helpers.py
@@ -0,0 +1,126 @@
+"""Unit tests for LLM helper functions."""
+
+import pytest
+from logsqueak.services.llm_helpers import format_chunks_for_llm
+from logsqueak.utils.llm_id_mapper import LLMIDMapper
+from logseq_outline.parser import LogseqOutline
+
+
+def test_format_chunks_for_llm_strips_page_level_frontmatter():
+    """Test format_chunks_for_llm() strips frontmatter from page-level chunks."""
+    # Arrange
+    id_mapper = LLMIDMapper()
+    page_name = "diffused"
+    page_chunk_id = f"{page_name}::__PAGE__"
+    regular_block_id = "block123"
+
+    # Add IDs to mapper
+    id_mapper.add(page_chunk_id)
+    id_mapper.add(regular_block_id)
+
+    # Page-level chunk with frontmatter in context
+    page_chunk_context = "Page: diffused\nTitle: Diffused System\ntags:: system, kubernetes"
+    # Regular block chunk (already cleaned during indexing)
+    regular_block_context = "- Deployment architecture\n  - Uses Kubernetes"
+
+    chunks = [
+        (page_name, page_chunk_id, page_chunk_context),
+        (page_name, regular_block_id, regular_block_context),
+    ]
+
+    page_contents = {
+        page_name: LogseqOutline.parse(
+            "tags:: system, kubernetes\n\n"
+            "- Deployment architecture\n"
+            "  id:: block123\n"
+            "  - Uses Kubernetes"
+        )
+    }
+
+    # Act
+    xml = format_chunks_for_llm(chunks, page_contents, id_mapper)
+
+    # Assert - frontmatter should appear only in <properties>, not in page-level block
+    assert "<properties>" in xml
+    assert "tags:: system, kubernetes" in xml.split("<properties>")[1].split("</properties>")[0]
+
+    # Page-level block should have "Page:" and "Title:" but NOT frontmatter
+    page_block_section = xml.split(f'<block id="1">')[1].split("</block>")[0]
+    assert "Page: diffused" in page_block_section
+    assert "Title: Diffused System" in page_block_section
+    assert "tags::" not in page_block_section  # Frontmatter stripped from block
+
+    # Regular block should be unchanged
+    regular_block_section = xml.split(f'<block id="2">')[1].split("</block>")[0]
+    assert "Deployment architecture" in regular_block_section
+
+
+def test_format_chunks_for_llm_handles_regular_blocks_normally():
+    """Test format_chunks_for_llm() doesn't strip frontmatter from regular blocks."""
+    # Arrange
+    id_mapper = LLMIDMapper()
+    page_name = "Python"
+    block_id = "block456"
+
+    id_mapper.add(block_id)
+
+    # Regular block (not a page-level chunk)
+    # These are already cleaned during indexing, but test that we don't break them
+    chunks = [
+        (page_name, block_id, "- Type hints are essential\n  - Improve code quality"),
+    ]
+
+    page_contents = {
+        page_name: LogseqOutline.parse(
+            "type:: language\n\n"
+            "- Type hints are essential\n"
+            "  id:: block456\n"
+            "  - Improve code quality"
+        )
+    }
+
+    # Act
+    xml = format_chunks_for_llm(chunks, page_contents, id_mapper)
+
+    # Assert - regular block content is preserved as-is
+    assert "Type hints are essential" in xml
+    assert "Improve code quality" in xml
+
+
+def test_format_chunks_for_llm_handles_multiple_pages():
+    """Test format_chunks_for_llm() handles multiple pages with page-level chunks."""
+    # Arrange
+    id_mapper = LLMIDMapper()
+
+    page1_id = "Page1::__PAGE__"
+    page2_id = "Page2::__PAGE__"
+
+    id_mapper.add(page1_id)
+    id_mapper.add(page2_id)
+
+    chunks = [
+        ("Page1", page1_id, "Page: Page1\ntags:: web"),
+        ("Page2", page2_id, "Page: Page2\ntype:: language"),
+    ]
+
+    page_contents = {
+        "Page1": LogseqOutline.parse("tags:: web\n\n"),
+        "Page2": LogseqOutline.parse("type:: language\n\n"),
+    }
+
+    # Act
+    xml = format_chunks_for_llm(chunks, page_contents, id_mapper)
+
+    # Assert - each page should have properties section and cleaned blocks
+    assert xml.count("<page name=") == 2
+    assert xml.count("<properties>") == 2
+
+    # Page1 block should not have frontmatter
+    page1_section = xml.split('<page name="Page1">')[1].split("</page>")[0]
+    page1_block = page1_section.split('<block id="1">')[1].split("</block>")[0]
+    assert "Page: Page1" in page1_block
+    assert "tags::" not in page1_block  # Stripped from block
+
+    # But properties should have frontmatter
+    page1_properties = page1_section.split("<properties>")[1].split("</properties>")[0]
+    assert "tags:: web" in page1_properties
diff --git a/tests/unit/test_llm_wrappers.py b/tests/unit/test_llm_wrappers.py
@@ -560,3 +560,106 @@ async def mock_stream(*args, **kwargs):
 
     # Assert - should handle gracefully (no crashes)
     assert len(results) == 0
+
+
+@pytest.mark.asyncio
+async def test_plan_integration_for_block_normalizes_page_level_chunks():
+    """Test plan_integration_for_block() normalizes __PAGE__ targets to add_section."""
+    # Arrange
+    mock_client = Mock(spec=LLMClient)
+
+    edited_content = EditedContent(
+        block_id="abc123",
+        original_content="[[diffused]] supports [[ACS]]",
+        hierarchical_context="- [[diffused]] supports [[ACS]]",
+        current_content="[[diffused]] can utilize [[ACS]]"
+    )
+
+    # Include page-level chunk (ends with ::__PAGE__)
+    candidate_chunks = [
+        ("diffused", "diffused::__PAGE__", "Page: diffused\nTitle: diffused\ntags:: system, kubernetes"),
+        ("diffused", "block1", "- Deployment architecture\n  - Uses Kubernetes")
+    ]
+
+    page_contents = {
+        "diffused": LogseqOutline.parse(
+            "tags:: system, kubernetes\n\n"
+            "- Deployment architecture\n"
+            "  id:: block1\n"
+            "  - Uses Kubernetes"
+        )
+    }
+
+    # Mock LLM suggesting add_under action for page-level chunk
+    # NOTE: "diffused::__PAGE__" will be mapped to short ID (e.g., "1")
+    async def mock_stream(*args, **kwargs):
+        # LLM suggests adding under page-level chunk
+        yield IntegrationDecisionChunk(
+            target_page="diffused",
+            action="add_under",  # LLM can suggest any action
+            target_block_id="1",  # Short ID for "diffused::__PAGE__"
+            target_block_title="diffused",
+            confidence=0.95,
+            reasoning="The knowledge is about diffused system"
+        )
+
+    mock_client.stream_ndjson = mock_stream
+
+    # Act
+    results = []
+    async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents):
+        results.append(chunk)
+
+    # Assert - __PAGE__ target should be normalized to add_section
+    assert len(results) == 1
+    assert results[0].knowledge_block_id == "abc123"
+    assert results[0].target_page == "diffused"
+    assert results[0].action == "add_section"  # Normalized from add_under
+    assert results[0].target_block_id is None  # Cleared (add_section has no target)
+    assert results[0].confidence == 0.95
+
+
+@pytest.mark.asyncio
+async def test_plan_integration_for_block_preserves_regular_block_targets():
+    """Test plan_integration_for_block() preserves regular block targets (not __PAGE__)."""
+    # Arrange
+    mock_client = Mock(spec=LLMClient)
+
+    edited_content = EditedContent(
+        block_id="abc123",
+        original_content="Kubernetes scaling info",
+        hierarchical_context="- Kubernetes scaling info",
+        current_content="Kubernetes enables horizontal scaling"
+    )
+
+    # Regular block chunk (NOT a page-level chunk)
+    candidate_chunks = [
+        ("diffused", "block1", "- Deployment architecture\n  - Uses Kubernetes")
+    ]
+
+    page_contents = {
+        "diffused": LogseqOutline.parse("- Deployment architecture\n  id:: block1")
+    }
+
+    # Mock LLM suggesting add_under action for regular block
+    async def mock_stream(*args, **kwargs):
+        yield IntegrationDecisionChunk(
+            target_page="diffused",
+            action="add_under",
+            target_block_id="1",  # Short ID for "block1"
+            target_block_title="Deployment architecture",
+            confidence=0.90,
+            reasoning="Related to deployment"
+        )
+
+    mock_client.stream_ndjson = mock_stream
+
+    # Act
+    results = []
+    async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents):
+        results.append(chunk)
+
+    # Assert - regular block should NOT be normalized
+    assert len(results) == 1
+    assert results[0].action == "add_under"  # Preserved
+    assert results[0].target_block_id == "block1"  # Translated but not cleared