From eef9ad524974e41fa008f62d567f3ee2518900ca Mon Sep 17 00:00:00 2001 From: Tim Waugh Date: Mon, 24 Nov 2025 10:56:52 +0000 Subject: [PATCH 1/2] fix(rag): normalize page-level chunk targets to add_section action Page-level chunks (__PAGE__) are synthetic entries in the RAG index used for semantic search by page name/title/frontmatter. They don't correspond to real blocks in the file structure, so they can't be used as integration targets for actions like add_under or replace. When the LLM selects a __PAGE__ chunk as a target, it means "this knowledge belongs on this page" without specifying a particular block. The correct interpretation is add_section (new top-level section). Changes: - Detect targets ending with "::__PAGE__" after LLM ID translation - Normalize action to "add_section" regardless of LLM suggestion - Clear target_block_id (add_section has no specific target) - Add debug logging for normalization events This eliminates the "Target block not found: page::__PAGE__" errors and enables LLM to suggest integration into pages that have no blocks yet (only page-level metadata exists in RAG index). Impact: - Before: Integration fails with "target block not found" error - After: Page-level chunks correctly interpreted as add_section - Enables: Adding knowledge to empty but relevant pages Tests: - Added test_plan_integration_for_block_normalizes_page_level_chunks - Added test_plan_integration_for_block_preserves_regular_block_targets - All existing llm_wrappers tests pass Assisted-by: Claude Code --- src/logsqueak/services/llm_wrappers.py | 13 ++++ tests/unit/test_llm_wrappers.py | 103 +++++++++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/src/logsqueak/services/llm_wrappers.py b/src/logsqueak/services/llm_wrappers.py index 8bbb4b1..2315132 100644 --- a/src/logsqueak/services/llm_wrappers.py +++ b/src/logsqueak/services/llm_wrappers.py @@ -651,6 +651,19 @@ async def plan_integration_for_block( ) continue # Skip this chunk + # Normalize page-level chunks (__PAGE__) to add_section + # Page-level chunks exist only in RAG index for semantic search, + # not as real blocks in the file structure + if target_hybrid_id and target_hybrid_id.endswith("::__PAGE__"): + logger.debug( + "llm_page_level_chunk_normalized", + original_action=chunk.action, + target_page=chunk.target_page, + target_id=target_hybrid_id + ) + chunk.action = "add_section" + target_hybrid_id = None # No target_block_id for add_section + # Set translated target_block_id chunk.target_block_id = target_hybrid_id diff --git a/tests/unit/test_llm_wrappers.py b/tests/unit/test_llm_wrappers.py index 3cd0dca..04461ce 100644 --- a/tests/unit/test_llm_wrappers.py +++ b/tests/unit/test_llm_wrappers.py @@ -560,3 +560,106 @@ async def mock_stream(*args, **kwargs): # Assert - should handle gracefully (no crashes) assert len(results) == 0 + + +@pytest.mark.asyncio +async def test_plan_integration_for_block_normalizes_page_level_chunks(): + """Test plan_integration_for_block() normalizes __PAGE__ targets to add_section.""" + # Arrange + mock_client = Mock(spec=LLMClient) + + edited_content = EditedContent( + block_id="abc123", + original_content="[[diffused]] supports [[ACS]]", + hierarchical_context="- [[diffused]] supports [[ACS]]", + current_content="[[diffused]] can utilize [[ACS]]" + ) + + # Include page-level chunk (ends with ::__PAGE__) + candidate_chunks = [ + ("diffused", "diffused::__PAGE__", "Page: diffused\nTitle: diffused\ntags:: system, kubernetes"), + ("diffused", "block1", "- Deployment architecture\n - Uses Kubernetes") + ] + + page_contents = { + "diffused": LogseqOutline.parse( + "tags:: system, kubernetes\n\n" + "- Deployment architecture\n" + " id:: block1\n" + " - Uses Kubernetes" + ) + } + + # Mock LLM suggesting add_under action for page-level chunk + # NOTE: "diffused::__PAGE__" will be mapped to short ID (e.g., "1") + async def mock_stream(*args, **kwargs): + # LLM suggests adding under page-level chunk + yield IntegrationDecisionChunk( + target_page="diffused", + action="add_under", # LLM can suggest any action + target_block_id="1", # Short ID for "diffused::__PAGE__" + target_block_title="diffused", + confidence=0.95, + reasoning="The knowledge is about diffused system" + ) + + mock_client.stream_ndjson = mock_stream + + # Act + results = [] + async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents): + results.append(chunk) + + # Assert - __PAGE__ target should be normalized to add_section + assert len(results) == 1 + assert results[0].knowledge_block_id == "abc123" + assert results[0].target_page == "diffused" + assert results[0].action == "add_section" # Normalized from add_under + assert results[0].target_block_id is None # Cleared (add_section has no target) + assert results[0].confidence == 0.95 + + +@pytest.mark.asyncio +async def test_plan_integration_for_block_preserves_regular_block_targets(): + """Test plan_integration_for_block() preserves regular block targets (not __PAGE__).""" + # Arrange + mock_client = Mock(spec=LLMClient) + + edited_content = EditedContent( + block_id="abc123", + original_content="Kubernetes scaling info", + hierarchical_context="- Kubernetes scaling info", + current_content="Kubernetes enables horizontal scaling" + ) + + # Regular block chunk (NOT a page-level chunk) + candidate_chunks = [ + ("diffused", "block1", "- Deployment architecture\n - Uses Kubernetes") + ] + + page_contents = { + "diffused": LogseqOutline.parse("- Deployment architecture\n id:: block1") + } + + # Mock LLM suggesting add_under action for regular block + async def mock_stream(*args, **kwargs): + yield IntegrationDecisionChunk( + target_page="diffused", + action="add_under", + target_block_id="1", # Short ID for "block1" + target_block_title="Deployment architecture", + confidence=0.90, + reasoning="Related to deployment" + ) + + mock_client.stream_ndjson = mock_stream + + # Act + results = [] + async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents): + results.append(chunk) + + # Assert - regular block should NOT be normalized + assert len(results) == 1 + assert results[0].action == "add_under" # Preserved + assert results[0].target_block_id == "block1" # Translated but not cleared From da0af4773914583ae728173573862fcc4842377a Mon Sep 17 00:00:00 2001 From: Tim Waugh Date: Mon, 24 Nov 2025 11:02:46 +0000 Subject: [PATCH 2/2] perf(llm): strip redundant frontmatter from page-level chunks in prompts Page-level chunks (__PAGE__) store frontmatter in their context for semantic search quality during embedding. However, when formatting these chunks for LLM prompts, the frontmatter was duplicated: - Once in the section (parsed from page outline) - Again in the content (from stored RAG chunk context) This wasted ~50-200 tokens per page depending on property count. Solution: Reuse existing _clean_context_for_llm() function from page_indexer.py to strip frontmatter from page-level chunks during prompt formatting. This elegant approach reuses tested code instead of duplicating logic. Changes: - Import _clean_context_for_llm() in llm_helpers.py - Detect page-level chunks (::__PAGE__) in format_chunks_for_llm() - Apply _clean_context_for_llm() to page-level chunks only - Regular blocks unchanged (already cleaned during indexing) - Frontmatter remains in section for LLM context Impact: - Before: "tags:: foo, bar" appears twice in prompt (properties + block) - After: "tags:: foo, bar" appears once (properties only) - Token savings: ~50-200 per page with properties - No impact on semantic search quality (frontmatter still embedded) Tests: - Added test_llm_helpers.py with 3 integration tests - Tests cover page-level chunk stripping, regular block preservation - All existing llm_wrappers tests pass (no regressions) Assisted-by: Claude Code --- src/logsqueak/services/llm_helpers.py | 12 ++- tests/unit/test_llm_helpers.py | 126 ++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_llm_helpers.py diff --git a/src/logsqueak/services/llm_helpers.py b/src/logsqueak/services/llm_helpers.py index ae82c88..adf62ce 100644 --- a/src/logsqueak/services/llm_helpers.py +++ b/src/logsqueak/services/llm_helpers.py @@ -2,6 +2,7 @@ from collections import defaultdict from logseq_outline.parser import LogseqOutline +from logsqueak.services.page_indexer import _clean_context_for_llm def format_chunks_for_llm( @@ -93,7 +94,16 @@ def format_chunks_for_llm( for block_id, context in chunks_by_page[page_name]: short_id = id_mapper.to_short(block_id) xml_parts.append(f'') - xml_parts.append(context) + + # Strip redundant frontmatter from page-level chunks + # Page-level chunks have format: "Page: X\nTitle: Y\n" + # Frontmatter is already shown in , so strip it + if block_id.endswith("::__PAGE__") and outline and outline.frontmatter: + cleaned_context = _clean_context_for_llm(context, outline.frontmatter) + xml_parts.append(cleaned_context) + else: + xml_parts.append(context) + xml_parts.append("") xml_parts.append("") diff --git a/tests/unit/test_llm_helpers.py b/tests/unit/test_llm_helpers.py new file mode 100644 index 0000000..44decc6 --- /dev/null +++ b/tests/unit/test_llm_helpers.py @@ -0,0 +1,126 @@ +"""Unit tests for LLM helper functions.""" + +import pytest +from logsqueak.services.llm_helpers import format_chunks_for_llm +from logsqueak.utils.llm_id_mapper import LLMIDMapper +from logseq_outline.parser import LogseqOutline + + +def test_format_chunks_for_llm_strips_page_level_frontmatter(): + """Test format_chunks_for_llm() strips frontmatter from page-level chunks.""" + # Arrange + id_mapper = LLMIDMapper() + page_name = "diffused" + page_chunk_id = f"{page_name}::__PAGE__" + regular_block_id = "block123" + + # Add IDs to mapper + id_mapper.add(page_chunk_id) + id_mapper.add(regular_block_id) + + # Page-level chunk with frontmatter in context + page_chunk_context = "Page: diffused\nTitle: Diffused System\ntags:: system, kubernetes" + # Regular block chunk (already cleaned during indexing) + regular_block_context = "- Deployment architecture\n - Uses Kubernetes" + + chunks = [ + (page_name, page_chunk_id, page_chunk_context), + (page_name, regular_block_id, regular_block_context), + ] + + page_contents = { + page_name: LogseqOutline.parse( + "tags:: system, kubernetes\n\n" + "- Deployment architecture\n" + " id:: block123\n" + " - Uses Kubernetes" + ) + } + + # Act + xml = format_chunks_for_llm(chunks, page_contents, id_mapper) + + # Assert - frontmatter should appear only in , not in page-level block + assert "" in xml + assert "tags:: system, kubernetes" in xml.split("")[1].split("")[0] + + # Page-level block should have "Page:" and "Title:" but NOT frontmatter + page_block_section = xml.split(f'')[1].split("")[0] + assert "Page: diffused" in page_block_section + assert "Title: Diffused System" in page_block_section + assert "tags::" not in page_block_section # Frontmatter stripped from block + + # Regular block should be unchanged + regular_block_section = xml.split(f'')[1].split("")[0] + assert "Deployment architecture" in regular_block_section + + +def test_format_chunks_for_llm_handles_regular_blocks_normally(): + """Test format_chunks_for_llm() doesn't strip frontmatter from regular blocks.""" + # Arrange + id_mapper = LLMIDMapper() + page_name = "Python" + block_id = "block456" + + id_mapper.add(block_id) + + # Regular block (not a page-level chunk) + # These are already cleaned during indexing, but test that we don't break them + chunks = [ + (page_name, block_id, "- Type hints are essential\n - Improve code quality"), + ] + + page_contents = { + page_name: LogseqOutline.parse( + "type:: language\n\n" + "- Type hints are essential\n" + " id:: block456\n" + " - Improve code quality" + ) + } + + # Act + xml = format_chunks_for_llm(chunks, page_contents, id_mapper) + + # Assert - regular block content is preserved as-is + assert "Type hints are essential" in xml + assert "Improve code quality" in xml + + +def test_format_chunks_for_llm_handles_multiple_pages(): + """Test format_chunks_for_llm() handles multiple pages with page-level chunks.""" + # Arrange + id_mapper = LLMIDMapper() + + page1_id = "Page1::__PAGE__" + page2_id = "Page2::__PAGE__" + + id_mapper.add(page1_id) + id_mapper.add(page2_id) + + chunks = [ + ("Page1", page1_id, "Page: Page1\ntags:: web"), + ("Page2", page2_id, "Page: Page2\ntype:: language"), + ] + + page_contents = { + "Page1": LogseqOutline.parse("tags:: web\n\n"), + "Page2": LogseqOutline.parse("type:: language\n\n"), + } + + # Act + xml = format_chunks_for_llm(chunks, page_contents, id_mapper) + + # Assert - each page should have properties section and cleaned blocks + assert xml.count("") == 2 + + # Page1 block should not have frontmatter + page1_section = xml.split('')[1].split("")[0] + page1_block = page1_section.split('')[1].split("")[0] + assert "Page: Page1" in page1_block + assert "tags::" not in page1_block # Stripped from block + + # But properties should have frontmatter + page1_properties = page1_section.split("")[1].split("")[0] + assert "tags:: web" in page1_properties