diff --git a/src/logsqueak/services/llm_helpers.py b/src/logsqueak/services/llm_helpers.py index ae82c88..adf62ce 100644 --- a/src/logsqueak/services/llm_helpers.py +++ b/src/logsqueak/services/llm_helpers.py @@ -2,6 +2,7 @@ from collections import defaultdict from logseq_outline.parser import LogseqOutline +from logsqueak.services.page_indexer import _clean_context_for_llm def format_chunks_for_llm( @@ -93,7 +94,16 @@ def format_chunks_for_llm( for block_id, context in chunks_by_page[page_name]: short_id = id_mapper.to_short(block_id) xml_parts.append(f'') - xml_parts.append(context) + + # Strip redundant frontmatter from page-level chunks + # Page-level chunks have format: "Page: X\nTitle: Y\n" + # Frontmatter is already shown in , so strip it + if block_id.endswith("::__PAGE__") and outline and outline.frontmatter: + cleaned_context = _clean_context_for_llm(context, outline.frontmatter) + xml_parts.append(cleaned_context) + else: + xml_parts.append(context) + xml_parts.append("") xml_parts.append("") diff --git a/src/logsqueak/services/llm_wrappers.py b/src/logsqueak/services/llm_wrappers.py index 8bbb4b1..2315132 100644 --- a/src/logsqueak/services/llm_wrappers.py +++ b/src/logsqueak/services/llm_wrappers.py @@ -651,6 +651,19 @@ async def plan_integration_for_block( ) continue # Skip this chunk + # Normalize page-level chunks (__PAGE__) to add_section + # Page-level chunks exist only in RAG index for semantic search, + # not as real blocks in the file structure + if target_hybrid_id and target_hybrid_id.endswith("::__PAGE__"): + logger.debug( + "llm_page_level_chunk_normalized", + original_action=chunk.action, + target_page=chunk.target_page, + target_id=target_hybrid_id + ) + chunk.action = "add_section" + target_hybrid_id = None # No target_block_id for add_section + # Set translated target_block_id chunk.target_block_id = target_hybrid_id diff --git a/tests/unit/test_llm_helpers.py b/tests/unit/test_llm_helpers.py new file mode 100644 index 0000000..44decc6 --- /dev/null +++ b/tests/unit/test_llm_helpers.py @@ -0,0 +1,126 @@ +"""Unit tests for LLM helper functions.""" + +import pytest +from logsqueak.services.llm_helpers import format_chunks_for_llm +from logsqueak.utils.llm_id_mapper import LLMIDMapper +from logseq_outline.parser import LogseqOutline + + +def test_format_chunks_for_llm_strips_page_level_frontmatter(): + """Test format_chunks_for_llm() strips frontmatter from page-level chunks.""" + # Arrange + id_mapper = LLMIDMapper() + page_name = "diffused" + page_chunk_id = f"{page_name}::__PAGE__" + regular_block_id = "block123" + + # Add IDs to mapper + id_mapper.add(page_chunk_id) + id_mapper.add(regular_block_id) + + # Page-level chunk with frontmatter in context + page_chunk_context = "Page: diffused\nTitle: Diffused System\ntags:: system, kubernetes" + # Regular block chunk (already cleaned during indexing) + regular_block_context = "- Deployment architecture\n - Uses Kubernetes" + + chunks = [ + (page_name, page_chunk_id, page_chunk_context), + (page_name, regular_block_id, regular_block_context), + ] + + page_contents = { + page_name: LogseqOutline.parse( + "tags:: system, kubernetes\n\n" + "- Deployment architecture\n" + " id:: block123\n" + " - Uses Kubernetes" + ) + } + + # Act + xml = format_chunks_for_llm(chunks, page_contents, id_mapper) + + # Assert - frontmatter should appear only in , not in page-level block + assert "" in xml + assert "tags:: system, kubernetes" in xml.split("")[1].split("")[0] + + # Page-level block should have "Page:" and "Title:" but NOT frontmatter + page_block_section = xml.split(f'')[1].split("")[0] + assert "Page: diffused" in page_block_section + assert "Title: Diffused System" in page_block_section + assert "tags::" not in page_block_section # Frontmatter stripped from block + + # Regular block should be unchanged + regular_block_section = xml.split(f'')[1].split("")[0] + assert "Deployment architecture" in regular_block_section + + +def test_format_chunks_for_llm_handles_regular_blocks_normally(): + """Test format_chunks_for_llm() doesn't strip frontmatter from regular blocks.""" + # Arrange + id_mapper = LLMIDMapper() + page_name = "Python" + block_id = "block456" + + id_mapper.add(block_id) + + # Regular block (not a page-level chunk) + # These are already cleaned during indexing, but test that we don't break them + chunks = [ + (page_name, block_id, "- Type hints are essential\n - Improve code quality"), + ] + + page_contents = { + page_name: LogseqOutline.parse( + "type:: language\n\n" + "- Type hints are essential\n" + " id:: block456\n" + " - Improve code quality" + ) + } + + # Act + xml = format_chunks_for_llm(chunks, page_contents, id_mapper) + + # Assert - regular block content is preserved as-is + assert "Type hints are essential" in xml + assert "Improve code quality" in xml + + +def test_format_chunks_for_llm_handles_multiple_pages(): + """Test format_chunks_for_llm() handles multiple pages with page-level chunks.""" + # Arrange + id_mapper = LLMIDMapper() + + page1_id = "Page1::__PAGE__" + page2_id = "Page2::__PAGE__" + + id_mapper.add(page1_id) + id_mapper.add(page2_id) + + chunks = [ + ("Page1", page1_id, "Page: Page1\ntags:: web"), + ("Page2", page2_id, "Page: Page2\ntype:: language"), + ] + + page_contents = { + "Page1": LogseqOutline.parse("tags:: web\n\n"), + "Page2": LogseqOutline.parse("type:: language\n\n"), + } + + # Act + xml = format_chunks_for_llm(chunks, page_contents, id_mapper) + + # Assert - each page should have properties section and cleaned blocks + assert xml.count("") == 2 + + # Page1 block should not have frontmatter + page1_section = xml.split('')[1].split("")[0] + page1_block = page1_section.split('')[1].split("")[0] + assert "Page: Page1" in page1_block + assert "tags::" not in page1_block # Stripped from block + + # But properties should have frontmatter + page1_properties = page1_section.split("")[1].split("")[0] + assert "tags:: web" in page1_properties diff --git a/tests/unit/test_llm_wrappers.py b/tests/unit/test_llm_wrappers.py index 3cd0dca..04461ce 100644 --- a/tests/unit/test_llm_wrappers.py +++ b/tests/unit/test_llm_wrappers.py @@ -560,3 +560,106 @@ async def mock_stream(*args, **kwargs): # Assert - should handle gracefully (no crashes) assert len(results) == 0 + + +@pytest.mark.asyncio +async def test_plan_integration_for_block_normalizes_page_level_chunks(): + """Test plan_integration_for_block() normalizes __PAGE__ targets to add_section.""" + # Arrange + mock_client = Mock(spec=LLMClient) + + edited_content = EditedContent( + block_id="abc123", + original_content="[[diffused]] supports [[ACS]]", + hierarchical_context="- [[diffused]] supports [[ACS]]", + current_content="[[diffused]] can utilize [[ACS]]" + ) + + # Include page-level chunk (ends with ::__PAGE__) + candidate_chunks = [ + ("diffused", "diffused::__PAGE__", "Page: diffused\nTitle: diffused\ntags:: system, kubernetes"), + ("diffused", "block1", "- Deployment architecture\n - Uses Kubernetes") + ] + + page_contents = { + "diffused": LogseqOutline.parse( + "tags:: system, kubernetes\n\n" + "- Deployment architecture\n" + " id:: block1\n" + " - Uses Kubernetes" + ) + } + + # Mock LLM suggesting add_under action for page-level chunk + # NOTE: "diffused::__PAGE__" will be mapped to short ID (e.g., "1") + async def mock_stream(*args, **kwargs): + # LLM suggests adding under page-level chunk + yield IntegrationDecisionChunk( + target_page="diffused", + action="add_under", # LLM can suggest any action + target_block_id="1", # Short ID for "diffused::__PAGE__" + target_block_title="diffused", + confidence=0.95, + reasoning="The knowledge is about diffused system" + ) + + mock_client.stream_ndjson = mock_stream + + # Act + results = [] + async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents): + results.append(chunk) + + # Assert - __PAGE__ target should be normalized to add_section + assert len(results) == 1 + assert results[0].knowledge_block_id == "abc123" + assert results[0].target_page == "diffused" + assert results[0].action == "add_section" # Normalized from add_under + assert results[0].target_block_id is None # Cleared (add_section has no target) + assert results[0].confidence == 0.95 + + +@pytest.mark.asyncio +async def test_plan_integration_for_block_preserves_regular_block_targets(): + """Test plan_integration_for_block() preserves regular block targets (not __PAGE__).""" + # Arrange + mock_client = Mock(spec=LLMClient) + + edited_content = EditedContent( + block_id="abc123", + original_content="Kubernetes scaling info", + hierarchical_context="- Kubernetes scaling info", + current_content="Kubernetes enables horizontal scaling" + ) + + # Regular block chunk (NOT a page-level chunk) + candidate_chunks = [ + ("diffused", "block1", "- Deployment architecture\n - Uses Kubernetes") + ] + + page_contents = { + "diffused": LogseqOutline.parse("- Deployment architecture\n id:: block1") + } + + # Mock LLM suggesting add_under action for regular block + async def mock_stream(*args, **kwargs): + yield IntegrationDecisionChunk( + target_page="diffused", + action="add_under", + target_block_id="1", # Short ID for "block1" + target_block_title="Deployment architecture", + confidence=0.90, + reasoning="Related to deployment" + ) + + mock_client.stream_ndjson = mock_stream + + # Act + results = [] + async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents): + results.append(chunk) + + # Assert - regular block should NOT be normalized + assert len(results) == 1 + assert results[0].action == "add_under" # Preserved + assert results[0].target_block_id == "block1" # Translated but not cleared