Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/logsqueak/services/llm_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from collections import defaultdict
from logseq_outline.parser import LogseqOutline
from logsqueak.services.page_indexer import _clean_context_for_llm


def format_chunks_for_llm(
Expand Down Expand Up @@ -93,7 +94,16 @@ def format_chunks_for_llm(
for block_id, context in chunks_by_page[page_name]:
short_id = id_mapper.to_short(block_id)
xml_parts.append(f'<block id="{xml_escape(short_id)}">')
xml_parts.append(context)

# Strip redundant frontmatter from page-level chunks
# Page-level chunks have format: "Page: X\nTitle: Y\n<frontmatter>"
# Frontmatter is already shown in <properties>, so strip it
if block_id.endswith("::__PAGE__") and outline and outline.frontmatter:
cleaned_context = _clean_context_for_llm(context, outline.frontmatter)
xml_parts.append(cleaned_context)
else:
xml_parts.append(context)

xml_parts.append("</block>")

xml_parts.append("</page>")
Expand Down
13 changes: 13 additions & 0 deletions src/logsqueak/services/llm_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,19 @@ async def plan_integration_for_block(
)
continue # Skip this chunk

# Normalize page-level chunks (__PAGE__) to add_section
# Page-level chunks exist only in RAG index for semantic search,
# not as real blocks in the file structure
if target_hybrid_id and target_hybrid_id.endswith("::__PAGE__"):
logger.debug(
"llm_page_level_chunk_normalized",
original_action=chunk.action,
target_page=chunk.target_page,
target_id=target_hybrid_id
)
chunk.action = "add_section"
target_hybrid_id = None # No target_block_id for add_section

# Set translated target_block_id
chunk.target_block_id = target_hybrid_id

Expand Down
126 changes: 126 additions & 0 deletions tests/unit/test_llm_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""Unit tests for LLM helper functions."""

import pytest
from logsqueak.services.llm_helpers import format_chunks_for_llm
from logsqueak.utils.llm_id_mapper import LLMIDMapper
from logseq_outline.parser import LogseqOutline


def test_format_chunks_for_llm_strips_page_level_frontmatter():
"""Test format_chunks_for_llm() strips frontmatter from page-level chunks."""
# Arrange
id_mapper = LLMIDMapper()
page_name = "diffused"
page_chunk_id = f"{page_name}::__PAGE__"
regular_block_id = "block123"

# Add IDs to mapper
id_mapper.add(page_chunk_id)
id_mapper.add(regular_block_id)

# Page-level chunk with frontmatter in context
page_chunk_context = "Page: diffused\nTitle: Diffused System\ntags:: system, kubernetes"
# Regular block chunk (already cleaned during indexing)
regular_block_context = "- Deployment architecture\n - Uses Kubernetes"

chunks = [
(page_name, page_chunk_id, page_chunk_context),
(page_name, regular_block_id, regular_block_context),
]

page_contents = {
page_name: LogseqOutline.parse(
"tags:: system, kubernetes\n\n"
"- Deployment architecture\n"
" id:: block123\n"
" - Uses Kubernetes"
)
}

# Act
xml = format_chunks_for_llm(chunks, page_contents, id_mapper)

# Assert - frontmatter should appear only in <properties>, not in page-level block
assert "<properties>" in xml
assert "tags:: system, kubernetes" in xml.split("<properties>")[1].split("</properties>")[0]

# Page-level block should have "Page:" and "Title:" but NOT frontmatter
page_block_section = xml.split(f'<block id="1">')[1].split("</block>")[0]
assert "Page: diffused" in page_block_section
assert "Title: Diffused System" in page_block_section
assert "tags::" not in page_block_section # Frontmatter stripped from block

# Regular block should be unchanged
regular_block_section = xml.split(f'<block id="2">')[1].split("</block>")[0]
assert "Deployment architecture" in regular_block_section


def test_format_chunks_for_llm_handles_regular_blocks_normally():
"""Test format_chunks_for_llm() doesn't strip frontmatter from regular blocks."""
# Arrange
id_mapper = LLMIDMapper()
page_name = "Python"
block_id = "block456"

id_mapper.add(block_id)

# Regular block (not a page-level chunk)
# These are already cleaned during indexing, but test that we don't break them
chunks = [
(page_name, block_id, "- Type hints are essential\n - Improve code quality"),
]

page_contents = {
page_name: LogseqOutline.parse(
"type:: language\n\n"
"- Type hints are essential\n"
" id:: block456\n"
" - Improve code quality"
)
}

# Act
xml = format_chunks_for_llm(chunks, page_contents, id_mapper)

# Assert - regular block content is preserved as-is
assert "Type hints are essential" in xml
assert "Improve code quality" in xml


def test_format_chunks_for_llm_handles_multiple_pages():
"""Test format_chunks_for_llm() handles multiple pages with page-level chunks."""
# Arrange
id_mapper = LLMIDMapper()

page1_id = "Page1::__PAGE__"
page2_id = "Page2::__PAGE__"

id_mapper.add(page1_id)
id_mapper.add(page2_id)

chunks = [
("Page1", page1_id, "Page: Page1\ntags:: web"),
("Page2", page2_id, "Page: Page2\ntype:: language"),
]

page_contents = {
"Page1": LogseqOutline.parse("tags:: web\n\n"),
"Page2": LogseqOutline.parse("type:: language\n\n"),
}

# Act
xml = format_chunks_for_llm(chunks, page_contents, id_mapper)

# Assert - each page should have properties section and cleaned blocks
assert xml.count("<page name=") == 2
assert xml.count("<properties>") == 2

# Page1 block should not have frontmatter
page1_section = xml.split('<page name="Page1">')[1].split("</page>")[0]
page1_block = page1_section.split('<block id="1">')[1].split("</block>")[0]
assert "Page: Page1" in page1_block
assert "tags::" not in page1_block # Stripped from block

# But properties should have frontmatter
page1_properties = page1_section.split("<properties>")[1].split("</properties>")[0]
assert "tags:: web" in page1_properties
103 changes: 103 additions & 0 deletions tests/unit/test_llm_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,3 +560,106 @@ async def mock_stream(*args, **kwargs):

# Assert - should handle gracefully (no crashes)
assert len(results) == 0


@pytest.mark.asyncio
async def test_plan_integration_for_block_normalizes_page_level_chunks():
"""Test plan_integration_for_block() normalizes __PAGE__ targets to add_section."""
# Arrange
mock_client = Mock(spec=LLMClient)

edited_content = EditedContent(
block_id="abc123",
original_content="[[diffused]] supports [[ACS]]",
hierarchical_context="- [[diffused]] supports [[ACS]]",
current_content="[[diffused]] can utilize [[ACS]]"
)

# Include page-level chunk (ends with ::__PAGE__)
candidate_chunks = [
("diffused", "diffused::__PAGE__", "Page: diffused\nTitle: diffused\ntags:: system, kubernetes"),
("diffused", "block1", "- Deployment architecture\n - Uses Kubernetes")
]

page_contents = {
"diffused": LogseqOutline.parse(
"tags:: system, kubernetes\n\n"
"- Deployment architecture\n"
" id:: block1\n"
" - Uses Kubernetes"
)
}

# Mock LLM suggesting add_under action for page-level chunk
# NOTE: "diffused::__PAGE__" will be mapped to short ID (e.g., "1")
async def mock_stream(*args, **kwargs):
# LLM suggests adding under page-level chunk
yield IntegrationDecisionChunk(
target_page="diffused",
action="add_under", # LLM can suggest any action
target_block_id="1", # Short ID for "diffused::__PAGE__"
target_block_title="diffused",
confidence=0.95,
reasoning="The knowledge is about diffused system"
)

mock_client.stream_ndjson = mock_stream

# Act
results = []
async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents):
results.append(chunk)

# Assert - __PAGE__ target should be normalized to add_section
assert len(results) == 1
assert results[0].knowledge_block_id == "abc123"
assert results[0].target_page == "diffused"
assert results[0].action == "add_section" # Normalized from add_under
assert results[0].target_block_id is None # Cleared (add_section has no target)
assert results[0].confidence == 0.95


@pytest.mark.asyncio
async def test_plan_integration_for_block_preserves_regular_block_targets():
"""Test plan_integration_for_block() preserves regular block targets (not __PAGE__)."""
# Arrange
mock_client = Mock(spec=LLMClient)

edited_content = EditedContent(
block_id="abc123",
original_content="Kubernetes scaling info",
hierarchical_context="- Kubernetes scaling info",
current_content="Kubernetes enables horizontal scaling"
)

# Regular block chunk (NOT a page-level chunk)
candidate_chunks = [
("diffused", "block1", "- Deployment architecture\n - Uses Kubernetes")
]

page_contents = {
"diffused": LogseqOutline.parse("- Deployment architecture\n id:: block1")
}

# Mock LLM suggesting add_under action for regular block
async def mock_stream(*args, **kwargs):
yield IntegrationDecisionChunk(
target_page="diffused",
action="add_under",
target_block_id="1", # Short ID for "block1"
target_block_title="Deployment architecture",
confidence=0.90,
reasoning="Related to deployment"
)

mock_client.stream_ndjson = mock_stream

# Act
results = []
async for chunk in plan_integration_for_block(mock_client, edited_content, candidate_chunks, page_contents):
results.append(chunk)

# Assert - regular block should NOT be normalized
assert len(results) == 1
assert results[0].action == "add_under" # Preserved
assert results[0].target_block_id == "block1" # Translated but not cleared