From e9e89e333253c239faf49eabdd087a6945627761 Mon Sep 17 00:00:00 2001
From: Chris Bartholomew <chris.bartholomew@vectorize.io>
Date: Mon, 22 Jun 2026 11:21:31 -0400
Subject: [PATCH 1/4] feat(tokens): propagate cached + thoughts tokens through
 return contexts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Gemini 2.5+ family (and any future provider that combines prompt caching
with reasoning tokens) reports four distinct token counts on every response:

  - prompt_token_count        (total input)
  - candidates_token_count    (visible output)
  - cached_content_token_count (subset of input served from prompt cache)
  - thoughts_token_count      (reasoning tokens, billed at output rate)

The provider already records the last two on the Prometheus
``hindsight.llm.tokens.{cached_input,thoughts}`` counters, but the values
stop at the metrics layer — every return context (TokenUsage,
LLMToolCallResult, TokenUsageSummary, RetainResult) only exposes the
top-level input/output split. As a result:

  * a downstream metering extension can't attribute prompt-cache hit-rate
    per operation (only globally via Prometheus aggregates), and
  * reasoning-token spend is invisible to ``output_tokens`` because the
    provider keeps it out of candidates_token_count. A workload that
    "looks cheap" by visible output can be silently expensive if the
    model is doing long reasoning chains.

This change threads the two fields through end-to-end:

  - ``TokenUsage`` gains ``thoughts_tokens`` (cached_tokens already
    existed); ``__add__`` sums it so multi-iteration agentic-loop
    aggregation works.
  - ``LLMToolCallResult`` gains ``cached_tokens`` + ``thoughts_tokens``.
  - ``TokenUsageSummary`` (returned by ``run_reflect_agent``) gains
    both fields and ``run_reflect_agent`` accumulates them at every
    call site (main tool loop + structured-output extraction + 4
    edge-case completion branches).
  - ``_generate_structured_output`` now returns a 5-tuple
    ``(output, in, out, cached, thoughts)``; the 6 unpack sites in the
    reflect agent are updated together.
  - ``RetainResult`` gains optional ``llm_cached_input_tokens`` and
    ``llm_thoughts_tokens`` fields; ``memory_engine`` populates them
    from the aggregated ``TokenUsage``. Defaults stay ``None`` for
    engines that don't surface the data so existing metering extensions
    are unaffected.
  - The Gemini provider — which was already reading the four token
    counts from the SDK response — now returns ``thoughts_tokens`` on
    both the ``call`` and ``call_with_tools`` paths, and the existing
    ``cached_input_tokens`` value reaches ``LLMToolCallResult``.

Backward compatibility: every new field defaults to 0 (or None for the
RetainResult dataclass), so any caller built before this change keeps
working. Provider impls that don't surface these counts simply propagate
zeros — the structured Prometheus counters were already optional in
``record_llm_call``.

Adds focused tests (``test_token_usage_cached_thoughts.py``, 6 cases)
pinning the propagation through every return type and the aggregation
behavior. Existing reflect-agent + Gemini provider tests (87 cases) pass
unchanged.

This is a pure plumbing change — no metrics are renamed, no behavior is
gated, no flags are added.
---
 .../hindsight_api/engine/memory_engine.py     |   2 +
 .../engine/providers/gemini_llm.py            |   3 +
 .../hindsight_api/engine/reflect/agent.py     |  59 ++++++--
 .../hindsight_api/engine/reflect/models.py    |  19 ++-
 .../hindsight_api/engine/response_models.py   |  30 +++-
 .../extensions/operation_validator.py         |  10 ++
 .../tests/test_token_usage_cached_thoughts.py | 135 ++++++++++++++++++
 7 files changed, 240 insertions(+), 18 deletions(-)
 create mode 100644 hindsight-api-slim/tests/test_token_usage_cached_thoughts.py

diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
index 6212877f3..96427771c 100644
--- a/hindsight-api-slim/hindsight_api/engine/memory_engine.py
+++ b/hindsight-api-slim/hindsight_api/engine/memory_engine.py
@@ -3442,6 +3442,8 @@ async def retain_batch_async(
                 llm_input_tokens=total_usage.input_tokens,
                 llm_output_tokens=total_usage.output_tokens,
                 llm_total_tokens=total_usage.total_tokens,
+                llm_cached_input_tokens=getattr(total_usage, "cached_tokens", 0) or 0,
+                llm_thoughts_tokens=getattr(total_usage, "thoughts_tokens", 0) or 0,
                 processed_content_tokens=total_processed_content_tokens,
             )
             try:
diff --git a/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py b/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py
index 9c2c5cd3a..9d94fa741 100644
--- a/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py
+++ b/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py
@@ -424,6 +424,7 @@ def _build_generation_config(use_cache: bool) -> "genai_types.GenerateContentCon
                         output_tokens=output_tokens,
                         total_tokens=input_tokens + output_tokens,
                         cached_tokens=cached_tokens,
+                        thoughts_tokens=thoughts_tokens,
                     )
                     return result, token_usage
                 return result
@@ -762,6 +763,8 @@ def _build_tools_config(use_cache: bool) -> "genai_types.GenerateContentConfig":
                     finish_reason=finish_reason,
                     input_tokens=input_tokens,
                     output_tokens=output_tokens,
+                    cached_tokens=cached_input_tokens,
+                    thoughts_tokens=thoughts_tokens,
                 )
 
             except genai_errors.APIError as e:
diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
index 04ea2427b..b8fca3f2b 100644
--- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
+++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
@@ -141,7 +141,7 @@ async def _generate_structured_output(
     response_schema: dict,
     llm_config: "LLMProvider",
     reflect_id: str,
-) -> tuple[dict[str, Any] | None, int, int]:
+) -> tuple[dict[str, Any] | None, int, int, int, int]:
     """Generate structured output from an answer using the provided JSON schema.
 
     Args:
@@ -151,7 +151,7 @@ async def _generate_structured_output(
         reflect_id: Reflect ID for logging
 
     Returns:
-        Tuple of (structured_output, input_tokens, output_tokens).
+        Tuple of (structured_output, input_tokens, output_tokens, cached_tokens, thoughts_tokens).
         structured_output is None if generation fails.
     """
     try:
@@ -186,7 +186,7 @@ def _json_schema_type_to_python(field_schema: dict) -> type:
 
         if not fields:
             logger.warning(f"[REFLECT {reflect_id}] No fields found in response_schema, skipping structured output")
-            return None, 0, 0
+            return None, 0, 0, 0, 0
 
         DynamicModel = create_model("StructuredResponse", **fields)
 
@@ -259,7 +259,13 @@ def _json_schema_type_to_python(field_schema: dict) -> type:
                 logger.warning(f"[REFLECT {reflect_id}] Required field '{field_name}' is empty in structured output")
 
         logger.info(f"[REFLECT {reflect_id}] Generated structured output with {len(structured_output)} fields")
-        return structured_output, usage.input_tokens, usage.output_tokens
+        return (
+            structured_output,
+            usage.input_tokens,
+            usage.output_tokens,
+            getattr(usage, "cached_tokens", 0) or 0,
+            getattr(usage, "thoughts_tokens", 0) or 0,
+        )
 
     except Exception as e:
         logger.warning(f"[REFLECT {reflect_id}] Failed to generate structured output: {e}")
@@ -435,9 +441,14 @@ async def run_reflect_agent(
     llm_trace: list[dict[str, Any]] = []
     context_history: list[dict[str, Any]] = []  # For final prompt fallback
 
-    # Token usage tracking - accumulate across all LLM calls
+    # Token usage tracking - accumulate across all LLM calls.
+    # cached_tokens and thoughts_tokens are surfaced for cost attribution
+    # and prompt-cache tuning. Both are subsets of (or parallel to) the
+    # input/output counts and are NOT double-counted in total_tokens.
     total_input_tokens = 0
     total_output_tokens = 0
+    total_cached_tokens = 0
+    total_thoughts_tokens = 0
 
     # Track available IDs for validation (prevents hallucinated citations)
     available_memory_ids: set[str] = set()
@@ -460,6 +471,8 @@ def _get_usage() -> TokenUsageSummary:
             input_tokens=total_input_tokens,
             output_tokens=total_output_tokens,
             total_tokens=total_input_tokens + total_output_tokens,
+            cached_tokens=total_cached_tokens,
+            thoughts_tokens=total_thoughts_tokens,
         )
 
     def _log_completion(answer: str, iterations: int, forced: bool = False):
@@ -526,6 +539,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -539,11 +554,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
+                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
                     answer, response_schema, llm_config, reflect_id
                 )
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -588,6 +605,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -600,11 +619,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
 
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
+                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
                     answer, response_schema, llm_config, reflect_id
                 )
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -661,6 +682,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             consecutive_errors = 0
             total_input_tokens += result.input_tokens
             total_output_tokens += result.output_tokens
+            total_cached_tokens += getattr(result, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(result, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": f"agent_{iteration + 1}",
@@ -709,6 +732,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -722,11 +747,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
+                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
                     answer, response_schema, llm_config, reflect_id
                 )
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -783,6 +810,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                     )
                     total_input_tokens += rewrite_usage.input_tokens
                     total_output_tokens += rewrite_usage.output_tokens
+                    total_cached_tokens += getattr(rewrite_usage, "cached_tokens", 0) or 0
+                    total_thoughts_tokens += getattr(rewrite_usage, "thoughts_tokens", 0) or 0
                     llm_trace.append(
                         {
                             "scope": "final_rewrite",
@@ -796,11 +825,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                 # Generate structured output if schema provided
                 structured_output = None
                 if response_schema and answer:
-                    structured_output, struct_in, struct_out = await _generate_structured_output(
+                    structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
                         answer, response_schema, llm_config, reflect_id
                     )
                     total_input_tokens += struct_in
                     total_output_tokens += struct_out
+                    total_cached_tokens += struct_cached
+                    total_thoughts_tokens += struct_thoughts
 
                 _log_completion(answer, iteration + 1)
                 return ReflectAgentResult(
@@ -835,6 +866,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             llm_duration = int((time.time() - llm_start) * 1000)
             total_input_tokens += usage.input_tokens
             total_output_tokens += usage.output_tokens
+            total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0
+            total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0
             llm_trace.append(
                 {
                     "scope": "final",
@@ -848,11 +881,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out = await _generate_structured_output(
+                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
                     answer, response_schema, llm_config, reflect_id
                 )
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
+                total_cached_tokens += struct_cached
+                total_thoughts_tokens += struct_thoughts
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -1147,7 +1182,7 @@ async def _process_done_tool(
     structured_output = None
     final_usage = usage
     if response_schema and llm_config and answer:
-        structured_output, struct_in, struct_out = await _generate_structured_output(
+        structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
             answer, response_schema, llm_config, reflect_id
         )
         # Add structured output tokens to usage
@@ -1155,6 +1190,8 @@ async def _process_done_tool(
             input_tokens=usage.input_tokens + struct_in,
             output_tokens=usage.output_tokens + struct_out,
             total_tokens=usage.total_tokens + struct_in + struct_out,
+            cached_tokens=usage.cached_tokens + struct_cached,
+            thoughts_tokens=usage.thoughts_tokens + struct_thoughts,
         )
 
     log_completion(answer, iterations)
diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/models.py b/hindsight-api-slim/hindsight_api/engine/reflect/models.py
index 26c3f150a..d0e44f831 100644
--- a/hindsight-api-slim/hindsight_api/engine/reflect/models.py
+++ b/hindsight-api-slim/hindsight_api/engine/reflect/models.py
@@ -78,9 +78,22 @@ class DirectiveInfo(BaseModel):
 class TokenUsageSummary(BaseModel):
     """Total token usage across all LLM calls."""
 
-    input_tokens: int = Field(default=0, description="Total input tokens used")
-    output_tokens: int = Field(default=0, description="Total output tokens used")
-    total_tokens: int = Field(default=0, description="Total tokens (input + output)")
+    input_tokens: int = Field(default=0, description="Total input tokens used (includes any cached prefix tokens)")
+    output_tokens: int = Field(
+        default=0, description="Total visible output tokens used (excludes reasoning/thoughts)"
+    )
+    total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
+    cached_tokens: int = Field(
+        default=0,
+        description="Cached/cache-read prompt tokens summed across calls. Subset of input_tokens.",
+    )
+    thoughts_tokens: int = Field(
+        default=0,
+        description=(
+            "Reasoning/thinking tokens summed across calls. Billed at the output rate by some providers "
+            "but not part of visible output."
+        ),
+    )
 
 
 class ReflectAgentResult(BaseModel):
diff --git a/hindsight-api-slim/hindsight_api/engine/response_models.py b/hindsight-api-slim/hindsight_api/engine/response_models.py
index ffa92c8d5..16c92d99e 100644
--- a/hindsight-api-slim/hindsight_api/engine/response_models.py
+++ b/hindsight-api-slim/hindsight_api/engine/response_models.py
@@ -31,8 +31,20 @@ class LLMToolCallResult(BaseModel):
     content: str | None = Field(default=None, description="Text content if any")
     tool_calls: list[LLMToolCall] = Field(default_factory=list, description="Tool calls requested by the LLM")
     finish_reason: str | None = Field(default=None, description="Reason the LLM stopped: 'stop', 'tool_calls', etc.")
-    input_tokens: int = Field(default=0, description="Input tokens used in this call")
-    output_tokens: int = Field(default=0, description="Output tokens used in this call")
+    input_tokens: int = Field(
+        default=0,
+        description="Input tokens used in this call (includes any cached prefix tokens reported by the provider)",
+    )
+    output_tokens: int = Field(
+        default=0, description="Visible output tokens used in this call (excludes reasoning/thoughts)"
+    )
+    cached_tokens: int = Field(
+        default=0, description="Cached prefix tokens, when reported by the provider. Subset of input_tokens."
+    )
+    thoughts_tokens: int = Field(
+        default=0,
+        description="Reasoning/thinking tokens. Billed at the output rate by some providers but not part of visible output.",
+    )
 
 
 class ToolCallTrace(BaseModel):
@@ -91,9 +103,18 @@ class TokenUsage(BaseModel):
     )
 
     input_tokens: int = Field(default=0, description="Number of input/prompt tokens consumed")
-    output_tokens: int = Field(default=0, description="Number of output/completion tokens generated")
-    total_tokens: int = Field(default=0, description="Total tokens (input + output)")
+    output_tokens: int = Field(
+        default=0, description="Number of visible output/completion tokens generated (excludes reasoning/thoughts)"
+    )
+    total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
     cached_tokens: int = Field(default=0, description="Cached/cache-read prompt tokens, when reported by the provider")
+    thoughts_tokens: int = Field(
+        default=0,
+        description=(
+            "Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers "
+            "(e.g. Gemini 2.5+ family) but not surfaced in the visible response."
+        ),
+    )
 
     def __add__(self, other: "TokenUsage") -> "TokenUsage":
         """Allow aggregating token usage from multiple calls."""
@@ -102,6 +123,7 @@ def __add__(self, other: "TokenUsage") -> "TokenUsage":
             output_tokens=self.output_tokens + other.output_tokens,
             total_tokens=self.total_tokens + other.total_tokens,
             cached_tokens=self.cached_tokens + other.cached_tokens,
+            thoughts_tokens=self.thoughts_tokens + other.thoughts_tokens,
         )
 
 
diff --git a/hindsight-api-slim/hindsight_api/extensions/operation_validator.py b/hindsight-api-slim/hindsight_api/extensions/operation_validator.py
index f28491759..6c6394add 100644
--- a/hindsight-api-slim/hindsight_api/extensions/operation_validator.py
+++ b/hindsight-api-slim/hindsight_api/extensions/operation_validator.py
@@ -208,6 +208,16 @@ class RetainResult:
     llm_input_tokens: int | None = None
     llm_output_tokens: int | None = None
     llm_total_tokens: int | None = None
+    # Diagnostic token splits surfaced for cost attribution and prompt-cache
+    # tuning. ``llm_cached_input_tokens`` is the subset of llm_input_tokens
+    # served from the provider's prompt cache (e.g. Gemini's
+    # cached_content_token_count). ``llm_thoughts_tokens`` is reasoning tokens
+    # that are billed at the output rate by some providers (Gemini 2.5+) but
+    # are not part of the visible response. Both default to None when the
+    # engine/provider didn't report them; downstream metering extensions
+    # should treat None as 0.
+    llm_cached_input_tokens: int | None = None
+    llm_thoughts_tokens: int | None = None
     # Content tokens the retain pipeline actually processed, after
     # chunk-level content-hash deduplication. Semantics:
     #   None — no dedup signal available (e.g. a first-time retain or a
diff --git a/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py
new file mode 100644
index 000000000..2b0ad3ac9
--- /dev/null
+++ b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py
@@ -0,0 +1,135 @@
+"""Tests for cached / thoughts token propagation through TokenUsage,
+LLMToolCallResult, TokenUsageSummary, and RetainResult.
+
+The Gemini 2.5+ family (and any future provider with prompt caching +
+reasoning tokens) reports four distinct token counts on every response:
+prompt, candidates (visible output), cached_content, and thoughts. The
+last two are billed separately by the provider but were previously not
+threaded through to downstream return contexts, so application-layer
+metering had no way to attribute prompt-cache hit rate or reasoning cost
+per operation.
+
+These tests pin the propagation: when a provider populates cached or
+thoughts on the way out, every accumulator and aggregate type carries
+the value through unchanged.
+"""
+
+from __future__ import annotations
+
+from hindsight_api.engine.reflect.models import TokenUsageSummary
+from hindsight_api.engine.response_models import LLMToolCallResult, TokenUsage
+from hindsight_api.extensions.operation_validator import RetainResult
+
+
+def test_token_usage_carries_cached_and_thoughts():
+    """TokenUsage defaults both new fields to 0 and accepts non-zero values."""
+    u = TokenUsage(input_tokens=1500, output_tokens=500, total_tokens=2000)
+    assert u.cached_tokens == 0
+    assert u.thoughts_tokens == 0
+
+    u = TokenUsage(
+        input_tokens=1500,
+        output_tokens=500,
+        total_tokens=2000,
+        cached_tokens=200,
+        thoughts_tokens=80,
+    )
+    assert u.cached_tokens == 200
+    assert u.thoughts_tokens == 80
+
+
+def test_token_usage_aggregates_thoughts_tokens():
+    """TokenUsage.__add__ sums thoughts_tokens alongside the existing fields.
+
+    Multi-iteration agentic loops accumulate per-call usage via ``+``. If
+    thoughts_tokens isn't summed, the per-op total undercounts reasoning
+    spend by a factor of N (the number of LLM sub-calls).
+    """
+    a = TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15, cached_tokens=2, thoughts_tokens=7)
+    b = TokenUsage(input_tokens=20, output_tokens=8, total_tokens=28, cached_tokens=3, thoughts_tokens=11)
+    c = a + b
+    assert c.input_tokens == 30
+    assert c.output_tokens == 13
+    assert c.total_tokens == 43
+    assert c.cached_tokens == 5
+    assert c.thoughts_tokens == 18
+
+
+def test_llm_tool_call_result_carries_cached_and_thoughts():
+    """call_with_tools returns LLMToolCallResult — both new fields default to 0
+    and accept non-zero values from the provider."""
+    r = LLMToolCallResult(content="ok", input_tokens=1234, output_tokens=56)
+    assert r.cached_tokens == 0
+    assert r.thoughts_tokens == 0
+
+    r = LLMToolCallResult(
+        content="ok",
+        input_tokens=1234,
+        output_tokens=56,
+        cached_tokens=200,
+        thoughts_tokens=78,
+    )
+    assert r.cached_tokens == 200
+    assert r.thoughts_tokens == 78
+
+
+def test_token_usage_summary_carries_cached_and_thoughts():
+    """TokenUsageSummary is what reflect agent returns to its caller — needs
+    to propagate the aggregate so per-op cost attribution works."""
+    s = TokenUsageSummary(
+        input_tokens=10000,
+        output_tokens=200,
+        total_tokens=10200,
+        cached_tokens=3000,
+        thoughts_tokens=150,
+    )
+    assert s.cached_tokens == 3000
+    assert s.thoughts_tokens == 150
+
+
+def test_token_usage_summary_defaults_cached_and_thoughts_to_zero():
+    """Defaults preserve backward compatibility for callers built before the
+    fields existed."""
+    s = TokenUsageSummary(input_tokens=100, output_tokens=50, total_tokens=150)
+    assert s.cached_tokens == 0
+    assert s.thoughts_tokens == 0
+
+
+def test_retain_result_carries_cached_input_and_thoughts():
+    """RetainResult is the contract between the engine and any metering
+    extension. The two new fields are optional (None) so older extensions
+    that don't read them are unaffected; engines that DO populate them get
+    end-to-end attribution into the metering hook."""
+
+    class _Ctx:
+        pass
+
+    r = RetainResult(
+        bank_id="b",
+        contents=[],
+        request_context=_Ctx(),
+        document_id=None,
+        fact_type_override=None,
+        unit_ids=[],
+        llm_input_tokens=1000,
+        llm_output_tokens=50,
+        llm_total_tokens=1050,
+        llm_cached_input_tokens=300,
+        llm_thoughts_tokens=25,
+    )
+    assert r.llm_cached_input_tokens == 300
+    assert r.llm_thoughts_tokens == 25
+
+    # Defaults stay None for engines that don't surface the data, so
+    # downstream extensions can use ``or 0`` without breaking on a
+    # core-only build.
+    r2 = RetainResult(
+        bank_id="b",
+        contents=[],
+        request_context=_Ctx(),
+        document_id=None,
+        fact_type_override=None,
+        unit_ids=[],
+    )
+    assert r2.llm_cached_input_tokens is None
+    assert r2.llm_thoughts_tokens is None

From bf49f794895e41ccdd2c0a8bba77e681be4ebd06 Mon Sep 17 00:00:00 2001
From: Chris Bartholomew <chris.bartholomew@vectorize.io>
Date: Mon, 22 Jun 2026 12:00:28 -0400
Subject: [PATCH 2/4] chore: regenerate clients + openapi spec for
 thoughts_tokens field

Picks up the new TokenUsage.thoughts_tokens field added in the parent
commit. Generated by:

  ./scripts/generate-openapi.sh
  ./scripts/generate-clients.sh

Plus ``ruff format`` over the two reflect/ source files to match the
project's enforced formatting style.

No hand edits in any generated file.
---
 .../hindsight_api/engine/reflect/agent.py     | 50 +++++++++++++------
 .../hindsight_api/engine/reflect/models.py    |  4 +-
 hindsight-clients/go/api/openapi.yaml         | 12 ++++-
 hindsight-clients/go/model_token_usage.go     | 45 ++++++++++++++++-
 .../models/token_usage.py                     | 10 ++--
 .../typescript/generated/types.gen.ts         | 10 +++-
 hindsight-docs/static/openapi.json            | 10 +++-
 7 files changed, 111 insertions(+), 30 deletions(-)

diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
index b8fca3f2b..1f03fdd10 100644
--- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
+++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
@@ -554,9 +554,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
                 total_cached_tokens += struct_cached
@@ -619,9 +623,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
 
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
                 total_cached_tokens += struct_cached
@@ -747,9 +755,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
                 total_cached_tokens += struct_cached
@@ -825,9 +837,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                 # Generate structured output if schema provided
                 structured_output = None
                 if response_schema and answer:
-                    structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
-                        answer, response_schema, llm_config, reflect_id
-                    )
+                    (
+                        structured_output,
+                        struct_in,
+                        struct_out,
+                        struct_cached,
+                        struct_thoughts,
+                    ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                     total_input_tokens += struct_in
                     total_output_tokens += struct_out
                     total_cached_tokens += struct_cached
@@ -881,9 +897,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
-                    answer, response_schema, llm_config, reflect_id
-                )
+                (
+                    structured_output,
+                    struct_in,
+                    struct_out,
+                    struct_cached,
+                    struct_thoughts,
+                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
                 total_input_tokens += struct_in
                 total_output_tokens += struct_out
                 total_cached_tokens += struct_cached
diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/models.py b/hindsight-api-slim/hindsight_api/engine/reflect/models.py
index d0e44f831..580442fea 100644
--- a/hindsight-api-slim/hindsight_api/engine/reflect/models.py
+++ b/hindsight-api-slim/hindsight_api/engine/reflect/models.py
@@ -79,9 +79,7 @@ class TokenUsageSummary(BaseModel):
     """Total token usage across all LLM calls."""
 
     input_tokens: int = Field(default=0, description="Total input tokens used (includes any cached prefix tokens)")
-    output_tokens: int = Field(
-        default=0, description="Total visible output tokens used (excludes reasoning/thoughts)"
-    )
+    output_tokens: int = Field(default=0, description="Total visible output tokens used (excludes reasoning/thoughts)")
     total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
     cached_tokens: int = Field(
         default=0,
diff --git a/hindsight-clients/go/api/openapi.yaml b/hindsight-clients/go/api/openapi.yaml
index 2e8b9d5f2..df793aa40 100644
--- a/hindsight-clients/go/api/openapi.yaml
+++ b/hindsight-clients/go/api/openapi.yaml
@@ -7843,12 +7843,13 @@ components:
           type: integer
         output_tokens:
           default: 0
-          description: Number of output/completion tokens generated
+          description: Number of visible output/completion tokens generated (excludes
+            reasoning/thoughts)
           title: Output Tokens
           type: integer
         total_tokens:
           default: 0
-          description: Total tokens (input + output)
+          description: "Total tokens (input + output, excludes thoughts)"
           title: Total Tokens
           type: integer
         cached_tokens:
@@ -7856,6 +7857,13 @@ components:
           description: "Cached/cache-read prompt tokens, when reported by the provider"
           title: Cached Tokens
           type: integer
+        thoughts_tokens:
+          default: 0
+          description: Reasoning/thinking tokens generated by the model. Billed at
+            the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced
+            in the visible response.
+          title: Thoughts Tokens
+          type: integer
       title: TokenUsage
     ToolCallsIncludeOptions:
       description: Options for including tool calls in reflect results.
diff --git a/hindsight-clients/go/model_token_usage.go b/hindsight-clients/go/model_token_usage.go
index b00f446fb..8030d10f7 100644
--- a/hindsight-clients/go/model_token_usage.go
+++ b/hindsight-clients/go/model_token_usage.go
@@ -21,12 +21,14 @@ var _ MappedNullable = &TokenUsage{}
 type TokenUsage struct {
 	// Number of input/prompt tokens consumed
 	InputTokens *int32 `json:"input_tokens,omitempty"`
-	// Number of output/completion tokens generated
+	// Number of visible output/completion tokens generated (excludes reasoning/thoughts)
 	OutputTokens *int32 `json:"output_tokens,omitempty"`
-	// Total tokens (input + output)
+	// Total tokens (input + output, excludes thoughts)
 	TotalTokens *int32 `json:"total_tokens,omitempty"`
 	// Cached/cache-read prompt tokens, when reported by the provider
 	CachedTokens *int32 `json:"cached_tokens,omitempty"`
+	// Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.
+	ThoughtsTokens *int32 `json:"thoughts_tokens,omitempty"`
 }
 
 // NewTokenUsage instantiates a new TokenUsage object
@@ -43,6 +45,8 @@ func NewTokenUsage() *TokenUsage {
 	this.TotalTokens = &totalTokens
 	var cachedTokens int32 = 0
 	this.CachedTokens = &cachedTokens
+	var thoughtsTokens int32 = 0
+	this.ThoughtsTokens = &thoughtsTokens
 	return &this
 }
 
@@ -59,6 +63,8 @@ func NewTokenUsageWithDefaults() *TokenUsage {
 	this.TotalTokens = &totalTokens
 	var cachedTokens int32 = 0
 	this.CachedTokens = &cachedTokens
+	var thoughtsTokens int32 = 0
+	this.ThoughtsTokens = &thoughtsTokens
 	return &this
 }
 
@@ -190,6 +196,38 @@ func (o *TokenUsage) SetCachedTokens(v int32) {
 	o.CachedTokens = &v
 }
 
+// GetThoughtsTokens returns the ThoughtsTokens field value if set, zero value otherwise.
+func (o *TokenUsage) GetThoughtsTokens() int32 {
+	if o == nil || IsNil(o.ThoughtsTokens) {
+		var ret int32
+		return ret
+	}
+	return *o.ThoughtsTokens
+}
+
+// GetThoughtsTokensOk returns a tuple with the ThoughtsTokens field value if set, nil otherwise
+// and a boolean to check if the value has been set.
+func (o *TokenUsage) GetThoughtsTokensOk() (*int32, bool) {
+	if o == nil || IsNil(o.ThoughtsTokens) {
+		return nil, false
+	}
+	return o.ThoughtsTokens, true
+}
+
+// HasThoughtsTokens returns a boolean if a field has been set.
+func (o *TokenUsage) HasThoughtsTokens() bool {
+	if o != nil && !IsNil(o.ThoughtsTokens) {
+		return true
+	}
+
+	return false
+}
+
+// SetThoughtsTokens gets a reference to the given int32 and assigns it to the ThoughtsTokens field.
+func (o *TokenUsage) SetThoughtsTokens(v int32) {
+	o.ThoughtsTokens = &v
+}
+
 func (o TokenUsage) MarshalJSON() ([]byte, error) {
 	toSerialize,err := o.ToMap()
 	if err != nil {
@@ -212,6 +250,9 @@ func (o TokenUsage) ToMap() (map[string]interface{}, error) {
 	if !IsNil(o.CachedTokens) {
 		toSerialize["cached_tokens"] = o.CachedTokens
 	}
+	if !IsNil(o.ThoughtsTokens) {
+		toSerialize["thoughts_tokens"] = o.ThoughtsTokens
+	}
 	return toSerialize, nil
 }
 
diff --git a/hindsight-clients/python/hindsight_client_api/models/token_usage.py b/hindsight-clients/python/hindsight_client_api/models/token_usage.py
index 38a900b2b..d2e3c5656 100644
--- a/hindsight-clients/python/hindsight_client_api/models/token_usage.py
+++ b/hindsight-clients/python/hindsight_client_api/models/token_usage.py
@@ -27,10 +27,11 @@ class TokenUsage(BaseModel):
     Token usage metrics for LLM calls.  Tracks input/output tokens for a single request to enable per-request cost tracking and monitoring.
     """ # noqa: E501
     input_tokens: Optional[StrictInt] = Field(default=0, description="Number of input/prompt tokens consumed")
-    output_tokens: Optional[StrictInt] = Field(default=0, description="Number of output/completion tokens generated")
-    total_tokens: Optional[StrictInt] = Field(default=0, description="Total tokens (input + output)")
+    output_tokens: Optional[StrictInt] = Field(default=0, description="Number of visible output/completion tokens generated (excludes reasoning/thoughts)")
+    total_tokens: Optional[StrictInt] = Field(default=0, description="Total tokens (input + output, excludes thoughts)")
     cached_tokens: Optional[StrictInt] = Field(default=0, description="Cached/cache-read prompt tokens, when reported by the provider")
-    __properties: ClassVar[List[str]] = ["input_tokens", "output_tokens", "total_tokens", "cached_tokens"]
+    thoughts_tokens: Optional[StrictInt] = Field(default=0, description="Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.")
+    __properties: ClassVar[List[str]] = ["input_tokens", "output_tokens", "total_tokens", "cached_tokens", "thoughts_tokens"]
 
     model_config = ConfigDict(
         populate_by_name=True,
@@ -86,7 +87,8 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]:
             "input_tokens": obj.get("input_tokens") if obj.get("input_tokens") is not None else 0,
             "output_tokens": obj.get("output_tokens") if obj.get("output_tokens") is not None else 0,
             "total_tokens": obj.get("total_tokens") if obj.get("total_tokens") is not None else 0,
-            "cached_tokens": obj.get("cached_tokens") if obj.get("cached_tokens") is not None else 0
+            "cached_tokens": obj.get("cached_tokens") if obj.get("cached_tokens") is not None else 0,
+            "thoughts_tokens": obj.get("thoughts_tokens") if obj.get("thoughts_tokens") is not None else 0
         })
         return _obj
 
diff --git a/hindsight-clients/typescript/generated/types.gen.ts b/hindsight-clients/typescript/generated/types.gen.ts
index 4f20fb290..5e7734cdf 100644
--- a/hindsight-clients/typescript/generated/types.gen.ts
+++ b/hindsight-clients/typescript/generated/types.gen.ts
@@ -3625,13 +3625,13 @@ export type TokenUsage = {
   /**
    * Output Tokens
    *
-   * Number of output/completion tokens generated
+   * Number of visible output/completion tokens generated (excludes reasoning/thoughts)
    */
   output_tokens?: number;
   /**
    * Total Tokens
    *
-   * Total tokens (input + output)
+   * Total tokens (input + output, excludes thoughts)
    */
   total_tokens?: number;
   /**
@@ -3640,6 +3640,12 @@ export type TokenUsage = {
    * Cached/cache-read prompt tokens, when reported by the provider
    */
   cached_tokens?: number;
+  /**
+   * Thoughts Tokens
+   *
+   * Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.
+   */
+  thoughts_tokens?: number;
 };
 
 /**
diff --git a/hindsight-docs/static/openapi.json b/hindsight-docs/static/openapi.json
index 4b06c5c53..467e7e94e 100644
--- a/hindsight-docs/static/openapi.json
+++ b/hindsight-docs/static/openapi.json
@@ -12246,13 +12246,13 @@
           "output_tokens": {
             "type": "integer",
             "title": "Output Tokens",
-            "description": "Number of output/completion tokens generated",
+            "description": "Number of visible output/completion tokens generated (excludes reasoning/thoughts)",
             "default": 0
           },
           "total_tokens": {
             "type": "integer",
             "title": "Total Tokens",
-            "description": "Total tokens (input + output)",
+            "description": "Total tokens (input + output, excludes thoughts)",
             "default": 0
           },
           "cached_tokens": {
@@ -12260,6 +12260,12 @@
             "title": "Cached Tokens",
             "description": "Cached/cache-read prompt tokens, when reported by the provider",
             "default": 0
+          },
+          "thoughts_tokens": {
+            "type": "integer",
+            "title": "Thoughts Tokens",
+            "description": "Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.",
+            "default": 0
           }
         },
         "type": "object",

From e240a7330d0fe32440f7a9f8888b5f18da0655d7 Mon Sep 17 00:00:00 2001
From: Chris Bartholomew <chris.bartholomew@vectorize.io>
Date: Mon, 22 Jun 2026 12:08:41 -0400
Subject: [PATCH 3/4] chore: regenerate
 skills/hindsight-docs/references/openapi.json

---
 skills/hindsight-docs/references/openapi.json | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/skills/hindsight-docs/references/openapi.json b/skills/hindsight-docs/references/openapi.json
index 4b06c5c53..467e7e94e 100644
--- a/skills/hindsight-docs/references/openapi.json
+++ b/skills/hindsight-docs/references/openapi.json
@@ -12246,13 +12246,13 @@
           "output_tokens": {
             "type": "integer",
             "title": "Output Tokens",
-            "description": "Number of output/completion tokens generated",
+            "description": "Number of visible output/completion tokens generated (excludes reasoning/thoughts)",
             "default": 0
           },
           "total_tokens": {
             "type": "integer",
             "title": "Total Tokens",
-            "description": "Total tokens (input + output)",
+            "description": "Total tokens (input + output, excludes thoughts)",
             "default": 0
           },
           "cached_tokens": {
@@ -12260,6 +12260,12 @@
             "title": "Cached Tokens",
             "description": "Cached/cache-read prompt tokens, when reported by the provider",
             "default": 0
+          },
+          "thoughts_tokens": {
+            "type": "integer",
+            "title": "Thoughts Tokens",
+            "description": "Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.",
+            "default": 0
           }
         },
         "type": "object",

From 3a7c47cad62440a82bad03587e4fb93637c5807c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Boschi?= <boschi1997@gmail.com>
Date: Tue, 23 Jun 2026 13:05:17 +0200
Subject: [PATCH 4/4] fix(reflect): return StructuredOutputResult instead of
 widened tuple

_generate_structured_output's return contract had drifted: the success
and no-fields branches returned a 5-tuple while the except branch still
returned a 3-tuple. All six call sites unpack five values, so any
structured-output failure would crash reflect with a ValueError instead
of degrading gracefully.

Replace the multi-item tuple return with a typed StructuredOutputResult
(per project rule: no multi-item tuple returns), making the arity
mismatch impossible and the failure path safe. Add a regression test.
---
 .../hindsight_api/engine/reflect/agent.py     | 124 +++++++-----------
 .../hindsight_api/engine/reflect/models.py    |  12 ++
 .../tests/test_token_usage_cached_thoughts.py |  29 +++-
 3 files changed, 89 insertions(+), 76 deletions(-)

diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
index 1f03fdd10..8a0ecda5c 100644
--- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
+++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
@@ -15,7 +15,7 @@
 from typing import TYPE_CHECKING, Any, Awaitable, Callable
 
 from ...config import get_config
-from .models import DirectiveInfo, LLMCall, ReflectAgentResult, TokenUsageSummary, ToolCall
+from .models import DirectiveInfo, LLMCall, ReflectAgentResult, StructuredOutputResult, TokenUsageSummary, ToolCall
 from .prompts import (
     _extract_directive_rules,
     build_final_prompt,
@@ -141,7 +141,7 @@ async def _generate_structured_output(
     response_schema: dict,
     llm_config: "LLMProvider",
     reflect_id: str,
-) -> tuple[dict[str, Any] | None, int, int, int, int]:
+) -> StructuredOutputResult:
     """Generate structured output from an answer using the provided JSON schema.
 
     Args:
@@ -151,8 +151,8 @@ async def _generate_structured_output(
         reflect_id: Reflect ID for logging
 
     Returns:
-        Tuple of (structured_output, input_tokens, output_tokens, cached_tokens, thoughts_tokens).
-        structured_output is None if generation fails.
+        A StructuredOutputResult carrying the structured output (None if
+        generation fails) and the call's token usage.
     """
     try:
         from typing import Any as TypingAny
@@ -186,7 +186,7 @@ def _json_schema_type_to_python(field_schema: dict) -> type:
 
         if not fields:
             logger.warning(f"[REFLECT {reflect_id}] No fields found in response_schema, skipping structured output")
-            return None, 0, 0, 0, 0
+            return StructuredOutputResult()
 
         DynamicModel = create_model("StructuredResponse", **fields)
 
@@ -259,17 +259,17 @@ def _json_schema_type_to_python(field_schema: dict) -> type:
                 logger.warning(f"[REFLECT {reflect_id}] Required field '{field_name}' is empty in structured output")
 
         logger.info(f"[REFLECT {reflect_id}] Generated structured output with {len(structured_output)} fields")
-        return (
-            structured_output,
-            usage.input_tokens,
-            usage.output_tokens,
-            getattr(usage, "cached_tokens", 0) or 0,
-            getattr(usage, "thoughts_tokens", 0) or 0,
+        return StructuredOutputResult(
+            structured_output=structured_output,
+            input_tokens=usage.input_tokens,
+            output_tokens=usage.output_tokens,
+            cached_tokens=usage.cached_tokens,
+            thoughts_tokens=usage.thoughts_tokens,
         )
 
     except Exception as e:
         logger.warning(f"[REFLECT {reflect_id}] Failed to generate structured output: {e}")
-        return None, 0, 0
+        return StructuredOutputResult()
 
 
 def _count_messages_tokens(messages: list[dict[str, Any]]) -> int:
@@ -554,17 +554,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                (
-                    structured_output,
-                    struct_in,
-                    struct_out,
-                    struct_cached,
-                    struct_thoughts,
-                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
-                total_input_tokens += struct_in
-                total_output_tokens += struct_out
-                total_cached_tokens += struct_cached
-                total_thoughts_tokens += struct_thoughts
+                struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
+                structured_output = struct.structured_output
+                total_input_tokens += struct.input_tokens
+                total_output_tokens += struct.output_tokens
+                total_cached_tokens += struct.cached_tokens
+                total_thoughts_tokens += struct.thoughts_tokens
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -623,17 +618,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
 
             structured_output = None
             if response_schema and answer:
-                (
-                    structured_output,
-                    struct_in,
-                    struct_out,
-                    struct_cached,
-                    struct_thoughts,
-                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
-                total_input_tokens += struct_in
-                total_output_tokens += struct_out
-                total_cached_tokens += struct_cached
-                total_thoughts_tokens += struct_thoughts
+                struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
+                structured_output = struct.structured_output
+                total_input_tokens += struct.input_tokens
+                total_output_tokens += struct.output_tokens
+                total_cached_tokens += struct.cached_tokens
+                total_thoughts_tokens += struct.thoughts_tokens
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -755,17 +745,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                (
-                    structured_output,
-                    struct_in,
-                    struct_out,
-                    struct_cached,
-                    struct_thoughts,
-                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
-                total_input_tokens += struct_in
-                total_output_tokens += struct_out
-                total_cached_tokens += struct_cached
-                total_thoughts_tokens += struct_thoughts
+                struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
+                structured_output = struct.structured_output
+                total_input_tokens += struct.input_tokens
+                total_output_tokens += struct.output_tokens
+                total_cached_tokens += struct.cached_tokens
+                total_thoughts_tokens += struct.thoughts_tokens
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -837,17 +822,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                 # Generate structured output if schema provided
                 structured_output = None
                 if response_schema and answer:
-                    (
-                        structured_output,
-                        struct_in,
-                        struct_out,
-                        struct_cached,
-                        struct_thoughts,
-                    ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
-                    total_input_tokens += struct_in
-                    total_output_tokens += struct_out
-                    total_cached_tokens += struct_cached
-                    total_thoughts_tokens += struct_thoughts
+                    struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
+                    structured_output = struct.structured_output
+                    total_input_tokens += struct.input_tokens
+                    total_output_tokens += struct.output_tokens
+                    total_cached_tokens += struct.cached_tokens
+                    total_thoughts_tokens += struct.thoughts_tokens
 
                 _log_completion(answer, iteration + 1)
                 return ReflectAgentResult(
@@ -897,17 +877,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
             # Generate structured output if schema provided
             structured_output = None
             if response_schema and answer:
-                (
-                    structured_output,
-                    struct_in,
-                    struct_out,
-                    struct_cached,
-                    struct_thoughts,
-                ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
-                total_input_tokens += struct_in
-                total_output_tokens += struct_out
-                total_cached_tokens += struct_cached
-                total_thoughts_tokens += struct_thoughts
+                struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
+                structured_output = struct.structured_output
+                total_input_tokens += struct.input_tokens
+                total_output_tokens += struct.output_tokens
+                total_cached_tokens += struct.cached_tokens
+                total_thoughts_tokens += struct.thoughts_tokens
 
             _log_completion(answer, iteration + 1, forced=True)
             return ReflectAgentResult(
@@ -1202,16 +1177,15 @@ async def _process_done_tool(
     structured_output = None
     final_usage = usage
     if response_schema and llm_config and answer:
-        structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output(
-            answer, response_schema, llm_config, reflect_id
-        )
+        struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id)
+        structured_output = struct.structured_output
         # Add structured output tokens to usage
         final_usage = TokenUsageSummary(
-            input_tokens=usage.input_tokens + struct_in,
-            output_tokens=usage.output_tokens + struct_out,
-            total_tokens=usage.total_tokens + struct_in + struct_out,
-            cached_tokens=usage.cached_tokens + struct_cached,
-            thoughts_tokens=usage.thoughts_tokens + struct_thoughts,
+            input_tokens=usage.input_tokens + struct.input_tokens,
+            output_tokens=usage.output_tokens + struct.output_tokens,
+            total_tokens=usage.total_tokens + struct.input_tokens + struct.output_tokens,
+            cached_tokens=usage.cached_tokens + struct.cached_tokens,
+            thoughts_tokens=usage.thoughts_tokens + struct.thoughts_tokens,
         )
 
     log_completion(answer, iterations)
diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/models.py b/hindsight-api-slim/hindsight_api/engine/reflect/models.py
index 580442fea..cc54af4e8 100644
--- a/hindsight-api-slim/hindsight_api/engine/reflect/models.py
+++ b/hindsight-api-slim/hindsight_api/engine/reflect/models.py
@@ -94,6 +94,18 @@ class TokenUsageSummary(BaseModel):
     )
 
 
+class StructuredOutputResult(BaseModel):
+    """Result of structured-output generation, including token usage for the call."""
+
+    structured_output: dict[str, Any] | None = Field(
+        default=None, description="Generated structured output, or None if generation failed"
+    )
+    input_tokens: int = Field(default=0, description="Input tokens used")
+    output_tokens: int = Field(default=0, description="Visible output tokens used")
+    cached_tokens: int = Field(default=0, description="Cached prefix tokens. Subset of input_tokens.")
+    thoughts_tokens: int = Field(default=0, description="Reasoning/thinking tokens, when reported by the provider")
+
+
 class ReflectAgentResult(BaseModel):
     """Result from the reflect agent."""
 
diff --git a/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py
index 2b0ad3ac9..ec8d7cd83 100644
--- a/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py
+++ b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py
@@ -16,7 +16,10 @@
 
 from __future__ import annotations
 
-from hindsight_api.engine.reflect.models import TokenUsageSummary
+import pytest
+
+from hindsight_api.engine.reflect.agent import _generate_structured_output
+from hindsight_api.engine.reflect.models import StructuredOutputResult, TokenUsageSummary
 from hindsight_api.engine.response_models import LLMToolCallResult, TokenUsage
 from hindsight_api.extensions.operation_validator import RetainResult
 
@@ -133,3 +136,27 @@ class _Ctx:
     )
     assert r2.llm_cached_input_tokens is None
     assert r2.llm_thoughts_tokens is None
+
+
+@pytest.mark.asyncio
+async def test_generate_structured_output_returns_dataclass_on_no_fields():
+    """_generate_structured_output returns a StructuredOutputResult, not a tuple.
+
+    Regression guard: the function and all six call sites must agree on a single
+    return type. A previous tuple-based contract drifted out of sync (the failure
+    branch returned 3 values while callers unpacked 5), which would crash reflect
+    with a ValueError on any structured-output failure. An empty schema exercises
+    the no-LLM-call branch deterministically.
+    """
+    result = await _generate_structured_output(
+        answer="anything",
+        response_schema={},
+        llm_config=None,
+        reflect_id="test",
+    )
+    assert isinstance(result, StructuredOutputResult)
+    assert result.structured_output is None
+    assert result.input_tokens == 0
+    assert result.output_tokens == 0
+    assert result.cached_tokens == 0
+    assert result.thoughts_tokens == 0