From e9e89e333253c239faf49eabdd087a6945627761 Mon Sep 17 00:00:00 2001 From: Chris Bartholomew Date: Mon, 22 Jun 2026 11:21:31 -0400 Subject: [PATCH 1/4] feat(tokens): propagate cached + thoughts tokens through return contexts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Gemini 2.5+ family (and any future provider that combines prompt caching with reasoning tokens) reports four distinct token counts on every response: - prompt_token_count (total input) - candidates_token_count (visible output) - cached_content_token_count (subset of input served from prompt cache) - thoughts_token_count (reasoning tokens, billed at output rate) The provider already records the last two on the Prometheus ``hindsight.llm.tokens.{cached_input,thoughts}`` counters, but the values stop at the metrics layer — every return context (TokenUsage, LLMToolCallResult, TokenUsageSummary, RetainResult) only exposes the top-level input/output split. As a result: * a downstream metering extension can't attribute prompt-cache hit-rate per operation (only globally via Prometheus aggregates), and * reasoning-token spend is invisible to ``output_tokens`` because the provider keeps it out of candidates_token_count. A workload that "looks cheap" by visible output can be silently expensive if the model is doing long reasoning chains. This change threads the two fields through end-to-end: - ``TokenUsage`` gains ``thoughts_tokens`` (cached_tokens already existed); ``__add__`` sums it so multi-iteration agentic-loop aggregation works. - ``LLMToolCallResult`` gains ``cached_tokens`` + ``thoughts_tokens``. - ``TokenUsageSummary`` (returned by ``run_reflect_agent``) gains both fields and ``run_reflect_agent`` accumulates them at every call site (main tool loop + structured-output extraction + 4 edge-case completion branches). - ``_generate_structured_output`` now returns a 5-tuple ``(output, in, out, cached, thoughts)``; the 6 unpack sites in the reflect agent are updated together. - ``RetainResult`` gains optional ``llm_cached_input_tokens`` and ``llm_thoughts_tokens`` fields; ``memory_engine`` populates them from the aggregated ``TokenUsage``. Defaults stay ``None`` for engines that don't surface the data so existing metering extensions are unaffected. - The Gemini provider — which was already reading the four token counts from the SDK response — now returns ``thoughts_tokens`` on both the ``call`` and ``call_with_tools`` paths, and the existing ``cached_input_tokens`` value reaches ``LLMToolCallResult``. Backward compatibility: every new field defaults to 0 (or None for the RetainResult dataclass), so any caller built before this change keeps working. Provider impls that don't surface these counts simply propagate zeros — the structured Prometheus counters were already optional in ``record_llm_call``. Adds focused tests (``test_token_usage_cached_thoughts.py``, 6 cases) pinning the propagation through every return type and the aggregation behavior. Existing reflect-agent + Gemini provider tests (87 cases) pass unchanged. This is a pure plumbing change — no metrics are renamed, no behavior is gated, no flags are added. --- .../hindsight_api/engine/memory_engine.py | 2 + .../engine/providers/gemini_llm.py | 3 + .../hindsight_api/engine/reflect/agent.py | 59 ++++++-- .../hindsight_api/engine/reflect/models.py | 19 ++- .../hindsight_api/engine/response_models.py | 30 +++- .../extensions/operation_validator.py | 10 ++ .../tests/test_token_usage_cached_thoughts.py | 135 ++++++++++++++++++ 7 files changed, 240 insertions(+), 18 deletions(-) create mode 100644 hindsight-api-slim/tests/test_token_usage_cached_thoughts.py diff --git a/hindsight-api-slim/hindsight_api/engine/memory_engine.py b/hindsight-api-slim/hindsight_api/engine/memory_engine.py index 6212877f3..96427771c 100644 --- a/hindsight-api-slim/hindsight_api/engine/memory_engine.py +++ b/hindsight-api-slim/hindsight_api/engine/memory_engine.py @@ -3442,6 +3442,8 @@ async def retain_batch_async( llm_input_tokens=total_usage.input_tokens, llm_output_tokens=total_usage.output_tokens, llm_total_tokens=total_usage.total_tokens, + llm_cached_input_tokens=getattr(total_usage, "cached_tokens", 0) or 0, + llm_thoughts_tokens=getattr(total_usage, "thoughts_tokens", 0) or 0, processed_content_tokens=total_processed_content_tokens, ) try: diff --git a/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py b/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py index 9c2c5cd3a..9d94fa741 100644 --- a/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py +++ b/hindsight-api-slim/hindsight_api/engine/providers/gemini_llm.py @@ -424,6 +424,7 @@ def _build_generation_config(use_cache: bool) -> "genai_types.GenerateContentCon output_tokens=output_tokens, total_tokens=input_tokens + output_tokens, cached_tokens=cached_tokens, + thoughts_tokens=thoughts_tokens, ) return result, token_usage return result @@ -762,6 +763,8 @@ def _build_tools_config(use_cache: bool) -> "genai_types.GenerateContentConfig": finish_reason=finish_reason, input_tokens=input_tokens, output_tokens=output_tokens, + cached_tokens=cached_input_tokens, + thoughts_tokens=thoughts_tokens, ) except genai_errors.APIError as e: diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py index 04ea2427b..b8fca3f2b 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py @@ -141,7 +141,7 @@ async def _generate_structured_output( response_schema: dict, llm_config: "LLMProvider", reflect_id: str, -) -> tuple[dict[str, Any] | None, int, int]: +) -> tuple[dict[str, Any] | None, int, int, int, int]: """Generate structured output from an answer using the provided JSON schema. Args: @@ -151,7 +151,7 @@ async def _generate_structured_output( reflect_id: Reflect ID for logging Returns: - Tuple of (structured_output, input_tokens, output_tokens). + Tuple of (structured_output, input_tokens, output_tokens, cached_tokens, thoughts_tokens). structured_output is None if generation fails. """ try: @@ -186,7 +186,7 @@ def _json_schema_type_to_python(field_schema: dict) -> type: if not fields: logger.warning(f"[REFLECT {reflect_id}] No fields found in response_schema, skipping structured output") - return None, 0, 0 + return None, 0, 0, 0, 0 DynamicModel = create_model("StructuredResponse", **fields) @@ -259,7 +259,13 @@ def _json_schema_type_to_python(field_schema: dict) -> type: logger.warning(f"[REFLECT {reflect_id}] Required field '{field_name}' is empty in structured output") logger.info(f"[REFLECT {reflect_id}] Generated structured output with {len(structured_output)} fields") - return structured_output, usage.input_tokens, usage.output_tokens + return ( + structured_output, + usage.input_tokens, + usage.output_tokens, + getattr(usage, "cached_tokens", 0) or 0, + getattr(usage, "thoughts_tokens", 0) or 0, + ) except Exception as e: logger.warning(f"[REFLECT {reflect_id}] Failed to generate structured output: {e}") @@ -435,9 +441,14 @@ async def run_reflect_agent( llm_trace: list[dict[str, Any]] = [] context_history: list[dict[str, Any]] = [] # For final prompt fallback - # Token usage tracking - accumulate across all LLM calls + # Token usage tracking - accumulate across all LLM calls. + # cached_tokens and thoughts_tokens are surfaced for cost attribution + # and prompt-cache tuning. Both are subsets of (or parallel to) the + # input/output counts and are NOT double-counted in total_tokens. total_input_tokens = 0 total_output_tokens = 0 + total_cached_tokens = 0 + total_thoughts_tokens = 0 # Track available IDs for validation (prevents hallucinated citations) available_memory_ids: set[str] = set() @@ -460,6 +471,8 @@ def _get_usage() -> TokenUsageSummary: input_tokens=total_input_tokens, output_tokens=total_output_tokens, total_tokens=total_input_tokens + total_output_tokens, + cached_tokens=total_cached_tokens, + thoughts_tokens=total_thoughts_tokens, ) def _log_completion(answer: str, iterations: int, forced: bool = False): @@ -526,6 +539,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): llm_duration = int((time.time() - llm_start) * 1000) total_input_tokens += usage.input_tokens total_output_tokens += usage.output_tokens + total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0 + total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0 llm_trace.append( { "scope": "final", @@ -539,11 +554,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out = await _generate_structured_output( + structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( answer, response_schema, llm_config, reflect_id ) total_input_tokens += struct_in total_output_tokens += struct_out + total_cached_tokens += struct_cached + total_thoughts_tokens += struct_thoughts _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -588,6 +605,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): llm_duration = int((time.time() - llm_start) * 1000) total_input_tokens += usage.input_tokens total_output_tokens += usage.output_tokens + total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0 + total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0 llm_trace.append( { "scope": "final", @@ -600,11 +619,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out = await _generate_structured_output( + structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( answer, response_schema, llm_config, reflect_id ) total_input_tokens += struct_in total_output_tokens += struct_out + total_cached_tokens += struct_cached + total_thoughts_tokens += struct_thoughts _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -661,6 +682,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): consecutive_errors = 0 total_input_tokens += result.input_tokens total_output_tokens += result.output_tokens + total_cached_tokens += getattr(result, "cached_tokens", 0) or 0 + total_thoughts_tokens += getattr(result, "thoughts_tokens", 0) or 0 llm_trace.append( { "scope": f"agent_{iteration + 1}", @@ -709,6 +732,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): llm_duration = int((time.time() - llm_start) * 1000) total_input_tokens += usage.input_tokens total_output_tokens += usage.output_tokens + total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0 + total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0 llm_trace.append( { "scope": "final", @@ -722,11 +747,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out = await _generate_structured_output( + structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( answer, response_schema, llm_config, reflect_id ) total_input_tokens += struct_in total_output_tokens += struct_out + total_cached_tokens += struct_cached + total_thoughts_tokens += struct_thoughts _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -783,6 +810,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): ) total_input_tokens += rewrite_usage.input_tokens total_output_tokens += rewrite_usage.output_tokens + total_cached_tokens += getattr(rewrite_usage, "cached_tokens", 0) or 0 + total_thoughts_tokens += getattr(rewrite_usage, "thoughts_tokens", 0) or 0 llm_trace.append( { "scope": "final_rewrite", @@ -796,11 +825,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out = await _generate_structured_output( + structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( answer, response_schema, llm_config, reflect_id ) total_input_tokens += struct_in total_output_tokens += struct_out + total_cached_tokens += struct_cached + total_thoughts_tokens += struct_thoughts _log_completion(answer, iteration + 1) return ReflectAgentResult( @@ -835,6 +866,8 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): llm_duration = int((time.time() - llm_start) * 1000) total_input_tokens += usage.input_tokens total_output_tokens += usage.output_tokens + total_cached_tokens += getattr(usage, "cached_tokens", 0) or 0 + total_thoughts_tokens += getattr(usage, "thoughts_tokens", 0) or 0 llm_trace.append( { "scope": "final", @@ -848,11 +881,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out = await _generate_structured_output( + structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( answer, response_schema, llm_config, reflect_id ) total_input_tokens += struct_in total_output_tokens += struct_out + total_cached_tokens += struct_cached + total_thoughts_tokens += struct_thoughts _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -1147,7 +1182,7 @@ async def _process_done_tool( structured_output = None final_usage = usage if response_schema and llm_config and answer: - structured_output, struct_in, struct_out = await _generate_structured_output( + structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( answer, response_schema, llm_config, reflect_id ) # Add structured output tokens to usage @@ -1155,6 +1190,8 @@ async def _process_done_tool( input_tokens=usage.input_tokens + struct_in, output_tokens=usage.output_tokens + struct_out, total_tokens=usage.total_tokens + struct_in + struct_out, + cached_tokens=usage.cached_tokens + struct_cached, + thoughts_tokens=usage.thoughts_tokens + struct_thoughts, ) log_completion(answer, iterations) diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/models.py b/hindsight-api-slim/hindsight_api/engine/reflect/models.py index 26c3f150a..d0e44f831 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/models.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/models.py @@ -78,9 +78,22 @@ class DirectiveInfo(BaseModel): class TokenUsageSummary(BaseModel): """Total token usage across all LLM calls.""" - input_tokens: int = Field(default=0, description="Total input tokens used") - output_tokens: int = Field(default=0, description="Total output tokens used") - total_tokens: int = Field(default=0, description="Total tokens (input + output)") + input_tokens: int = Field(default=0, description="Total input tokens used (includes any cached prefix tokens)") + output_tokens: int = Field( + default=0, description="Total visible output tokens used (excludes reasoning/thoughts)" + ) + total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)") + cached_tokens: int = Field( + default=0, + description="Cached/cache-read prompt tokens summed across calls. Subset of input_tokens.", + ) + thoughts_tokens: int = Field( + default=0, + description=( + "Reasoning/thinking tokens summed across calls. Billed at the output rate by some providers " + "but not part of visible output." + ), + ) class ReflectAgentResult(BaseModel): diff --git a/hindsight-api-slim/hindsight_api/engine/response_models.py b/hindsight-api-slim/hindsight_api/engine/response_models.py index ffa92c8d5..16c92d99e 100644 --- a/hindsight-api-slim/hindsight_api/engine/response_models.py +++ b/hindsight-api-slim/hindsight_api/engine/response_models.py @@ -31,8 +31,20 @@ class LLMToolCallResult(BaseModel): content: str | None = Field(default=None, description="Text content if any") tool_calls: list[LLMToolCall] = Field(default_factory=list, description="Tool calls requested by the LLM") finish_reason: str | None = Field(default=None, description="Reason the LLM stopped: 'stop', 'tool_calls', etc.") - input_tokens: int = Field(default=0, description="Input tokens used in this call") - output_tokens: int = Field(default=0, description="Output tokens used in this call") + input_tokens: int = Field( + default=0, + description="Input tokens used in this call (includes any cached prefix tokens reported by the provider)", + ) + output_tokens: int = Field( + default=0, description="Visible output tokens used in this call (excludes reasoning/thoughts)" + ) + cached_tokens: int = Field( + default=0, description="Cached prefix tokens, when reported by the provider. Subset of input_tokens." + ) + thoughts_tokens: int = Field( + default=0, + description="Reasoning/thinking tokens. Billed at the output rate by some providers but not part of visible output.", + ) class ToolCallTrace(BaseModel): @@ -91,9 +103,18 @@ class TokenUsage(BaseModel): ) input_tokens: int = Field(default=0, description="Number of input/prompt tokens consumed") - output_tokens: int = Field(default=0, description="Number of output/completion tokens generated") - total_tokens: int = Field(default=0, description="Total tokens (input + output)") + output_tokens: int = Field( + default=0, description="Number of visible output/completion tokens generated (excludes reasoning/thoughts)" + ) + total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)") cached_tokens: int = Field(default=0, description="Cached/cache-read prompt tokens, when reported by the provider") + thoughts_tokens: int = Field( + default=0, + description=( + "Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers " + "(e.g. Gemini 2.5+ family) but not surfaced in the visible response." + ), + ) def __add__(self, other: "TokenUsage") -> "TokenUsage": """Allow aggregating token usage from multiple calls.""" @@ -102,6 +123,7 @@ def __add__(self, other: "TokenUsage") -> "TokenUsage": output_tokens=self.output_tokens + other.output_tokens, total_tokens=self.total_tokens + other.total_tokens, cached_tokens=self.cached_tokens + other.cached_tokens, + thoughts_tokens=self.thoughts_tokens + other.thoughts_tokens, ) diff --git a/hindsight-api-slim/hindsight_api/extensions/operation_validator.py b/hindsight-api-slim/hindsight_api/extensions/operation_validator.py index f28491759..6c6394add 100644 --- a/hindsight-api-slim/hindsight_api/extensions/operation_validator.py +++ b/hindsight-api-slim/hindsight_api/extensions/operation_validator.py @@ -208,6 +208,16 @@ class RetainResult: llm_input_tokens: int | None = None llm_output_tokens: int | None = None llm_total_tokens: int | None = None + # Diagnostic token splits surfaced for cost attribution and prompt-cache + # tuning. ``llm_cached_input_tokens`` is the subset of llm_input_tokens + # served from the provider's prompt cache (e.g. Gemini's + # cached_content_token_count). ``llm_thoughts_tokens`` is reasoning tokens + # that are billed at the output rate by some providers (Gemini 2.5+) but + # are not part of the visible response. Both default to None when the + # engine/provider didn't report them; downstream metering extensions + # should treat None as 0. + llm_cached_input_tokens: int | None = None + llm_thoughts_tokens: int | None = None # Content tokens the retain pipeline actually processed, after # chunk-level content-hash deduplication. Semantics: # None — no dedup signal available (e.g. a first-time retain or a diff --git a/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py new file mode 100644 index 000000000..2b0ad3ac9 --- /dev/null +++ b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py @@ -0,0 +1,135 @@ +"""Tests for cached / thoughts token propagation through TokenUsage, +LLMToolCallResult, TokenUsageSummary, and RetainResult. + +The Gemini 2.5+ family (and any future provider with prompt caching + +reasoning tokens) reports four distinct token counts on every response: +prompt, candidates (visible output), cached_content, and thoughts. The +last two are billed separately by the provider but were previously not +threaded through to downstream return contexts, so application-layer +metering had no way to attribute prompt-cache hit rate or reasoning cost +per operation. + +These tests pin the propagation: when a provider populates cached or +thoughts on the way out, every accumulator and aggregate type carries +the value through unchanged. +""" + +from __future__ import annotations + +from hindsight_api.engine.reflect.models import TokenUsageSummary +from hindsight_api.engine.response_models import LLMToolCallResult, TokenUsage +from hindsight_api.extensions.operation_validator import RetainResult + + +def test_token_usage_carries_cached_and_thoughts(): + """TokenUsage defaults both new fields to 0 and accepts non-zero values.""" + u = TokenUsage(input_tokens=1500, output_tokens=500, total_tokens=2000) + assert u.cached_tokens == 0 + assert u.thoughts_tokens == 0 + + u = TokenUsage( + input_tokens=1500, + output_tokens=500, + total_tokens=2000, + cached_tokens=200, + thoughts_tokens=80, + ) + assert u.cached_tokens == 200 + assert u.thoughts_tokens == 80 + + +def test_token_usage_aggregates_thoughts_tokens(): + """TokenUsage.__add__ sums thoughts_tokens alongside the existing fields. + + Multi-iteration agentic loops accumulate per-call usage via ``+``. If + thoughts_tokens isn't summed, the per-op total undercounts reasoning + spend by a factor of N (the number of LLM sub-calls). + """ + a = TokenUsage(input_tokens=10, output_tokens=5, total_tokens=15, cached_tokens=2, thoughts_tokens=7) + b = TokenUsage(input_tokens=20, output_tokens=8, total_tokens=28, cached_tokens=3, thoughts_tokens=11) + c = a + b + assert c.input_tokens == 30 + assert c.output_tokens == 13 + assert c.total_tokens == 43 + assert c.cached_tokens == 5 + assert c.thoughts_tokens == 18 + + +def test_llm_tool_call_result_carries_cached_and_thoughts(): + """call_with_tools returns LLMToolCallResult — both new fields default to 0 + and accept non-zero values from the provider.""" + r = LLMToolCallResult(content="ok", input_tokens=1234, output_tokens=56) + assert r.cached_tokens == 0 + assert r.thoughts_tokens == 0 + + r = LLMToolCallResult( + content="ok", + input_tokens=1234, + output_tokens=56, + cached_tokens=200, + thoughts_tokens=78, + ) + assert r.cached_tokens == 200 + assert r.thoughts_tokens == 78 + + +def test_token_usage_summary_carries_cached_and_thoughts(): + """TokenUsageSummary is what reflect agent returns to its caller — needs + to propagate the aggregate so per-op cost attribution works.""" + s = TokenUsageSummary( + input_tokens=10000, + output_tokens=200, + total_tokens=10200, + cached_tokens=3000, + thoughts_tokens=150, + ) + assert s.cached_tokens == 3000 + assert s.thoughts_tokens == 150 + + +def test_token_usage_summary_defaults_cached_and_thoughts_to_zero(): + """Defaults preserve backward compatibility for callers built before the + fields existed.""" + s = TokenUsageSummary(input_tokens=100, output_tokens=50, total_tokens=150) + assert s.cached_tokens == 0 + assert s.thoughts_tokens == 0 + + +def test_retain_result_carries_cached_input_and_thoughts(): + """RetainResult is the contract between the engine and any metering + extension. The two new fields are optional (None) so older extensions + that don't read them are unaffected; engines that DO populate them get + end-to-end attribution into the metering hook.""" + + class _Ctx: + pass + + r = RetainResult( + bank_id="b", + contents=[], + request_context=_Ctx(), + document_id=None, + fact_type_override=None, + unit_ids=[], + llm_input_tokens=1000, + llm_output_tokens=50, + llm_total_tokens=1050, + llm_cached_input_tokens=300, + llm_thoughts_tokens=25, + ) + assert r.llm_cached_input_tokens == 300 + assert r.llm_thoughts_tokens == 25 + + # Defaults stay None for engines that don't surface the data, so + # downstream extensions can use ``or 0`` without breaking on a + # core-only build. + r2 = RetainResult( + bank_id="b", + contents=[], + request_context=_Ctx(), + document_id=None, + fact_type_override=None, + unit_ids=[], + ) + assert r2.llm_cached_input_tokens is None + assert r2.llm_thoughts_tokens is None From bf49f794895e41ccdd2c0a8bba77e681be4ebd06 Mon Sep 17 00:00:00 2001 From: Chris Bartholomew Date: Mon, 22 Jun 2026 12:00:28 -0400 Subject: [PATCH 2/4] chore: regenerate clients + openapi spec for thoughts_tokens field Picks up the new TokenUsage.thoughts_tokens field added in the parent commit. Generated by: ./scripts/generate-openapi.sh ./scripts/generate-clients.sh Plus ``ruff format`` over the two reflect/ source files to match the project's enforced formatting style. No hand edits in any generated file. --- .../hindsight_api/engine/reflect/agent.py | 50 +++++++++++++------ .../hindsight_api/engine/reflect/models.py | 4 +- hindsight-clients/go/api/openapi.yaml | 12 ++++- hindsight-clients/go/model_token_usage.go | 45 ++++++++++++++++- .../models/token_usage.py | 10 ++-- .../typescript/generated/types.gen.ts | 10 +++- hindsight-docs/static/openapi.json | 10 +++- 7 files changed, 111 insertions(+), 30 deletions(-) diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py index b8fca3f2b..1f03fdd10 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py @@ -554,9 +554,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( - answer, response_schema, llm_config, reflect_id - ) + ( + structured_output, + struct_in, + struct_out, + struct_cached, + struct_thoughts, + ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) total_input_tokens += struct_in total_output_tokens += struct_out total_cached_tokens += struct_cached @@ -619,9 +623,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( - answer, response_schema, llm_config, reflect_id - ) + ( + structured_output, + struct_in, + struct_out, + struct_cached, + struct_thoughts, + ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) total_input_tokens += struct_in total_output_tokens += struct_out total_cached_tokens += struct_cached @@ -747,9 +755,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( - answer, response_schema, llm_config, reflect_id - ) + ( + structured_output, + struct_in, + struct_out, + struct_cached, + struct_thoughts, + ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) total_input_tokens += struct_in total_output_tokens += struct_out total_cached_tokens += struct_cached @@ -825,9 +837,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( - answer, response_schema, llm_config, reflect_id - ) + ( + structured_output, + struct_in, + struct_out, + struct_cached, + struct_thoughts, + ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) total_input_tokens += struct_in total_output_tokens += struct_out total_cached_tokens += struct_cached @@ -881,9 +897,13 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( - answer, response_schema, llm_config, reflect_id - ) + ( + structured_output, + struct_in, + struct_out, + struct_cached, + struct_thoughts, + ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) total_input_tokens += struct_in total_output_tokens += struct_out total_cached_tokens += struct_cached diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/models.py b/hindsight-api-slim/hindsight_api/engine/reflect/models.py index d0e44f831..580442fea 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/models.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/models.py @@ -79,9 +79,7 @@ class TokenUsageSummary(BaseModel): """Total token usage across all LLM calls.""" input_tokens: int = Field(default=0, description="Total input tokens used (includes any cached prefix tokens)") - output_tokens: int = Field( - default=0, description="Total visible output tokens used (excludes reasoning/thoughts)" - ) + output_tokens: int = Field(default=0, description="Total visible output tokens used (excludes reasoning/thoughts)") total_tokens: int = Field(default=0, description="Total tokens (input + output, excludes thoughts)") cached_tokens: int = Field( default=0, diff --git a/hindsight-clients/go/api/openapi.yaml b/hindsight-clients/go/api/openapi.yaml index 2e8b9d5f2..df793aa40 100644 --- a/hindsight-clients/go/api/openapi.yaml +++ b/hindsight-clients/go/api/openapi.yaml @@ -7843,12 +7843,13 @@ components: type: integer output_tokens: default: 0 - description: Number of output/completion tokens generated + description: Number of visible output/completion tokens generated (excludes + reasoning/thoughts) title: Output Tokens type: integer total_tokens: default: 0 - description: Total tokens (input + output) + description: "Total tokens (input + output, excludes thoughts)" title: Total Tokens type: integer cached_tokens: @@ -7856,6 +7857,13 @@ components: description: "Cached/cache-read prompt tokens, when reported by the provider" title: Cached Tokens type: integer + thoughts_tokens: + default: 0 + description: Reasoning/thinking tokens generated by the model. Billed at + the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced + in the visible response. + title: Thoughts Tokens + type: integer title: TokenUsage ToolCallsIncludeOptions: description: Options for including tool calls in reflect results. diff --git a/hindsight-clients/go/model_token_usage.go b/hindsight-clients/go/model_token_usage.go index b00f446fb..8030d10f7 100644 --- a/hindsight-clients/go/model_token_usage.go +++ b/hindsight-clients/go/model_token_usage.go @@ -21,12 +21,14 @@ var _ MappedNullable = &TokenUsage{} type TokenUsage struct { // Number of input/prompt tokens consumed InputTokens *int32 `json:"input_tokens,omitempty"` - // Number of output/completion tokens generated + // Number of visible output/completion tokens generated (excludes reasoning/thoughts) OutputTokens *int32 `json:"output_tokens,omitempty"` - // Total tokens (input + output) + // Total tokens (input + output, excludes thoughts) TotalTokens *int32 `json:"total_tokens,omitempty"` // Cached/cache-read prompt tokens, when reported by the provider CachedTokens *int32 `json:"cached_tokens,omitempty"` + // Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response. + ThoughtsTokens *int32 `json:"thoughts_tokens,omitempty"` } // NewTokenUsage instantiates a new TokenUsage object @@ -43,6 +45,8 @@ func NewTokenUsage() *TokenUsage { this.TotalTokens = &totalTokens var cachedTokens int32 = 0 this.CachedTokens = &cachedTokens + var thoughtsTokens int32 = 0 + this.ThoughtsTokens = &thoughtsTokens return &this } @@ -59,6 +63,8 @@ func NewTokenUsageWithDefaults() *TokenUsage { this.TotalTokens = &totalTokens var cachedTokens int32 = 0 this.CachedTokens = &cachedTokens + var thoughtsTokens int32 = 0 + this.ThoughtsTokens = &thoughtsTokens return &this } @@ -190,6 +196,38 @@ func (o *TokenUsage) SetCachedTokens(v int32) { o.CachedTokens = &v } +// GetThoughtsTokens returns the ThoughtsTokens field value if set, zero value otherwise. +func (o *TokenUsage) GetThoughtsTokens() int32 { + if o == nil || IsNil(o.ThoughtsTokens) { + var ret int32 + return ret + } + return *o.ThoughtsTokens +} + +// GetThoughtsTokensOk returns a tuple with the ThoughtsTokens field value if set, nil otherwise +// and a boolean to check if the value has been set. +func (o *TokenUsage) GetThoughtsTokensOk() (*int32, bool) { + if o == nil || IsNil(o.ThoughtsTokens) { + return nil, false + } + return o.ThoughtsTokens, true +} + +// HasThoughtsTokens returns a boolean if a field has been set. +func (o *TokenUsage) HasThoughtsTokens() bool { + if o != nil && !IsNil(o.ThoughtsTokens) { + return true + } + + return false +} + +// SetThoughtsTokens gets a reference to the given int32 and assigns it to the ThoughtsTokens field. +func (o *TokenUsage) SetThoughtsTokens(v int32) { + o.ThoughtsTokens = &v +} + func (o TokenUsage) MarshalJSON() ([]byte, error) { toSerialize,err := o.ToMap() if err != nil { @@ -212,6 +250,9 @@ func (o TokenUsage) ToMap() (map[string]interface{}, error) { if !IsNil(o.CachedTokens) { toSerialize["cached_tokens"] = o.CachedTokens } + if !IsNil(o.ThoughtsTokens) { + toSerialize["thoughts_tokens"] = o.ThoughtsTokens + } return toSerialize, nil } diff --git a/hindsight-clients/python/hindsight_client_api/models/token_usage.py b/hindsight-clients/python/hindsight_client_api/models/token_usage.py index 38a900b2b..d2e3c5656 100644 --- a/hindsight-clients/python/hindsight_client_api/models/token_usage.py +++ b/hindsight-clients/python/hindsight_client_api/models/token_usage.py @@ -27,10 +27,11 @@ class TokenUsage(BaseModel): Token usage metrics for LLM calls. Tracks input/output tokens for a single request to enable per-request cost tracking and monitoring. """ # noqa: E501 input_tokens: Optional[StrictInt] = Field(default=0, description="Number of input/prompt tokens consumed") - output_tokens: Optional[StrictInt] = Field(default=0, description="Number of output/completion tokens generated") - total_tokens: Optional[StrictInt] = Field(default=0, description="Total tokens (input + output)") + output_tokens: Optional[StrictInt] = Field(default=0, description="Number of visible output/completion tokens generated (excludes reasoning/thoughts)") + total_tokens: Optional[StrictInt] = Field(default=0, description="Total tokens (input + output, excludes thoughts)") cached_tokens: Optional[StrictInt] = Field(default=0, description="Cached/cache-read prompt tokens, when reported by the provider") - __properties: ClassVar[List[str]] = ["input_tokens", "output_tokens", "total_tokens", "cached_tokens"] + thoughts_tokens: Optional[StrictInt] = Field(default=0, description="Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.") + __properties: ClassVar[List[str]] = ["input_tokens", "output_tokens", "total_tokens", "cached_tokens", "thoughts_tokens"] model_config = ConfigDict( populate_by_name=True, @@ -86,7 +87,8 @@ def from_dict(cls, obj: Optional[Dict[str, Any]]) -> Optional[Self]: "input_tokens": obj.get("input_tokens") if obj.get("input_tokens") is not None else 0, "output_tokens": obj.get("output_tokens") if obj.get("output_tokens") is not None else 0, "total_tokens": obj.get("total_tokens") if obj.get("total_tokens") is not None else 0, - "cached_tokens": obj.get("cached_tokens") if obj.get("cached_tokens") is not None else 0 + "cached_tokens": obj.get("cached_tokens") if obj.get("cached_tokens") is not None else 0, + "thoughts_tokens": obj.get("thoughts_tokens") if obj.get("thoughts_tokens") is not None else 0 }) return _obj diff --git a/hindsight-clients/typescript/generated/types.gen.ts b/hindsight-clients/typescript/generated/types.gen.ts index 4f20fb290..5e7734cdf 100644 --- a/hindsight-clients/typescript/generated/types.gen.ts +++ b/hindsight-clients/typescript/generated/types.gen.ts @@ -3625,13 +3625,13 @@ export type TokenUsage = { /** * Output Tokens * - * Number of output/completion tokens generated + * Number of visible output/completion tokens generated (excludes reasoning/thoughts) */ output_tokens?: number; /** * Total Tokens * - * Total tokens (input + output) + * Total tokens (input + output, excludes thoughts) */ total_tokens?: number; /** @@ -3640,6 +3640,12 @@ export type TokenUsage = { * Cached/cache-read prompt tokens, when reported by the provider */ cached_tokens?: number; + /** + * Thoughts Tokens + * + * Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response. + */ + thoughts_tokens?: number; }; /** diff --git a/hindsight-docs/static/openapi.json b/hindsight-docs/static/openapi.json index 4b06c5c53..467e7e94e 100644 --- a/hindsight-docs/static/openapi.json +++ b/hindsight-docs/static/openapi.json @@ -12246,13 +12246,13 @@ "output_tokens": { "type": "integer", "title": "Output Tokens", - "description": "Number of output/completion tokens generated", + "description": "Number of visible output/completion tokens generated (excludes reasoning/thoughts)", "default": 0 }, "total_tokens": { "type": "integer", "title": "Total Tokens", - "description": "Total tokens (input + output)", + "description": "Total tokens (input + output, excludes thoughts)", "default": 0 }, "cached_tokens": { @@ -12260,6 +12260,12 @@ "title": "Cached Tokens", "description": "Cached/cache-read prompt tokens, when reported by the provider", "default": 0 + }, + "thoughts_tokens": { + "type": "integer", + "title": "Thoughts Tokens", + "description": "Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.", + "default": 0 } }, "type": "object", From e240a7330d0fe32440f7a9f8888b5f18da0655d7 Mon Sep 17 00:00:00 2001 From: Chris Bartholomew Date: Mon, 22 Jun 2026 12:08:41 -0400 Subject: [PATCH 3/4] chore: regenerate skills/hindsight-docs/references/openapi.json --- skills/hindsight-docs/references/openapi.json | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/skills/hindsight-docs/references/openapi.json b/skills/hindsight-docs/references/openapi.json index 4b06c5c53..467e7e94e 100644 --- a/skills/hindsight-docs/references/openapi.json +++ b/skills/hindsight-docs/references/openapi.json @@ -12246,13 +12246,13 @@ "output_tokens": { "type": "integer", "title": "Output Tokens", - "description": "Number of output/completion tokens generated", + "description": "Number of visible output/completion tokens generated (excludes reasoning/thoughts)", "default": 0 }, "total_tokens": { "type": "integer", "title": "Total Tokens", - "description": "Total tokens (input + output)", + "description": "Total tokens (input + output, excludes thoughts)", "default": 0 }, "cached_tokens": { @@ -12260,6 +12260,12 @@ "title": "Cached Tokens", "description": "Cached/cache-read prompt tokens, when reported by the provider", "default": 0 + }, + "thoughts_tokens": { + "type": "integer", + "title": "Thoughts Tokens", + "description": "Reasoning/thinking tokens generated by the model. Billed at the output rate by some providers (e.g. Gemini 2.5+ family) but not surfaced in the visible response.", + "default": 0 } }, "type": "object", From 3a7c47cad62440a82bad03587e4fb93637c5807c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Boschi?= Date: Tue, 23 Jun 2026 13:05:17 +0200 Subject: [PATCH 4/4] fix(reflect): return StructuredOutputResult instead of widened tuple _generate_structured_output's return contract had drifted: the success and no-fields branches returned a 5-tuple while the except branch still returned a 3-tuple. All six call sites unpack five values, so any structured-output failure would crash reflect with a ValueError instead of degrading gracefully. Replace the multi-item tuple return with a typed StructuredOutputResult (per project rule: no multi-item tuple returns), making the arity mismatch impossible and the failure path safe. Add a regression test. --- .../hindsight_api/engine/reflect/agent.py | 124 +++++++----------- .../hindsight_api/engine/reflect/models.py | 12 ++ .../tests/test_token_usage_cached_thoughts.py | 29 +++- 3 files changed, 89 insertions(+), 76 deletions(-) diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py index 1f03fdd10..8a0ecda5c 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py @@ -15,7 +15,7 @@ from typing import TYPE_CHECKING, Any, Awaitable, Callable from ...config import get_config -from .models import DirectiveInfo, LLMCall, ReflectAgentResult, TokenUsageSummary, ToolCall +from .models import DirectiveInfo, LLMCall, ReflectAgentResult, StructuredOutputResult, TokenUsageSummary, ToolCall from .prompts import ( _extract_directive_rules, build_final_prompt, @@ -141,7 +141,7 @@ async def _generate_structured_output( response_schema: dict, llm_config: "LLMProvider", reflect_id: str, -) -> tuple[dict[str, Any] | None, int, int, int, int]: +) -> StructuredOutputResult: """Generate structured output from an answer using the provided JSON schema. Args: @@ -151,8 +151,8 @@ async def _generate_structured_output( reflect_id: Reflect ID for logging Returns: - Tuple of (structured_output, input_tokens, output_tokens, cached_tokens, thoughts_tokens). - structured_output is None if generation fails. + A StructuredOutputResult carrying the structured output (None if + generation fails) and the call's token usage. """ try: from typing import Any as TypingAny @@ -186,7 +186,7 @@ def _json_schema_type_to_python(field_schema: dict) -> type: if not fields: logger.warning(f"[REFLECT {reflect_id}] No fields found in response_schema, skipping structured output") - return None, 0, 0, 0, 0 + return StructuredOutputResult() DynamicModel = create_model("StructuredResponse", **fields) @@ -259,17 +259,17 @@ def _json_schema_type_to_python(field_schema: dict) -> type: logger.warning(f"[REFLECT {reflect_id}] Required field '{field_name}' is empty in structured output") logger.info(f"[REFLECT {reflect_id}] Generated structured output with {len(structured_output)} fields") - return ( - structured_output, - usage.input_tokens, - usage.output_tokens, - getattr(usage, "cached_tokens", 0) or 0, - getattr(usage, "thoughts_tokens", 0) or 0, + return StructuredOutputResult( + structured_output=structured_output, + input_tokens=usage.input_tokens, + output_tokens=usage.output_tokens, + cached_tokens=usage.cached_tokens, + thoughts_tokens=usage.thoughts_tokens, ) except Exception as e: logger.warning(f"[REFLECT {reflect_id}] Failed to generate structured output: {e}") - return None, 0, 0 + return StructuredOutputResult() def _count_messages_tokens(messages: list[dict[str, Any]]) -> int: @@ -554,17 +554,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - ( - structured_output, - struct_in, - struct_out, - struct_cached, - struct_thoughts, - ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) - total_input_tokens += struct_in - total_output_tokens += struct_out - total_cached_tokens += struct_cached - total_thoughts_tokens += struct_thoughts + struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) + structured_output = struct.structured_output + total_input_tokens += struct.input_tokens + total_output_tokens += struct.output_tokens + total_cached_tokens += struct.cached_tokens + total_thoughts_tokens += struct.thoughts_tokens _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -623,17 +618,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): structured_output = None if response_schema and answer: - ( - structured_output, - struct_in, - struct_out, - struct_cached, - struct_thoughts, - ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) - total_input_tokens += struct_in - total_output_tokens += struct_out - total_cached_tokens += struct_cached - total_thoughts_tokens += struct_thoughts + struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) + structured_output = struct.structured_output + total_input_tokens += struct.input_tokens + total_output_tokens += struct.output_tokens + total_cached_tokens += struct.cached_tokens + total_thoughts_tokens += struct.thoughts_tokens _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -755,17 +745,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - ( - structured_output, - struct_in, - struct_out, - struct_cached, - struct_thoughts, - ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) - total_input_tokens += struct_in - total_output_tokens += struct_out - total_cached_tokens += struct_cached - total_thoughts_tokens += struct_thoughts + struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) + structured_output = struct.structured_output + total_input_tokens += struct.input_tokens + total_output_tokens += struct.output_tokens + total_cached_tokens += struct.cached_tokens + total_thoughts_tokens += struct.thoughts_tokens _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -837,17 +822,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - ( - structured_output, - struct_in, - struct_out, - struct_cached, - struct_thoughts, - ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) - total_input_tokens += struct_in - total_output_tokens += struct_out - total_cached_tokens += struct_cached - total_thoughts_tokens += struct_thoughts + struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) + structured_output = struct.structured_output + total_input_tokens += struct.input_tokens + total_output_tokens += struct.output_tokens + total_cached_tokens += struct.cached_tokens + total_thoughts_tokens += struct.thoughts_tokens _log_completion(answer, iteration + 1) return ReflectAgentResult( @@ -897,17 +877,12 @@ def _log_completion(answer: str, iterations: int, forced: bool = False): # Generate structured output if schema provided structured_output = None if response_schema and answer: - ( - structured_output, - struct_in, - struct_out, - struct_cached, - struct_thoughts, - ) = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) - total_input_tokens += struct_in - total_output_tokens += struct_out - total_cached_tokens += struct_cached - total_thoughts_tokens += struct_thoughts + struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) + structured_output = struct.structured_output + total_input_tokens += struct.input_tokens + total_output_tokens += struct.output_tokens + total_cached_tokens += struct.cached_tokens + total_thoughts_tokens += struct.thoughts_tokens _log_completion(answer, iteration + 1, forced=True) return ReflectAgentResult( @@ -1202,16 +1177,15 @@ async def _process_done_tool( structured_output = None final_usage = usage if response_schema and llm_config and answer: - structured_output, struct_in, struct_out, struct_cached, struct_thoughts = await _generate_structured_output( - answer, response_schema, llm_config, reflect_id - ) + struct = await _generate_structured_output(answer, response_schema, llm_config, reflect_id) + structured_output = struct.structured_output # Add structured output tokens to usage final_usage = TokenUsageSummary( - input_tokens=usage.input_tokens + struct_in, - output_tokens=usage.output_tokens + struct_out, - total_tokens=usage.total_tokens + struct_in + struct_out, - cached_tokens=usage.cached_tokens + struct_cached, - thoughts_tokens=usage.thoughts_tokens + struct_thoughts, + input_tokens=usage.input_tokens + struct.input_tokens, + output_tokens=usage.output_tokens + struct.output_tokens, + total_tokens=usage.total_tokens + struct.input_tokens + struct.output_tokens, + cached_tokens=usage.cached_tokens + struct.cached_tokens, + thoughts_tokens=usage.thoughts_tokens + struct.thoughts_tokens, ) log_completion(answer, iterations) diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/models.py b/hindsight-api-slim/hindsight_api/engine/reflect/models.py index 580442fea..cc54af4e8 100644 --- a/hindsight-api-slim/hindsight_api/engine/reflect/models.py +++ b/hindsight-api-slim/hindsight_api/engine/reflect/models.py @@ -94,6 +94,18 @@ class TokenUsageSummary(BaseModel): ) +class StructuredOutputResult(BaseModel): + """Result of structured-output generation, including token usage for the call.""" + + structured_output: dict[str, Any] | None = Field( + default=None, description="Generated structured output, or None if generation failed" + ) + input_tokens: int = Field(default=0, description="Input tokens used") + output_tokens: int = Field(default=0, description="Visible output tokens used") + cached_tokens: int = Field(default=0, description="Cached prefix tokens. Subset of input_tokens.") + thoughts_tokens: int = Field(default=0, description="Reasoning/thinking tokens, when reported by the provider") + + class ReflectAgentResult(BaseModel): """Result from the reflect agent.""" diff --git a/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py index 2b0ad3ac9..ec8d7cd83 100644 --- a/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py +++ b/hindsight-api-slim/tests/test_token_usage_cached_thoughts.py @@ -16,7 +16,10 @@ from __future__ import annotations -from hindsight_api.engine.reflect.models import TokenUsageSummary +import pytest + +from hindsight_api.engine.reflect.agent import _generate_structured_output +from hindsight_api.engine.reflect.models import StructuredOutputResult, TokenUsageSummary from hindsight_api.engine.response_models import LLMToolCallResult, TokenUsage from hindsight_api.extensions.operation_validator import RetainResult @@ -133,3 +136,27 @@ class _Ctx: ) assert r2.llm_cached_input_tokens is None assert r2.llm_thoughts_tokens is None + + +@pytest.mark.asyncio +async def test_generate_structured_output_returns_dataclass_on_no_fields(): + """_generate_structured_output returns a StructuredOutputResult, not a tuple. + + Regression guard: the function and all six call sites must agree on a single + return type. A previous tuple-based contract drifted out of sync (the failure + branch returned 3 values while callers unpacked 5), which would crash reflect + with a ValueError on any structured-output failure. An empty schema exercises + the no-LLM-call branch deterministically. + """ + result = await _generate_structured_output( + answer="anything", + response_schema={}, + llm_config=None, + reflect_id="test", + ) + assert isinstance(result, StructuredOutputResult) + assert result.structured_output is None + assert result.input_tokens == 0 + assert result.output_tokens == 0 + assert result.cached_tokens == 0 + assert result.thoughts_tokens == 0