From 6692eebe11c64f0d299810407d4f3e7f03521976 Mon Sep 17 00:00:00 2001 From: vitalii-dynamiq Date: Wed, 6 May 2026 22:55:04 +0400 Subject: [PATCH 1/2] fix: litellm-compat for dynamiq integration tests (v0.4.8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three drop-in gaps prevented dynamiq's test fixtures from passing against arcllm even though direct API calls worked. Exception positional args: litellm's exception classes take (message, llm_provider, model, ...) positionally. arcllm previously made these keyword-only. Tests construct errors as `RateLimitError(msg, "bedrock", "amazon.titan")` which raised "takes 2 positional arguments but 4 were given". - ArcLLMError: provider/model/status_code now positional after message; llm_provider stays keyword-only as the litellm-name alias - RateLimitError: accepts (message, provider, model) positionally - ProviderAPIError: detects litellm shape (status_code, message, ...) by type — first int positional becomes status_code - BadRequestError (renamed from InvalidRequestError to match the canonical litellm/OpenAI name; InvalidRequestError stays as alias): accepts (message, model, provider) per litellm AND (message, provider, model) per arcllm. Disambiguates by checking SUPPORTED_PROVIDERS — common provider names always resolve correctly. Streaming chunk serialisation: Choice.model_dump() omitted .delta. dynamiq's streaming callback reads chunk["choices"][0]["delta"]["content"] from the serialized dict, so it saw KeyError on every streamed event. token_counter overhead: Counts now follow OpenAI's per-message formula (3 + per-key + 1 for name + 3 priming) so totals match litellm's. Previous sum-of-fields undercount made dynamiq's history-summarisation logic preserve more context than the model could actually accept. ModelResponse defaults: - choices defaults to [Choice()] so fixtures that do ModelResponse()["choices"][0]["message"]["content"] = ... work - stream: bool = False added so ModelResponse(stream=True) is accepted - Choice.delta added so streaming fixtures can set delta on the same Choice class litellm uses for both modes Result: dynamiq main suite goes from 281 integration failures → 0 (1066 integration + 1149 unit, all passing). arcllm's own test suite unchanged (8 pre-existing Ollama integration failures only). --- arcllm/__init__.py | 2 +- arcllm/exceptions.py | 75 +++++++++++++++++++++++++++++++++++--------- arcllm/tokens.py | 55 +++++++++++++++++++++++++++----- arcllm/types.py | 19 ++++++++++- pyproject.toml | 2 +- 5 files changed, 127 insertions(+), 26 deletions(-) diff --git a/arcllm/__init__.py b/arcllm/__init__.py index 34a79c3..da23918 100644 --- a/arcllm/__init__.py +++ b/arcllm/__init__.py @@ -69,7 +69,7 @@ from __future__ import annotations -__version__ = "0.4.6" +__version__ = "0.4.8" __all__ = [ "APIConnectionError", "APIError", diff --git a/arcllm/exceptions.py b/arcllm/exceptions.py index 5325d26..0b86c70 100644 --- a/arcllm/exceptions.py +++ b/arcllm/exceptions.py @@ -37,16 +37,16 @@ class ArcLLMError(Exception): def __init__( self, message: str, - *, provider: str | None = None, - # Litellm-compat alias: callers migrating from litellm pass - # ``llm_provider`` (its kwarg name). If both are given, ``provider`` - # wins so explicit arcllm code keeps its semantics. - llm_provider: str | None = None, model: str | None = None, status_code: int | None = None, request_id: str | None = None, raw_response: Any | None = None, + *, + # Litellm-compat alias: callers migrating from litellm pass + # ``llm_provider`` (its kwarg name). If both are given, ``provider`` + # wins so explicit arcllm code keeps its semantics. + llm_provider: str | None = None, ) -> None: super().__init__(message) self.message = message @@ -103,11 +103,17 @@ class RateLimitError(ArcLLMError): def __init__( self, message: str, - *, + provider: str | None = None, + model: str | None = None, + *args: Any, retry_after: float | None = None, **kwargs: Any, ) -> None: - super().__init__(message, **kwargs) + if provider is not None: + kwargs.setdefault("provider", provider) + if model is not None: + kwargs.setdefault("model", model) + super().__init__(message, *args, **kwargs) self.retry_after = retry_after @@ -151,17 +157,31 @@ class ProviderAPIError(ArcLLMError): This is used for provider-specific errors that don't map to other more specific exception types. + + Litellm-compat: callers may construct this as + ``ProviderAPIError(status_code, message, provider, model)`` (litellm's + ``APIError`` signature). Detection is by type — if the first arg is an + ``int``, it's the status code and the remaining positionals shift. """ def __init__( self, - message: str, - *, + *args: Any, error_type: str | None = None, error_code: str | None = None, **kwargs: Any, ) -> None: - super().__init__(message, **kwargs) + # Litellm `APIError(status_code, message, llm_provider, model)` + # vs arcllm `ProviderAPIError(message, provider, model, status_code)`. + if args and isinstance(args[0], int): + status_code = args[0] + message = args[1] if len(args) > 1 else "" + provider = args[2] if len(args) > 2 else None + model = args[3] if len(args) > 3 else None + kwargs.setdefault("status_code", status_code) + super().__init__(message, provider, model, **kwargs) + else: + super().__init__(*args, **kwargs) self.error_type = error_type self.error_code = error_code @@ -239,7 +259,7 @@ def __init__( self.filter_reason = filter_reason -class InvalidRequestError(ArcLLMError): +class BadRequestError(ArcLLMError): """ Raised when the request is malformed or invalid. @@ -247,16 +267,41 @@ class InvalidRequestError(ArcLLMError): - Missing required parameters - Invalid parameter values - Malformed message format + + Litellm-compat: callers may construct this as + ``BadRequestError(message, model, llm_provider)`` (litellm signature + has ``model`` second). The base ``ArcLLMError`` has ``provider`` second. + We accept both shapes — if the second positional looks like a provider + name (registered in our provider list), treat it as ``provider``; + otherwise treat it as ``model``. """ def __init__( self, message: str, - *, + arg2: str | None = None, + arg3: str | None = None, + *args: Any, param: str | None = None, **kwargs: Any, ) -> None: - super().__init__(message, **kwargs) + # Disambiguate (provider, model) vs litellm's (model, llm_provider). + # Heuristic: if arg2 is a known provider name and arg3 isn't, use + # arcllm's order. If arg3 is a known provider and arg2 isn't, use + # litellm's (model, llm_provider) order. Falls back to arcllm's order. + if arg2 is not None and arg3 is not None: + from arcllm.providers.base import SUPPORTED_PROVIDERS + + if arg2 not in SUPPORTED_PROVIDERS and arg3 in SUPPORTED_PROVIDERS: + # Litellm shape: (message, model, llm_provider) + kwargs.setdefault("provider", arg3) + kwargs.setdefault("model", arg2) + else: + kwargs.setdefault("provider", arg2) + kwargs.setdefault("model", arg3) + elif arg2 is not None: + kwargs.setdefault("provider", arg2) + super().__init__(message, *args, **kwargs) self.param = param @@ -325,7 +370,7 @@ def map_status_code_to_exception( if status_code == 404: return UnsupportedModelError(message, status_code=status_code, **kwargs) if status_code == 400: - return InvalidRequestError(message, status_code=status_code, **kwargs) + return BadRequestError(message, status_code=status_code, **kwargs) if status_code == 408: return TimeoutError(message, status_code=status_code, **kwargs) if status_code == 503: @@ -345,4 +390,4 @@ def map_status_code_to_exception( # ``ProviderAPIError`` (the broader provider-error base) and # ``InvalidRequestError`` (400-class semantics) respectively. APIError = ProviderAPIError -BadRequestError = InvalidRequestError +InvalidRequestError = BadRequestError diff --git a/arcllm/tokens.py b/arcllm/tokens.py index bf9ce54..46f2493 100644 --- a/arcllm/tokens.py +++ b/arcllm/tokens.py @@ -159,6 +159,14 @@ def token_counter( models when ``arcllm-sdk[tokenize]`` is installed, otherwise a chars/4 heuristic with a one-time warning. + For ``messages`` lists, follows OpenAI's published per-message + overhead formula (3 tokens per message + 3 priming tokens for the + final assistant turn) so counts are comparable to litellm and to + OpenAI's own ``tiktoken`` cookbook examples. Without the overhead, + arcllm would systematically undercount and downstream callers + (notably dynamiq's history-summarisation logic) would preserve + more messages than the model's context window can actually hold. + Raises ``ValueError`` if both ``messages`` and ``text`` are missing. """ if messages is None and text is None: @@ -166,11 +174,42 @@ def token_counter( if messages is not None and text is not None: raise ValueError("token_counter accepts `messages` or `text`, not both") - payload = text if text is not None else _flatten_messages(messages or []) - - count = _count_text_with_tiktoken(payload, model) - if count is not None: - return count - - _warn_heuristic_once(model) - return _heuristic_count(payload) + if text is not None: + count = _count_text_with_tiktoken(text, model) + if count is not None: + return count + _warn_heuristic_once(model) + return _heuristic_count(text) + + # Messages path — count each field separately and add per-message + # overhead so the total matches OpenAI's chat-completion accounting + # (and litellm's, which uses the same formula). + msgs = messages or [] + per_message = 3 + per_name = 1 + total = 0 + for msg in msgs: + total += per_message + for key, value in msg.items(): + if value is None: + continue + if isinstance(value, str): + field_count = _count_text_with_tiktoken(value, model) + if field_count is None: + field_count = _heuristic_count(value) + total += field_count + else: + # Non-string fields (content arrays for vision, tool_calls + # JSON, etc.) — flatten to text and count. + flattened = _flatten_messages([{key: value}]) + field_count = _count_text_with_tiktoken(flattened, model) + if field_count is None: + field_count = _heuristic_count(flattened) + total += field_count + if key == "name": + total += per_name + # Priming tokens for the assistant's reply. + total += 3 + if _count_text_with_tiktoken("", model) is None: + _warn_heuristic_once(model) + return total diff --git a/arcllm/types.py b/arcllm/types.py index e0d8e9e..74f4100 100644 --- a/arcllm/types.py +++ b/arcllm/types.py @@ -274,12 +274,22 @@ def model_dump(self) -> dict[str, Any]: class Choice(_DictLike, msgspec.Struct): - """A single choice in a completion response.""" + """A single choice in a completion response. + + Litellm uses one class (``Choices``) for both non-streaming and + streaming responses — same object exposes ``.message`` for chat + completions and ``.delta`` for stream chunks. arcllm normally + separates these into :class:`Choice` (chat) and :class:`ChunkChoice` + (stream), but ``delta`` is exposed here too so litellm-style code + that uses ``ModelResponse`` for both modes keeps working. + """ index: int = 0 message: Message = msgspec.field(default_factory=Message) finish_reason: str | None = None logprobs: dict[str, Any] | None = None + # Litellm-compat: streaming code paths set ``.delta`` on a Choice. + delta: ChunkDelta | None = None def model_dump(self) -> dict[str, Any]: """Return dict representation for serialization.""" @@ -291,6 +301,8 @@ def model_dump(self) -> dict[str, Any]: result["finish_reason"] = self.finish_reason if self.logprobs is not None: result["logprobs"] = self.logprobs + if self.delta is not None: + result["delta"] = self.delta.model_dump() return result @@ -320,6 +332,11 @@ class ModelResponse(_DictLike, msgspec.Struct): choices: list[Choice] = msgspec.field(default_factory=lambda: [Choice()]) usage: Usage | None = None system_fingerprint: str | None = None + # Litellm-compat marker: callers construct ``ModelResponse(stream=True)`` + # to indicate the response represents a stream chunk. arcllm normally + # uses :class:`StreamChunk` for that, but accepting the kwarg keeps + # litellm fixtures working unchanged. + stream: bool = False # Extra fields for debugging/compatibility model_extra: dict[str, Any] = {} diff --git a/pyproject.toml b/pyproject.toml index 33a5f60..5ad8112 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "arcllm-sdk" -version = "0.4.6" +version = "0.4.8" description = "The arc connecting you to every LLM. Minimal dependencies, maximum performance." readme = "README.md" license = "Apache-2.0" From 4dab9eb704b8ab7958144454974fa8f6571fc83f Mon Sep 17 00:00:00 2001 From: vitalii-dynamiq Date: Thu, 7 May 2026 12:22:58 +0400 Subject: [PATCH 2/2] feat: unify reasoning_content + thinking_blocks across providers (v0.4.9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reasoning-capable models (DeepSeek-R1, GLM-4.5+, Anthropic Claude with extended thinking, Gemini 2.5 with includeThoughts, Groq DeepSeek/Qwen, Cerebras Qwen-thinking, Together / Fireworks DeepSeek-R1, OpenAI o-series via chat/completions) all expose chain-of-thought, but each family uses a different field name. Previously arcllm dropped this entirely on the floor — callers could see the final answer but not the thinking. This wires up a unified surface: - Message.reasoning_content: str — flat-string CoT, populated by every reasoning provider - Message.thinking_blocks: list[ThinkingBlock] — Anthropic's structured form (thinking | redacted_thinking, with signatures preserved for tool-use round-trips) - ChunkDelta.reasoning_content / .thinking / .signature — streaming deltas Provider mapping: - OpenAIAdapter (and DeepSeek, GLM, Groq, Cerebras, Together, Fireworks, Nebius, OVHcloud, Moonshot, OpenRouter, Perplexity — all subclasses): reads message.reasoning_content or message.reasoning from the response; same for delta.reasoning_content / .reasoning in stream events. - AnthropicAdapter: extracts content[].type=="thinking" and "redacted_thinking" blocks; populates both thinking_blocks (with signature) and a concatenated reasoning_content. Streaming handles thinking_delta / signature_delta with one block per signature. - GeminiAdapter: routes parts[].thought=true text into reasoning_content (non-thought parts stay in content). Same split for streaming. stream_chunk_builder accumulates reasoning across chunks and rebuilds Anthropic's per-block grouping (signature_delta closes a block). Verified live end-to-end through arcllm.completion: Z.AI GLM-4.5-air content="5" reasoning_len=730 DeepSeek-R1 content="5" reasoning_len=67 Claude Sonnet 4.5 content="5" reasoning_len=101 thinking_blocks=1 (sig) Gemini 2.5 Flash content="5" reasoning_len=406 Streaming verified for all four — Anthropic's thinking_delta + signature_delta correctly group into a single ThinkingBlock with the signature attached. 18 new unit tests cover wire-format parsing for every provider plus stream_chunk_builder. arcllm own suite: 792 passed (was 782). dynamiq integration suite unaffected: 1149 unit + 1066 integration, all passing. --- arcllm/__init__.py | 4 +- arcllm/core.py | 51 +++++ arcllm/providers/anthropic_adapter.py | 82 ++++++- arcllm/providers/gemini_adapter.py | 20 +- arcllm/providers/openai_adapter.py | 13 ++ arcllm/types.py | 63 ++++++ pyproject.toml | 2 +- tests/test_reasoning.py | 298 ++++++++++++++++++++++++++ 8 files changed, 524 insertions(+), 9 deletions(-) diff --git a/arcllm/__init__.py b/arcllm/__init__.py index da23918..7596cb7 100644 --- a/arcllm/__init__.py +++ b/arcllm/__init__.py @@ -69,7 +69,7 @@ from __future__ import annotations -__version__ = "0.4.8" +__version__ = "0.4.9" __all__ = [ "APIConnectionError", "APIError", @@ -102,6 +102,7 @@ "ServiceUnavailableError", "StreamChunk", "StreamingResponse", + "ThinkingBlock", "Timeout", "TimeoutError", "ToolCall", @@ -224,6 +225,7 @@ RerankResult, StreamChunk, StreamingResponse, + ThinkingBlock, ToolCall, Usage, ) diff --git a/arcllm/core.py b/arcllm/core.py index a8f1ce8..dc02538 100644 --- a/arcllm/core.py +++ b/arcllm/core.py @@ -30,6 +30,7 @@ ModelResponse, StreamChunk, StreamingResponse, + ThinkingBlock, ToolCall, Usage, ) @@ -548,6 +549,10 @@ def stream_chunk_builder( # Use specialized structure for better performance choice_roles: dict[int, str | None] = {} choice_content: dict[int, list[str]] = {} + choice_reasoning: dict[int, list[str]] = {} + # Anthropic-style: per-choice ordered list of (thinking_text, signature) + # blocks rebuilt from the stream so callers can replay them as input. + choice_thinking_blocks: dict[int, list[list[str]]] = {} choice_tool_calls: dict[ int, dict[int, list[Any]] ] = {} # idx -> tc_idx -> [id, type, name_parts, arg_parts] @@ -571,6 +576,8 @@ def stream_chunk_builder( if idx not in choice_content: choice_roles[idx] = None choice_content[idx] = [] + choice_reasoning[idx] = [] + choice_thinking_blocks[idx] = [] choice_tool_calls[idx] = {} choice_finish[idx] = None choice_logprobs[idx] = None @@ -584,6 +591,26 @@ def stream_chunk_builder( if delta_content: choice_content[idx].append(delta_content) + # Reasoning (DeepSeek/GLM/o-series style — flat string deltas). + delta_reasoning = delta.reasoning_content + if delta_reasoning: + choice_reasoning[idx].append(delta_reasoning) + + # Anthropic-style thinking deltas — group by current open block. + # A new block starts whenever a thinking delta arrives after a + # signature delta (or first thinking delta of the stream). + delta_thinking = delta.thinking + delta_signature = delta.signature + if delta_thinking is not None or delta_signature is not None: + blocks = choice_thinking_blocks[idx] + if not blocks or (blocks and blocks[-1][1]): + # Last block is closed (has signature) — start a new one. + blocks.append(["", ""]) + if delta_thinking: + blocks[-1][0] += delta_thinking + if delta_signature: + blocks[-1][1] = delta_signature + choice_finish_reason = choice.finish_reason if choice_finish_reason: choice_finish[idx] = choice_finish_reason @@ -645,10 +672,34 @@ def stream_chunk_builder( content_parts = choice_content[idx] content = "".join(content_parts) if content_parts else None + reasoning_parts = choice_reasoning[idx] + reasoning_content = "".join(reasoning_parts) if reasoning_parts else None + + thinking_blocks_assembled: list[ThinkingBlock] | None = None + if choice_thinking_blocks[idx]: + thinking_blocks_assembled = [ + ThinkingBlock( + type="thinking", + thinking=text, + signature=sig or None, + ) + for text, sig in choice_thinking_blocks[idx] + if text or sig + ] or None + # Fallback to populate the flat surface when only thinking blocks + # arrived (Anthropic) — concatenate their text so callers reading + # ``reasoning_content`` see the same string regardless of provider. + if reasoning_content is None and thinking_blocks_assembled is not None: + reasoning_content = ( + "".join(b.thinking or "" for b in thinking_blocks_assembled) or None + ) + message = Message( role=choice_roles[idx] or "assistant", content=content, tool_calls=tool_calls or None, + reasoning_content=reasoning_content, + thinking_blocks=thinking_blocks_assembled, ) choices.append( diff --git a/arcllm/providers/anthropic_adapter.py b/arcllm/providers/anthropic_adapter.py index 9db63e8..3a20591 100644 --- a/arcllm/providers/anthropic_adapter.py +++ b/arcllm/providers/anthropic_adapter.py @@ -97,6 +97,7 @@ Message, ModelResponse, StreamChunk, + ThinkingBlock, ToolCall, Usage, ) @@ -450,9 +451,11 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon now = int(time.time()) content_blocks = resp.get("content", []) - # Extract text content and tool uses + # Extract text content, tool uses, and thinking blocks text_parts: list[str] = [] tool_calls: list[ToolCall] = [] + thinking_blocks: list[ThinkingBlock] = [] + thinking_text_parts: list[str] = [] # Citations are sourced from two places in Anthropic responses: # - ``web_search_tool_result`` blocks: aggregate result list with # ``url`` / ``title`` / ``snippet`` per source. @@ -485,11 +488,29 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon end_index=ann_dict.get("end_index") or ann_dict.get("end_char_index"), ) - # Second pass: tool uses + web_search_tool_result fallback (only fills - # URLs that the text-block annotations didn't already cover). + # Second pass: tool uses, thinking blocks, and web_search_tool_result + # fallback (only fills URLs the text-block annotations didn't cover). for block in content_blocks: kind = block.get("type") - if kind == "tool_use": + if kind == "thinking": + thinking_text = block.get("thinking", "") + thinking_blocks.append( + ThinkingBlock( + type="thinking", + thinking=thinking_text, + signature=block.get("signature"), + ) + ) + if thinking_text: + thinking_text_parts.append(thinking_text) + elif kind == "redacted_thinking": + thinking_blocks.append( + ThinkingBlock( + type="redacted_thinking", + data=block.get("data"), + ) + ) + elif kind == "tool_use": tool_calls.append( ToolCall( id=block.get("id", ""), @@ -518,12 +539,15 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon # Join text parts efficiently text_content = "".join(text_parts) if text_parts else None citations = list(citation_index.values()) if citation_index else None + reasoning_content = "".join(thinking_text_parts) if thinking_text_parts else None message = Message( role=resp.get("role", "assistant"), content=text_content, tool_calls=tool_calls or None, citations=citations, + reasoning_content=reasoning_content, + thinking_blocks=thinking_blocks or None, ) # Map Anthropic stop reasons to OpenAI format @@ -617,6 +641,21 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None: ) ], ) + if block.get("type") == "thinking": + # Anthropic emits an empty thinking block first, then a + # series of thinking_delta events with the text, then a + # signature_delta with the cryptographic signature. + return StreamChunk( + id="", + model=model, + choices=[ + ChunkChoice( + index=0, + delta=ChunkDelta(thinking=block.get("thinking", "")), + finish_reason=None, + ) + ], + ) if block.get("type") == "tool_use": # Start of tool use return StreamChunk( @@ -645,7 +684,8 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None: elif event_type == "content_block_delta": delta = event.get("delta", {}) - if delta.get("type") == "text_delta": + delta_type = delta.get("type") + if delta_type == "text_delta": return StreamChunk( id="", model=model, @@ -657,6 +697,38 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None: ) ], ) + if delta_type == "thinking_delta": + # Surface as both ``thinking`` (matches Anthropic wire shape + # for round-trip) and ``reasoning_content`` (so callers using + # the unified surface can stream thinking text without a + # provider-specific code path). + thinking_text = delta.get("thinking", "") + return StreamChunk( + id="", + model=model, + choices=[ + ChunkChoice( + index=0, + delta=ChunkDelta( + thinking=thinking_text, + reasoning_content=thinking_text, + ), + finish_reason=None, + ) + ], + ) + if delta_type == "signature_delta": + return StreamChunk( + id="", + model=model, + choices=[ + ChunkChoice( + index=0, + delta=ChunkDelta(signature=delta.get("signature", "")), + finish_reason=None, + ) + ], + ) if delta.get("type") == "input_json_delta": # Tool argument delta return StreamChunk( diff --git a/arcllm/providers/gemini_adapter.py b/arcllm/providers/gemini_adapter.py index 111a007..785118e 100644 --- a/arcllm/providers/gemini_adapter.py +++ b/arcllm/providers/gemini_adapter.py @@ -420,11 +420,19 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon # Use list + join for efficient string building text_parts: list[str] = [] + thought_parts: list[str] = [] tool_calls: list[ToolCall] = [] for part in parts: if "text" in part: - text_parts.append(part["text"]) + # Gemini marks chain-of-thought parts with ``thought: true`` + # when the request set ``thinkingConfig.includeThoughts``. + # We split those out into ``reasoning_content`` so callers + # don't have to filter them out of the answer text. + if part.get("thought"): + thought_parts.append(part["text"]) + else: + text_parts.append(part["text"]) elif "functionCall" in part: fc = part["functionCall"] tool_calls.append( @@ -439,12 +447,14 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon ) text_content = "".join(text_parts) if text_parts else None + reasoning_content = "".join(thought_parts) if thought_parts else None citations = _extract_grounding_citations(candidate) message = Message( role="assistant", content=text_content, tool_calls=tool_calls or None, citations=citations, + reasoning_content=reasoning_content, ) # Map finish reason @@ -507,11 +517,15 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None: # Use list + join for efficient string building text_parts: list[str] = [] + thought_parts: list[str] = [] tool_call_deltas: list[dict[str, Any]] = [] for part in parts: if "text" in part: - text_parts.append(part["text"]) + if part.get("thought"): + thought_parts.append(part["text"]) + else: + text_parts.append(part["text"]) elif "functionCall" in part: fc = part["functionCall"] tool_call_deltas.append( @@ -527,9 +541,11 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None: ) text_content = "".join(text_parts) if text_parts else None + reasoning_content = "".join(thought_parts) if thought_parts else None delta = ChunkDelta( content=text_content, tool_calls=tool_call_deltas or None, + reasoning_content=reasoning_content, ) finish_reason = None diff --git a/arcllm/providers/openai_adapter.py b/arcllm/providers/openai_adapter.py index 823899c..3e75e18 100644 --- a/arcllm/providers/openai_adapter.py +++ b/arcllm/providers/openai_adapter.py @@ -231,12 +231,23 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon arguments=fc.get("arguments", ""), ) + # ``reasoning_content`` is the de-facto field name used by + # DeepSeek-R1, GLM-4.5+, Groq's DeepSeek/Qwen-thinking models, + # Cerebras, Together, Fireworks, and any OpenAI-compat host + # serving a reasoning model. ``reasoning`` is the alias + # OpenAI ships on the chat-completions endpoint for o-series + # responses; we accept either and normalise to one field. + reasoning_content = message_data.get("reasoning_content") or message_data.get( + "reasoning" + ) + message = Message( role=message_data.get("role", "assistant"), content=message_data.get("content"), tool_calls=tool_calls, function_call=function_call, refusal=message_data.get("refusal"), + reasoning_content=reasoning_content, ) choices.append( @@ -303,6 +314,8 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None: content=delta_data.get("content"), tool_calls=tool_calls, function_call=delta_data.get("function_call"), + reasoning_content=delta_data.get("reasoning_content") + or delta_data.get("reasoning"), ) choices.append( diff --git a/arcllm/types.py b/arcllm/types.py index 74f4100..1ebe061 100644 --- a/arcllm/types.py +++ b/arcllm/types.py @@ -186,6 +186,44 @@ def model_dump(self) -> dict[str, Any]: return result +# ============================================================================= +# Reasoning / Thinking Types +# ============================================================================= + + +class ThinkingBlock(_DictLike, msgspec.Struct): + """A chain-of-thought block from a reasoning-capable model. + + Anthropic's extended-thinking feature returns thinking as structured + content blocks (``type: "thinking"`` with a ``signature`` for replay, + or ``type: "redacted_thinking"`` with opaque ``data``). Preserving the + block shape — instead of flattening to a string — lets callers send + the thinking back as part of the conversation history without losing + the signature, which is required for tool-use round trips. + + For providers that emit thinking as a flat string (DeepSeek-R1, GLM, + Groq DeepSeek/Qwen-thinking, Gemini with ``includeThoughts=true``) + we populate :attr:`Message.reasoning_content` directly instead. + """ + + type: Literal["thinking", "redacted_thinking"] = "thinking" + thinking: str | None = None + signature: str | None = None + # Anthropic-specific: opaque payload for ``redacted_thinking`` blocks. + data: str | None = None + + def model_dump(self) -> dict[str, Any]: + """Return dict representation for serialization.""" + result: dict[str, Any] = {"type": self.type} + if self.thinking is not None: + result["thinking"] = self.thinking + if self.signature is not None: + result["signature"] = self.signature + if self.data is not None: + result["data"] = self.data + return result + + # ============================================================================= # Message Types # ============================================================================= @@ -204,6 +242,14 @@ class Message(_DictLike, msgspec.Struct): # responses; an empty list means "the provider was asked to ground but # returned no sources" (rare). citations: list[Citation] | None = None + # Chain-of-thought / extended-thinking output from reasoning models. + # ``reasoning_content`` is the unified flat-string surface (populated + # by DeepSeek-R1, GLM-4.5+, Gemini 2.5+ with includeThoughts, OpenAI + # o-series via chat/completions when supported, etc.). For Anthropic + # extended thinking we also populate ``thinking_blocks`` so callers + # can send the structured form back with signatures intact. + reasoning_content: str | None = None + thinking_blocks: list[ThinkingBlock] | None = None def model_dump(self) -> dict[str, Any]: """Return dict representation for serialization.""" @@ -218,6 +264,10 @@ def model_dump(self) -> dict[str, Any]: result["refusal"] = self.refusal if self.citations is not None: result["citations"] = [c.model_dump() for c in self.citations] + if self.reasoning_content is not None: + result["reasoning_content"] = self.reasoning_content + if self.thinking_blocks is not None: + result["thinking_blocks"] = [b.model_dump() for b in self.thinking_blocks] return result @@ -372,6 +422,13 @@ class ChunkDelta(_DictLike, msgspec.Struct): # for grounded providers — Perplexity, Gemini grounding, Anthropic # web-search). None on intermediate chunks. citations: list[Citation] | None = None + # Reasoning deltas. ``reasoning_content`` is the flat-string surface + # (DeepSeek-R1, GLM, Groq, etc.). ``thinking`` carries the per-chunk + # text of an Anthropic ``thinking_delta`` event; the matching + # ``signature`` lands in the trailing ``signature_delta``. + reasoning_content: str | None = None + thinking: str | None = None + signature: str | None = None def model_dump(self) -> dict[str, Any]: """Return dict representation for serialization.""" @@ -386,6 +443,12 @@ def model_dump(self) -> dict[str, Any]: result["function_call"] = self.function_call if self.citations is not None: result["citations"] = [c.model_dump() for c in self.citations] + if self.reasoning_content is not None: + result["reasoning_content"] = self.reasoning_content + if self.thinking is not None: + result["thinking"] = self.thinking + if self.signature is not None: + result["signature"] = self.signature return result diff --git a/pyproject.toml b/pyproject.toml index 5ad8112..d5ade71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "arcllm-sdk" -version = "0.4.8" +version = "0.4.9" description = "The arc connecting you to every LLM. Minimal dependencies, maximum performance." readme = "README.md" license = "Apache-2.0" diff --git a/tests/test_reasoning.py b/tests/test_reasoning.py index 23aed12..d3c6196 100644 --- a/tests/test_reasoning.py +++ b/tests/test_reasoning.py @@ -173,3 +173,301 @@ def test_no_thinking_args_leaves_config_clean(self, gemini_adapter: GeminiAdapte # exist, it must not carry thinkingConfig. if "generationConfig" in body: assert "thinkingConfig" not in body["generationConfig"] + + +# --------------------------------------------------------------------------- +# Response-side: reasoning_content / thinking_blocks parsing +# --------------------------------------------------------------------------- +# +# Reasoning models expose chain-of-thought differently per family. arcllm +# normalises everything into ``Message.reasoning_content`` (flat str). For +# Anthropic we additionally keep ``Message.thinking_blocks`` so callers can +# replay the structured form (with signatures) on the next turn. + + +class TestReasoningResponseExtraction: + """``parse_response`` populates ``reasoning_content`` + ``thinking_blocks``.""" + + def test_openai_reasoning_field_is_extracted(self, openai_adapter: OpenAIAdapter) -> None: + """OpenAI o-series chat/completions can return ``reasoning`` on message.""" + body = orjson.dumps( + { + "id": "x", + "model": "o3-mini", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "42", + "reasoning": "Counting Hitchhiker references...", + }, + "finish_reason": "stop", + } + ], + } + ) + resp = openai_adapter.parse_response(body, model="o3-mini") + msg = resp.choices[0].message + assert msg.content == "42" + assert msg.reasoning_content == "Counting Hitchhiker references..." + + def test_deepseek_style_reasoning_content_is_extracted( + self, openai_adapter: OpenAIAdapter + ) -> None: + """DeepSeek-R1 / GLM / Groq DeepSeek / Together / Fireworks DeepSeek-R1 + all use the ``reasoning_content`` field. Test through the OpenAI base + since every OpenAI-compat host inherits this parser.""" + body = orjson.dumps( + { + "id": "x", + "model": "deepseek-reasoner", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "ok", + "reasoning_content": "Let me think... yes, ok.", + }, + "finish_reason": "stop", + } + ], + } + ) + resp = openai_adapter.parse_response(body, model="deepseek-reasoner") + msg = resp.choices[0].message + assert msg.content == "ok" + assert msg.reasoning_content == "Let me think... yes, ok." + + def test_anthropic_thinking_blocks_preserved_with_signature( + self, anthropic_adapter: AnthropicAdapter + ) -> None: + """Anthropic extended-thinking returns structured blocks. The signature + must round-trip — replaying without it breaks tool-use.""" + body = orjson.dumps( + { + "id": "msg_x", + "type": "message", + "role": "assistant", + "model": "claude-sonnet-4-7", + "stop_reason": "end_turn", + "content": [ + { + "type": "thinking", + "thinking": "User wants ok. Reply ok.", + "signature": "sig_abc123", + }, + {"type": "text", "text": "ok"}, + ], + "usage": {"input_tokens": 5, "output_tokens": 12}, + } + ) + resp = anthropic_adapter.parse_response(body, model="claude-sonnet-4-7") + msg = resp.choices[0].message + assert msg.content == "ok" + assert msg.reasoning_content == "User wants ok. Reply ok." + assert msg.thinking_blocks is not None + assert len(msg.thinking_blocks) == 1 + block = msg.thinking_blocks[0] + assert block.type == "thinking" + assert block.thinking == "User wants ok. Reply ok." + assert block.signature == "sig_abc123" + + def test_anthropic_redacted_thinking_block_preserves_opaque_data( + self, anthropic_adapter: AnthropicAdapter + ) -> None: + """``redacted_thinking`` blocks have no readable text — only an opaque + payload that must round-trip back unchanged. They surface on + ``thinking_blocks`` but contribute nothing to ``reasoning_content``.""" + body = orjson.dumps( + { + "id": "msg_x", + "role": "assistant", + "model": "claude-sonnet-4-7", + "stop_reason": "end_turn", + "content": [ + {"type": "redacted_thinking", "data": "OPAQUE_BLOB"}, + {"type": "text", "text": "ok"}, + ], + "usage": {"input_tokens": 5, "output_tokens": 1}, + } + ) + resp = anthropic_adapter.parse_response(body, model="claude-sonnet-4-7") + msg = resp.choices[0].message + assert msg.reasoning_content is None + assert msg.thinking_blocks is not None + assert msg.thinking_blocks[0].type == "redacted_thinking" + assert msg.thinking_blocks[0].data == "OPAQUE_BLOB" + + def test_gemini_thought_parts_route_to_reasoning_content( + self, gemini_adapter: GeminiAdapter + ) -> None: + """Gemini 2.5+ marks chain-of-thought parts with ``thought: true``. + + Without this split, the thought text would land in ``content`` and + the caller would have to filter it out manually.""" + body = orjson.dumps( + { + "candidates": [ + { + "content": { + "parts": [ + {"text": "User wants ok.", "thought": True}, + {"text": "ok"}, + ] + }, + "finishReason": "STOP", + } + ], + "usageMetadata": {"promptTokenCount": 5, "candidatesTokenCount": 1}, + } + ) + resp = gemini_adapter.parse_response(body, model="gemini-2.5-pro") + msg = resp.choices[0].message + assert msg.content == "ok" + assert msg.reasoning_content == "User wants ok." + + def test_non_reasoning_response_leaves_fields_none(self, openai_adapter: OpenAIAdapter) -> None: + """Regular chat responses (no reasoning fields) must not invent them.""" + body = orjson.dumps( + { + "id": "x", + "model": "gpt-4o-mini", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "ok"}, + "finish_reason": "stop", + } + ], + } + ) + resp = openai_adapter.parse_response(body, model="gpt-4o-mini") + msg = resp.choices[0].message + assert msg.reasoning_content is None + assert msg.thinking_blocks is None + + +class TestReasoningStreamAccumulation: + """``stream_chunk_builder`` accumulates reasoning across chunks.""" + + def test_flat_reasoning_content_accumulates(self) -> None: + """DeepSeek/GLM/Groq style: reasoning_content arrives in deltas.""" + from arcllm.core import stream_chunk_builder + from arcllm.types import ChunkChoice, ChunkDelta, StreamChunk + + chunks = [ + StreamChunk( + id="x", + model="deepseek-reasoner", + choices=[ChunkChoice(index=0, delta=ChunkDelta(role="assistant"))], + ), + StreamChunk( + id="x", + model="deepseek-reasoner", + choices=[ChunkChoice(index=0, delta=ChunkDelta(reasoning_content="Let me "))], + ), + StreamChunk( + id="x", + model="deepseek-reasoner", + choices=[ChunkChoice(index=0, delta=ChunkDelta(reasoning_content="think."))], + ), + StreamChunk( + id="x", + model="deepseek-reasoner", + choices=[ + ChunkChoice(index=0, delta=ChunkDelta(content="ok"), finish_reason="stop") + ], + ), + ] + final = stream_chunk_builder(chunks) + msg = final.choices[0].message + assert msg.content == "ok" + assert msg.reasoning_content == "Let me think." + assert msg.thinking_blocks is None + + def test_anthropic_thinking_deltas_grouped_by_signature(self) -> None: + """Anthropic streaming: thinking_delta → thinking_delta → signature_delta + is one block. The next thinking_delta opens a new block.""" + from arcllm.core import stream_chunk_builder + from arcllm.types import ChunkChoice, ChunkDelta, StreamChunk + + chunks = [ + StreamChunk( + id="x", + model="claude-sonnet-4-7", + choices=[ChunkChoice(index=0, delta=ChunkDelta(role="assistant"))], + ), + StreamChunk( + id="x", + model="claude-sonnet-4-7", + choices=[ + ChunkChoice( + index=0, + delta=ChunkDelta(thinking="User wants ", reasoning_content="User wants "), + ) + ], + ), + StreamChunk( + id="x", + model="claude-sonnet-4-7", + choices=[ + ChunkChoice( + index=0, + delta=ChunkDelta(thinking="ok.", reasoning_content="ok."), + ) + ], + ), + StreamChunk( + id="x", + model="claude-sonnet-4-7", + choices=[ChunkChoice(index=0, delta=ChunkDelta(signature="sig_abc"))], + ), + StreamChunk( + id="x", + model="claude-sonnet-4-7", + choices=[ + ChunkChoice(index=0, delta=ChunkDelta(content="ok"), finish_reason="stop") + ], + ), + ] + final = stream_chunk_builder(chunks) + msg = final.choices[0].message + assert msg.content == "ok" + assert msg.reasoning_content == "User wants ok." + assert msg.thinking_blocks is not None + assert len(msg.thinking_blocks) == 1 + assert msg.thinking_blocks[0].thinking == "User wants ok." + assert msg.thinking_blocks[0].signature == "sig_abc" + + +class TestReasoningSerialization: + """``Message.model_dump`` round-trips reasoning fields.""" + + def test_dump_includes_reasoning_when_set(self) -> None: + from arcllm.types import Message, ThinkingBlock + + msg = Message( + role="assistant", + content="ok", + reasoning_content="thinking text", + thinking_blocks=[ + ThinkingBlock(type="thinking", thinking="thinking text", signature="s") + ], + ) + dumped = msg.model_dump() + assert dumped["reasoning_content"] == "thinking text" + assert dumped["thinking_blocks"] == [ + {"type": "thinking", "thinking": "thinking text", "signature": "s"} + ] + + def test_dump_omits_reasoning_when_absent(self) -> None: + """Don't emit empty reasoning fields — keeps the serialised shape lean + and matches OpenAI/litellm behaviour for non-reasoning responses.""" + from arcllm.types import Message + + msg = Message(role="assistant", content="ok") + dumped = msg.model_dump() + assert "reasoning_content" not in dumped + assert "thinking_blocks" not in dumped