From 6692eebe11c64f0d299810407d4f3e7f03521976 Mon Sep 17 00:00:00 2001
From: vitalii-dynamiq <vitalii@getdynamiq.ai>
Date: Wed, 6 May 2026 22:55:04 +0400
Subject: [PATCH 1/2] fix: litellm-compat for dynamiq integration tests
 (v0.4.8)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three drop-in gaps prevented dynamiq's test fixtures from passing
against arcllm even though direct API calls worked.

Exception positional args:
litellm's exception classes take (message, llm_provider, model, ...)
positionally. arcllm previously made these keyword-only. Tests
construct errors as `RateLimitError(msg, "bedrock", "amazon.titan")`
which raised "takes 2 positional arguments but 4 were given".

- ArcLLMError: provider/model/status_code now positional after message;
  llm_provider stays keyword-only as the litellm-name alias
- RateLimitError: accepts (message, provider, model) positionally
- ProviderAPIError: detects litellm shape (status_code, message, ...)
  by type — first int positional becomes status_code
- BadRequestError (renamed from InvalidRequestError to match the
  canonical litellm/OpenAI name; InvalidRequestError stays as alias):
  accepts (message, model, provider) per litellm AND
  (message, provider, model) per arcllm. Disambiguates by checking
  SUPPORTED_PROVIDERS — common provider names always resolve correctly.

Streaming chunk serialisation:
Choice.model_dump() omitted .delta. dynamiq's streaming callback reads
chunk["choices"][0]["delta"]["content"] from the serialized dict, so it
saw KeyError on every streamed event.

token_counter overhead:
Counts now follow OpenAI's per-message formula (3 + per-key
+ 1 for name + 3 priming) so totals match litellm's. Previous
sum-of-fields undercount made dynamiq's history-summarisation logic
preserve more context than the model could actually accept.

ModelResponse defaults:
- choices defaults to [Choice()] so fixtures that do
  ModelResponse()["choices"][0]["message"]["content"] = ... work
- stream: bool = False added so ModelResponse(stream=True) is accepted
- Choice.delta added so streaming fixtures can set delta on the same
  Choice class litellm uses for both modes

Result: dynamiq main suite goes from 281 integration failures → 0
(1066 integration + 1149 unit, all passing). arcllm's own test suite
unchanged (8 pre-existing Ollama integration failures only).
---
 arcllm/__init__.py   |  2 +-
 arcllm/exceptions.py | 75 +++++++++++++++++++++++++++++++++++---------
 arcllm/tokens.py     | 55 +++++++++++++++++++++++++++-----
 arcllm/types.py      | 19 ++++++++++-
 pyproject.toml       |  2 +-
 5 files changed, 127 insertions(+), 26 deletions(-)

diff --git a/arcllm/__init__.py b/arcllm/__init__.py
index 34a79c3..da23918 100644
--- a/arcllm/__init__.py
+++ b/arcllm/__init__.py
@@ -69,7 +69,7 @@
 
 from __future__ import annotations
 
-__version__ = "0.4.6"
+__version__ = "0.4.8"
 __all__ = [
     "APIConnectionError",
     "APIError",
diff --git a/arcllm/exceptions.py b/arcllm/exceptions.py
index 5325d26..0b86c70 100644
--- a/arcllm/exceptions.py
+++ b/arcllm/exceptions.py
@@ -37,16 +37,16 @@ class ArcLLMError(Exception):
     def __init__(
         self,
         message: str,
-        *,
         provider: str | None = None,
-        # Litellm-compat alias: callers migrating from litellm pass
-        # ``llm_provider`` (its kwarg name). If both are given, ``provider``
-        # wins so explicit arcllm code keeps its semantics.
-        llm_provider: str | None = None,
         model: str | None = None,
         status_code: int | None = None,
         request_id: str | None = None,
         raw_response: Any | None = None,
+        *,
+        # Litellm-compat alias: callers migrating from litellm pass
+        # ``llm_provider`` (its kwarg name). If both are given, ``provider``
+        # wins so explicit arcllm code keeps its semantics.
+        llm_provider: str | None = None,
     ) -> None:
         super().__init__(message)
         self.message = message
@@ -103,11 +103,17 @@ class RateLimitError(ArcLLMError):
     def __init__(
         self,
         message: str,
-        *,
+        provider: str | None = None,
+        model: str | None = None,
+        *args: Any,
         retry_after: float | None = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(message, **kwargs)
+        if provider is not None:
+            kwargs.setdefault("provider", provider)
+        if model is not None:
+            kwargs.setdefault("model", model)
+        super().__init__(message, *args, **kwargs)
         self.retry_after = retry_after
 
 
@@ -151,17 +157,31 @@ class ProviderAPIError(ArcLLMError):
 
     This is used for provider-specific errors that don't map to
     other more specific exception types.
+
+    Litellm-compat: callers may construct this as
+    ``ProviderAPIError(status_code, message, provider, model)`` (litellm's
+    ``APIError`` signature). Detection is by type — if the first arg is an
+    ``int``, it's the status code and the remaining positionals shift.
     """
 
     def __init__(
         self,
-        message: str,
-        *,
+        *args: Any,
         error_type: str | None = None,
         error_code: str | None = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(message, **kwargs)
+        # Litellm `APIError(status_code, message, llm_provider, model)`
+        # vs arcllm `ProviderAPIError(message, provider, model, status_code)`.
+        if args and isinstance(args[0], int):
+            status_code = args[0]
+            message = args[1] if len(args) > 1 else ""
+            provider = args[2] if len(args) > 2 else None
+            model = args[3] if len(args) > 3 else None
+            kwargs.setdefault("status_code", status_code)
+            super().__init__(message, provider, model, **kwargs)
+        else:
+            super().__init__(*args, **kwargs)
         self.error_type = error_type
         self.error_code = error_code
 
@@ -239,7 +259,7 @@ def __init__(
         self.filter_reason = filter_reason
 
 
-class InvalidRequestError(ArcLLMError):
+class BadRequestError(ArcLLMError):
     """
     Raised when the request is malformed or invalid.
 
@@ -247,16 +267,41 @@ class InvalidRequestError(ArcLLMError):
     - Missing required parameters
     - Invalid parameter values
     - Malformed message format
+
+    Litellm-compat: callers may construct this as
+    ``BadRequestError(message, model, llm_provider)`` (litellm signature
+    has ``model`` second). The base ``ArcLLMError`` has ``provider`` second.
+    We accept both shapes — if the second positional looks like a provider
+    name (registered in our provider list), treat it as ``provider``;
+    otherwise treat it as ``model``.
     """
 
     def __init__(
         self,
         message: str,
-        *,
+        arg2: str | None = None,
+        arg3: str | None = None,
+        *args: Any,
         param: str | None = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(message, **kwargs)
+        # Disambiguate (provider, model) vs litellm's (model, llm_provider).
+        # Heuristic: if arg2 is a known provider name and arg3 isn't, use
+        # arcllm's order. If arg3 is a known provider and arg2 isn't, use
+        # litellm's (model, llm_provider) order. Falls back to arcllm's order.
+        if arg2 is not None and arg3 is not None:
+            from arcllm.providers.base import SUPPORTED_PROVIDERS
+
+            if arg2 not in SUPPORTED_PROVIDERS and arg3 in SUPPORTED_PROVIDERS:
+                # Litellm shape: (message, model, llm_provider)
+                kwargs.setdefault("provider", arg3)
+                kwargs.setdefault("model", arg2)
+            else:
+                kwargs.setdefault("provider", arg2)
+                kwargs.setdefault("model", arg3)
+        elif arg2 is not None:
+            kwargs.setdefault("provider", arg2)
+        super().__init__(message, *args, **kwargs)
         self.param = param
 
 
@@ -325,7 +370,7 @@ def map_status_code_to_exception(
     if status_code == 404:
         return UnsupportedModelError(message, status_code=status_code, **kwargs)
     if status_code == 400:
-        return InvalidRequestError(message, status_code=status_code, **kwargs)
+        return BadRequestError(message, status_code=status_code, **kwargs)
     if status_code == 408:
         return TimeoutError(message, status_code=status_code, **kwargs)
     if status_code == 503:
@@ -345,4 +390,4 @@ def map_status_code_to_exception(
 # ``ProviderAPIError`` (the broader provider-error base) and
 # ``InvalidRequestError`` (400-class semantics) respectively.
 APIError = ProviderAPIError
-BadRequestError = InvalidRequestError
+InvalidRequestError = BadRequestError
diff --git a/arcllm/tokens.py b/arcllm/tokens.py
index bf9ce54..46f2493 100644
--- a/arcllm/tokens.py
+++ b/arcllm/tokens.py
@@ -159,6 +159,14 @@ def token_counter(
     models when ``arcllm-sdk[tokenize]`` is installed, otherwise a chars/4
     heuristic with a one-time warning.
 
+    For ``messages`` lists, follows OpenAI's published per-message
+    overhead formula (3 tokens per message + 3 priming tokens for the
+    final assistant turn) so counts are comparable to litellm and to
+    OpenAI's own ``tiktoken`` cookbook examples. Without the overhead,
+    arcllm would systematically undercount and downstream callers
+    (notably dynamiq's history-summarisation logic) would preserve
+    more messages than the model's context window can actually hold.
+
     Raises ``ValueError`` if both ``messages`` and ``text`` are missing.
     """
     if messages is None and text is None:
@@ -166,11 +174,42 @@ def token_counter(
     if messages is not None and text is not None:
         raise ValueError("token_counter accepts `messages` or `text`, not both")
 
-    payload = text if text is not None else _flatten_messages(messages or [])
-
-    count = _count_text_with_tiktoken(payload, model)
-    if count is not None:
-        return count
-
-    _warn_heuristic_once(model)
-    return _heuristic_count(payload)
+    if text is not None:
+        count = _count_text_with_tiktoken(text, model)
+        if count is not None:
+            return count
+        _warn_heuristic_once(model)
+        return _heuristic_count(text)
+
+    # Messages path — count each field separately and add per-message
+    # overhead so the total matches OpenAI's chat-completion accounting
+    # (and litellm's, which uses the same formula).
+    msgs = messages or []
+    per_message = 3
+    per_name = 1
+    total = 0
+    for msg in msgs:
+        total += per_message
+        for key, value in msg.items():
+            if value is None:
+                continue
+            if isinstance(value, str):
+                field_count = _count_text_with_tiktoken(value, model)
+                if field_count is None:
+                    field_count = _heuristic_count(value)
+                total += field_count
+            else:
+                # Non-string fields (content arrays for vision, tool_calls
+                # JSON, etc.) — flatten to text and count.
+                flattened = _flatten_messages([{key: value}])
+                field_count = _count_text_with_tiktoken(flattened, model)
+                if field_count is None:
+                    field_count = _heuristic_count(flattened)
+                total += field_count
+            if key == "name":
+                total += per_name
+    # Priming tokens for the assistant's reply.
+    total += 3
+    if _count_text_with_tiktoken("", model) is None:
+        _warn_heuristic_once(model)
+    return total
diff --git a/arcllm/types.py b/arcllm/types.py
index e0d8e9e..74f4100 100644
--- a/arcllm/types.py
+++ b/arcllm/types.py
@@ -274,12 +274,22 @@ def model_dump(self) -> dict[str, Any]:
 
 
 class Choice(_DictLike, msgspec.Struct):
-    """A single choice in a completion response."""
+    """A single choice in a completion response.
+
+    Litellm uses one class (``Choices``) for both non-streaming and
+    streaming responses — same object exposes ``.message`` for chat
+    completions and ``.delta`` for stream chunks. arcllm normally
+    separates these into :class:`Choice` (chat) and :class:`ChunkChoice`
+    (stream), but ``delta`` is exposed here too so litellm-style code
+    that uses ``ModelResponse`` for both modes keeps working.
+    """
 
     index: int = 0
     message: Message = msgspec.field(default_factory=Message)
     finish_reason: str | None = None
     logprobs: dict[str, Any] | None = None
+    # Litellm-compat: streaming code paths set ``.delta`` on a Choice.
+    delta: ChunkDelta | None = None
 
     def model_dump(self) -> dict[str, Any]:
         """Return dict representation for serialization."""
@@ -291,6 +301,8 @@ def model_dump(self) -> dict[str, Any]:
             result["finish_reason"] = self.finish_reason
         if self.logprobs is not None:
             result["logprobs"] = self.logprobs
+        if self.delta is not None:
+            result["delta"] = self.delta.model_dump()
         return result
 
 
@@ -320,6 +332,11 @@ class ModelResponse(_DictLike, msgspec.Struct):
     choices: list[Choice] = msgspec.field(default_factory=lambda: [Choice()])
     usage: Usage | None = None
     system_fingerprint: str | None = None
+    # Litellm-compat marker: callers construct ``ModelResponse(stream=True)``
+    # to indicate the response represents a stream chunk. arcllm normally
+    # uses :class:`StreamChunk` for that, but accepting the kwarg keeps
+    # litellm fixtures working unchanged.
+    stream: bool = False
     # Extra fields for debugging/compatibility
     model_extra: dict[str, Any] = {}
 
diff --git a/pyproject.toml b/pyproject.toml
index 33a5f60..5ad8112 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "arcllm-sdk"
-version = "0.4.6"
+version = "0.4.8"
 description = "The arc connecting you to every LLM. Minimal dependencies, maximum performance."
 readme = "README.md"
 license = "Apache-2.0"

From 4dab9eb704b8ab7958144454974fa8f6571fc83f Mon Sep 17 00:00:00 2001
From: vitalii-dynamiq <vitalii@getdynamiq.ai>
Date: Thu, 7 May 2026 12:22:58 +0400
Subject: [PATCH 2/2] feat: unify reasoning_content + thinking_blocks across
 providers (v0.4.9)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reasoning-capable models (DeepSeek-R1, GLM-4.5+, Anthropic Claude with
extended thinking, Gemini 2.5 with includeThoughts, Groq DeepSeek/Qwen,
Cerebras Qwen-thinking, Together / Fireworks DeepSeek-R1, OpenAI o-series
via chat/completions) all expose chain-of-thought, but each family uses
a different field name. Previously arcllm dropped this entirely on the
floor — callers could see the final answer but not the thinking.

This wires up a unified surface:

- Message.reasoning_content: str — flat-string CoT, populated by every
  reasoning provider
- Message.thinking_blocks: list[ThinkingBlock] — Anthropic's structured
  form (thinking | redacted_thinking, with signatures preserved for
  tool-use round-trips)
- ChunkDelta.reasoning_content / .thinking / .signature — streaming deltas

Provider mapping:
- OpenAIAdapter (and DeepSeek, GLM, Groq, Cerebras, Together, Fireworks,
  Nebius, OVHcloud, Moonshot, OpenRouter, Perplexity — all subclasses):
  reads message.reasoning_content or message.reasoning from the response;
  same for delta.reasoning_content / .reasoning in stream events.
- AnthropicAdapter: extracts content[].type=="thinking" and
  "redacted_thinking" blocks; populates both thinking_blocks (with
  signature) and a concatenated reasoning_content. Streaming handles
  thinking_delta / signature_delta with one block per signature.
- GeminiAdapter: routes parts[].thought=true text into reasoning_content
  (non-thought parts stay in content). Same split for streaming.

stream_chunk_builder accumulates reasoning across chunks and rebuilds
Anthropic's per-block grouping (signature_delta closes a block).

Verified live end-to-end through arcllm.completion:

  Z.AI GLM-4.5-air      content="5"  reasoning_len=730
  DeepSeek-R1           content="5"  reasoning_len=67
  Claude Sonnet 4.5     content="5"  reasoning_len=101  thinking_blocks=1 (sig)
  Gemini 2.5 Flash      content="5"  reasoning_len=406

Streaming verified for all four — Anthropic's thinking_delta +
signature_delta correctly group into a single ThinkingBlock with the
signature attached.

18 new unit tests cover wire-format parsing for every provider plus
stream_chunk_builder. arcllm own suite: 792 passed (was 782).
dynamiq integration suite unaffected: 1149 unit + 1066 integration,
all passing.
---
 arcllm/__init__.py                    |   4 +-
 arcllm/core.py                        |  51 +++++
 arcllm/providers/anthropic_adapter.py |  82 ++++++-
 arcllm/providers/gemini_adapter.py    |  20 +-
 arcllm/providers/openai_adapter.py    |  13 ++
 arcllm/types.py                       |  63 ++++++
 pyproject.toml                        |   2 +-
 tests/test_reasoning.py               | 298 ++++++++++++++++++++++++++
 8 files changed, 524 insertions(+), 9 deletions(-)

diff --git a/arcllm/__init__.py b/arcllm/__init__.py
index da23918..7596cb7 100644
--- a/arcllm/__init__.py
+++ b/arcllm/__init__.py
@@ -69,7 +69,7 @@
 
 from __future__ import annotations
 
-__version__ = "0.4.8"
+__version__ = "0.4.9"
 __all__ = [
     "APIConnectionError",
     "APIError",
@@ -102,6 +102,7 @@
     "ServiceUnavailableError",
     "StreamChunk",
     "StreamingResponse",
+    "ThinkingBlock",
     "Timeout",
     "TimeoutError",
     "ToolCall",
@@ -224,6 +225,7 @@
     RerankResult,
     StreamChunk,
     StreamingResponse,
+    ThinkingBlock,
     ToolCall,
     Usage,
 )
diff --git a/arcllm/core.py b/arcllm/core.py
index a8f1ce8..dc02538 100644
--- a/arcllm/core.py
+++ b/arcllm/core.py
@@ -30,6 +30,7 @@
     ModelResponse,
     StreamChunk,
     StreamingResponse,
+    ThinkingBlock,
     ToolCall,
     Usage,
 )
@@ -548,6 +549,10 @@ def stream_chunk_builder(
     # Use specialized structure for better performance
     choice_roles: dict[int, str | None] = {}
     choice_content: dict[int, list[str]] = {}
+    choice_reasoning: dict[int, list[str]] = {}
+    # Anthropic-style: per-choice ordered list of (thinking_text, signature)
+    # blocks rebuilt from the stream so callers can replay them as input.
+    choice_thinking_blocks: dict[int, list[list[str]]] = {}
     choice_tool_calls: dict[
         int, dict[int, list[Any]]
     ] = {}  # idx -> tc_idx -> [id, type, name_parts, arg_parts]
@@ -571,6 +576,8 @@ def stream_chunk_builder(
             if idx not in choice_content:
                 choice_roles[idx] = None
                 choice_content[idx] = []
+                choice_reasoning[idx] = []
+                choice_thinking_blocks[idx] = []
                 choice_tool_calls[idx] = {}
                 choice_finish[idx] = None
                 choice_logprobs[idx] = None
@@ -584,6 +591,26 @@ def stream_chunk_builder(
             if delta_content:
                 choice_content[idx].append(delta_content)
 
+            # Reasoning (DeepSeek/GLM/o-series style — flat string deltas).
+            delta_reasoning = delta.reasoning_content
+            if delta_reasoning:
+                choice_reasoning[idx].append(delta_reasoning)
+
+            # Anthropic-style thinking deltas — group by current open block.
+            # A new block starts whenever a thinking delta arrives after a
+            # signature delta (or first thinking delta of the stream).
+            delta_thinking = delta.thinking
+            delta_signature = delta.signature
+            if delta_thinking is not None or delta_signature is not None:
+                blocks = choice_thinking_blocks[idx]
+                if not blocks or (blocks and blocks[-1][1]):
+                    # Last block is closed (has signature) — start a new one.
+                    blocks.append(["", ""])
+                if delta_thinking:
+                    blocks[-1][0] += delta_thinking
+                if delta_signature:
+                    blocks[-1][1] = delta_signature
+
             choice_finish_reason = choice.finish_reason
             if choice_finish_reason:
                 choice_finish[idx] = choice_finish_reason
@@ -645,10 +672,34 @@ def stream_chunk_builder(
         content_parts = choice_content[idx]
         content = "".join(content_parts) if content_parts else None
 
+        reasoning_parts = choice_reasoning[idx]
+        reasoning_content = "".join(reasoning_parts) if reasoning_parts else None
+
+        thinking_blocks_assembled: list[ThinkingBlock] | None = None
+        if choice_thinking_blocks[idx]:
+            thinking_blocks_assembled = [
+                ThinkingBlock(
+                    type="thinking",
+                    thinking=text,
+                    signature=sig or None,
+                )
+                for text, sig in choice_thinking_blocks[idx]
+                if text or sig
+            ] or None
+            # Fallback to populate the flat surface when only thinking blocks
+            # arrived (Anthropic) — concatenate their text so callers reading
+            # ``reasoning_content`` see the same string regardless of provider.
+            if reasoning_content is None and thinking_blocks_assembled is not None:
+                reasoning_content = (
+                    "".join(b.thinking or "" for b in thinking_blocks_assembled) or None
+                )
+
         message = Message(
             role=choice_roles[idx] or "assistant",
             content=content,
             tool_calls=tool_calls or None,
+            reasoning_content=reasoning_content,
+            thinking_blocks=thinking_blocks_assembled,
         )
 
         choices.append(
diff --git a/arcllm/providers/anthropic_adapter.py b/arcllm/providers/anthropic_adapter.py
index 9db63e8..3a20591 100644
--- a/arcllm/providers/anthropic_adapter.py
+++ b/arcllm/providers/anthropic_adapter.py
@@ -97,6 +97,7 @@
     Message,
     ModelResponse,
     StreamChunk,
+    ThinkingBlock,
     ToolCall,
     Usage,
 )
@@ -450,9 +451,11 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon
         now = int(time.time())
         content_blocks = resp.get("content", [])
 
-        # Extract text content and tool uses
+        # Extract text content, tool uses, and thinking blocks
         text_parts: list[str] = []
         tool_calls: list[ToolCall] = []
+        thinking_blocks: list[ThinkingBlock] = []
+        thinking_text_parts: list[str] = []
         # Citations are sourced from two places in Anthropic responses:
         #   - ``web_search_tool_result`` blocks: aggregate result list with
         #     ``url`` / ``title`` / ``snippet`` per source.
@@ -485,11 +488,29 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon
                     end_index=ann_dict.get("end_index") or ann_dict.get("end_char_index"),
                 )
 
-        # Second pass: tool uses + web_search_tool_result fallback (only fills
-        # URLs that the text-block annotations didn't already cover).
+        # Second pass: tool uses, thinking blocks, and web_search_tool_result
+        # fallback (only fills URLs the text-block annotations didn't cover).
         for block in content_blocks:
             kind = block.get("type")
-            if kind == "tool_use":
+            if kind == "thinking":
+                thinking_text = block.get("thinking", "")
+                thinking_blocks.append(
+                    ThinkingBlock(
+                        type="thinking",
+                        thinking=thinking_text,
+                        signature=block.get("signature"),
+                    )
+                )
+                if thinking_text:
+                    thinking_text_parts.append(thinking_text)
+            elif kind == "redacted_thinking":
+                thinking_blocks.append(
+                    ThinkingBlock(
+                        type="redacted_thinking",
+                        data=block.get("data"),
+                    )
+                )
+            elif kind == "tool_use":
                 tool_calls.append(
                     ToolCall(
                         id=block.get("id", ""),
@@ -518,12 +539,15 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon
         # Join text parts efficiently
         text_content = "".join(text_parts) if text_parts else None
         citations = list(citation_index.values()) if citation_index else None
+        reasoning_content = "".join(thinking_text_parts) if thinking_text_parts else None
 
         message = Message(
             role=resp.get("role", "assistant"),
             content=text_content,
             tool_calls=tool_calls or None,
             citations=citations,
+            reasoning_content=reasoning_content,
+            thinking_blocks=thinking_blocks or None,
         )
 
         # Map Anthropic stop reasons to OpenAI format
@@ -617,6 +641,21 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None:
                         )
                     ],
                 )
+            if block.get("type") == "thinking":
+                # Anthropic emits an empty thinking block first, then a
+                # series of thinking_delta events with the text, then a
+                # signature_delta with the cryptographic signature.
+                return StreamChunk(
+                    id="",
+                    model=model,
+                    choices=[
+                        ChunkChoice(
+                            index=0,
+                            delta=ChunkDelta(thinking=block.get("thinking", "")),
+                            finish_reason=None,
+                        )
+                    ],
+                )
             if block.get("type") == "tool_use":
                 # Start of tool use
                 return StreamChunk(
@@ -645,7 +684,8 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None:
 
         elif event_type == "content_block_delta":
             delta = event.get("delta", {})
-            if delta.get("type") == "text_delta":
+            delta_type = delta.get("type")
+            if delta_type == "text_delta":
                 return StreamChunk(
                     id="",
                     model=model,
@@ -657,6 +697,38 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None:
                         )
                     ],
                 )
+            if delta_type == "thinking_delta":
+                # Surface as both ``thinking`` (matches Anthropic wire shape
+                # for round-trip) and ``reasoning_content`` (so callers using
+                # the unified surface can stream thinking text without a
+                # provider-specific code path).
+                thinking_text = delta.get("thinking", "")
+                return StreamChunk(
+                    id="",
+                    model=model,
+                    choices=[
+                        ChunkChoice(
+                            index=0,
+                            delta=ChunkDelta(
+                                thinking=thinking_text,
+                                reasoning_content=thinking_text,
+                            ),
+                            finish_reason=None,
+                        )
+                    ],
+                )
+            if delta_type == "signature_delta":
+                return StreamChunk(
+                    id="",
+                    model=model,
+                    choices=[
+                        ChunkChoice(
+                            index=0,
+                            delta=ChunkDelta(signature=delta.get("signature", "")),
+                            finish_reason=None,
+                        )
+                    ],
+                )
             if delta.get("type") == "input_json_delta":
                 # Tool argument delta
                 return StreamChunk(
diff --git a/arcllm/providers/gemini_adapter.py b/arcllm/providers/gemini_adapter.py
index 111a007..785118e 100644
--- a/arcllm/providers/gemini_adapter.py
+++ b/arcllm/providers/gemini_adapter.py
@@ -420,11 +420,19 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon
 
             # Use list + join for efficient string building
             text_parts: list[str] = []
+            thought_parts: list[str] = []
             tool_calls: list[ToolCall] = []
 
             for part in parts:
                 if "text" in part:
-                    text_parts.append(part["text"])
+                    # Gemini marks chain-of-thought parts with ``thought: true``
+                    # when the request set ``thinkingConfig.includeThoughts``.
+                    # We split those out into ``reasoning_content`` so callers
+                    # don't have to filter them out of the answer text.
+                    if part.get("thought"):
+                        thought_parts.append(part["text"])
+                    else:
+                        text_parts.append(part["text"])
                 elif "functionCall" in part:
                     fc = part["functionCall"]
                     tool_calls.append(
@@ -439,12 +447,14 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon
                     )
 
             text_content = "".join(text_parts) if text_parts else None
+            reasoning_content = "".join(thought_parts) if thought_parts else None
             citations = _extract_grounding_citations(candidate)
             message = Message(
                 role="assistant",
                 content=text_content,
                 tool_calls=tool_calls or None,
                 citations=citations,
+                reasoning_content=reasoning_content,
             )
 
             # Map finish reason
@@ -507,11 +517,15 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None:
 
             # Use list + join for efficient string building
             text_parts: list[str] = []
+            thought_parts: list[str] = []
             tool_call_deltas: list[dict[str, Any]] = []
 
             for part in parts:
                 if "text" in part:
-                    text_parts.append(part["text"])
+                    if part.get("thought"):
+                        thought_parts.append(part["text"])
+                    else:
+                        text_parts.append(part["text"])
                 elif "functionCall" in part:
                     fc = part["functionCall"]
                     tool_call_deltas.append(
@@ -527,9 +541,11 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None:
                     )
 
             text_content = "".join(text_parts) if text_parts else None
+            reasoning_content = "".join(thought_parts) if thought_parts else None
             delta = ChunkDelta(
                 content=text_content,
                 tool_calls=tool_call_deltas or None,
+                reasoning_content=reasoning_content,
             )
 
             finish_reason = None
diff --git a/arcllm/providers/openai_adapter.py b/arcllm/providers/openai_adapter.py
index 823899c..3e75e18 100644
--- a/arcllm/providers/openai_adapter.py
+++ b/arcllm/providers/openai_adapter.py
@@ -231,12 +231,23 @@ def _build_model_response(self, resp: dict[str, Any], model: str) -> ModelRespon
                     arguments=fc.get("arguments", ""),
                 )
 
+            # ``reasoning_content`` is the de-facto field name used by
+            # DeepSeek-R1, GLM-4.5+, Groq's DeepSeek/Qwen-thinking models,
+            # Cerebras, Together, Fireworks, and any OpenAI-compat host
+            # serving a reasoning model. ``reasoning`` is the alias
+            # OpenAI ships on the chat-completions endpoint for o-series
+            # responses; we accept either and normalise to one field.
+            reasoning_content = message_data.get("reasoning_content") or message_data.get(
+                "reasoning"
+            )
+
             message = Message(
                 role=message_data.get("role", "assistant"),
                 content=message_data.get("content"),
                 tool_calls=tool_calls,
                 function_call=function_call,
                 refusal=message_data.get("refusal"),
+                reasoning_content=reasoning_content,
             )
 
             choices.append(
@@ -303,6 +314,8 @@ def parse_stream_event(self, data: str, model: str) -> StreamChunk | None:
                 content=delta_data.get("content"),
                 tool_calls=tool_calls,
                 function_call=delta_data.get("function_call"),
+                reasoning_content=delta_data.get("reasoning_content")
+                or delta_data.get("reasoning"),
             )
 
             choices.append(
diff --git a/arcllm/types.py b/arcllm/types.py
index 74f4100..1ebe061 100644
--- a/arcllm/types.py
+++ b/arcllm/types.py
@@ -186,6 +186,44 @@ def model_dump(self) -> dict[str, Any]:
         return result
 
 
+# =============================================================================
+# Reasoning / Thinking Types
+# =============================================================================
+
+
+class ThinkingBlock(_DictLike, msgspec.Struct):
+    """A chain-of-thought block from a reasoning-capable model.
+
+    Anthropic's extended-thinking feature returns thinking as structured
+    content blocks (``type: "thinking"`` with a ``signature`` for replay,
+    or ``type: "redacted_thinking"`` with opaque ``data``). Preserving the
+    block shape — instead of flattening to a string — lets callers send
+    the thinking back as part of the conversation history without losing
+    the signature, which is required for tool-use round trips.
+
+    For providers that emit thinking as a flat string (DeepSeek-R1, GLM,
+    Groq DeepSeek/Qwen-thinking, Gemini with ``includeThoughts=true``)
+    we populate :attr:`Message.reasoning_content` directly instead.
+    """
+
+    type: Literal["thinking", "redacted_thinking"] = "thinking"
+    thinking: str | None = None
+    signature: str | None = None
+    # Anthropic-specific: opaque payload for ``redacted_thinking`` blocks.
+    data: str | None = None
+
+    def model_dump(self) -> dict[str, Any]:
+        """Return dict representation for serialization."""
+        result: dict[str, Any] = {"type": self.type}
+        if self.thinking is not None:
+            result["thinking"] = self.thinking
+        if self.signature is not None:
+            result["signature"] = self.signature
+        if self.data is not None:
+            result["data"] = self.data
+        return result
+
+
 # =============================================================================
 # Message Types
 # =============================================================================
@@ -204,6 +242,14 @@ class Message(_DictLike, msgspec.Struct):
     # responses; an empty list means "the provider was asked to ground but
     # returned no sources" (rare).
     citations: list[Citation] | None = None
+    # Chain-of-thought / extended-thinking output from reasoning models.
+    # ``reasoning_content`` is the unified flat-string surface (populated
+    # by DeepSeek-R1, GLM-4.5+, Gemini 2.5+ with includeThoughts, OpenAI
+    # o-series via chat/completions when supported, etc.). For Anthropic
+    # extended thinking we also populate ``thinking_blocks`` so callers
+    # can send the structured form back with signatures intact.
+    reasoning_content: str | None = None
+    thinking_blocks: list[ThinkingBlock] | None = None
 
     def model_dump(self) -> dict[str, Any]:
         """Return dict representation for serialization."""
@@ -218,6 +264,10 @@ def model_dump(self) -> dict[str, Any]:
             result["refusal"] = self.refusal
         if self.citations is not None:
             result["citations"] = [c.model_dump() for c in self.citations]
+        if self.reasoning_content is not None:
+            result["reasoning_content"] = self.reasoning_content
+        if self.thinking_blocks is not None:
+            result["thinking_blocks"] = [b.model_dump() for b in self.thinking_blocks]
         return result
 
 
@@ -372,6 +422,13 @@ class ChunkDelta(_DictLike, msgspec.Struct):
     # for grounded providers — Perplexity, Gemini grounding, Anthropic
     # web-search). None on intermediate chunks.
     citations: list[Citation] | None = None
+    # Reasoning deltas. ``reasoning_content`` is the flat-string surface
+    # (DeepSeek-R1, GLM, Groq, etc.). ``thinking`` carries the per-chunk
+    # text of an Anthropic ``thinking_delta`` event; the matching
+    # ``signature`` lands in the trailing ``signature_delta``.
+    reasoning_content: str | None = None
+    thinking: str | None = None
+    signature: str | None = None
 
     def model_dump(self) -> dict[str, Any]:
         """Return dict representation for serialization."""
@@ -386,6 +443,12 @@ def model_dump(self) -> dict[str, Any]:
             result["function_call"] = self.function_call
         if self.citations is not None:
             result["citations"] = [c.model_dump() for c in self.citations]
+        if self.reasoning_content is not None:
+            result["reasoning_content"] = self.reasoning_content
+        if self.thinking is not None:
+            result["thinking"] = self.thinking
+        if self.signature is not None:
+            result["signature"] = self.signature
         return result
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 5ad8112..d5ade71 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "arcllm-sdk"
-version = "0.4.8"
+version = "0.4.9"
 description = "The arc connecting you to every LLM. Minimal dependencies, maximum performance."
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/tests/test_reasoning.py b/tests/test_reasoning.py
index 23aed12..d3c6196 100644
--- a/tests/test_reasoning.py
+++ b/tests/test_reasoning.py
@@ -173,3 +173,301 @@ def test_no_thinking_args_leaves_config_clean(self, gemini_adapter: GeminiAdapte
         # exist, it must not carry thinkingConfig.
         if "generationConfig" in body:
             assert "thinkingConfig" not in body["generationConfig"]
+
+
+# ---------------------------------------------------------------------------
+# Response-side: reasoning_content / thinking_blocks parsing
+# ---------------------------------------------------------------------------
+#
+# Reasoning models expose chain-of-thought differently per family. arcllm
+# normalises everything into ``Message.reasoning_content`` (flat str). For
+# Anthropic we additionally keep ``Message.thinking_blocks`` so callers can
+# replay the structured form (with signatures) on the next turn.
+
+
+class TestReasoningResponseExtraction:
+    """``parse_response`` populates ``reasoning_content`` + ``thinking_blocks``."""
+
+    def test_openai_reasoning_field_is_extracted(self, openai_adapter: OpenAIAdapter) -> None:
+        """OpenAI o-series chat/completions can return ``reasoning`` on message."""
+        body = orjson.dumps(
+            {
+                "id": "x",
+                "model": "o3-mini",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": "42",
+                            "reasoning": "Counting Hitchhiker references...",
+                        },
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+        )
+        resp = openai_adapter.parse_response(body, model="o3-mini")
+        msg = resp.choices[0].message
+        assert msg.content == "42"
+        assert msg.reasoning_content == "Counting Hitchhiker references..."
+
+    def test_deepseek_style_reasoning_content_is_extracted(
+        self, openai_adapter: OpenAIAdapter
+    ) -> None:
+        """DeepSeek-R1 / GLM / Groq DeepSeek / Together / Fireworks DeepSeek-R1
+        all use the ``reasoning_content`` field. Test through the OpenAI base
+        since every OpenAI-compat host inherits this parser."""
+        body = orjson.dumps(
+            {
+                "id": "x",
+                "model": "deepseek-reasoner",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": "ok",
+                            "reasoning_content": "Let me think... yes, ok.",
+                        },
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+        )
+        resp = openai_adapter.parse_response(body, model="deepseek-reasoner")
+        msg = resp.choices[0].message
+        assert msg.content == "ok"
+        assert msg.reasoning_content == "Let me think... yes, ok."
+
+    def test_anthropic_thinking_blocks_preserved_with_signature(
+        self, anthropic_adapter: AnthropicAdapter
+    ) -> None:
+        """Anthropic extended-thinking returns structured blocks. The signature
+        must round-trip — replaying without it breaks tool-use."""
+        body = orjson.dumps(
+            {
+                "id": "msg_x",
+                "type": "message",
+                "role": "assistant",
+                "model": "claude-sonnet-4-7",
+                "stop_reason": "end_turn",
+                "content": [
+                    {
+                        "type": "thinking",
+                        "thinking": "User wants ok. Reply ok.",
+                        "signature": "sig_abc123",
+                    },
+                    {"type": "text", "text": "ok"},
+                ],
+                "usage": {"input_tokens": 5, "output_tokens": 12},
+            }
+        )
+        resp = anthropic_adapter.parse_response(body, model="claude-sonnet-4-7")
+        msg = resp.choices[0].message
+        assert msg.content == "ok"
+        assert msg.reasoning_content == "User wants ok. Reply ok."
+        assert msg.thinking_blocks is not None
+        assert len(msg.thinking_blocks) == 1
+        block = msg.thinking_blocks[0]
+        assert block.type == "thinking"
+        assert block.thinking == "User wants ok. Reply ok."
+        assert block.signature == "sig_abc123"
+
+    def test_anthropic_redacted_thinking_block_preserves_opaque_data(
+        self, anthropic_adapter: AnthropicAdapter
+    ) -> None:
+        """``redacted_thinking`` blocks have no readable text — only an opaque
+        payload that must round-trip back unchanged. They surface on
+        ``thinking_blocks`` but contribute nothing to ``reasoning_content``."""
+        body = orjson.dumps(
+            {
+                "id": "msg_x",
+                "role": "assistant",
+                "model": "claude-sonnet-4-7",
+                "stop_reason": "end_turn",
+                "content": [
+                    {"type": "redacted_thinking", "data": "OPAQUE_BLOB"},
+                    {"type": "text", "text": "ok"},
+                ],
+                "usage": {"input_tokens": 5, "output_tokens": 1},
+            }
+        )
+        resp = anthropic_adapter.parse_response(body, model="claude-sonnet-4-7")
+        msg = resp.choices[0].message
+        assert msg.reasoning_content is None
+        assert msg.thinking_blocks is not None
+        assert msg.thinking_blocks[0].type == "redacted_thinking"
+        assert msg.thinking_blocks[0].data == "OPAQUE_BLOB"
+
+    def test_gemini_thought_parts_route_to_reasoning_content(
+        self, gemini_adapter: GeminiAdapter
+    ) -> None:
+        """Gemini 2.5+ marks chain-of-thought parts with ``thought: true``.
+
+        Without this split, the thought text would land in ``content`` and
+        the caller would have to filter it out manually."""
+        body = orjson.dumps(
+            {
+                "candidates": [
+                    {
+                        "content": {
+                            "parts": [
+                                {"text": "User wants ok.", "thought": True},
+                                {"text": "ok"},
+                            ]
+                        },
+                        "finishReason": "STOP",
+                    }
+                ],
+                "usageMetadata": {"promptTokenCount": 5, "candidatesTokenCount": 1},
+            }
+        )
+        resp = gemini_adapter.parse_response(body, model="gemini-2.5-pro")
+        msg = resp.choices[0].message
+        assert msg.content == "ok"
+        assert msg.reasoning_content == "User wants ok."
+
+    def test_non_reasoning_response_leaves_fields_none(self, openai_adapter: OpenAIAdapter) -> None:
+        """Regular chat responses (no reasoning fields) must not invent them."""
+        body = orjson.dumps(
+            {
+                "id": "x",
+                "model": "gpt-4o-mini",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {"role": "assistant", "content": "ok"},
+                        "finish_reason": "stop",
+                    }
+                ],
+            }
+        )
+        resp = openai_adapter.parse_response(body, model="gpt-4o-mini")
+        msg = resp.choices[0].message
+        assert msg.reasoning_content is None
+        assert msg.thinking_blocks is None
+
+
+class TestReasoningStreamAccumulation:
+    """``stream_chunk_builder`` accumulates reasoning across chunks."""
+
+    def test_flat_reasoning_content_accumulates(self) -> None:
+        """DeepSeek/GLM/Groq style: reasoning_content arrives in deltas."""
+        from arcllm.core import stream_chunk_builder
+        from arcllm.types import ChunkChoice, ChunkDelta, StreamChunk
+
+        chunks = [
+            StreamChunk(
+                id="x",
+                model="deepseek-reasoner",
+                choices=[ChunkChoice(index=0, delta=ChunkDelta(role="assistant"))],
+            ),
+            StreamChunk(
+                id="x",
+                model="deepseek-reasoner",
+                choices=[ChunkChoice(index=0, delta=ChunkDelta(reasoning_content="Let me "))],
+            ),
+            StreamChunk(
+                id="x",
+                model="deepseek-reasoner",
+                choices=[ChunkChoice(index=0, delta=ChunkDelta(reasoning_content="think."))],
+            ),
+            StreamChunk(
+                id="x",
+                model="deepseek-reasoner",
+                choices=[
+                    ChunkChoice(index=0, delta=ChunkDelta(content="ok"), finish_reason="stop")
+                ],
+            ),
+        ]
+        final = stream_chunk_builder(chunks)
+        msg = final.choices[0].message
+        assert msg.content == "ok"
+        assert msg.reasoning_content == "Let me think."
+        assert msg.thinking_blocks is None
+
+    def test_anthropic_thinking_deltas_grouped_by_signature(self) -> None:
+        """Anthropic streaming: thinking_delta → thinking_delta → signature_delta
+        is one block. The next thinking_delta opens a new block."""
+        from arcllm.core import stream_chunk_builder
+        from arcllm.types import ChunkChoice, ChunkDelta, StreamChunk
+
+        chunks = [
+            StreamChunk(
+                id="x",
+                model="claude-sonnet-4-7",
+                choices=[ChunkChoice(index=0, delta=ChunkDelta(role="assistant"))],
+            ),
+            StreamChunk(
+                id="x",
+                model="claude-sonnet-4-7",
+                choices=[
+                    ChunkChoice(
+                        index=0,
+                        delta=ChunkDelta(thinking="User wants ", reasoning_content="User wants "),
+                    )
+                ],
+            ),
+            StreamChunk(
+                id="x",
+                model="claude-sonnet-4-7",
+                choices=[
+                    ChunkChoice(
+                        index=0,
+                        delta=ChunkDelta(thinking="ok.", reasoning_content="ok."),
+                    )
+                ],
+            ),
+            StreamChunk(
+                id="x",
+                model="claude-sonnet-4-7",
+                choices=[ChunkChoice(index=0, delta=ChunkDelta(signature="sig_abc"))],
+            ),
+            StreamChunk(
+                id="x",
+                model="claude-sonnet-4-7",
+                choices=[
+                    ChunkChoice(index=0, delta=ChunkDelta(content="ok"), finish_reason="stop")
+                ],
+            ),
+        ]
+        final = stream_chunk_builder(chunks)
+        msg = final.choices[0].message
+        assert msg.content == "ok"
+        assert msg.reasoning_content == "User wants ok."
+        assert msg.thinking_blocks is not None
+        assert len(msg.thinking_blocks) == 1
+        assert msg.thinking_blocks[0].thinking == "User wants ok."
+        assert msg.thinking_blocks[0].signature == "sig_abc"
+
+
+class TestReasoningSerialization:
+    """``Message.model_dump`` round-trips reasoning fields."""
+
+    def test_dump_includes_reasoning_when_set(self) -> None:
+        from arcllm.types import Message, ThinkingBlock
+
+        msg = Message(
+            role="assistant",
+            content="ok",
+            reasoning_content="thinking text",
+            thinking_blocks=[
+                ThinkingBlock(type="thinking", thinking="thinking text", signature="s")
+            ],
+        )
+        dumped = msg.model_dump()
+        assert dumped["reasoning_content"] == "thinking text"
+        assert dumped["thinking_blocks"] == [
+            {"type": "thinking", "thinking": "thinking text", "signature": "s"}
+        ]
+
+    def test_dump_omits_reasoning_when_absent(self) -> None:
+        """Don't emit empty reasoning fields — keeps the serialised shape lean
+        and matches OpenAI/litellm behaviour for non-reasoning responses."""
+        from arcllm.types import Message
+
+        msg = Message(role="assistant", content="ok")
+        dumped = msg.model_dump()
+        assert "reasoning_content" not in dumped
+        assert "thinking_blocks" not in dumped