diff --git a/.cursor/rules/project-overview.mdc b/.cursor/rules/project-overview.mdc
index 1b724ed..418f5c2 100644
--- a/.cursor/rules/project-overview.mdc
+++ b/.cursor/rules/project-overview.mdc
@@ -30,6 +30,7 @@ SAFE_COMPLETE is an intentional governance choice, **not an error**.
 | `moralstack/runtime/`          | Orchestrator, deliberation loop; simulator includes semantic layer (harm_type, semantic_expected_harm) that influences deliberation; simulator module with config_loader for env (MORALSTACK_SIMULATOR_*); hindsight module with config_loader for env (MORALSTACK_HINDSIGHT_*); perspective module with config_loader for env (MORALSTACK_PERSPECTIVES_*); critic module with config_loader for env (MORALSTACK_CRITIC_*) |
 | `moralstack/runtime/decision/` | safe_complete_policy (single source of truth for action bounds, final_action) |
 | `moralstack/runtime/trace/`    | DecisionTrace, append_decision_trace for audit |
+| `moralstack/models/policy.py` | `OpenAIPolicy` (generate / rewrite / refuse); env `OPENAI_*`; optional `MORALSTACK_POLICY_REWRITE_MODEL` for deliberative `rewrite()` (defaults to primary model when unset) |
 | `moralstack/models/risk/`      | Risk estimation (schema, estimator, calibration, parse_result, config_loader for env) |
 | `moralstack/models/decision_explanation.py` | DecisionExplanation dataclass for explainability |
 | `moralstack/models/reason_codes.py` | ReasonCode enum and policy-to-reason mapping |
diff --git a/.env.minimal b/.env.minimal
index 062381b..7adf8a8 100644
--- a/.env.minimal
+++ b/.env.minimal
@@ -12,6 +12,8 @@
 # -----------------------------------------------------------------------------
 OPENAI_API_KEY=<YOUR_KEY_HERE>
 OPENAI_MODEL=gpt-4o
+# Lighter model for policy rewrites (omit variable entirely to use OPENAI_MODEL)
+MORALSTACK_POLICY_REWRITE_MODEL=gpt-4.1-nano
 OPENAI_TEMPERATURE=0.1
 OPENAI_TOP_P=0.8
 
@@ -52,7 +54,7 @@ MORALSTACK_UI_PASSWORD=admin
 # -----------------------------------------------------------------------------
 # See docs/modules/risk_estimator.md for full documentation of each variable.
 # Model for the semantic judge (if set, overrides OPENAI_MODEL for risk only)
-MORALSTACK_RISK_MODEL=gpt-4o
+MORALSTACK_RISK_MODEL=gpt-4o-mini
 # if MORALSTACK_RISK_PARALLEL_ESTIMATORS = true then the following models are used for parallel estimation
 MORALSTACK_RISK_INTENT_MODEL=gpt-4o
 MORALSTACK_RISK_SIGNALS_MODEL=gpt-4o-mini
@@ -111,7 +113,7 @@ MORALSTACK_CRITIC_INCLUDE_EXAMPLES=false
 # -----------------------------------------------------------------------------
 # See docs/modules/simulator.md for full documentation of each variable.
 # Model for consequence simulator (if set, overrides OPENAI_MODEL for simulator only; used in run and benchmark)
-MORALSTACK_SIMULATOR_MODEL=gpt-4o-mini
+MORALSTACK_SIMULATOR_MODEL=gpt-4.1-nano
 MORALSTACK_SIMULATOR_MAX_RETRIES=3
 MORALSTACK_SIMULATOR_MAX_TOKENS=384
 MORALSTACK_SIMULATOR_TEMPERATURE=0.1
@@ -125,7 +127,7 @@ MORALSTACK_SIMULATOR_ENABLE_CACHING=true
 # -----------------------------------------------------------------------------
 # See docs/modules/hindsight.md for full documentation of each variable.
 # Model for hindsight evaluator (if set, overrides OPENAI_MODEL for hindsight only; used in run and benchmark)
-MORALSTACK_HINDSIGHT_MODEL=gpt-4o
+MORALSTACK_HINDSIGHT_MODEL=gpt-4o-mini
 MORALSTACK_HINDSIGHT_MAX_RETRIES=3
 MORALSTACK_HINDSIGHT_MAX_TOKENS=768
 MORALSTACK_HINDSIGHT_TEMPERATURE=0.2
@@ -164,6 +166,11 @@ MORALSTACK_ORCHESTRATOR_ENABLE_HINDSIGHT_GATING=true
 MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD=0.4
 MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD=100
 MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER=0.95
+# When true and PARALLEL_MODULE_CALLS=true, critic runs in parallel with sim+persp instead of gating them sequentially.
+# Hard violations are still honoured (sim/persp results discarded). Saves ~3.5s per cycle.
+MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES=true
+# When true, risk estimation and draft generation run in parallel (speculative overlap). Saves ~4s on deliberative path.
+MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION=true
 
 # -----------------------------------------------------------------------------
 # Tracing & Debug
diff --git a/.env.template b/.env.template
index e5a5dde..96d99e0 100644
--- a/.env.template
+++ b/.env.template
@@ -12,6 +12,11 @@
 # -----------------------------------------------------------------------------
 OPENAI_API_KEY=
 OPENAI_MODEL=gpt-4o
+# Model used for policy rewrite operations (cycle 2+).
+# Uses a lighter model by default to reduce latency on revisions
+# triggered by soft violations. Set to same value as OPENAI_MODEL
+# to disable the downgrade.
+MORALSTACK_POLICY_REWRITE_MODEL=gpt-4.1-nano
 # OPENAI_BASE_URL=https://your-proxy.example.com/v1
 # OPENAI_TIMEOUT_MS=60000
 # OPENAI_MAX_RETRIES=3
@@ -55,7 +60,7 @@ MORALSTACK_UI_PASSWORD=
 # -----------------------------------------------------------------------------
 # See docs/modules/risk_estimator.md for full documentation of each variable.
 # Model for the semantic judge (if set, overrides OPENAI_MODEL for risk only)
-# MORALSTACK_RISK_MODEL=gpt-4o
+# MORALSTACK_RISK_MODEL=gpt-4o-mini
 # if MORALSTACK_RISK_PARALLEL_ESTIMATORS = true then the following models are used for parallel estimation
 # MORALSTACK_RISK_INTENT_MODEL=gpt-4o
 # MORALSTACK_RISK_SIGNALS_MODEL=gpt-4o-mini
@@ -115,7 +120,7 @@ MORALSTACK_UI_PASSWORD=
 # -----------------------------------------------------------------------------
 # See docs/modules/simulator.md for full documentation of each variable.
 # Model for consequence simulator (if set, overrides OPENAI_MODEL for simulator only; used in run and benchmark)
-# MORALSTACK_SIMULATOR_MODEL=gpt-4o-mini
+# MORALSTACK_SIMULATOR_MODEL=gpt-4.1-nano
 # MORALSTACK_SIMULATOR_MAX_RETRIES=3
 # MORALSTACK_SIMULATOR_MAX_TOKENS=384
 # MORALSTACK_SIMULATOR_TEMPERATURE=0.1
@@ -129,7 +134,7 @@ MORALSTACK_UI_PASSWORD=
 # -----------------------------------------------------------------------------
 # See docs/modules/hindsight.md for full documentation of each variable.
 # Model for hindsight evaluator (if set, overrides OPENAI_MODEL for hindsight only; used in run and benchmark)
-# MORALSTACK_HINDSIGHT_MODEL=gpt-4o
+# MORALSTACK_HINDSIGHT_MODEL=gpt-4o-mini
 # MORALSTACK_HINDSIGHT_MAX_RETRIES=3
 # MORALSTACK_HINDSIGHT_MAX_TOKENS=768
 # MORALSTACK_HINDSIGHT_TEMPERATURE=0.2
@@ -169,6 +174,11 @@ MORALSTACK_UI_PASSWORD=
 # MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD=0.4
 # MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD=100
 # MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER=0.95
+# When true and PARALLEL_MODULE_CALLS=true, critic runs in parallel with sim+persp instead of gating them sequentially.
+# Hard violations are still honoured (sim/persp results discarded). Saves ~3.5s per cycle.
+# MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES=true
+# When true, risk estimation and draft generation run in parallel (speculative overlap). Saves ~4s on deliberative path.
+# MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION=true
 
 # -----------------------------------------------------------------------------
 # Tracing & Debug
diff --git a/INSTALL.md b/INSTALL.md
index 674f9d1..73f0f59 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -80,6 +80,7 @@ See [docs/modules/openai_params.md](docs/modules/openai_params.md) for details a
 |--------------------------------|---------------------------|----------------------------------------------------------------|
 | OPENAI_API_KEY                 | -                         | OpenAI API key (required)                                      |
 | OPENAI_MODEL                   | gpt-4o                    | OpenAI model (see [Model compatibility](#model-compatibility)) |
+| MORALSTACK_POLICY_REWRITE_MODEL | - (same as OPENAI_MODEL) | Policy `rewrite()` at cycle 2+; `.env.template` uses `gpt-4.1-nano`; set any lighter model to reduce latency (see [policy.md](docs/modules/policy.md)) |
 | OPENAI_BASE_URL                | -                         | Base URL (proxy/enterprise)                                    |
 | OPENAI_TIMEOUT_MS              | 60000                     | Timeout in milliseconds                                        |
 | OPENAI_MAX_RETRIES             | 3                         | Retries on 429/503                                             |
@@ -129,10 +130,12 @@ both CLI run and benchmark, hindsight configuration and model are read only from
 CLI override — env is the single source of configuration.**
 
 **Orchestrator**: Optional overrides (e.g. `MORALSTACK_ORCHESTRATOR_MAX_DELIBERATION_CYCLES`,
-`MORALSTACK_ORCHESTRATOR_TIMEOUT_MS`, `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`, …) are listed in `.env.template`
-and fully documented in [docs/modules/orchestrator.md](docs/modules/orchestrator.md#environment-variables). Leave them
-commented to use built-in defaults. **In both CLI run and benchmark, orchestrator configuration is read only from the
-environment (`.env`); there is no CLI override — env is the single source of configuration.**
+`MORALSTACK_ORCHESTRATOR_TIMEOUT_MS`, `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`,
+`MORALSTACK_ORCHESTRATOR_PARALLEL_MODULE_CALLS`, `MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES`,
+`MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION`, …) are listed in `.env.template` and fully documented in
+[docs/modules/orchestrator.md](docs/modules/orchestrator.md#environment-variables). Leave them commented to use built-in
+defaults. **In both CLI run and benchmark, orchestrator configuration is read only from the environment (`.env`); there
+is no CLI override — env is the single source of configuration.**
 
 **Benchmark**: Optional overrides (e.g. `MORALSTACK_BENCHMARK_OUTPUTS`, `MORALSTACK_BENCHMARK_BASELINE_MODEL`,
 `MORALSTACK_BENCHMARK_JUDGE_MODEL`) are listed in `.env.template` and fully documented
diff --git a/README.md b/README.md
index 16457c8..abdcfe4 100644
--- a/README.md
+++ b/README.md
@@ -87,8 +87,8 @@ Evaluated on 84 questions spanning adversarial prompts, dual-use domains, regula
 
 | | Baseline | MoralStack | Tie |
 |---|---|---|---|
-| **Wins** | 4 | **54** | 26 |
-| **Avg Safety Score** | 7.92/10 | **9.27/10** | — |
+| **Wins** | 1 | **53** | 30 |
+| **Avg Safety Score** | 7.73/10 | **9.39/10** | — |
 
 ### Decision Accuracy
 
@@ -109,9 +109,9 @@ REFUSE         0     0    22
 
 | | Baseline | MoralStack |
 |---|---|---|
-| **Avg Latency** | ~10s | ~70s |
+| **Avg Latency** | ~5s | ~60s |
 
-Deliberative paths add latency by design (see [Limitations](#limitations--trade-offs)).
+Deliberative paths add latency by design. Latency-reducing optimizations include speculative decoding, parallel risk estimation, lighter models for simulator and policy rewrite (see [Limitations](#limitations--trade-offs) and [Configuration](#configuration)).
 
 ## Quickstart
 
@@ -180,6 +180,7 @@ Environment is loaded via `moralstack/utils/env_loader.py`.
 Key variables:
 
 - `OPENAI_MODEL` (default `gpt-4o`)
+- `MORALSTACK_POLICY_REWRITE_MODEL` (optional; model for deliberative `rewrite()` at cycle 2+; if unset, same as `OPENAI_MODEL`. `.env.template` sets `gpt-4.1-nano` for lower rewrite latency.)
 - `OPENAI_TIMEOUT_MS` (default `60000`)
 - `OPENAI_MAX_RETRIES` (default `3`)
 - `OPENAI_TEMPERATURE` (code fallback default `0.7`; `.env.template` starter value `0.1`)
@@ -190,6 +191,18 @@ Key variables:
 
 For full variable reference see [INSTALL.md](INSTALL.md) and `docs/modules/*.md`.
 
+Default models by component (each can be overridden via its env var; see `INSTALL.md` and module docs):
+
+| Component | Default model | Env variable |
+|-----------|---------------|--------------|
+| Policy (generation) | gpt-4o | `OPENAI_MODEL` |
+| Policy (rewrite) | same as primary, or `gpt-4.1-nano` in `.env.template` | `MORALSTACK_POLICY_REWRITE_MODEL` |
+| Risk estimator | follows `OPENAI_MODEL` unless set | `MORALSTACK_RISK_MODEL` |
+| Critic | follows `OPENAI_MODEL` unless set | `MORALSTACK_CRITIC_MODEL` |
+| Simulator | follows `OPENAI_MODEL` unless set | `MORALSTACK_SIMULATOR_MODEL` |
+| Perspectives | follows `OPENAI_MODEL` unless set | `MORALSTACK_PERSPECTIVES_MODEL` |
+| Hindsight | follows `OPENAI_MODEL` unless set | `MORALSTACK_HINDSIGHT_MODEL` |
+
 ## Running the Benchmark
 
 ```bash
@@ -239,11 +252,11 @@ Open [http://localhost:8765/](http://localhost:8765/) (or `MORALSTACK_UI_PORT`).
 
 MoralStack makes deliberate trade-offs:
 
-- **Latency over speed**: deliberative paths run multiple LLM calls (risk → critic → simulator → perspectives → hindsight). Average response time is ~70s vs ~10s for raw GPT-4o. This is a design choice — governance takes time.
-- **Multi-model cost**: a single deliberative request makes 7-9 LLM calls. We use `gpt-4o-mini` for lower-stakes modules (simulator, perspectives) to reduce cost.
+- **Latency over speed**: deliberative paths run multiple LLM calls (risk → critic → simulator → perspectives → hindsight). Average response time is ~60s vs ~5s for raw GPT-4o. This is a design choice — governance takes time.
+- **Multi-model cost**: a single deliberative request makes 7-9 LLM calls. Example profiles: `.env.minimal` uses `gpt-4.1-nano` for policy rewrite and simulator, and `gpt-4o-mini` for perspectives (all overridable via env).
 - **Benchmark scope**: 84 curated questions demonstrate the approach but do not cover all edge cases. We recommend running your own evaluations on domain-specific inputs.
 - **LLM non-determinism**: despite low temperature settings across all modules, LLM outputs can vary between runs. The system includes deterministic guardrails in code to bound this variance, but perfect reproducibility is not guaranteed.
 
-We are actively working on reducing latency through early-exit optimizations and context-mode switching.
+Latency has been reduced through speculative decoding (predicted outputs for draft revisions), parallel risk estimation, lighter models for simulator and rewrite (`gpt-4.1-nano`), structured JSON output enforcement, and soft-revision prompt constraints. Further optimizations (early-exit on low-risk queries, context-mode switching) are planned.
 
 See full discussion in [docs/limitations_and_tradeoffs.md](docs/limitations_and_tradeoffs.md).
diff --git a/docs/architecture_spec.md b/docs/architecture_spec.md
index e1f8ae8..59e43d7 100644
--- a/docs/architecture_spec.md
+++ b/docs/architecture_spec.md
@@ -150,6 +150,9 @@ class OrchestratorConfig:
     enable_simulation: bool = True
     enable_hindsight: bool = True
     borderline_refuse_upper: float = 0.95  # Upper bound (inclusive) for borderline REFUSE deliberation
+    parallel_module_calls: bool = True
+    parallel_critic_with_modules: bool = True   # *[impl]* critic || sim || persp when parallel_module_calls
+    enable_speculative_generation: bool = True  # *[impl]* risk || speculative generate at controller entry
 
 @dataclass
 class RiskThresholds:
@@ -245,8 +248,34 @@ To reduce tokens and latency, the deliberative cycle supports:
   cycles) to preserve revision context
 - **Gating**: `enable_hindsight_gating` is true by default (hindsight only in final cycle; opt-out for legacy). `enable_simulator_gating` (opt-in) skips simulator when safe.
 - **Trace**: optional fields `context_mode_by_module`, `modules_skipped` for reporting
+- **Policy rewrite model**: deliberative `rewrite()` at cycle 2+ may use `MORALSTACK_POLICY_REWRITE_MODEL` (when unset,
+  same as `OPENAI_MODEL`) to reduce latency; initial `generate()` / speculative draft stays on the primary model.
 
-See § Token Optimization above.
+#### Policy rewrite model downgrade
+
+When the critic triggers a revision on soft violations, the policy `rewrite` at cycle 2+ uses a configurable model
+(`MORALSTACK_POLICY_REWRITE_MODEL`). If unset or empty, the primary `OPENAI_MODEL` is used (backward compatible). A
+lighter default (for example `gpt-4.1-nano` in `.env.template`) reduces rewrite latency because the call runs under
+explicit critic guidance and constrained-generation instructions; speculative first-pass generation remains on the
+primary model for baseline quality. To disable the split, set `MORALSTACK_POLICY_REWRITE_MODEL` to the same value as
+`OPENAI_MODEL`.
+
+In benchmark testing, this optimization reduces rewrite step latency and, combined with
+`gpt-4.1-nano` on the simulator, brings average deliberative latency from ~82s to ~60s
+(~27% reduction) with no measurable compliance degradation (98.8% maintained).
+
+#### Rewrite prompt constraints
+
+To prevent lighter rewrite models from introducing new operational content during revision,
+the rewrite system prompt includes explicit constraints:
+
+- Do not add new examples, scenarios, or operational details not present in the original draft
+- Focus on restructuring and reframing existing content based on critic feedback
+- When feedback requests conceptual focus, remove operational specifics rather than adding new ones
+
+These constraints are appended to the rewrite system prompt regardless of whether it comes from
+the deliberation runner or uses the fallback default. They compensate for the tendency of smaller
+models to "fill" revisions with new specifics rather than restructuring existing content.
 
 ---
 
@@ -304,7 +333,9 @@ strutturato), non un classificatore leggero; i segnali sono semantici (es. `ethi
 ### 3.4 Policy LLM
 
 **Responsibility**: Text generation (responses, revisions, refusals). *[impl]* The Orchestrator uses `generate` for
-draft, `rewrite` for revisions guided by Critic/Hindsight/Simulator/Perspectives, `refuse` for refusals.
+draft, `rewrite` for revisions guided by Critic/Hindsight/Simulator/Perspectives, `refuse` for refusals. Optional env
+`MORALSTACK_POLICY_REWRITE_MODEL` selects the model for `rewrite()` only; `generate()` and `refuse()` use the primary
+`OPENAI_MODEL` (see [Policy rewrite model downgrade](#policy-rewrite-model-downgrade) above).
 
 ```python
 @dataclass
@@ -893,6 +924,10 @@ Refusal text is persisted as an LLM call with `action` containing `"refuse"` (e.
 - Quick check: ~100ms
 - Assembly: ~10ms
 
+> **Actual measured performance** (benchmark, 84 questions): fast path average ~10-12s.
+> Target values above reflect aspirational architecture without LLM call latency.
+> Real-world fast path includes speculative generation (~5-8s) plus quick-check (~2-3s).
+
 ---
 
 ### 4.2 Sequence Diagram - Full Deliberation (risk ≥ 0.7)
diff --git a/docs/limitations_and_tradeoffs.md b/docs/limitations_and_tradeoffs.md
index b8efc35..4d90bab 100644
--- a/docs/limitations_and_tradeoffs.md
+++ b/docs/limitations_and_tradeoffs.md
@@ -17,20 +17,28 @@ to the risk of underestimating vulnerability situations.
 ### 2. Latency and Computational Cost
 
 When it activates the deliberative path, MoralStack introduces
-significant computational overhead compared to a direct LLM call.
+computational overhead compared to a direct LLM call (~60s average
+vs ~5s for raw GPT-4o).
 
-The system prioritizes:
+The system prioritizes safety, decision correctness, and auditability
+over pure latency. Recent optimizations (parallel risk estimation,
+speculative decoding, lighter models for simulator and policy rewrite)
+have reduced average deliberative latency by ~26% from the initial
+architecture.
 
-- safety
-- decision correctness
-- auditability
+Latency profile by path:
 
-over pure latency.
+- **Fast path** (benign queries, ~11% of traffic): ~10-12s
+- **Deliberative path** (standard): ~45-60s
+- **Deliberative sensitive** (regulated domains): ~70-85s
 
 For this reason, MoralStack is not suitable for:
 
 - high-frequency creative chat
-- real-time systems with strict latency constraints
+- real-time systems with strict sub-second latency constraints
+
+Planned further optimizations include early-exit on low-risk deliberative
+queries and context-mode switching for reduced token overhead.
 
 ### 3. SAFE_COMPLETE as Decision, not Error
 
diff --git a/docs/modules/benchmark.md b/docs/modules/benchmark.md
index 5683280..ae2e89e 100644
--- a/docs/modules/benchmark.md
+++ b/docs/modules/benchmark.md
@@ -55,6 +55,10 @@ All benchmark configuration can be overridden via `.env`. Variables are read whe
 1. `--model` / `-m` (CLI)
 2. `gpt-4o` (default)
 
+**MoralStack policy rewrite** (env only; not set via CLI): `MORALSTACK_POLICY_REWRITE_MODEL` selects the model for
+`rewrite()` in deliberation (cycle 2+). If unset, the rewrite uses the same model as primary policy generation. Report
+`models_config.moralstack.policy_rewrite` reflects the effective value.
+
 **Judge**:
 
 1. `MORALSTACK_BENCHMARK_JUDGE_MODEL` (if set in `.env`)
@@ -84,8 +88,8 @@ With `MORALSTACK_BENCHMARK_BASELINE_MODEL=gpt-4o` in `.env`, the baseline always
 ## Report and UI
 
 The benchmark report JSON (`benchmark_{run_id}.json`) stores `model`, `judge_model`, and `models_config` (baseline,
-judge, MoralStack modules: policy, risk, critic, simulator, hindsight, perspectives). The markdown export and the UI
-display these models clearly in the report header and on the run detail page.
+judge, MoralStack modules: policy, policy_rewrite, risk, critic, simulator, hindsight, perspectives). The markdown
+export and the UI display these models clearly in the report header and on the run detail page.
 
 **UI and export requirement**: The file `benchmark_{run_id}.json` in `MORALSTACK_BENCHMARK_OUTPUTS` is **required** for
 the moralstack-ui to display the full benchmark summary (Executive Summary, FP/FN, per-question baseline/moralstack
diff --git a/docs/modules/critic.md b/docs/modules/critic.md
index 386b6a9..089539f 100644
--- a/docs/modules/critic.md
+++ b/docs/modules/critic.md
@@ -170,6 +170,8 @@ variables.**
 - **Type**: int (>= 1)
 - **Meaning**: Number of parse attempts for the critic JSON response before raising an error.
 
+Structured critic output uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON.
+
 #### MORALSTACK_CRITIC_MAX_TOKENS
 
 - **Default**: `384`
diff --git a/docs/modules/hindsight.md b/docs/modules/hindsight.md
index 3cc2e54..70c330b 100644
--- a/docs/modules/hindsight.md
+++ b/docs/modules/hindsight.md
@@ -247,6 +247,8 @@ hindsight configuration is the single source of configuration — no CLI or code
 - **Type**: int (>= 1)
 - **Description**: Number of parse attempts for the hindsight JSON response before raising an error.
 
+Hindsight evaluation uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON.
+
 #### MORALSTACK_HINDSIGHT_MAX_TOKENS
 
 - **Default**: `768`
diff --git a/docs/modules/openai_params.md b/docs/modules/openai_params.md
index 149d0a7..56fba08 100644
--- a/docs/modules/openai_params.md
+++ b/docs/modules/openai_params.md
@@ -84,6 +84,41 @@ response = client.chat.completions.create(
 
 ---
 
+## Predicted Output Support
+
+Some models support the `prediction` parameter, which enables speculative decoding for faster generation when the
+expected output is largely similar to a known text (e.g. a draft revision in `rewrite()`).
+
+### `MODELS_SUPPORTING_PREDICTED_OUTPUT`
+
+Tuple of model name prefixes that support the `prediction` parameter:
+
+```python
+MODELS_SUPPORTING_PREDICTED_OUTPUT = (
+    "gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
+)
+```
+
+### `supports_predicted_output(model: str) -> bool`
+
+Returns `True` if the model supports speculative decoding via predicted outputs.
+
+```python
+from moralstack.utils.openai_params import supports_predicted_output
+
+supports_predicted_output("gpt-4o")      # True
+supports_predicted_output("gpt-4.1")     # True
+supports_predicted_output("o3-mini")     # False (uses max_completion_tokens)
+supports_predicted_output("gpt-5.2")    # False
+```
+
+**Constraints:** Predicted outputs are incompatible with `max_completion_tokens`, `logprobs`, and `n > 1`. The
+`rewrite()` method in `OpenAIPolicy` uses this automatically — no caller changes needed.
+
+**Reference:** [OpenAI Predicted Outputs](https://platform.openai.com/docs/guides/predicted-outputs)
+
+---
+
 ## Updating the Model List
 
 When OpenAI releases new models that require `max_completion_tokens`, update the tuple in
@@ -93,6 +128,12 @@ When OpenAI releases new models that require `max_completion_tokens`, update the
 MODELS_REQUIRING_MAX_COMPLETION_TOKENS = ("o1", "o3", "o4", "gpt-5", "gpt-6")  # add new prefix
 ```
 
+When OpenAI adds predicted output support to new models, update:
+
+```python
+MODELS_SUPPORTING_PREDICTED_OUTPUT = ("gpt-4o", "gpt-4o-mini", "gpt-4.1", ...)  # add new prefix
+```
+
 **Rules:**
 
 - Use the **shortest unique prefix** that identifies the model family (e.g. `gpt-5` matches `gpt-5.2`, `gpt-5.1`,
@@ -109,7 +150,8 @@ deprecated and not compatible with o-series models.
 
 All modules that call the OpenAI Chat Completions API use this utility:
 
-- **Policy LLM** (`moralstack/models/policy.py`) — `_complete()`
+- **Policy LLM** (`moralstack/models/policy.py`) — `_complete()` uses `completion_tokens_param` and
+  `supports_predicted_output` (the latter for `rewrite()` speculative decoding)
 - **Benchmark** (`scripts/benchmark_moralstack.py`) — `OpenAIClient.generate()`, `_generate_with_model()`
 - **Constitution Retriever** (`moralstack/constitution/retriever.py`) — direct `client.chat.completions.create` calls (used by store)
 - **Runtime modules** (critic, perspective, hindsight, simulator, risk estimator) — via policy
diff --git a/docs/modules/orchestrator.md b/docs/modules/orchestrator.md
index d576e30..137cfb4 100644
--- a/docs/modules/orchestrator.md
+++ b/docs/modules/orchestrator.md
@@ -119,6 +119,26 @@ config = OrchestratorConfig(
 | `min_hindsight_score`                  | 0.8     | Minimum hindsight score for convergence                          |
 | `borderline_refuse_upper`              | 0.95    | Upper bound for borderline REFUSE deliberation                   |
 | `early_exit_perspectives_threshold`    | 0.85    | Weighted approval threshold for early exit (critic PROCEED path) |
+| `parallel_critic_with_modules`         | `True`  | When `True` and `parallel_module_calls` is `True`, the critic runs in parallel with the simulator and perspectives instead of as a sequential gate. See [Latency-oriented parameters](#latency-oriented-parameters). |
+| `enable_speculative_generation`        | `True`  | When `True`, risk estimation and speculative draft generation run in parallel before routing. The draft is reused on benign, fast, and deliberative routes when applicable. See [Latency-oriented parameters](#latency-oriented-parameters). |
+
+### Latency-oriented parameters
+
+These flags reduce wall-clock latency **without changing routing policy** (`decide_action`, `get_route`, overlay floors, or convergence invariants). They do not change how `final_action` is computed from risk and module outputs.
+
+**`parallel_critic_with_modules` (default `True`)**
+
+- **Requires** `parallel_module_calls=True`. When both are `True`, each deliberation cycle runs **critic**, **simulator**, and **perspectives** concurrently (three parallel LLM calls per cycle when those modules are enabled).
+- **When `False`**: the runner uses a two-stage layout: critic runs first; only if there is no hard violation do simulator and perspectives run in parallel. This avoids paying for simulator/perspective calls when the critic would reject the draft, but adds sequential critic latency before sim/persp start.
+- **Hard violations**: If the critic reports a hard violation, simulator and perspective results from that cycle are **discarded** and do not affect merged state. Convergence and refusal logic see the same critic outcome as in the gated layout; you may pay extra token cost in the rare hard-violation case.
+- **Set to `false`** if you prioritize minimizing LLM spend on hard-violation paths over latency.
+
+**`enable_speculative_generation` (default `True`)**
+
+- **When `True`**: `OrchestrationController` starts **risk estimation** and a **speculative policy `generate`** (same base system prompt as normal first-pass generation) in parallel. After risk returns, routing proceeds as usual; the speculative draft is **not** used for policy routing decisions.
+- **Reuse**: On benign fast path, fast path, and deliberative path, the draft is reused when it is still valid (skipping a duplicate first `generate` where implemented). On **REFUSE**, the speculative call is unused (wasted latency/token trade-off). **`SAFE_COMPLETE`** path does not reuse this draft (different system instructions).
+- **Constrained generation** (`CLEARLY_HARMFUL` deliberation): the speculative draft is **not** applied as cycle-1 output; the constrained system prompt is used instead.
+- **Note**: Speculative generation uses language resolution **before** the risk estimator’s `detected_language` is available (fallback path). Routing and safety decisions are unchanged; draft wording may differ slightly from a strictly sequential generate-after-risk for the same request.
 
 ### Borderline REFUSE Upper Bound
 
@@ -178,7 +198,7 @@ For low-risk requests, the system uses an optimized path:
 Request → Risk Estimation (< 0.3) → Direct Generation → Response
 ```
 
-**Typical latency**: 300-500ms
+**Typical latency**: ~10-12s (benchmark, 84 questions)
 
 ### Deliberative Path (risk ≥ 0.3)
 
@@ -191,7 +211,9 @@ For significant-risk requests:
 5. **Hindsight Evaluation**: Retrospective evaluation
 6. **Convergence Check**: Verify termination criteria
 
-**Typical latency**: 2-30 seconds (depends on number of cycles)
+**Typical latency**: Deliberative path averages ~45-60s for standard queries,
+~70-85s for sensitive-domain queries (1 cycle ~35s, 2 cycles ~65s).
+Fast path averages ~10-12s.
 
 ---
 
@@ -427,11 +449,30 @@ There is no dedicated model for the orchestrator (it is not an LLM module).
 
 - **Default**: `true`
 - **Type**: bool
-- **Description**: When true, critic/simulator/perspectives run in parallel within each cycle. Their LLM calls are
-  persisted (with run/request/cycle context) and appear in moralstack-ui; `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`,
+- **Description**: When true, the deliberation runner uses parallel execution for module calls within each cycle (see
+  also `MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES`). LLM calls are persisted (with run/request/cycle context)
+  and appear in moralstack-ui; `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`,
   `MORALSTACK_ORCHESTRATOR_ENABLE_SIMULATION`, and `MORALSTACK_ORCHESTRATOR_ENABLE_HINDSIGHT` determine which modules run
   and thus which calls are recorded and visible in the UI.
 
+#### MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES
+
+- **Default**: `true`
+- **Type**: bool
+- **Description**: When `true` and `MORALSTACK_ORCHESTRATOR_PARALLEL_MODULE_CALLS` is `true`, the critic runs in parallel
+  with the simulator and perspectives (full parallel evaluation). When `false`, the critic runs first as a gate; simulator
+  and perspectives run in parallel only after the critic reports no hard violation. See
+  [Latency-oriented parameters](#latency-oriented-parameters).
+
+#### MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION
+
+- **Default**: `true`
+- **Type**: bool
+- **Description**: When `true`, risk estimation and speculative first-pass draft generation run in parallel at the start
+  of `process()`. The draft may be reused on benign, fast, and deliberative routes; it is not used for routing. When
+  `false`, risk estimation runs alone, then generation follows the previous sequential pattern. See
+  [Latency-oriented parameters](#latency-oriented-parameters).
+
 ### Risk thresholds (path routing)
 
 #### MORALSTACK_ORCHESTRATOR_RISK_LOW_THRESHOLD
diff --git a/docs/modules/perspectives.md b/docs/modules/perspectives.md
index 85d99d0..9c448c8 100644
--- a/docs/modules/perspectives.md
+++ b/docs/modules/perspectives.md
@@ -157,6 +157,8 @@ overrides these variables.**
 - **Type**: int (≥ 1)
 - **Meaning**: Number of parse attempts per perspective JSON response before marking that perspective as failed.
 
+Perspective evaluation uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON.
+
 #### MORALSTACK_PERSPECTIVES_MAX_TOKENS
 
 - **Default**: `512`
diff --git a/docs/modules/policy.md b/docs/modules/policy.md
index 357baba..6671746 100644
--- a/docs/modules/policy.md
+++ b/docs/modules/policy.md
@@ -33,7 +33,7 @@ models like gpt-5.x and o-series require the latter).
 ```python
 from moralstack.models.policy import OpenAIPolicy, OpenAIPolicyConfig
 
-# From environment variables (OPENAI_API_KEY, OPENAI_MODEL, ...)
+# From environment variables (OPENAI_API_KEY, OPENAI_MODEL, optional MORALSTACK_POLICY_REWRITE_MODEL, ...)
 policy = OpenAIPolicy()
 
 # Or with explicit overrides
@@ -65,6 +65,32 @@ print(result.text)
 
 Revises a response based on feedback.
 
+When the configured model supports it (gpt-4o, gpt-4o-mini, gpt-4.1 family), `rewrite()` automatically leverages
+**OpenAI Predicted Outputs** (speculative decoding): the existing draft is provided as a prediction hint so that
+unchanged portions of the text are generated significantly faster. This is transparent to the caller and does not
+alter the output quality.
+
+**Rewrite model**: `OpenAIPolicy` may use a separate model for `rewrite()` only via `MORALSTACK_POLICY_REWRITE_MODEL`.
+If unset or empty, `rewrite()` uses the same model as `generate()` (`OPENAI_MODEL`). This keeps the first-pass
+draft on the primary model while allowing a lighter model for deliberative revisions (see `docs/architecture_spec.md`).
+
+The rewrite prompt also includes explicit constraints that prevent lighter models from adding
+new operational content not present in the original draft. This ensures that `gpt-4.1-nano`
+rewrites maintain quality comparable to `gpt-4o` rewrites on the benchmark (9.39 avg score,
+0 leakage).
+
+---
+
+## Environment variables
+
+| Variable | Purpose |
+|----------|---------|
+| `OPENAI_API_KEY` | Required API key |
+| `OPENAI_MODEL` | Primary model for `generate()` and `refuse()` |
+| `MORALSTACK_POLICY_REWRITE_MODEL` | Optional; model for `rewrite()` (deliberative cycle 2+). Defaults to `OPENAI_MODEL` when unset. `.env.template` sets `gpt-4.1-nano` when you copy that file. |
+
+See also `.env.template` and `INSTALL.md`.
+
 ```python
 result = policy.rewrite(
     prompt="Original user request",
@@ -106,6 +132,20 @@ class GenerationResult:
     finish_reason: str     # Termination reason ("stop", "length", etc.)
 ```
 
+### GenerationConfig
+
+```python
+@dataclass
+class GenerationConfig:
+    max_tokens: int = 2048
+    temperature: float = 0.7
+    top_p: float = 0.9
+    stop_sequences: list[str] = []
+    response_format: Any = None  # OpenAI response format constraint
+```
+
+The optional `response_format` field maps to OpenAI's response format (e.g. `{"type": "json_object"}`). Structured evaluation modules (Critic, Simulator, Hindsight, Perspectives) set it so the API returns guaranteed valid JSON.
+
 ---
 
 ## Output Sanitization
@@ -174,6 +214,8 @@ class PolicyLLMProtocol(Protocol):
         ...
 ```
 
+The `config` argument is typically a `GenerationConfig` (see [Output Structure](#output-structure)); it may include `response_format` for OpenAI structured outputs.
+
 ---
 
 ## Usage with OpenAI
diff --git a/docs/modules/simulator.md b/docs/modules/simulator.md
index 7c9184b..5a258df 100644
--- a/docs/modules/simulator.md
+++ b/docs/modules/simulator.md
@@ -283,10 +283,15 @@ configuration is the single source of configuration — no CLI or code path over
   benchmark create a dedicated `OpenAIPolicy` with this model for the simulator; the rest of the stack keeps using
   `OPENAI_MODEL`.
 - **Effect**:
-    - **Set to a model id** (e.g. `gpt-4o-mini`): The simulator uses that model. Lets you use a smaller/cheaper model
+    - **Set to a model id** (e.g. `gpt-4o-mini`, `gpt-4.1-nano` as in `.env.template` / `.env.minimal`): The simulator uses that model. Lets you use a smaller/cheaper model
       for simulation and a larger one for generation.
     - **Unset or empty**: The simulator uses the same policy (and model) as the rest of the pipeline.
 
+In the recommended configuration (`.env.template`), the simulator uses `gpt-4.1-nano`.
+Benchmark testing shows this reduces average deliberative latency by ~27% compared to
+`gpt-4o-mini` on the simulator, with no compliance degradation (98.8% maintained) and
+minimal quality impact (avg score 9.39 vs 9.36 with `gpt-4o` across all modules).
+
 ### LLM and retry behaviour
 
 #### MORALSTACK_SIMULATOR_MAX_RETRIES
@@ -295,6 +300,8 @@ configuration is the single source of configuration — no CLI or code path over
 - **Type**: int (>= 1)
 - **Description**: Number of parse attempts for the simulator JSON response before raising an error.
 
+Simulator generation uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON.
+
 #### MORALSTACK_SIMULATOR_MAX_TOKENS
 
 - **Default**: `384`
diff --git a/moralstack/cli/loader.py b/moralstack/cli/loader.py
index 55fc03e..71a78f7 100644
--- a/moralstack/cli/loader.py
+++ b/moralstack/cli/loader.py
@@ -163,8 +163,11 @@ def _load_real_modules(self) -> dict[str, Any]:
             model=self.config.openai_model,
         )
         modules["policy"] = openai_policy
-        self.load_status["policy"] = f"✓ OpenAI ({self.config.openai_model})"
-        print_colored(f"  ✓ policy: OpenAI ({self.config.openai_model})", "green")
+        policy_display = self.config.openai_model
+        if openai_policy.rewrite_model != openai_policy.model:
+            policy_display = f"{self.config.openai_model} (rewrite: {openai_policy.rewrite_model})"
+        self.load_status["policy"] = f"✓ OpenAI ({policy_display})"
+        print_colored(f"  ✓ policy: OpenAI ({policy_display})", "green")
 
         # Constitution Store (requires openai_api_key)
         from moralstack.constitution.openai_config import OpenAIClientConfig
diff --git a/moralstack/cli/report.py b/moralstack/cli/report.py
index 59718dc..299c56b 100644
--- a/moralstack/cli/report.py
+++ b/moralstack/cli/report.py
@@ -17,7 +17,16 @@ def __init__(self, verbose: bool = False):
         self.calls: list[dict] = []
         self.call_counter = 0
 
-    def log_call(self, module: str, action: str, prompt: str, response: str = "", duration_ms: float = 0.0):
+    def log_call(
+        self,
+        module: str,
+        action: str,
+        prompt: str,
+        response: str = "",
+        duration_ms: float = 0.0,
+        *,
+        model: str | None = None,
+    ):
         """Logs an LLM call."""
         self.call_counter += 1
 
@@ -32,6 +41,7 @@ def log_call(self, module: str, action: str, prompt: str, response: str = "", du
             "prompt": prompt[:prompt_limit] + "..." if len(prompt) > prompt_limit else prompt,
             "response": (response[:response_limit] + "..." if len(response) > response_limit else response),
             "duration_ms": duration_ms,
+            "model": model,
             # Always save complete data for revision history
             "full_prompt": prompt,
             "full_response": response,
@@ -52,6 +62,9 @@ def _print_call(self, call: dict):
         print_colored(f"📞 LLM CALL #{call['id']}: {call['module']} → {call['action']}", header_color)
         print_colored(f"{'=' * 80}", border_color)
 
+        if call.get("model"):
+            print_colored(f"🤖 Model: {call['model']}", "cyan")
+
         if call["prompt"]:
             print_colored("\n📤 PROMPT:", "blue")
             # Show full prompt for important modules
@@ -422,6 +435,10 @@ def _detailed_phases(self, trace: DeliberationTrace, call_logger: CallLogger) ->
                 lines.append(f"| **Duration** | `{phase.duration_ms:.0f}ms` |")
                 lines.append(f"| **Success** | {status_icon} |")
 
+                policy_call = self._get_policy_call_dict_for_phase(phase, cycle_num, calls_by_module)
+                if policy_call and policy_call.get("model"):
+                    lines.append(f"| **Model** | `{policy_call['model']}` |")
+
                 if phase.decision:
                     lines.append(f"| **Decision** | `{phase.decision}` |")
                 if phase.decision_reason:
@@ -512,6 +529,27 @@ def _build_calls_map(self, call_logger: CallLogger) -> dict:
 
         return calls_map
 
+    def _get_policy_call_dict_for_phase(self, phase: PhaseResult, cycle: int, calls_map: dict) -> dict[str, Any] | None:
+        """Returns the CallLogger entry for policy generation or revision, if any."""
+        if phase.phase not in (PhaseType.GENERATION, PhaseType.REVISION):
+            return None
+        if "policy" not in calls_map:
+            return None
+        module_calls = calls_map["policy"]
+        if phase.phase == PhaseType.GENERATION:
+            for call in module_calls:
+                if "generate" in call.get("action", "").lower():
+                    return call
+            return module_calls[0] if module_calls else None
+        if phase.phase == PhaseType.REVISION:
+            rewrite_calls = [c for c in module_calls if "rewrite" in c.get("action", "").lower()]
+            if cycle >= 2 and rewrite_calls:
+                idx = cycle - 2
+                if 0 <= idx < len(rewrite_calls):
+                    return rewrite_calls[idx]
+            return None
+        return None
+
     def _get_full_data_for_phase(self, phase: PhaseResult, cycle: int, calls_map: dict) -> tuple:
         """
         Retrieves COMPLETE (non-truncated) data for a phase from CallLogger.
diff --git a/moralstack/models/base.py b/moralstack/models/base.py
index f4d52af..55536c3 100644
--- a/moralstack/models/base.py
+++ b/moralstack/models/base.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import List, Literal, Optional
+from typing import Any, List, Literal, Optional
 
 
 @dataclass(frozen=True)
@@ -19,6 +19,7 @@ class GenerationConfig:
     temperature: float = 0.7
     top_p: float = 0.9
     stop_sequences: List[str] = field(default_factory=list)
+    response_format: Any = None
 
 
 @dataclass(frozen=True)
diff --git a/moralstack/models/policy.py b/moralstack/models/policy.py
index 224c210..09163e1 100644
--- a/moralstack/models/policy.py
+++ b/moralstack/models/policy.py
@@ -1,24 +1,34 @@
 """
-Policy LLM per MoralStack - OpenAI only.
+Policy LLM for MoralStack — OpenAI only.
 
-Unico provider: OpenAI API. Configurazione centralizzata via variabili d'ambiente:
-- OPENAI_API_KEY (obbligatoria)
+Single provider: OpenAI API. Configuration via environment variables:
+- OPENAI_API_KEY (required)
 - OPENAI_MODEL (default: gpt-4o)
-- OPENAI_BASE_URL (opzionale, per proxy/enterprise)
-- OPENAI_TIMEOUT_MS (opzionale)
-- OPENAI_MAX_RETRIES (opzionale)
-- OPENAI_TEMPERATURE (opzionale, default 0.7)
+- MORALSTACK_POLICY_REWRITE_MODEL (optional; deliberative rewrite at cycle 2+; defaults to OPENAI_MODEL)
+- OPENAI_BASE_URL (optional, for proxy/enterprise)
+- OPENAI_TIMEOUT_MS (optional)
+- OPENAI_MAX_RETRIES (optional)
+- OPENAI_TEMPERATURE (optional, default 0.7)
 """
 
 from __future__ import annotations
 
+import logging
 import os
 from dataclasses import dataclass
 from typing import Any
 
 from moralstack.models.base import GenerationConfig, GenerationResult
-from moralstack.utils.openai_params import completion_tokens_param
-from moralstack.utils.provider_errors import classify_provider_error, sleep_with_backoff
+from moralstack.utils.openai_params import (
+    completion_tokens_param,
+    supports_predicted_output,
+)
+from moralstack.utils.provider_errors import (
+    classify_provider_error,
+    sleep_with_backoff,
+)
+
+logger = logging.getLogger(__name__)
 
 
 def _get_openai_config(
@@ -29,8 +39,8 @@ def _get_openai_config(
     api_key = (api_key_override or os.getenv("OPENAI_API_KEY") or "").strip()
     if not api_key:
         raise ValueError(
-            "OPENAI_API_KEY non impostata. Imposta la variabile d'ambiente OPENAI_API_KEY "
-            "o usa --openai-key da riga di comando. Esempio: export OPENAI_API_KEY=sk-..."
+            "OPENAI_API_KEY is not set. Set the environment variable OPENAI_API_KEY "
+            "or pass --openai-key from the command line. Example: export OPENAI_API_KEY=sk-..."
         )
     base_url = os.getenv("OPENAI_BASE_URL") or None
     timeout_ms_env = os.getenv("OPENAI_TIMEOUT_MS")
@@ -39,6 +49,8 @@ def _get_openai_config(
     model = model_override or os.getenv("OPENAI_MODEL", "gpt-4o")
     temperature = float(os.getenv("OPENAI_TEMPERATURE", "0.7"))
     top_p = float(os.getenv("OPENAI_TOP_P", "0.9"))
+    rewrite_raw = os.getenv("MORALSTACK_POLICY_REWRITE_MODEL")
+    rewrite_model = rewrite_raw.strip() if rewrite_raw and rewrite_raw.strip() else None
     return OpenAIPolicyConfig(
         api_key=api_key,
         base_url=base_url,
@@ -47,6 +59,7 @@ def _get_openai_config(
         model=model,
         temperature=temperature,
         top_p=top_p,
+        rewrite_model=rewrite_model,
     )
 
 
@@ -64,6 +77,7 @@ class OpenAIPolicyConfig:
     max_retries: int | None = None
     temperature: float | None = None
     top_p: float | None = None
+    rewrite_model: str | None = None
 
 
 class OpenAIPolicy:
@@ -104,6 +118,7 @@ def __init__(
         final_max_retries = config.max_retries if config and config.max_retries is not None else env_cfg.max_retries
         final_temperature = config.temperature if config and config.temperature is not None else env_cfg.temperature
         final_top_p = config.top_p if config and config.top_p is not None else env_cfg.top_p
+        final_rewrite_model = config.rewrite_model if config and config.rewrite_model is not None else env_cfg.rewrite_model
 
         # Override espliciti da parametri __init__ (sovrascrivono tutto)
         if api_key is not None:
@@ -119,6 +134,7 @@ def __init__(
 
         self.api_key = final_api_key
         self.model = final_model
+        self._rewrite_model = final_rewrite_model if final_rewrite_model is not None else final_model
         self._timeout = (final_timeout_ms / 1000.0) if final_timeout_ms is not None else 60.0
         self._max_retries = final_max_retries if final_max_retries is not None else 3
         self._default_temperature = final_temperature if final_temperature is not None else 0.7
@@ -127,7 +143,7 @@ def __init__(
         try:
             import openai
         except ImportError:
-            raise ImportError("Il client OpenAI è richiesto. Installa con: pip install openai")
+            raise ImportError("The OpenAI client is required. Install with: pip install openai")
 
         kwargs: dict[str, Any] = {"api_key": self.api_key}
         if final_base_url:
@@ -135,6 +151,11 @@ def __init__(
         self.client = openai.OpenAI(**kwargs)
         self._cost_tracker: Any = None
 
+    @property
+    def rewrite_model(self) -> str:
+        """Effective model for `rewrite()` (may differ from `model` when env is set)."""
+        return self._rewrite_model
+
     def set_cost_tracker(self, tracker: Any) -> None:
         """Imposta un TokenCostTracker per tracciare i costi delle chiamate."""
         self._cost_tracker = tracker
@@ -146,26 +167,42 @@ def _complete(
         max_tokens: int = 1024,
         top_p: float | None = None,
         response_format: Any = None,
+        prediction: dict[str, str] | None = None,
+        model_override: str | None = None,
     ) -> tuple[str, int, str]:
         """
-        Chiamata completions con retry su errori transient (429/503/timeout).
-        Ritorna (text, tokens_used, finish_reason). Usa classifier e backoff con jitter.
+        Completions call with retry on transient errors (429/503/timeout).
+        Returns (text, tokens_used, finish_reason). Uses classifier and jittered backoff.
+
+        Args:
+            prediction: Optional predicted output for speculative decoding.
+                Expected format: ``{"type": "content", "content": "..."}``
+                Only applied when the current model supports the feature.
+            model_override: If set, used instead of ``self.model`` for this call.
         """
+        effective_model = model_override or self.model
         temp = temperature if temperature is not None else self._default_temperature
         top_p_val = top_p if top_p is not None else self._default_top_p
+        use_prediction = prediction is not None and supports_predicted_output(effective_model or "")
+        # prediction and response_format are mutually exclusive in
+        # the OpenAI API; prefer response_format (structural guarantee)
+        if use_prediction and response_format is not None:
+            use_prediction = False
         last_error: Exception | None = None
         for attempt in range(self._max_retries):
             try:
                 kwargs: dict[str, Any] = {
-                    "model": self.model,
+                    "model": effective_model,
                     "messages": messages,
                     "temperature": temp,
                     "top_p": top_p_val,
                     "timeout": self._timeout,
                 }
-                kwargs.update(completion_tokens_param(self.model, max_tokens))
+                kwargs.update(completion_tokens_param(effective_model, max_tokens))
                 if response_format is not None:
                     kwargs["response_format"] = response_format
+                if use_prediction:
+                    kwargs["prediction"] = prediction
                 response = self.client.chat.completions.create(**kwargs)
                 choice = response.choices[0]
                 text = (choice.message.content or "").strip()
@@ -177,7 +214,7 @@ def _complete(
                     prompt_tokens = int(tokens * 0.7) if tokens else 0
                     completion_tokens = tokens - prompt_tokens if tokens else 0
                 if self._cost_tracker is not None and hasattr(self._cost_tracker, "add_call"):
-                    self._cost_tracker.add_call(self.model, prompt_tokens, completion_tokens)
+                    self._cost_tracker.add_call(effective_model, prompt_tokens, completion_tokens)
                 reason = choice.finish_reason or "stop"
                 return text, tokens, reason
             except Exception as e:
@@ -198,8 +235,18 @@ def generate(
         prompt: str,
         system: str = "",
         config: GenerationConfig | None = None,
+        prediction: dict[str, str] | None = None,
+        model_override: str | None = None,
     ) -> GenerationResult:
-        """Genera risposta al prompt."""
+        """Generate a response for the prompt.
+
+        Args:
+            prediction: Optional predicted output for speculative decoding.
+                When provided and the model supports it, the API uses
+                speculative decoding to produce faster responses when the
+                output is expected to be similar to the prediction text.
+            model_override: If set, used instead of the primary policy model for this call.
+        """
         messages: list[dict[str, str]] = []
         if system:
             messages.append({"role": "system", "content": system})
@@ -208,13 +255,21 @@ def generate(
         max_tokens = 1024
         temperature = self._default_temperature
         top_p = self._default_top_p
+        response_format = None
         if config is not None:
             max_tokens = getattr(config, "max_tokens", max_tokens)
             temperature = getattr(config, "temperature", temperature)
             top_p = getattr(config, "top_p", top_p)
+            response_format = getattr(config, "response_format", None)
 
         text, tokens_used, finish_reason = self._complete(
-            messages, temperature=temperature, max_tokens=max_tokens, top_p=top_p
+            messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            prediction=prediction,
+            response_format=response_format,
+            model_override=model_override,
         )
         return GenerationResult(
             text=text,
@@ -233,26 +288,59 @@ def rewrite(
         system: str = "",
         config: GenerationConfig | None = None,
     ) -> GenerationResult:
-        """Riscrive una bozza in base al feedback."""
+        """Riscrive una bozza in base al feedback.
+
+        Uses OpenAI predicted outputs (speculative decoding) when the model
+        supports it: the existing draft is provided as a prediction hint so
+        that unchanged portions are generated significantly faster.
+        """
         rewrite_prompt = (
             f"ORIGINAL REQUEST:\n{prompt}\n\n"
             f"CURRENT DRAFT:\n{draft}\n\n"
             f"REVISION FEEDBACK:\n{guidance}\n\n"
             "Revise the response incorporating the feedback above. "
-            "IMPORTANT: Maintain or improve the depth, structure, and multiple perspectives. "
+            "IMPORTANT: Maintain or improve the depth, structure, "
+            "and multiple perspectives. "
             "Use numbered lists and clear sections when appropriate. "
             "Keep the response comprehensive and well-reasoned. "
             "Output ONLY the revised response, no additional commentary."
         )
         rewrite_system = system or (
-            "You are an assistant that revises responses based on feedback. "
-            "When revising, MAINTAIN or IMPROVE the depth and structure of the response. "
-            "Continue to present multiple perspectives and balanced analysis. "
+            "You are an assistant that revises responses based on "
+            "feedback. "
+            "When revising, MAINTAIN or IMPROVE the depth and "
+            "structure of the response. "
+            "Continue to present multiple perspectives and balanced "
+            "analysis. "
             "Use numbered lists and clear organization. "
-            "Incorporate the feedback while keeping the response comprehensive and well-reasoned. "
+            "Incorporate the feedback while keeping the response "
+            "comprehensive and well-reasoned. "
             "Respond in the SAME LANGUAGE as the original user request."
         )
-        return self.generate(rewrite_prompt, system=rewrite_system, config=config)
+
+        # Append constraints regardless of source
+        rewrite_system += (
+            "REWRITE CONSTRAINTS:\n"
+            "- Do NOT add new examples, scenarios, or operational details "
+            "not present in the original draft.\n"
+            "- Focus on restructuring, deepening, and reframing the EXISTING "
+            "content based on the feedback.\n"
+            "- When feedback says to focus on narrative or conceptual aspects, "
+            "REMOVE operational specifics rather than adding new ones.\n"
+        )
+        draft_prediction = {"type": "content", "content": draft} if draft else None
+        logger.info(
+            "policy_rewrite using model=%s (primary=%s)",
+            self._rewrite_model,
+            self.model,
+        )
+        return self.generate(
+            rewrite_prompt,
+            system=rewrite_system,
+            config=config,
+            prediction=draft_prediction,
+            model_override=self._rewrite_model,
+        )
 
     def refuse(
         self,
@@ -298,7 +386,7 @@ def refuse(
             "Provide deep reasoning and helpful, specific alternatives. "
         )
         if language and str(language).strip():
-            refuse_system += f"CRITICAL: You MUST respond entirely in {language.strip()}. " "Do not add translations."
+            refuse_system += f"CRITICAL: You MUST respond entirely in {language.strip()}. Do not add translations."
         else:
             refuse_system += "Always respond in the SAME LANGUAGE as the user's request."
         return self.generate(refuse_prompt, system=refuse_system, config=config)
diff --git a/moralstack/models/risk/estimator.py b/moralstack/models/risk/estimator.py
index 2544313..d932ea0 100644
--- a/moralstack/models/risk/estimator.py
+++ b/moralstack/models/risk/estimator.py
@@ -229,12 +229,15 @@ def _persist_risk_llm_call(
         attempts: int,
     ) -> None:
         """Persist LLM call for risk estimation. Logs debug on ImportError."""
+        risk_model = getattr(self.policy, "model", None) if self.policy else None
+        risk_model_str = str(risk_model) if risk_model is not None else None
         try:
             persist_llm_call(
                 cycle=0,
                 phase="risk_estimation",
                 module="risk_estimator",
                 action="estimate",
+                model=risk_model_str,
                 started_at=started_at,
                 duration_ms=duration_ms,
                 prompt=prompt,
@@ -411,12 +414,15 @@ def _persist_mini_llm_call(
         """Persist a single mini-estimator LLM call. Logs debug on ImportError."""
         import json as _json
 
+        risk_model = getattr(self.policy, "model", None) if self.policy else None
+        risk_model_str = str(risk_model) if risk_model is not None else None
         try:
             persist_llm_call(
                 cycle=0,
                 phase="risk_estimation",
                 module="risk_estimator",
                 action=action,
+                model=risk_model_str,
                 started_at=started_at,
                 duration_ms=duration_ms,
                 prompt=prompt,
diff --git a/moralstack/orchestration/config_loader.py b/moralstack/orchestration/config_loader.py
index 1c80cf4..d6f90ff 100644
--- a/moralstack/orchestration/config_loader.py
+++ b/moralstack/orchestration/config_loader.py
@@ -37,6 +37,8 @@
 ENV_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD = "MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD"
 ENV_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD = "MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD"
 ENV_BORDERLINE_REFUSE_UPPER = "MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER"
+ENV_PARALLEL_CRITIC_WITH_MODULES = "MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES"
+ENV_ENABLE_SPECULATIVE_GENERATION = "MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION"
 
 
 def load_orchestrator_config_from_env():  # -> OrchestratorConfig (imported inside)
@@ -75,6 +77,8 @@ def load_orchestrator_config_from_env():  # -> OrchestratorConfig (imported insi
     simulator_gate_semantic_harm_threshold = get_env_float(ENV_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD, 0.4, 0.0, 1.0)
     simulator_gate_delta_chars_threshold = get_env_int(ENV_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD, 100, 0)
     borderline_refuse_upper = get_env_float(ENV_BORDERLINE_REFUSE_UPPER, 0.95, 0.0, 1.0)
+    parallel_critic_with_modules = get_env_bool(ENV_PARALLEL_CRITIC_WITH_MODULES, True)
+    enable_speculative_generation = get_env_bool(ENV_ENABLE_SPECULATIVE_GENERATION, True)
 
     return OrchestratorConfig(
         max_deliberation_cycles=max_deliberation_cycles,
@@ -97,4 +101,6 @@ def load_orchestrator_config_from_env():  # -> OrchestratorConfig (imported insi
         simulator_gate_semantic_harm_threshold=simulator_gate_semantic_harm_threshold,
         simulator_gate_delta_chars_threshold=simulator_gate_delta_chars_threshold,
         borderline_refuse_upper=borderline_refuse_upper,
+        parallel_critic_with_modules=parallel_critic_with_modules,
+        enable_speculative_generation=enable_speculative_generation,
     )
diff --git a/moralstack/orchestration/controller.py b/moralstack/orchestration/controller.py
index dbf0ff0..beca1b0 100644
--- a/moralstack/orchestration/controller.py
+++ b/moralstack/orchestration/controller.py
@@ -36,6 +36,7 @@
     orch_debug_log,
 )
 from moralstack.orchestration.domain_exclusion import generate_domain_exclusion_response
+from moralstack.orchestration.language_resolver import resolve_prompt_with_language
 from moralstack.orchestration.overlay_policy import (
     OVERLAY_SENSITIVE_RISK_FLOOR,
     apply_risk_floor_if_sensitive,
@@ -206,6 +207,96 @@ def _estimate_risk(self, request: ProcessedRequest) -> RiskEstimation:
         except Exception as e:
             raise RiskEstimationError(f"Risk estimation failed: {e}")
 
+    def _speculative_generate(
+        self,
+        request: ProcessedRequest,
+    ) -> str | None:
+        """Generate a speculative draft in parallel with risk estimation.
+
+        Uses the base system prompt and fallback language detection (the risk
+        estimator's ``detected_language`` is not available yet).  Returns the
+        output-protected draft text, or ``None`` on any failure.
+        """
+        if self.policy is None:
+            return None
+        try:
+            prompt_text = resolve_prompt_with_language(
+                request.prompt,
+                "",
+                request.prompt,
+            )
+            start = time.time()
+            try:
+                result = self.policy.generate(
+                    prompt=prompt_text,
+                    system=self._protected_system_prompt,
+                )
+            except TypeError:
+                result = self.policy.generate(prompt_text)
+            elapsed = (time.time() - start) * 1000
+            response_text = getattr(result, "text", None) or str(result)
+            protection = self._output_protector.validate(response_text)
+            prompt_used = getattr(result, "prompt_used", None) or prompt_text
+            system_used = getattr(result, "system_used", None) or self._protected_system_prompt
+            policy_model = getattr(self.policy, "model", None)
+            policy_model_str = str(policy_model) if policy_model is not None else None
+            record_llm_call(
+                self.logger,
+                None,
+                {
+                    "cycle": 0,
+                    "phase": "speculative_generate",
+                    "module": "policy",
+                    "action": "generate (speculative)",
+                    "model": policy_model_str,
+                    "started_at": int(start * 1000),
+                    "duration_ms": elapsed,
+                    "prompt": prompt_used,
+                    "system_prompt": system_used or "",
+                    "raw_response": response_text,
+                    "sequence_in_cycle": 0,
+                },
+            )
+            return protection.cleaned
+        except Exception as e:
+            _LOG.warning(
+                "Speculative generation failed, will regenerate: %s",
+                e,
+            )
+            return None
+
+    def _run_speculative_overlap(
+        self,
+        request: ProcessedRequest,
+    ) -> tuple[str | None, RiskEstimation]:
+        """Run risk estimation and speculative draft generation in parallel.
+
+        Returns ``(speculative_draft_or_None, risk_estimation)``.
+        If risk estimation raises, the exception propagates normally.
+
+        Uses ``contextvars.copy_context()`` so that persistence context
+        (run_id, request_id, cycle) is available inside the worker threads.
+        """
+        import contextvars
+        from concurrent.futures import ThreadPoolExecutor
+
+        ctx_risk = contextvars.copy_context()
+        ctx_spec = contextvars.copy_context()
+
+        executor = ThreadPoolExecutor(max_workers=2)
+        try:
+            risk_fut = executor.submit(ctx_risk.run, self._estimate_risk, request)
+            spec_fut = executor.submit(
+                ctx_spec.run,
+                self._speculative_generate,
+                request,
+            )
+            risk_estimation = risk_fut.result()
+            speculative_draft = spec_fut.result()
+            return speculative_draft, risk_estimation
+        finally:
+            executor.shutdown(wait=False)
+
     def _handle_timeout(self, request: ProcessedRequest, error_msg: str, start_time: float) -> OrchestratorResult:
         processing_time = int((time.time() - start_time) * 1000)
         response = FinalResponse(
@@ -447,6 +538,7 @@ def _route_benign(
         risk_estimation: RiskEstimationProtocol,
         start_time: float,
         trace: Trace,
+        speculative_draft: str | None = None,
     ) -> OrchestratorResult:
         request_id = request.request_id
         orch_debug_log(
@@ -462,6 +554,7 @@ def _route_benign(
             start_time,
             decision=decision,
             decision_explanation=explanation,
+            speculative_draft=speculative_draft,
         )
         fill_trace_from_result(trace, result)
         result.trace = trace
@@ -515,6 +608,7 @@ def _route_fast_path(
         risk_estimation: RiskEstimationProtocol,
         start_time: float,
         trace: Trace,
+        speculative_draft: str | None = None,
     ) -> OrchestratorResult:
         request_id = request.request_id
         orch_debug_log(
@@ -532,6 +626,7 @@ def _route_fast_path(
             decision=decision,
             constitution=constitution,
             decision_explanation=explanation,
+            speculative_draft=speculative_draft,
         )
         result.path_taken = "fast"
         fill_trace_from_result(trace, result)
@@ -675,6 +770,7 @@ def _route_deliberative(
         start_time: float,
         trace: Trace,
         pre_decision: Decision | None = None,
+        speculative_draft: str | None = None,
     ) -> OrchestratorResult:
         request_id = request.request_id
         orch_debug_log(
@@ -691,6 +787,7 @@ def _route_deliberative(
             start_time,
             constrained_generation=constrained_generation,
             constitution=constitution,
+            speculative_draft=speculative_draft,
         )
         modules_called: set[str] = set()
         if state.critiques:
@@ -910,7 +1007,11 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult:
         trace = self._trace_lifecycle.start_trace(request_id)
 
         try:
-            risk_estimation = self._estimate_risk(request)
+            speculative_draft: str | None = None
+            if self.config.enable_speculative_generation and self.policy is not None:
+                speculative_draft, risk_estimation = self._run_speculative_overlap(request)
+            else:
+                risk_estimation = self._estimate_risk(request)
             risk_score = risk_estimation.score if hasattr(risk_estimation, "score") else 0.5
             risk_category = getattr(risk_estimation, "risk_category", None)
             constrained_generation = risk_category == RiskCategory.CLEARLY_HARMFUL
@@ -1037,6 +1138,7 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult:
                     risk_proto,
                     start_time,
                     trace,
+                    speculative_draft=speculative_draft,
                 )
             if route == "safe_complete":
                 return self._route_safe_complete(
@@ -1070,6 +1172,7 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult:
                     risk_proto,
                     start_time,
                     trace,
+                    speculative_draft=speculative_draft,
                 )
 
             return self._route_deliberative(
@@ -1081,6 +1184,7 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult:
                 start_time,
                 trace,
                 pre_decision=decision,
+                speculative_draft=speculative_draft,
             )
 
         except OrchestratorTimeoutError as e:
diff --git a/moralstack/orchestration/deliberation_runner.py b/moralstack/orchestration/deliberation_runner.py
index ea39fd6..d8bebe3 100644
--- a/moralstack/orchestration/deliberation_runner.py
+++ b/moralstack/orchestration/deliberation_runner.py
@@ -56,6 +56,36 @@
 
 _LOG = logging.getLogger(__name__)
 
+
+def _policy_llm_model_for_action(policy: Any, action: str) -> str | None:
+    """Effective OpenAI model name for policy generate vs rewrite (rewrite may use MORALSTACK_POLICY_REWRITE_MODEL)."""
+    if policy is None:
+        return None
+    if action == "rewrite":
+        rw = getattr(policy, "rewrite_model", None)
+        if rw is not None:
+            return str(rw)
+    m = getattr(policy, "model", None)
+    return str(m) if m is not None else None
+
+
+def _module_model(module: Any) -> str | None:
+    """Return the OpenAI model name used by a cognitive module (critic, simulator, …).
+
+    Each module stores its inner ``OpenAIPolicy`` as ``self.policy``; fall back
+    to ``module.model`` if present.
+    """
+    if module is None:
+        return None
+    inner = getattr(module, "policy", None)
+    if inner is not None:
+        m = getattr(inner, "model", None)
+        if m is not None:
+            return str(m)
+    m = getattr(module, "model", None)
+    return str(m) if m is not None else None
+
+
 # Logical order within a deliberation cycle for journey/report display (sequence_in_cycle).
 SEQ_POLICY = 1
 SEQ_CRITIC = 2
@@ -183,6 +213,7 @@ def run_benign_fast_path(
         *,
         decision: Decision,
         decision_explanation: DecisionExplanation | None = None,
+        speculative_draft: str | None = None,
     ) -> OrchestratorResult:
         """FAST PATH per operational_risk == NONE. Nessun modulo deliberativo."""
         from moralstack.orchestration.diagnostics import orch_debug_log
@@ -196,38 +227,65 @@ def run_benign_fast_path(
         )
         if self.policy is not None:
             try:
-                prompt_text = resolve_prompt_with_language(
-                    request.prompt,
-                    risk_estimation.detected_language or "",
-                    request.prompt,
-                )
-                start_gen = time.time()
-                try:
-                    result = self.policy.generate(prompt=prompt_text, system=self._protected_system_prompt)
-                except TypeError:
-                    result = self.policy.generate(prompt_text)
-                elapsed = (time.time() - start_gen) * 1000
-                response_text = _policy_text(result)
-                prompt_used = _policy_prompt_used(result, prompt_text)
-                system_used = _policy_system_used(result, self._protected_system_prompt or "")
-                record_llm_call(
-                    self.logger,
-                    None,
-                    {
-                        "cycle": 0,
-                        "phase": "policy_generate",
-                        "module": "policy",
-                        "action": "generate (benign_fast_path)",
-                        "started_at": int(start_gen * 1000),
-                        "duration_ms": elapsed,
-                        "prompt": prompt_used,
-                        "system_prompt": system_used or "",
-                        "raw_response": response_text,
-                        "sequence_in_cycle": SEQ_POLICY,
-                    },
-                )
-                protection_result = self._output_protector.validate(response_text)
-                content = protection_result.cleaned
+                if speculative_draft:
+                    content = speculative_draft
+                    record_llm_call(
+                        self.logger,
+                        None,
+                        {
+                            "cycle": 0,
+                            "phase": "policy_generate",
+                            "module": "policy",
+                            "action": "generate (speculative-reuse," " benign_fast_path)",
+                            "model": _policy_llm_model_for_action(self.policy, "generate"),
+                            "duration_ms": 0.0,
+                            "prompt": request.prompt[:200],
+                            "raw_response": content[:200],
+                            "sequence_in_cycle": SEQ_POLICY,
+                        },
+                    )
+                else:
+                    prompt_text = resolve_prompt_with_language(
+                        request.prompt,
+                        risk_estimation.detected_language or "",
+                        request.prompt,
+                    )
+                    start_gen = time.time()
+                    try:
+                        result = self.policy.generate(
+                            prompt=prompt_text,
+                            system=self._protected_system_prompt,
+                        )
+                    except TypeError:
+                        result = self.policy.generate(prompt_text)
+                    elapsed = (time.time() - start_gen) * 1000
+                    response_text = _policy_text(result)
+                    prompt_used = _policy_prompt_used(
+                        result,
+                        prompt_text,
+                    )
+                    system_used = _policy_system_used(
+                        result,
+                        self._protected_system_prompt or "",
+                    )
+                    record_llm_call(
+                        self.logger,
+                        None,
+                        {
+                            "cycle": 0,
+                            "phase": "policy_generate",
+                            "module": "policy",
+                            "action": "generate (benign_fast_path)",
+                            "started_at": int(start_gen * 1000),
+                            "duration_ms": elapsed,
+                            "prompt": prompt_used,
+                            "system_prompt": system_used or "",
+                            "raw_response": response_text,
+                            "sequence_in_cycle": SEQ_POLICY,
+                        },
+                    )
+                    protection_result = self._output_protector.validate(response_text)
+                    content = protection_result.cleaned
             except Exception as e:
                 raise GenerationError(f"Generation failed: {e}")
         else:
@@ -371,6 +429,7 @@ def run_fast_path(
         decision: Decision,
         constitution: Any | None = None,
         decision_explanation: DecisionExplanation | None = None,
+        speculative_draft: str | None = None,
     ) -> OrchestratorResult:
         """Path veloce: genera draft + quick check costituzionale;
         se fallisce passa a deliberative."""
@@ -384,73 +443,116 @@ def run_fast_path(
             request_id=request.request_id or "",
         )
         if constitution is None and self.constitution_store is not None:
-            constitution = get_constitution_safe(self.constitution_store, request.get_domain())
+            constitution = get_constitution_safe(
+                self.constitution_store,
+                request.get_domain(),
+            )
         state = DeliberationState(cycle=0)
         if self.policy is not None:
             try:
-                start_gen = time.time()
-                prompt_text = resolve_prompt_with_language(
-                    request.prompt,
-                    risk_estimation.detected_language or "",
-                    request.prompt,
-                )
-                try:
-                    result = self.policy.generate(prompt=prompt_text, system=self._protected_system_prompt)
-                except TypeError:
-                    result = self.policy.generate(prompt_text)
-                elapsed = (time.time() - start_gen) * 1000
-                response_text = _policy_text(result)
-                protection_result = self._output_protector.validate(response_text)
-                if protection_result.had_leakage:
+                if speculative_draft:
+                    state.draft_response = speculative_draft
+                    reuse_model = _policy_llm_model_for_action(self.policy, "generate")
                     record_llm_call(
                         self.logger,
                         {
-                            "module": "output_protection",
-                            "action": "leakage_detected (fast_path)",
-                            "prompt": f"Type: {protection_result.leakage_type}",
-                            "response": f"Cleaned from {len(response_text)} to {len(protection_result.cleaned)} chars",
+                            "module": "policy",
+                            "action": "generate (speculative-reuse," " fast_path)",
+                            "prompt": request.prompt[:200],
+                            "response": speculative_draft[:200],
                             "duration_ms": 0.0,
+                            "model": reuse_model,
                         },
                         {
                             "cycle": 0,
-                            "phase": "output_protection",
-                            "module": "output_protection",
-                            "action": "leakage_detected (fast_path)",
+                            "phase": "policy_generate",
+                            "module": "policy",
+                            "action": "generate (speculative-reuse," " fast_path)",
+                            "model": reuse_model,
                             "duration_ms": 0.0,
-                            "raw_response": {
-                                "leakage_type": protection_result.leakage_type,
-                                "original_len": len(response_text),
-                                "cleaned_len": len(protection_result.cleaned),
-                                "had_leakage": True,
+                            "prompt": request.prompt[:200],
+                            "raw_response": speculative_draft[:200],
+                            "sequence_in_cycle": SEQ_POLICY,
+                        },
+                    )
+                else:
+                    start_gen = time.time()
+                    prompt_text = resolve_prompt_with_language(
+                        request.prompt,
+                        risk_estimation.detected_language or "",
+                        request.prompt,
+                    )
+                    try:
+                        result = self.policy.generate(
+                            prompt=prompt_text,
+                            system=self._protected_system_prompt,
+                        )
+                    except TypeError:
+                        result = self.policy.generate(prompt_text)
+                    elapsed = (time.time() - start_gen) * 1000
+                    response_text = _policy_text(result)
+                    protection_result = self._output_protector.validate(response_text)
+                    if protection_result.had_leakage:
+                        record_llm_call(
+                            self.logger,
+                            {
+                                "module": "output_protection",
+                                "action": "leakage_detected" " (fast_path)",
+                                "prompt": "Type: " f"{protection_result.leakage_type}",
+                                "response": "Cleaned from "
+                                f"{len(response_text)} to "
+                                f"{len(protection_result.cleaned)}"
+                                " chars",
+                                "duration_ms": 0.0,
+                            },
+                            {
+                                "cycle": 0,
+                                "phase": "output_protection",
+                                "module": "output_protection",
+                                "action": "leakage_detected" " (fast_path)",
+                                "duration_ms": 0.0,
+                                "raw_response": {
+                                    "leakage_type": protection_result.leakage_type,
+                                    "original_len": len(response_text),
+                                    "cleaned_len": len(
+                                        protection_result.cleaned,
+                                    ),
+                                    "had_leakage": True,
+                                },
+                                "sequence_in_cycle": SEQ_POLICY,
                             },
+                        )
+                    state.draft_response = protection_result.cleaned
+                    prompt_used = _policy_prompt_used(
+                        result,
+                        prompt_text,
+                    )
+                    system_used = _policy_system_used(
+                        result,
+                        self._protected_system_prompt,
+                    )
+                    record_llm_call(
+                        self.logger,
+                        {
+                            "module": "policy",
+                            "action": "generate (fast_path)",
+                            "prompt": request.prompt,
+                            "response": state.draft_response,
+                            "duration_ms": elapsed,
+                        },
+                        {
+                            "cycle": 0,
+                            "phase": "policy_generate",
+                            "module": "policy",
+                            "action": "generate (fast_path)",
+                            "started_at": int(start_gen * 1000),
+                            "duration_ms": elapsed,
+                            "prompt": prompt_used,
+                            "system_prompt": system_used or "",
+                            "raw_response": response_text,
                             "sequence_in_cycle": SEQ_POLICY,
                         },
                     )
-                state.draft_response = protection_result.cleaned
-                prompt_used = _policy_prompt_used(result, prompt_text)
-                system_used = _policy_system_used(result, self._protected_system_prompt)
-                record_llm_call(
-                    self.logger,
-                    {
-                        "module": "policy",
-                        "action": "generate (fast_path)",
-                        "prompt": request.prompt,
-                        "response": state.draft_response,
-                        "duration_ms": elapsed,
-                    },
-                    {
-                        "cycle": 0,
-                        "phase": "policy_generate",
-                        "module": "policy",
-                        "action": "generate (fast_path)",
-                        "started_at": int(start_gen * 1000),
-                        "duration_ms": elapsed,
-                        "prompt": prompt_used,
-                        "system_prompt": system_used or "",
-                        "raw_response": response_text,
-                        "sequence_in_cycle": SEQ_POLICY,
-                    },
-                )
             except Exception as e:
                 raise GenerationError(f"Generation failed: {e}")
         else:
@@ -460,7 +562,11 @@ def run_fast_path(
                 quick_result = self.critic.quick_check(request.prompt, state.draft_response, constitution)
                 if not quick_result.passed:
                     state_delib, risk_score, outcome = self.run_deliberative_path(
-                        request, risk_estimation, start_time, constitution=constitution
+                        request,
+                        risk_estimation,
+                        start_time,
+                        constitution=constitution,
+                        speculative_draft=state.draft_response,
                     )
                     return self._build_deliberative_result(
                         request,
@@ -627,16 +733,26 @@ def run_deliberative_path(
         *,
         constrained_generation: bool = False,
         constitution: Any | None = None,
+        speculative_draft: str | None = None,
     ) -> tuple[DeliberationState, float, ConvergenceOutcome]:
         """
         Esegue cicli deliberativi. Restituisce (state, risk_score, outcome) per assemblaggio.
         L'unica autorità sul loop è outcome post-enforcement: "continue"
         non sopravvive a cicli esauriti.
+
+        Args:
+            speculative_draft: Pre-generated draft from parallel overlap with
+                risk estimation.  When provided *and* constrained_generation is
+                False, the draft is used as the cycle-1 starting point,
+                skipping the initial generation call.
         """
         from moralstack.orchestration.diagnostics import orch_debug_log
 
         if constitution is None and self.constitution_store is not None:
-            constitution = get_constitution_safe(self.constitution_store, request.get_domain())
+            constitution = get_constitution_safe(
+                self.constitution_store,
+                request.get_domain(),
+            )
         request_id = request.request_id or ""
         orch_debug_log(
             "orchestrator.py:_deliberative_path",
@@ -647,6 +763,13 @@ def run_deliberative_path(
         )
         self._current_start_time = start_time
         state = DeliberationState(cycle=0)
+        # Pre-set speculative draft for cycle 1 when safe to do so.
+        # constrained_generation uses a different system prompt so the
+        # speculative draft (generated with the base prompt) is not suitable.
+        if speculative_draft and not constrained_generation:
+            state.draft_response = sanitize_policy_output(
+                speculative_draft,
+            )
         risk_score = risk_estimation.score
         max_cycles = self._effective_max_cycles(risk_estimation)
         # Constrained generation (clearly_harmful): the policy is already instructed to
@@ -1146,9 +1269,40 @@ def _run_critique_simulate_perspectives_parallel(
         import moralstack.prompts.perspectives_prompt  # noqa: F401
         import moralstack.prompts.simulator_prompt  # noqa: F401
 
-        # STAGE 1: critic always runs first — it is the gate for the entire cycle.
-        # If it returns violated_hard=True, simulator and perspectives cannot change
-        # the outcome and would be pure waste. Run critic sequentially and short-circuit.
+        if self.config.parallel_critic_with_modules:
+            return self._run_full_parallel_evaluation(
+                state,
+                request,
+                delib_context=delib_context,
+                context_mode=context_mode,
+                risk_estimation=risk_estimation,
+                max_cycles=max_cycles,
+                constitution=constitution,
+            )
+
+        return self._run_critic_gated_parallel(
+            state,
+            request,
+            delib_context=delib_context,
+            context_mode=context_mode,
+            risk_estimation=risk_estimation,
+            max_cycles=max_cycles,
+            constitution=constitution,
+        )
+
+    def _run_critic_gated_parallel(
+        self,
+        state: DeliberationState,
+        request: ProcessedRequest,
+        *,
+        delib_context: DelibContext | None = None,
+        context_mode: str = "full",
+        risk_estimation: RiskEstimationProtocol | None = None,
+        max_cycles: int = 2,
+        constitution: Any | None = None,
+    ) -> DeliberationState:
+        """Original two-stage approach: critic runs first as a gate, then
+        simulator + perspectives run in parallel only if no hard violation."""
         state = self._critique(
             state,
             request,
@@ -1157,31 +1311,45 @@ def _run_critique_simulate_perspectives_parallel(
             constitution=constitution,
         )
         if state.has_critical_violations or getattr(state.last_critique, "violated_hard", False):
-            # Hard violation confirmed: skip simulator and perspectives entirely.
-            # _deliberation_cycle will read state.has_critical_violations and set
-            # state.decision = REFUSE / REVISE immediately after we return.
             return state
 
-        # STAGE 2: critic returned PROCEED/REVISE with no hard violations.
-        # Simulator and perspectives are now meaningful — run them in parallel.
-        # Note: n_errors_after_critic is taken AFTER _critique so that fork()
-        # inherits critic errors and the slice correctly captures only new errors
-        # added by simulator / perspectives.
         n_errors_after_critic = len(state.errors)
         state2 = state.fork()
         state3 = state.fork()
 
-        def do_simulate(s: DeliberationState, r: ProcessedRequest) -> DeliberationState:
+        def do_simulate(
+            s: DeliberationState,
+            r: ProcessedRequest,
+        ) -> DeliberationState:
             if not self.config.enable_simulation or self.simulator is None:
                 return s
-            if not self._should_run_simulator(s, risk_estimation, delib_context, s.cycle, max_cycles):
-                return s  # Carry forward (s already has simulations from fork)
-            return self._simulate(s, r, delib_context=delib_context, context_mode=context_mode)
+            if not self._should_run_simulator(
+                s,
+                risk_estimation,
+                delib_context,
+                s.cycle,
+                max_cycles,
+            ):
+                return s
+            return self._simulate(
+                s,
+                r,
+                delib_context=delib_context,
+                context_mode=context_mode,
+            )
 
-        def do_perspectives(s: DeliberationState, r: ProcessedRequest) -> DeliberationState:
+        def do_perspectives(
+            s: DeliberationState,
+            r: ProcessedRequest,
+        ) -> DeliberationState:
             if not self.config.enable_perspectives or self.perspectives is None:
                 return s
-            return self._evaluate_perspectives(s, r, delib_context=delib_context, context_mode=context_mode)
+            return self._evaluate_perspectives(
+                s,
+                r,
+                delib_context=delib_context,
+                context_mode=context_mode,
+            )
 
         ctx2 = contextvars.copy_context()
         ctx3 = contextvars.copy_context()
@@ -1195,6 +1363,116 @@ def do_perspectives(s: DeliberationState, r: ProcessedRequest) -> DeliberationSt
         state.errors = list(state.errors) + list(s2.errors[n_errors_after_critic:]) + list(s3.errors[n_errors_after_critic:])
         return state
 
+    def _run_full_parallel_evaluation(
+        self,
+        state: DeliberationState,
+        request: ProcessedRequest,
+        *,
+        delib_context: DelibContext | None = None,
+        context_mode: str = "full",
+        risk_estimation: RiskEstimationProtocol | None = None,
+        max_cycles: int = 2,
+        constitution: Any | None = None,
+    ) -> DeliberationState:
+        """Full parallel: critic, simulator, and perspectives all run
+        concurrently. On hard violation the sim/persp results are discarded,
+        paying extra LLM calls but saving wall-clock time in the common case
+        (no hard violation). Decision quality is identical: the convergence
+        logic sees exactly the same module outputs."""
+        n_errors_before = len(state.errors)
+        state_critic = state.fork()
+        state_sim = state.fork()
+        state_persp = state.fork()
+
+        def do_critique(
+            s: DeliberationState,
+            r: ProcessedRequest,
+        ) -> DeliberationState:
+            return self._critique(
+                s,
+                r,
+                delib_context=delib_context,
+                context_mode=context_mode,
+                constitution=constitution,
+            )
+
+        def do_simulate(
+            s: DeliberationState,
+            r: ProcessedRequest,
+        ) -> DeliberationState:
+            if not self.config.enable_simulation or self.simulator is None:
+                return s
+            if not self._should_run_simulator(
+                s,
+                risk_estimation,
+                delib_context,
+                s.cycle,
+                max_cycles,
+            ):
+                return s
+            return self._simulate(
+                s,
+                r,
+                delib_context=delib_context,
+                context_mode=context_mode,
+            )
+
+        def do_perspectives(
+            s: DeliberationState,
+            r: ProcessedRequest,
+        ) -> DeliberationState:
+            if not self.config.enable_perspectives or self.perspectives is None:
+                return s
+            return self._evaluate_perspectives(
+                s,
+                r,
+                delib_context=delib_context,
+                context_mode=context_mode,
+            )
+
+        ctx_c = contextvars.copy_context()
+        ctx_s = contextvars.copy_context()
+        ctx_p = contextvars.copy_context()
+        executor = self._get_executor()
+        fut_c = executor.submit(ctx_c.run, do_critique, state_critic, request)
+        fut_s = executor.submit(ctx_s.run, do_simulate, state_sim, request)
+        fut_p = executor.submit(ctx_p.run, do_perspectives, state_persp, request)
+
+        sc = fut_c.result()
+        ss = fut_s.result()
+        sp = fut_p.result()
+
+        # Always merge critic results
+        state.critiques = sc.critiques
+        state.errors = list(state.errors) + list(sc.errors[n_errors_before:])
+
+        # Propagate critic signals into delib_context (matches sequential path)
+        if delib_context is not None and state.last_critique is not None:
+            critique = state.last_critique
+            delib_context.critic_decision = getattr(critique, "decision", "") or ""
+            delib_context.critic_violated_hard = bool(getattr(critique, "violated_hard", False))
+            if getattr(critique, "violations", None):
+                delib_context.critic_violations_summary = "; ".join(
+                    f"{v.principle_id}:{getattr(v, 'severity', 0)}" for v in critique.violations[:5]
+                )
+
+        hard_violation = state.has_critical_violations or getattr(
+            state.last_critique,
+            "violated_hard",
+            False,
+        )
+        if hard_violation:
+            # Discard sim/persp results — critic authority prevails.
+            # Wall-clock time was not wasted (parallel execution).
+            return state
+
+        # No hard violation: merge sim + persp results
+        state.simulations = ss.simulations
+        state.perspectives = sp.perspectives
+        state._perspectives_aggregation = sp._perspectives_aggregation
+        state.errors = list(state.errors) + list(ss.errors[n_errors_before:]) + list(sp.errors[n_errors_before:])
+        return state
+
     def _apply_constitutional_perspective_override(self, state: DeliberationState) -> None:
         """Applica override costituzionale sulle prospettive quando il Critic
         rileva violazioni HARD."""
@@ -1239,6 +1517,7 @@ def _soft_revision_pass(
             state.soft_revision_guidance_used = guidance
             prompt_used = _policy_prompt_used(result, user_prompt_with_lang)
             system_used = _policy_system_used(result, self._protected_system_prompt)
+            soft_model = _policy_llm_model_for_action(self.policy, "rewrite")
             record_llm_call(
                 self.logger,
                 {
@@ -1247,11 +1526,13 @@ def _soft_revision_pass(
                     "prompt": f"Guidance: {guidance[:200]}",
                     "response": state.draft_response[:200],
                     "duration_ms": elapsed,
+                    "model": soft_model,
                 },
                 {
                     "phase": "soft_revision",
                     "module": "policy",
                     "action": "soft_revision",
+                    "model": soft_model,
                     "started_at": int(start * 1000),
                     "duration_ms": elapsed,
                     "prompt": prompt_used,
@@ -1276,7 +1557,33 @@ def _generate_or_revise(
         if self.policy is None:
             state.draft_response = f"[Mock response to: {request.prompt[:50]}...]"
             return state
-        # Use explicit language from Risk Estimator to reduce LLM non-compliance
+        # Speculative draft already present from parallel generation:
+        # skip redundant LLM call in cycle 1.
+        if state.cycle == 1 and state.draft_response:
+            reuse_model = _policy_llm_model_for_action(self.policy, "generate")
+            record_llm_call(
+                self.logger,
+                {
+                    "module": "policy",
+                    "action": "generate (speculative-reuse)",
+                    "prompt": request.prompt[:200],
+                    "response": state.draft_response[:200],
+                    "duration_ms": 0.0,
+                    "model": reuse_model,
+                },
+                {
+                    "cycle": 1,
+                    "phase": "policy_generate",
+                    "module": "policy",
+                    "action": "generate (speculative-reuse)",
+                    "model": reuse_model,
+                    "duration_ms": 0.0,
+                    "prompt": request.prompt[:200],
+                    "raw_response": state.draft_response[:200],
+                    "sequence_in_cycle": SEQ_POLICY,
+                },
+            )
+            return state
         try:
             start = time.time()
             det_iso = risk_estimation.detected_language or ""
@@ -1343,6 +1650,7 @@ def _generate_or_revise(
             state.draft_response = sanitize_policy_output(protection_result.cleaned)
             prompt_used = _policy_prompt_used(result, prompt_text)
             system_used = _policy_system_used(result, self._protected_system_prompt)
+            policy_model_label = _policy_llm_model_for_action(self.policy, action)
             record_llm_call(
                 self.logger,
                 {
@@ -1351,11 +1659,13 @@ def _generate_or_revise(
                     "prompt": prompt_text,
                     "response": state.draft_response,
                     "duration_ms": elapsed,
+                    "model": policy_model_label,
                 },
                 {
                     "phase": "policy_generate" if action == "generate" else "policy_rewrite",
                     "module": "policy",
                     "action": action,
+                    "model": policy_model_label,
                     "started_at": int(start * 1000),
                     "duration_ms": elapsed,
                     "prompt": prompt_used,
@@ -1429,6 +1739,7 @@ def _critique(
             nv = len(critique.violations)
             rg = (critique.revision_guidance[:100]) if critique.revision_guidance else "N/A"
             response_text = f"Violations: {nv}, Guidance: {rg}"
+            critic_model = _module_model(self.critic)
             record_llm_call(
                 self.logger,
                 {
@@ -1437,11 +1748,13 @@ def _critique(
                     "prompt": prompt_text,
                     "response": response_text,
                     "duration_ms": elapsed,
+                    "model": critic_model,
                 },
                 {
                     "phase": "critic",
                     "module": "critic",
                     "action": "critique",
+                    "model": critic_model,
                     "started_at": int(start * 1000),
                     "duration_ms": elapsed,
                     "prompt": getattr(critique, "prompt", None) or prompt_text,
@@ -1528,6 +1841,7 @@ def _simulate(
                 f"Expected valence: {ev:.2f}, Semantic harm: {sem_harm:.2f}, "
                 f"Dominant harms: {dom_harms}, Worst harm: {worst}"
             )
+            sim_model = _module_model(self.simulator)
             record_llm_call(
                 self.logger,
                 {
@@ -1536,11 +1850,13 @@ def _simulate(
                     "prompt": f"SIMULATION\nPrompt: {request.prompt}\nResponse: {state.draft_response}",
                     "response": response_text,
                     "duration_ms": elapsed,
+                    "model": sim_model,
                 },
                 {
                     "phase": "simulator",
                     "module": "simulator",
                     "action": "simulate",
+                    "model": sim_model,
                     "started_at": int(start * 1000),
                     "duration_ms": elapsed,
                     "prompt": getattr(simulation, "prompt", ""),
@@ -1633,6 +1949,7 @@ def _evaluate_hindsight(
                 context_mode=context_mode,
             )
             elapsed = (time.time() - start) * 1000
+            hindsight_model = _module_model(self.hindsight)
             record_llm_call(
                 self.logger,
                 {
@@ -1641,11 +1958,13 @@ def _evaluate_hindsight(
                     "prompt": f"HINDSIGHT\nPrompt: {request.prompt}\nResponse: {state.draft_response}",
                     "response": str(hindsight_result)[:200],
                     "duration_ms": elapsed,
+                    "model": hindsight_model,
                 },
                 {
                     "phase": "hindsight",
                     "module": "hindsight",
                     "action": "evaluate",
+                    "model": hindsight_model,
                     "started_at": int(start * 1000),
                     "duration_ms": elapsed,
                     "prompt": getattr(hindsight_result, "prompt", ""),
@@ -1740,6 +2059,7 @@ def _evaluate_perspectives(
             raw_resp = "\n---\n".join(result.raw_responses or []) if getattr(result, "raw_responses", None) else ""
             prompts_list = getattr(result, "prompts", []) or []
             system_list = getattr(result, "system_prompts", []) or []
+            persp_model = _module_model(self.perspectives)
             record_llm_call(
                 self.logger,
                 {
@@ -1748,11 +2068,13 @@ def _evaluate_perspectives(
                     "prompt": f"PERSPECTIVES\nPrompt: {request.prompt}\nResponse: {state.draft_response}",
                     "response": str(result)[:200],
                     "duration_ms": elapsed,
+                    "model": persp_model,
                 },
                 {
                     "phase": "perspectives",
                     "module": "perspectives",
                     "action": "evaluate",
+                    "model": persp_model,
                     "started_at": int(start * 1000),
                     "duration_ms": elapsed,
                     "prompt": "\n---\n".join(prompts_list) if prompts_list else "",
diff --git a/moralstack/orchestration/types.py b/moralstack/orchestration/types.py
index ff2aee8..47617de 100644
--- a/moralstack/orchestration/types.py
+++ b/moralstack/orchestration/types.py
@@ -500,6 +500,15 @@ class OrchestratorConfig:
     soft_revision_min_suggestions: int = 1  # Minimum total suggestions to trigger
     soft_revision_max_approval: float = 0.95  # Skip rewrite if weighted approval exceeds this
     early_exit_perspectives_threshold: float = 0.85  # Early exit se critic PROCEED + perspectives >= questo
+    # When True and parallel_module_calls is True, critic runs in parallel
+    # with simulator and perspectives instead of acting as a sequential gate.
+    # Hard violations are still honoured: sim/persp results are discarded when
+    # the critic finds a hard violation. Default True for latency savings.
+    parallel_critic_with_modules: bool = True
+    # When True, risk estimation and speculative draft generation run in
+    # parallel. The draft is used directly for benign/fast/deliberative
+    # routes and discarded on REFUSE. Zero impact on decision quality.
+    enable_speculative_generation: bool = True
 
 
 # =============================================================================
diff --git a/moralstack/persistence/db.py b/moralstack/persistence/db.py
index d41aac5..34a2e66 100644
--- a/moralstack/persistence/db.py
+++ b/moralstack/persistence/db.py
@@ -750,6 +750,58 @@ def get_all_runs(limit: int = 100) -> list[dict[str, Any]]:
         return []
 
 
+def get_models_used_for_run(run_id: str) -> dict[str, str]:
+    """Returns a mapping of ``module/action`` → model actually used, from persisted llm_calls.
+
+    Scans all llm_calls for the run and picks the first non-empty ``model`` value
+    per logical slot.  Returned keys: ``policy_generate``, ``policy_rewrite``,
+    ``risk``, ``critic``, ``simulator``, ``hindsight``, ``perspectives``.
+    Only slots with a non-empty model value appear in the result.
+    """
+    path = get_db_path()
+    if not path:
+        return {}
+    try:
+        conn = _get_connection(path)
+        rows = conn.execute(
+            """
+            SELECT module, action, model
+            FROM llm_calls
+            WHERE run_id = ? AND model IS NOT NULL AND model != ''
+            ORDER BY started_at
+            """,
+            (run_id,),
+        ).fetchall()
+        conn.close()
+    except Exception as e:
+        logger.warning("persistence: get_models_used_for_run failed: %s", e)
+        return {}
+
+    result: dict[str, str] = {}
+    for r in rows:
+        module = (r["module"] or "").strip()
+        action = (r["action"] or "").strip().lower()
+        model = (r["model"] or "").strip()
+        if not model:
+            continue
+        if module == "policy":
+            if "rewrite" in action:
+                result.setdefault("policy_rewrite", model)
+            elif "generate" in action:
+                result.setdefault("policy_generate", model)
+        elif module == "risk_estimator":
+            result.setdefault("risk", model)
+        elif module == "critic":
+            result.setdefault("critic", model)
+        elif module == "simulator":
+            result.setdefault("simulator", model)
+        elif module == "hindsight":
+            result.setdefault("hindsight", model)
+        elif module == "perspectives":
+            result.setdefault("perspectives", model)
+    return result
+
+
 def delete_run(run_id: str) -> bool:
     """Deletes a run and all related data (CASCADE). Returns True on success."""
     path = get_db_path()
diff --git a/moralstack/prompts/_common.py b/moralstack/prompts/_common.py
index 2f77f80..c485af1 100644
--- a/moralstack/prompts/_common.py
+++ b/moralstack/prompts/_common.py
@@ -12,6 +12,10 @@
 # Costante standard per chiusura prompt JSON
 OUTPUT_JSON_ONLY = "Output ONLY valid JSON."
 
+# Prompt-cache layout (FULL prompts): longest stable instruction/schema blocks MUST come first in the
+# string sent to the provider; per-request fields (request, draft, risk) MUST come last so identical
+# prefixes can hit automatic prompt caches (e.g. OpenAI prompt caching on repeated long prefixes).
+
 
 class ThinContextSections(TypedDict):
     """Sezioni contesto per thin mode (summary, key_points, risk_signals, change_log)."""
diff --git a/moralstack/prompts/critic_prompt.py b/moralstack/prompts/critic_prompt.py
index 847dc02..24ad3dd 100644
--- a/moralstack/prompts/critic_prompt.py
+++ b/moralstack/prompts/critic_prompt.py
@@ -52,11 +52,20 @@
 {"decision":"PROCEED","violated_hard":false,"violations":[],"revision_guidance":""}
 """
 
-CRITIC_FULL_TEMPLATE = """Evaluate the response against these principles. Output valid JSON only.
+# Static rules + JSON closure first (prompt-cache friendly); per-request fields last.
+CRITIC_FULL_TEMPLATE = (
+    """{rules}
+
+"""
+    + OUTPUT_JSON_ONLY
+    + """
+
+TASK: Evaluate the response in TURN CONTEXT against the listed principles. Judge the RESPONSE content, not the REQUEST topic.
 
 PRINCIPLES:
 {principles}
 
+TURN CONTEXT:
 RISK ASSESSMENT:
 {risk_signals}
 
@@ -65,9 +74,8 @@
 
 RESPONSE:
 {response}
-
-{rules}
-""" + OUTPUT_JSON_ONLY
+"""
+)
 
 CRITIC_THIN_TEMPLATE = """Evaluate the response against these principles. Output valid JSON only.
 
diff --git a/moralstack/prompts/hindsight_prompt.py b/moralstack/prompts/hindsight_prompt.py
index d626027..7fd86d3 100644
--- a/moralstack/prompts/hindsight_prompt.py
+++ b/moralstack/prompts/hindsight_prompt.py
@@ -31,16 +31,41 @@
 }}
 """
 
-HINDSIGHT_BATCH_FULL_TEMPLATE = """Evaluate this AI response considering multiple potential consequences.
+# FULL: static evaluation/output schema first (prompt caching); consequences + turn context last.
+HINDSIGHT_BATCH_FULL_TEMPLATE = (
+    """For each consequence listed under POTENTIAL CONSEQUENCES below,
+    produce exactly one evaluation using the same scenario_id.
 
+Output JSON:
+{{
+  "evaluations": [
+    {{
+      "scenario_id": "id of the consequence",
+      "safety": -1.0 to 1.0,
+      "helpfulness": -1.0 to 1.0,
+      "honesty": -1.0 to 1.0,
+      "harm_probability": 0.0 to 1.0,
+      "benefit_probability": 0.0 to 1.0,
+      "confidence": 0.0 to 1.0,
+      "rationale": "Brief explanation"
+    }}
+  ]
+}}
+"""
+    + OUTPUT_JSON_ONLY
+    + """
+
+POTENTIAL CONSEQUENCES:
+{consequences}
+
+TURN CONTEXT:
 REQUEST: {request}
 RESPONSE: {response}
 
 RISK CONTEXT:
 {risk_signals}
-
-{instructions}
-""" + OUTPUT_JSON_ONLY
+"""
+)
 
 HINDSIGHT_BATCH_THIN_TEMPLATE = """Evaluate this AI response considering multiple potential consequences.
 
@@ -82,10 +107,10 @@ def build_hindsight_prompt(
     if mode == "full":
         risk_signals = context.get_risk_signals_str() or "none"
         return HINDSIGHT_BATCH_FULL_TEMPLATE.format(
+            consequences=consequences_text,
             request=request,
             response=context.draft_text_full or "",
             risk_signals=risk_signals,
-            instructions=instructions,
         )
 
     sections = build_thin_context_sections(context)
diff --git a/moralstack/prompts/perspectives_prompt.py b/moralstack/prompts/perspectives_prompt.py
index 81f3025..f3a8e36 100644
--- a/moralstack/prompts/perspectives_prompt.py
+++ b/moralstack/prompts/perspectives_prompt.py
@@ -47,24 +47,29 @@
   Low risk. Evaluate helpfulness, accuracy, completeness normally.
 """
 
-# Full template: perspective block + REQUEST/RESPONSE + common (kept for build_perspectives_prompt)
-PERSPECTIVE_FULL_TEMPLATE = """Evaluate this AI response from the perspective of: {perspective_name}
+# Full template: static interpretation + JSON contract first (prompt caching); perspective + draft last.
+PERSPECTIVE_FULL_TEMPLATE = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """
+
+Evaluate this AI response from the perspective of: {perspective_name}
 
 {perspective_instructions}
 
+TURN CONTEXT:
 REQUEST: {request}
 
 RESPONSE: {response}
 
 RISK CONTEXT: {risk_signals}
+"""
 
-""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS
+# Thin template: same static-first layout as full; perspective + thin turn context last.
+PERSPECTIVE_THIN_TEMPLATE = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """
 
-# Thin template: perspective block + request + thin sections + common
-PERSPECTIVE_THIN_TEMPLATE = """Evaluate this AI response from the perspective of: {perspective_name}
+Evaluate this AI response from the perspective of: {perspective_name}
 
 {perspective_instructions}
 
+TURN CONTEXT:
 REQUEST: {request}
 
 RESPONSE SUMMARY (compact):
@@ -75,19 +80,23 @@
 
 RISK CONTEXT: {risk_signals}
 {change_log_section}
+"""
 
-""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS
+# Shared system prompt body: static block first, then REQUEST/RESPONSE (full) or thin sections.
+PERSPECTIVE_SYSTEM_FULL_BODY = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """
 
-# Shared system prompt body: REQUEST + RESPONSE (or thin) + common instructions only (no perspective)
-PERSPECTIVE_SYSTEM_FULL_BODY = """REQUEST: {request}
+TURN CONTEXT:
+REQUEST: {request}
 
 RESPONSE: {response}
 
 RISK CONTEXT: {risk_signals}
+"""
 
-""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS
+PERSPECTIVE_SYSTEM_THIN_BODY = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """
 
-PERSPECTIVE_SYSTEM_THIN_BODY = """REQUEST: {request}
+TURN CONTEXT:
+REQUEST: {request}
 
 RESPONSE SUMMARY (compact):
 {response_summary}
@@ -97,8 +106,7 @@
 
 RISK CONTEXT: {risk_signals}
 {change_log_section}
-
-""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS
+"""
 
 
 def build_perspectives_system_prompt(
diff --git a/moralstack/prompts/simulator_prompt.py b/moralstack/prompts/simulator_prompt.py
index b10be8e..8880141 100644
--- a/moralstack/prompts/simulator_prompt.py
+++ b/moralstack/prompts/simulator_prompt.py
@@ -85,31 +85,16 @@
 DEFAULT_DOMAIN_GUIDANCE = "Consider whether the intent is educational/protective vs exploitative."
 
 
-# FULL template
-SIMULATOR_FULL_TEMPLATE = (
+# FULL template: longest static prefix first (prompt caching); TURN PARAMETERS and draft last.
+SIMULATOR_FULL_STATIC_PREFIX = (
     """
-Generate {num_scenarios} potential consequences for this AI response.
-
-REQUEST:
-{request}
-
-RESPONSE:
-{response}
-
-RISK CONTEXT:
-{risk_signals}
-
-DOMAIN CONTEXT:
-{domain}
-
-DOMAIN-SPECIFIC GUIDANCE:
-
-{domain_guidance}
+CONTRACT (fixed instructions; identical across requests when this prefix is reused):
 
 Requirements:
 
-- Generate exactly {num_scenarios} consequences.
-- Return exactly {num_scenarios} objects inside "consequences".
+- Generate exactly N potential consequences for one AI response, where N is the integer num_scenarios in
+  TURN PARAMETERS below.
+- Return exactly N objects inside "consequences".
 
 - Include a balanced mix:
   - at least one positive outcome (scenario_type: positive_outcome)
@@ -162,6 +147,30 @@
 """
 )
 
+SIMULATOR_FULL_TEMPLATE = SIMULATOR_FULL_STATIC_PREFIX + """
+
+---
+
+TURN PARAMETERS:
+num_scenarios: {num_scenarios}
+
+REQUEST:
+{request}
+
+RESPONSE:
+{response}
+
+RISK CONTEXT:
+{risk_signals}
+
+DOMAIN CONTEXT:
+{domain}
+
+DOMAIN-SPECIFIC GUIDANCE:
+
+{domain_guidance}
+"""
+
 
 # THIN template
 SIMULATOR_THIN_TEMPLATE = (
diff --git a/moralstack/reports/markdown_export.py b/moralstack/reports/markdown_export.py
index 4a277fa..c8e4221 100644
--- a/moralstack/reports/markdown_export.py
+++ b/moralstack/reports/markdown_export.py
@@ -11,12 +11,14 @@
 from __future__ import annotations
 
 import json
+import os
 from datetime import datetime
 from typing import Any
 
 from moralstack.persistence.config import get_db_path
 from moralstack.persistence.db import (
     get_decision_traces_for_request,
+    get_models_used_for_run,
     get_requests_for_run,
     get_run,
 )
@@ -121,6 +123,102 @@ def _build_benchmark_comparison_section(br: dict[str, Any]) -> str:
     return "\n".join(lines)
 
 
+def format_models_used_markdown(models_cfg: dict[str, Any], *, primary_model: str = "gpt-4o") -> str:
+    """
+    Renders the '### Models used' markdown table from a benchmark-style ``models_config`` dict.
+    Matches the benchmark report header (including parallel risk rows when present).
+    """
+    ms = models_cfg.get("moralstack") or {}
+    m = primary_model or "gpt-4o"
+    baseline = models_cfg.get("baseline", m)
+    judge = models_cfg.get("judge", m)
+    if ms.get("risk_parallel"):
+        risk_rows = (
+            f"| **MoralStack risk** | parallel mini-estimators |\n"
+            f"| **MoralStack risk · intent** | {ms.get('risk_intent', '—')} |\n"
+            f"| **MoralStack risk · signals** | {ms.get('risk_signals', '—')} |\n"
+            f"| **MoralStack risk · operational** | {ms.get('risk_operational', '—')} |"
+        )
+    else:
+        risk_rows = f"| **MoralStack risk** | {ms.get('risk', m)} |"
+    pr = ms.get("policy_rewrite", ms.get("policy", m))
+    return f"""### Models used
+
+| Component | Model |
+|-----------|-------|
+| **Baseline** | {baseline} |
+| **Judge** | {judge} |
+| **MoralStack policy** | {ms.get('policy', m)} |
+| **MoralStack policy (rewrite)** | {pr} |
+{risk_rows}
+| **MoralStack critic** | {ms.get('critic', m)} |
+| **MoralStack simulator** | {ms.get('simulator', m)} |
+| **MoralStack hindsight** | {ms.get('hindsight', m)} |
+| **MoralStack perspectives** | {ms.get('perspectives', m)} |
+"""
+
+
+def _resolve_models_config_for_run(run_id: str) -> tuple[dict[str, Any], str]:
+    """
+    Returns (models_config, primary_model) for a run.
+
+    Priority:
+      1. Benchmark JSON ``models_config`` (snapshot taken at benchmark time).
+      2. Persisted ``llm_calls.model`` column (actual models used at run time).
+      3. Env-vars fallback (last resort; may not match the run).
+    """
+    env_policy = (os.getenv("OPENAI_MODEL") or "gpt-4o").strip()
+    run = get_run(run_id)
+    run_type = (run.get("run_type") or "").strip().lower() if run else ""
+
+    # --- Benchmark: prefer the embedded snapshot ---
+    if run_type == "benchmark":
+        br = load_benchmark_report(run_id)
+        if br and isinstance(br, dict):
+            mc = br.get("models_config")
+            if mc:
+                _overlay_db_models(mc, run_id)
+                return mc, (br.get("model") or env_policy)
+            model = br.get("model") or env_policy
+            bm = br.get("baseline_model") or model
+            jm = br.get("judge_model") or model
+            cfg = _get_benchmark_models_config_fallback(bm, jm, moralstack_policy_model=model)
+            _overlay_db_models(cfg, run_id)
+            return cfg, model
+
+    # --- Single / interactive run: build from DB ---
+    db_models = get_models_used_for_run(run_id)
+    primary = db_models.get("policy_generate") or env_policy
+    ms: dict[str, Any] = {
+        "policy": primary,
+        "policy_rewrite": db_models.get("policy_rewrite", primary),
+        "risk": db_models.get("risk", primary),
+        "critic": db_models.get("critic", primary),
+        "simulator": db_models.get("simulator", primary),
+        "hindsight": db_models.get("hindsight", primary),
+        "perspectives": db_models.get("perspectives", primary),
+    }
+    cfg: dict[str, Any] = {"baseline": "—", "judge": "—", "moralstack": ms}
+    return cfg, primary
+
+
+def _overlay_db_models(cfg: dict[str, Any], run_id: str) -> None:
+    """Patch *cfg* in-place with actually-used models from ``llm_calls``."""
+    db_models = get_models_used_for_run(run_id)
+    if not db_models:
+        return
+    ms = cfg.get("moralstack")
+    if not isinstance(ms, dict):
+        return
+    if "policy_generate" in db_models:
+        ms["policy"] = db_models["policy_generate"]
+    if "policy_rewrite" in db_models:
+        ms["policy_rewrite"] = db_models["policy_rewrite"]
+    for key in ("risk", "critic", "simulator", "hindsight", "perspectives"):
+        if key in db_models:
+            ms[key] = db_models[key]
+
+
 def export_request_markdown(run_id: str, request_id: str) -> str:
     """
     Exports a single request's deliberation report as markdown.
@@ -138,7 +236,12 @@ def export_request_markdown(run_id: str, request_id: str) -> str:
             return "# Error: No database configured (MORALSTACK_DB_PATH)"
         return f"# Error: Request {request_id} not found in run {run_id}"
 
-    md = render_request_report(report)
+    models_cfg, primary_model = _resolve_models_config_for_run(run_id)
+    models_used_md = format_models_used_markdown(models_cfg, primary_model=primary_model)
+    md = render_request_report(
+        report,
+        models_used_section=f"---\n\n{models_used_md}",
+    )
 
     if report.benchmark_result:
         md += "\n\n" + _build_benchmark_comparison_section(report.benchmark_result)
@@ -203,11 +306,15 @@ def _get_benchmark_models_config_fallback(
     except ImportError:
         risk_m = critic_m = simulator_m = hindsight_m = perspectives_m = policy_fallback
 
+    rewrite_raw = (os.getenv("MORALSTACK_POLICY_REWRITE_MODEL") or "").strip()
+    policy_rewrite_m = rewrite_raw if rewrite_raw else policy_fallback
+
     return {
         "baseline": baseline_model,
         "judge": judge_model,
         "moralstack": {
             "policy": policy_fallback,
+            "policy_rewrite": policy_rewrite_m,
             "risk": risk_m,
             "critic": critic_m,
             "simulator": simulator_m,
@@ -278,24 +385,9 @@ def _build_benchmark_section_from_dict(report: dict, section_builder: str) -> st
         models_cfg = report.get("models_config")
         if not models_cfg:
             models_cfg = _get_benchmark_models_config_fallback(baseline_model, judge_model, moralstack_policy_model=model)
-        ms = models_cfg.get("moralstack", {})
         models_block = ""
-        if ms:
-            models_block = f"""
-
-### Models used
-
-| Component | Model |
-|-----------|-------|
-| **Baseline** | {models_cfg.get('baseline', model)} |
-| **Judge** | {models_cfg.get('judge', judge_model)} |
-| **MoralStack policy** | {ms.get('policy', model)} |
-| **MoralStack risk** | {ms.get('risk', model)} |
-| **MoralStack critic** | {ms.get('critic', model)} |
-| **MoralStack simulator** | {ms.get('simulator', model)} |
-| **MoralStack hindsight** | {ms.get('hindsight', model)} |
-| **MoralStack perspectives** | {ms.get('perspectives', model)} |
-"""
+        if models_cfg.get("moralstack"):
+            models_block = "\n\n" + format_models_used_markdown(models_cfg, primary_model=model) + "\n"
         return f"""# 🧪 MoralStack Benchmark Report
 
 > **Report generated**: {ts}
diff --git a/moralstack/reports/renderer_markdown.py b/moralstack/reports/renderer_markdown.py
index 6bada13..70ce3f6 100644
--- a/moralstack/reports/renderer_markdown.py
+++ b/moralstack/reports/renderer_markdown.py
@@ -488,13 +488,19 @@ def render_request_footer(report: "RequestReport") -> str:
 """
 
 
-def render_request_report(report: "RequestReport") -> str:
-    """Render full request/deliberation report markdown from RequestReport."""
+def render_request_report(report: "RequestReport", *, models_used_section: str = "") -> str:
+    """Render full request/deliberation report markdown from RequestReport.
+
+    Optional ``models_used_section``: markdown block (e.g. '### Models used' table) inserted
+    after the request header and before the executive summary (used by UI export).
+    """
     orch = render_orchestrator_observability(report)
     sections = [
         render_request_header(report),
-        render_executive_summary(report),
     ]
+    if models_used_section and models_used_section.strip():
+        sections.append(models_used_section.strip())
+    sections.append(render_executive_summary(report))
     if orch:
         sections.append(orch)
     sections.extend(
diff --git a/moralstack/runtime/modules/critic_module.py b/moralstack/runtime/modules/critic_module.py
index 48a8db6..99833c9 100644
--- a/moralstack/runtime/modules/critic_module.py
+++ b/moralstack/runtime/modules/critic_module.py
@@ -309,6 +309,7 @@ def __init__(
             temperature=self.config.temperature,
             top_p=self.config.top_p,
             stop_sequences=[],
+            response_format={"type": "json_object"},
         )
 
     def critique(
@@ -500,6 +501,7 @@ def quick_check(
             max_tokens=256,
             temperature=0.1,
             top_p=self.config.top_p,
+            response_format={"type": "json_object"},
         )
 
         try:
diff --git a/moralstack/runtime/modules/hindsight_module.py b/moralstack/runtime/modules/hindsight_module.py
index c2d8ac1..e96e970 100644
--- a/moralstack/runtime/modules/hindsight_module.py
+++ b/moralstack/runtime/modules/hindsight_module.py
@@ -509,6 +509,7 @@ def __init__(
             temperature=self.config.temperature,
             top_p=self.config.top_p,
             stop_sequences=[],
+            response_format={"type": "json_object"},
         )
 
         # Pesi per HindsightScores
diff --git a/moralstack/runtime/modules/perspective_module.py b/moralstack/runtime/modules/perspective_module.py
index d8bba47..3f3535f 100644
--- a/moralstack/runtime/modules/perspective_module.py
+++ b/moralstack/runtime/modules/perspective_module.py
@@ -419,6 +419,7 @@ def __init__(
             temperature=self.config.temperature,
             top_p=self.config.top_p,
             stop_sequences=[],
+            response_format={"type": "json_object"},
         )
 
     def evaluate(
@@ -883,8 +884,7 @@ def apply_constitutional_override(
     # Override: cap weighted_approval to 0.2
     if inner.weighted_approval > 0.2:
         logging.getLogger(__name__).info(
-            "Perspective override applied due to HARD constitutional violation: "
-            "weighted_approval capped from %.2f to 0.20",
+            "Perspective override applied due to HARD constitutional violation: weighted_approval capped from %.2f to 0.20",
             inner.weighted_approval,
         )
         inner.weighted_approval = min(inner.weighted_approval, 0.2)
diff --git a/moralstack/runtime/modules/simulator_module.py b/moralstack/runtime/modules/simulator_module.py
index 85d083c..dd8cd64 100644
--- a/moralstack/runtime/modules/simulator_module.py
+++ b/moralstack/runtime/modules/simulator_module.py
@@ -293,6 +293,7 @@ def __init__(
             temperature=self.config.temperature,
             top_p=self.config.top_p,
             stop_sequences=[],
+            response_format={"type": "json_object"},
         )
 
     def simulate(
diff --git a/moralstack/ui/app.py b/moralstack/ui/app.py
index 87423eb..249b030 100644
--- a/moralstack/ui/app.py
+++ b/moralstack/ui/app.py
@@ -444,6 +444,23 @@ def _group_calls_into_tiers_and_enrich(calls: list) -> list[list[dict]]:
     all_tiers = time_tiers + merged_seq_tiers
     all_tiers.sort(key=lambda t: min((c.get("started_at") or 0) for c in t))
 
+    # ── Post-merge: collapse adjacent tiers that overlap in time ────────
+    # With full-parallel evaluation (critic||sim||persp) or speculative
+    # overlap (risk||generate), modules from different static vtiers may
+    # actually run concurrently.  Merge them so the UI shows a single
+    # parallel tier instead of misleading sequential steps.
+    if len(all_tiers) > 1:
+        collapsed: list[list[dict]] = [all_tiers[0]]
+        for tier in all_tiers[1:]:
+            prev = collapsed[-1]
+            prev_max_end = max((c.get("started_at") or 0) + (c.get("duration_ms") or 0) for c in prev)
+            tier_min_start = min((c.get("started_at") or 0) for c in tier)
+            if tier_min_start < prev_max_end:
+                prev.extend(tier)
+            else:
+                collapsed.append(tier)
+        all_tiers = collapsed
+
     # ── Enrich with timing info ──────────────────────────────────────────
     processed: list[list[dict]] = []
     for tier in all_tiers:
diff --git a/moralstack/ui/templates/run.html b/moralstack/ui/templates/run.html
index d898307..7b75ef8 100644
--- a/moralstack/ui/templates/run.html
+++ b/moralstack/ui/templates/run.html
@@ -69,6 +69,10 @@ <h3 style="margin-top:0;">Models used</h3>
             <td><strong>MoralStack policy</strong></td>
             <td><code>{{ benchmark_report.models_config.moralstack.policy }}</code></td>
         </tr>
+        <tr>
+            <td><strong>MoralStack policy (rewrite)</strong></td>
+            <td><code>{{ benchmark_report.models_config.moralstack.policy_rewrite | default(benchmark_report.models_config.moralstack.policy, true) }}</code></td>
+        </tr>
         {% if benchmark_report.models_config.moralstack.risk_parallel %}
         <tr>
             <td><strong>MoralStack risk</strong></td>
diff --git a/moralstack/utils/openai_params.py b/moralstack/utils/openai_params.py
index 44c9e65..0c03b22 100644
--- a/moralstack/utils/openai_params.py
+++ b/moralstack/utils/openai_params.py
@@ -2,6 +2,7 @@
 OpenAI API parameter helpers. Single source of truth for model-specific params.
 
 Newer models (gpt-5.x, o-series) require max_completion_tokens instead of max_tokens.
+Predicted output support is determined by model compatibility.
 """
 
 from __future__ import annotations
@@ -10,6 +11,18 @@
 
 MODELS_REQUIRING_MAX_COMPLETION_TOKENS = ("o1", "o3", "o4", "gpt-5")
 
+# Models that support the ``prediction`` parameter for speculative decoding.
+# Predicted outputs are incompatible with max_completion_tokens, logprobs,
+# and n > 1.  Only models using the legacy max_tokens param qualify.
+# Reference: https://platform.openai.com/docs/guides/predicted-outputs
+MODELS_SUPPORTING_PREDICTED_OUTPUT = (
+    "gpt-4o",
+    "gpt-4o-mini",
+    "gpt-4.1",
+    "gpt-4.1-mini",
+    "gpt-4.1-nano",
+)
+
 
 def uses_max_completion_tokens(model: str) -> bool:
     """True if model requires max_completion_tokens instead of max_tokens."""
@@ -17,6 +30,17 @@ def uses_max_completion_tokens(model: str) -> bool:
     return any(m.startswith(p) for p in MODELS_REQUIRING_MAX_COMPLETION_TOKENS)
 
 
+def supports_predicted_output(model: str) -> bool:
+    """True if model supports the ``prediction`` parameter (speculative decoding).
+
+    Predicted outputs speed up generation when the expected output is largely
+    similar to a known text (e.g. a draft revision).  The feature is only
+    available on models that use the legacy ``max_tokens`` parameter.
+    """
+    m = (model or "").lower()
+    return any(m.startswith(p) for p in MODELS_SUPPORTING_PREDICTED_OUTPUT)
+
+
 def completion_tokens_param(model: str, max_tokens: int) -> dict[str, Any]:
     """Returns the correct param dict for chat.completions.create."""
     if uses_max_completion_tokens(model):
diff --git a/scripts/benchmark_moralstack.py b/scripts/benchmark_moralstack.py
index 878c4f3..ae5cd89 100644
--- a/scripts/benchmark_moralstack.py
+++ b/scripts/benchmark_moralstack.py
@@ -1468,6 +1468,7 @@ def get_benchmark_models_config(
     Baseline uses MORALSTACK_BENCHMARK_BASELINE_MODEL exclusively.
     MoralStack modules use env vars (MORALSTACK_*_MODEL); if unset, fall back to
     moralstack_policy_model.
+    Includes `policy_rewrite` from `MORALSTACK_POLICY_REWRITE_MODEL` (or primary policy when unset).
     When parallel mini-estimators are enabled, includes risk_intent, risk_signals,
     risk_operational keys.
     """
@@ -1512,8 +1513,12 @@ def get_benchmark_models_config(
     except ImportError:
         risk_m = critic_m = simulator_m = hindsight_m = perspectives_m = policy_fallback
 
+    rewrite_raw = (os.getenv("MORALSTACK_POLICY_REWRITE_MODEL") or "").strip()
+    policy_rewrite_m = rewrite_raw if rewrite_raw else policy_fallback
+
     moralstack_cfg: dict[str, Any] = {
         "policy": policy_fallback,
+        "policy_rewrite": policy_rewrite_m,
         "risk": risk_m,
         "critic": critic_m,
         "simulator": simulator_m,
@@ -2813,6 +2818,7 @@ def _header(self, report: BenchmarkReport) -> str:
 | **Baseline** | {models_cfg["baseline"]} |
 | **Judge** | {models_cfg["judge"]} |
 | **MoralStack policy** | {ms["policy"]} |
+| **MoralStack policy (rewrite)** | {ms["policy_rewrite"]} |
 {risk_rows}
 | **MoralStack critic** | {ms["critic"]} |
 | **MoralStack simulator** | {ms["simulator"]} |