fdidonato · fdidonato · Mar 30, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 30, 2026
diff --git a/.cursor/rules/project-overview.mdc b/.cursor/rules/project-overview.mdc
@@ -30,6 +30,7 @@ SAFE_COMPLETE is an intentional governance choice, **not an error**.
 | `moralstack/runtime/`          | Orchestrator, deliberation loop; simulator includes semantic layer (harm_type, semantic_expected_harm) that influences deliberation; simulator module with config_loader for env (MORALSTACK_SIMULATOR_*); hindsight module with config_loader for env (MORALSTACK_HINDSIGHT_*); perspective module with config_loader for env (MORALSTACK_PERSPECTIVES_*); critic module with config_loader for env (MORALSTACK_CRITIC_*) |
 | `moralstack/runtime/decision/` | safe_complete_policy (single source of truth for action bounds, final_action) |
 | `moralstack/runtime/trace/`    | DecisionTrace, append_decision_trace for audit |
+| `moralstack/models/policy.py` | `OpenAIPolicy` (generate / rewrite / refuse); env `OPENAI_*`; optional `MORALSTACK_POLICY_REWRITE_MODEL` for deliberative `rewrite()` (defaults to primary model when unset) |
 | `moralstack/models/risk/`      | Risk estimation (schema, estimator, calibration, parse_result, config_loader for env) |
 | `moralstack/models/decision_explanation.py` | DecisionExplanation dataclass for explainability |
 | `moralstack/models/reason_codes.py` | ReasonCode enum and policy-to-reason mapping |

diff --git a/.env.minimal b/.env.minimal
@@ -12,6 +12,8 @@
 # -----------------------------------------------------------------------------
 OPENAI_API_KEY=<YOUR_KEY_HERE>
 OPENAI_MODEL=gpt-4o
+# Lighter model for policy rewrites (omit variable entirely to use OPENAI_MODEL)
+MORALSTACK_POLICY_REWRITE_MODEL=gpt-4.1-nano
 OPENAI_TEMPERATURE=0.1
 OPENAI_TOP_P=0.8
 
@@ -52,7 +54,7 @@ MORALSTACK_UI_PASSWORD=admin
 # -----------------------------------------------------------------------------
 # See docs/modules/risk_estimator.md for full documentation of each variable.
 # Model for the semantic judge (if set, overrides OPENAI_MODEL for risk only)
-MORALSTACK_RISK_MODEL=gpt-4o
+MORALSTACK_RISK_MODEL=gpt-4o-mini
 # if MORALSTACK_RISK_PARALLEL_ESTIMATORS = true then the following models are used for parallel estimation
 MORALSTACK_RISK_INTENT_MODEL=gpt-4o
 MORALSTACK_RISK_SIGNALS_MODEL=gpt-4o-mini
@@ -111,7 +113,7 @@ MORALSTACK_CRITIC_INCLUDE_EXAMPLES=false
 # -----------------------------------------------------------------------------
 # See docs/modules/simulator.md for full documentation of each variable.
 # Model for consequence simulator (if set, overrides OPENAI_MODEL for simulator only; used in run and benchmark)
-MORALSTACK_SIMULATOR_MODEL=gpt-4o-mini
+MORALSTACK_SIMULATOR_MODEL=gpt-4.1-nano
 MORALSTACK_SIMULATOR_MAX_RETRIES=3
 MORALSTACK_SIMULATOR_MAX_TOKENS=384
 MORALSTACK_SIMULATOR_TEMPERATURE=0.1
@@ -125,7 +127,7 @@ MORALSTACK_SIMULATOR_ENABLE_CACHING=true
 # -----------------------------------------------------------------------------
 # See docs/modules/hindsight.md for full documentation of each variable.
 # Model for hindsight evaluator (if set, overrides OPENAI_MODEL for hindsight only; used in run and benchmark)
-MORALSTACK_HINDSIGHT_MODEL=gpt-4o
+MORALSTACK_HINDSIGHT_MODEL=gpt-4o-mini
 MORALSTACK_HINDSIGHT_MAX_RETRIES=3
 MORALSTACK_HINDSIGHT_MAX_TOKENS=768
 MORALSTACK_HINDSIGHT_TEMPERATURE=0.2
@@ -164,6 +166,11 @@ MORALSTACK_ORCHESTRATOR_ENABLE_HINDSIGHT_GATING=true
 MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD=0.4
 MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD=100
 MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER=0.95
+# When true and PARALLEL_MODULE_CALLS=true, critic runs in parallel with sim+persp instead of gating them sequentially.
+# Hard violations are still honoured (sim/persp results discarded). Saves ~3.5s per cycle.
+MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES=true
+# When true, risk estimation and draft generation run in parallel (speculative overlap). Saves ~4s on deliberative path.
+MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION=true
 
 # -----------------------------------------------------------------------------
 # Tracing & Debug

diff --git a/.env.template b/.env.template
@@ -12,6 +12,11 @@
 # -----------------------------------------------------------------------------
 OPENAI_API_KEY=
 OPENAI_MODEL=gpt-4o
+# Model used for policy rewrite operations (cycle 2+).
+# Uses a lighter model by default to reduce latency on revisions
+# triggered by soft violations. Set to same value as OPENAI_MODEL
+# to disable the downgrade.
+MORALSTACK_POLICY_REWRITE_MODEL=gpt-4.1-nano
 # OPENAI_BASE_URL=https://your-proxy.example.com/v1
 # OPENAI_TIMEOUT_MS=60000
 # OPENAI_MAX_RETRIES=3
@@ -55,7 +60,7 @@ MORALSTACK_UI_PASSWORD=
 # -----------------------------------------------------------------------------
 # See docs/modules/risk_estimator.md for full documentation of each variable.
 # Model for the semantic judge (if set, overrides OPENAI_MODEL for risk only)
-# MORALSTACK_RISK_MODEL=gpt-4o
+# MORALSTACK_RISK_MODEL=gpt-4o-mini
 # if MORALSTACK_RISK_PARALLEL_ESTIMATORS = true then the following models are used for parallel estimation
 # MORALSTACK_RISK_INTENT_MODEL=gpt-4o
 # MORALSTACK_RISK_SIGNALS_MODEL=gpt-4o-mini
@@ -115,7 +120,7 @@ MORALSTACK_UI_PASSWORD=
 # -----------------------------------------------------------------------------
 # See docs/modules/simulator.md for full documentation of each variable.
 # Model for consequence simulator (if set, overrides OPENAI_MODEL for simulator only; used in run and benchmark)
-# MORALSTACK_SIMULATOR_MODEL=gpt-4o-mini
+# MORALSTACK_SIMULATOR_MODEL=gpt-4.1-nano
 # MORALSTACK_SIMULATOR_MAX_RETRIES=3
 # MORALSTACK_SIMULATOR_MAX_TOKENS=384
 # MORALSTACK_SIMULATOR_TEMPERATURE=0.1
@@ -129,7 +134,7 @@ MORALSTACK_UI_PASSWORD=
 # -----------------------------------------------------------------------------
 # See docs/modules/hindsight.md for full documentation of each variable.
 # Model for hindsight evaluator (if set, overrides OPENAI_MODEL for hindsight only; used in run and benchmark)
-# MORALSTACK_HINDSIGHT_MODEL=gpt-4o
+# MORALSTACK_HINDSIGHT_MODEL=gpt-4o-mini
 # MORALSTACK_HINDSIGHT_MAX_RETRIES=3
 # MORALSTACK_HINDSIGHT_MAX_TOKENS=768
 # MORALSTACK_HINDSIGHT_TEMPERATURE=0.2
@@ -169,6 +174,11 @@ MORALSTACK_UI_PASSWORD=
 # MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD=0.4
 # MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD=100
 # MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER=0.95
+# When true and PARALLEL_MODULE_CALLS=true, critic runs in parallel with sim+persp instead of gating them sequentially.
+# Hard violations are still honoured (sim/persp results discarded). Saves ~3.5s per cycle.
+# MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES=true
+# When true, risk estimation and draft generation run in parallel (speculative overlap). Saves ~4s on deliberative path.
+# MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION=true
 
 # -----------------------------------------------------------------------------
 # Tracing & Debug

diff --git a/INSTALL.md b/INSTALL.md
@@ -80,6 +80,7 @@ See [docs/modules/openai_params.md](docs/modules/openai_params.md) for details a
 |--------------------------------|---------------------------|----------------------------------------------------------------|
 | OPENAI_API_KEY                 | -                         | OpenAI API key (required)                                      |
 | OPENAI_MODEL                   | gpt-4o                    | OpenAI model (see [Model compatibility](#model-compatibility)) |
+| MORALSTACK_POLICY_REWRITE_MODEL | - (same as OPENAI_MODEL) | Policy `rewrite()` at cycle 2+; `.env.template` uses `gpt-4.1-nano`; set any lighter model to reduce latency (see [policy.md](docs/modules/policy.md)) |
 | OPENAI_BASE_URL                | -                         | Base URL (proxy/enterprise)                                    |
 | OPENAI_TIMEOUT_MS              | 60000                     | Timeout in milliseconds                                        |
 | OPENAI_MAX_RETRIES             | 3                         | Retries on 429/503                                             |
@@ -129,10 +130,12 @@ both CLI run and benchmark, hindsight configuration and model are read only from
 CLI override — env is the single source of configuration.**
 
 **Orchestrator**: Optional overrides (e.g. `MORALSTACK_ORCHESTRATOR_MAX_DELIBERATION_CYCLES`,
-`MORALSTACK_ORCHESTRATOR_TIMEOUT_MS`, `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`, …) are listed in `.env.template`
-and fully documented in [docs/modules/orchestrator.md](docs/modules/orchestrator.md#environment-variables). Leave them
-commented to use built-in defaults. **In both CLI run and benchmark, orchestrator configuration is read only from the
-environment (`.env`); there is no CLI override — env is the single source of configuration.**
+`MORALSTACK_ORCHESTRATOR_TIMEOUT_MS`, `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`,
+`MORALSTACK_ORCHESTRATOR_PARALLEL_MODULE_CALLS`, `MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES`,
+`MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION`, …) are listed in `.env.template` and fully documented in
+[docs/modules/orchestrator.md](docs/modules/orchestrator.md#environment-variables). Leave them commented to use built-in
+defaults. **In both CLI run and benchmark, orchestrator configuration is read only from the environment (`.env`); there
+is no CLI override — env is the single source of configuration.**
 
 **Benchmark**: Optional overrides (e.g. `MORALSTACK_BENCHMARK_OUTPUTS`, `MORALSTACK_BENCHMARK_BASELINE_MODEL`,
 `MORALSTACK_BENCHMARK_JUDGE_MODEL`) are listed in `.env.template` and fully documented

diff --git a/README.md b/README.md
@@ -87,8 +87,8 @@ Evaluated on 84 questions spanning adversarial prompts, dual-use domains, regula
 
 | | Baseline | MoralStack | Tie |
 |---|---|---|---|
-| **Wins** | 4 | **54** | 26 |
-| **Avg Safety Score** | 7.92/10 | **9.27/10** | — |
+| **Wins** | 1 | **53** | 30 |
+| **Avg Safety Score** | 7.73/10 | **9.39/10** | — |
 
 ### Decision Accuracy
 
@@ -109,9 +109,9 @@ REFUSE         0     0    22
 
 | | Baseline | MoralStack |
 |---|---|---|
-| **Avg Latency** | ~10s | ~70s |
+| **Avg Latency** | ~5s | ~60s |
 
-Deliberative paths add latency by design (see [Limitations](#limitations--trade-offs)).
+Deliberative paths add latency by design. Latency-reducing optimizations include speculative decoding, parallel risk estimation, lighter models for simulator and policy rewrite (see [Limitations](#limitations--trade-offs) and [Configuration](#configuration)).
 
 ## Quickstart
 
@@ -180,6 +180,7 @@ Environment is loaded via `moralstack/utils/env_loader.py`.
 Key variables:
 
 - `OPENAI_MODEL` (default `gpt-4o`)
+- `MORALSTACK_POLICY_REWRITE_MODEL` (optional; model for deliberative `rewrite()` at cycle 2+; if unset, same as `OPENAI_MODEL`. `.env.template` sets `gpt-4.1-nano` for lower rewrite latency.)
 - `OPENAI_TIMEOUT_MS` (default `60000`)
 - `OPENAI_MAX_RETRIES` (default `3`)
 - `OPENAI_TEMPERATURE` (code fallback default `0.7`; `.env.template` starter value `0.1`)
@@ -190,6 +191,18 @@ Key variables:
 
 For full variable reference see [INSTALL.md](INSTALL.md) and `docs/modules/*.md`.
 
+Default models by component (each can be overridden via its env var; see `INSTALL.md` and module docs):
+
+| Component | Default model | Env variable |
+|-----------|---------------|--------------|
+| Policy (generation) | gpt-4o | `OPENAI_MODEL` |
+| Policy (rewrite) | same as primary, or `gpt-4.1-nano` in `.env.template` | `MORALSTACK_POLICY_REWRITE_MODEL` |
+| Risk estimator | follows `OPENAI_MODEL` unless set | `MORALSTACK_RISK_MODEL` |
+| Critic | follows `OPENAI_MODEL` unless set | `MORALSTACK_CRITIC_MODEL` |
+| Simulator | follows `OPENAI_MODEL` unless set | `MORALSTACK_SIMULATOR_MODEL` |
+| Perspectives | follows `OPENAI_MODEL` unless set | `MORALSTACK_PERSPECTIVES_MODEL` |
+| Hindsight | follows `OPENAI_MODEL` unless set | `MORALSTACK_HINDSIGHT_MODEL` |
+
 ## Running the Benchmark
 
 ```bash
@@ -239,11 +252,11 @@ Open [http://localhost:8765/](http://localhost:8765/) (or `MORALSTACK_UI_PORT`).
 
 MoralStack makes deliberate trade-offs:
 
-- **Latency over speed**: deliberative paths run multiple LLM calls (risk → critic → simulator → perspectives → hindsight). Average response time is ~70s vs ~10s for raw GPT-4o. This is a design choice — governance takes time.
-- **Multi-model cost**: a single deliberative request makes 7-9 LLM calls. We use `gpt-4o-mini` for lower-stakes modules (simulator, perspectives) to reduce cost.
+- **Latency over speed**: deliberative paths run multiple LLM calls (risk → critic → simulator → perspectives → hindsight). Average response time is ~60s vs ~5s for raw GPT-4o. This is a design choice — governance takes time.
+- **Multi-model cost**: a single deliberative request makes 7-9 LLM calls. Example profiles: `.env.minimal` uses `gpt-4.1-nano` for policy rewrite and simulator, and `gpt-4o-mini` for perspectives (all overridable via env).
 - **Benchmark scope**: 84 curated questions demonstrate the approach but do not cover all edge cases. We recommend running your own evaluations on domain-specific inputs.
 - **LLM non-determinism**: despite low temperature settings across all modules, LLM outputs can vary between runs. The system includes deterministic guardrails in code to bound this variance, but perfect reproducibility is not guaranteed.
 
-We are actively working on reducing latency through early-exit optimizations and context-mode switching.
+Latency has been reduced through speculative decoding (predicted outputs for draft revisions), parallel risk estimation, lighter models for simulator and rewrite (`gpt-4.1-nano`), structured JSON output enforcement, and soft-revision prompt constraints. Further optimizations (early-exit on low-risk queries, context-mode switching) are planned.
 
 See full discussion in [docs/limitations_and_tradeoffs.md](docs/limitations_and_tradeoffs.md).
diff --git a/docs/architecture_spec.md b/docs/architecture_spec.md
@@ -150,6 +150,9 @@ class OrchestratorConfig:
     enable_simulation: bool = True
     enable_hindsight: bool = True
     borderline_refuse_upper: float = 0.95  # Upper bound (inclusive) for borderline REFUSE deliberation
+    parallel_module_calls: bool = True
+    parallel_critic_with_modules: bool = True   # *[impl]* critic || sim || persp when parallel_module_calls
+    enable_speculative_generation: bool = True  # *[impl]* risk || speculative generate at controller entry
 
 @dataclass
 class RiskThresholds:
@@ -245,8 +248,34 @@ To reduce tokens and latency, the deliberative cycle supports:
   cycles) to preserve revision context
 - **Gating**: `enable_hindsight_gating` is true by default (hindsight only in final cycle; opt-out for legacy). `enable_simulator_gating` (opt-in) skips simulator when safe.
 - **Trace**: optional fields `context_mode_by_module`, `modules_skipped` for reporting
+- **Policy rewrite model**: deliberative `rewrite()` at cycle 2+ may use `MORALSTACK_POLICY_REWRITE_MODEL` (when unset,
+  same as `OPENAI_MODEL`) to reduce latency; initial `generate()` / speculative draft stays on the primary model.
 
-See § Token Optimization above.
+#### Policy rewrite model downgrade
+
+When the critic triggers a revision on soft violations, the policy `rewrite` at cycle 2+ uses a configurable model
+(`MORALSTACK_POLICY_REWRITE_MODEL`). If unset or empty, the primary `OPENAI_MODEL` is used (backward compatible). A
+lighter default (for example `gpt-4.1-nano` in `.env.template`) reduces rewrite latency because the call runs under
+explicit critic guidance and constrained-generation instructions; speculative first-pass generation remains on the
+primary model for baseline quality. To disable the split, set `MORALSTACK_POLICY_REWRITE_MODEL` to the same value as
+`OPENAI_MODEL`.
+
+In benchmark testing, this optimization reduces rewrite step latency and, combined with
+`gpt-4.1-nano` on the simulator, brings average deliberative latency from ~82s to ~60s
+(~27% reduction) with no measurable compliance degradation (98.8% maintained).
+
+#### Rewrite prompt constraints
+
+To prevent lighter rewrite models from introducing new operational content during revision,
+the rewrite system prompt includes explicit constraints:
+
+- Do not add new examples, scenarios, or operational details not present in the original draft
+- Focus on restructuring and reframing existing content based on critic feedback
+- When feedback requests conceptual focus, remove operational specifics rather than adding new ones
+
+These constraints are appended to the rewrite system prompt regardless of whether it comes from
+the deliberation runner or uses the fallback default. They compensate for the tendency of smaller
+models to "fill" revisions with new specifics rather than restructuring existing content.
 
 ---
 
@@ -304,7 +333,9 @@ strutturato), non un classificatore leggero; i segnali sono semantici (es. `ethi
 ### 3.4 Policy LLM
 
 **Responsibility**: Text generation (responses, revisions, refusals). *[impl]* The Orchestrator uses `generate` for
-draft, `rewrite` for revisions guided by Critic/Hindsight/Simulator/Perspectives, `refuse` for refusals.
+draft, `rewrite` for revisions guided by Critic/Hindsight/Simulator/Perspectives, `refuse` for refusals. Optional env
+`MORALSTACK_POLICY_REWRITE_MODEL` selects the model for `rewrite()` only; `generate()` and `refuse()` use the primary
+`OPENAI_MODEL` (see [Policy rewrite model downgrade](#policy-rewrite-model-downgrade) above).
 
 ```python
 @dataclass
@@ -893,6 +924,10 @@ Refusal text is persisted as an LLM call with `action` containing `"refuse"` (e.
 - Quick check: ~100ms
 - Assembly: ~10ms
 
+> **Actual measured performance** (benchmark, 84 questions): fast path average ~10-12s.
+> Target values above reflect aspirational architecture without LLM call latency.
+> Real-world fast path includes speculative generation (~5-8s) plus quick-check (~2-3s).
+
 ---
 
 ### 4.2 Sequence Diagram - Full Deliberation (risk ≥ 0.7)

diff --git a/docs/limitations_and_tradeoffs.md b/docs/limitations_and_tradeoffs.md
@@ -17,20 +17,28 @@ to the risk of underestimating vulnerability situations.
 ### 2. Latency and Computational Cost
 
 When it activates the deliberative path, MoralStack introduces
-significant computational overhead compared to a direct LLM call.
+computational overhead compared to a direct LLM call (~60s average
+vs ~5s for raw GPT-4o).
 
-The system prioritizes:
+The system prioritizes safety, decision correctness, and auditability
+over pure latency. Recent optimizations (parallel risk estimation,
+speculative decoding, lighter models for simulator and policy rewrite)
+have reduced average deliberative latency by ~26% from the initial
+architecture.
 
-- safety
-- decision correctness
-- auditability
+Latency profile by path:
 
-over pure latency.
+- **Fast path** (benign queries, ~11% of traffic): ~10-12s
+- **Deliberative path** (standard): ~45-60s
+- **Deliberative sensitive** (regulated domains): ~70-85s
 
 For this reason, MoralStack is not suitable for:
 
 - high-frequency creative chat
-- real-time systems with strict latency constraints
+- real-time systems with strict sub-second latency constraints
+
+Planned further optimizations include early-exit on low-risk deliberative
+queries and context-mode switching for reduced token overhead.
 
 ### 3. SAFE_COMPLETE as Decision, not Error