diff --git a/.cursor/rules/project-overview.mdc b/.cursor/rules/project-overview.mdc index 1b724ed..418f5c2 100644 --- a/.cursor/rules/project-overview.mdc +++ b/.cursor/rules/project-overview.mdc @@ -30,6 +30,7 @@ SAFE_COMPLETE is an intentional governance choice, **not an error**. | `moralstack/runtime/` | Orchestrator, deliberation loop; simulator includes semantic layer (harm_type, semantic_expected_harm) that influences deliberation; simulator module with config_loader for env (MORALSTACK_SIMULATOR_*); hindsight module with config_loader for env (MORALSTACK_HINDSIGHT_*); perspective module with config_loader for env (MORALSTACK_PERSPECTIVES_*); critic module with config_loader for env (MORALSTACK_CRITIC_*) | | `moralstack/runtime/decision/` | safe_complete_policy (single source of truth for action bounds, final_action) | | `moralstack/runtime/trace/` | DecisionTrace, append_decision_trace for audit | +| `moralstack/models/policy.py` | `OpenAIPolicy` (generate / rewrite / refuse); env `OPENAI_*`; optional `MORALSTACK_POLICY_REWRITE_MODEL` for deliberative `rewrite()` (defaults to primary model when unset) | | `moralstack/models/risk/` | Risk estimation (schema, estimator, calibration, parse_result, config_loader for env) | | `moralstack/models/decision_explanation.py` | DecisionExplanation dataclass for explainability | | `moralstack/models/reason_codes.py` | ReasonCode enum and policy-to-reason mapping | diff --git a/.env.minimal b/.env.minimal index 062381b..7adf8a8 100644 --- a/.env.minimal +++ b/.env.minimal @@ -12,6 +12,8 @@ # ----------------------------------------------------------------------------- OPENAI_API_KEY= OPENAI_MODEL=gpt-4o +# Lighter model for policy rewrites (omit variable entirely to use OPENAI_MODEL) +MORALSTACK_POLICY_REWRITE_MODEL=gpt-4.1-nano OPENAI_TEMPERATURE=0.1 OPENAI_TOP_P=0.8 @@ -52,7 +54,7 @@ MORALSTACK_UI_PASSWORD=admin # ----------------------------------------------------------------------------- # See docs/modules/risk_estimator.md for full documentation of each variable. # Model for the semantic judge (if set, overrides OPENAI_MODEL for risk only) -MORALSTACK_RISK_MODEL=gpt-4o +MORALSTACK_RISK_MODEL=gpt-4o-mini # if MORALSTACK_RISK_PARALLEL_ESTIMATORS = true then the following models are used for parallel estimation MORALSTACK_RISK_INTENT_MODEL=gpt-4o MORALSTACK_RISK_SIGNALS_MODEL=gpt-4o-mini @@ -111,7 +113,7 @@ MORALSTACK_CRITIC_INCLUDE_EXAMPLES=false # ----------------------------------------------------------------------------- # See docs/modules/simulator.md for full documentation of each variable. # Model for consequence simulator (if set, overrides OPENAI_MODEL for simulator only; used in run and benchmark) -MORALSTACK_SIMULATOR_MODEL=gpt-4o-mini +MORALSTACK_SIMULATOR_MODEL=gpt-4.1-nano MORALSTACK_SIMULATOR_MAX_RETRIES=3 MORALSTACK_SIMULATOR_MAX_TOKENS=384 MORALSTACK_SIMULATOR_TEMPERATURE=0.1 @@ -125,7 +127,7 @@ MORALSTACK_SIMULATOR_ENABLE_CACHING=true # ----------------------------------------------------------------------------- # See docs/modules/hindsight.md for full documentation of each variable. # Model for hindsight evaluator (if set, overrides OPENAI_MODEL for hindsight only; used in run and benchmark) -MORALSTACK_HINDSIGHT_MODEL=gpt-4o +MORALSTACK_HINDSIGHT_MODEL=gpt-4o-mini MORALSTACK_HINDSIGHT_MAX_RETRIES=3 MORALSTACK_HINDSIGHT_MAX_TOKENS=768 MORALSTACK_HINDSIGHT_TEMPERATURE=0.2 @@ -164,6 +166,11 @@ MORALSTACK_ORCHESTRATOR_ENABLE_HINDSIGHT_GATING=true MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD=0.4 MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD=100 MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER=0.95 +# When true and PARALLEL_MODULE_CALLS=true, critic runs in parallel with sim+persp instead of gating them sequentially. +# Hard violations are still honoured (sim/persp results discarded). Saves ~3.5s per cycle. +MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES=true +# When true, risk estimation and draft generation run in parallel (speculative overlap). Saves ~4s on deliberative path. +MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION=true # ----------------------------------------------------------------------------- # Tracing & Debug diff --git a/.env.template b/.env.template index e5a5dde..96d99e0 100644 --- a/.env.template +++ b/.env.template @@ -12,6 +12,11 @@ # ----------------------------------------------------------------------------- OPENAI_API_KEY= OPENAI_MODEL=gpt-4o +# Model used for policy rewrite operations (cycle 2+). +# Uses a lighter model by default to reduce latency on revisions +# triggered by soft violations. Set to same value as OPENAI_MODEL +# to disable the downgrade. +MORALSTACK_POLICY_REWRITE_MODEL=gpt-4.1-nano # OPENAI_BASE_URL=https://your-proxy.example.com/v1 # OPENAI_TIMEOUT_MS=60000 # OPENAI_MAX_RETRIES=3 @@ -55,7 +60,7 @@ MORALSTACK_UI_PASSWORD= # ----------------------------------------------------------------------------- # See docs/modules/risk_estimator.md for full documentation of each variable. # Model for the semantic judge (if set, overrides OPENAI_MODEL for risk only) -# MORALSTACK_RISK_MODEL=gpt-4o +# MORALSTACK_RISK_MODEL=gpt-4o-mini # if MORALSTACK_RISK_PARALLEL_ESTIMATORS = true then the following models are used for parallel estimation # MORALSTACK_RISK_INTENT_MODEL=gpt-4o # MORALSTACK_RISK_SIGNALS_MODEL=gpt-4o-mini @@ -115,7 +120,7 @@ MORALSTACK_UI_PASSWORD= # ----------------------------------------------------------------------------- # See docs/modules/simulator.md for full documentation of each variable. # Model for consequence simulator (if set, overrides OPENAI_MODEL for simulator only; used in run and benchmark) -# MORALSTACK_SIMULATOR_MODEL=gpt-4o-mini +# MORALSTACK_SIMULATOR_MODEL=gpt-4.1-nano # MORALSTACK_SIMULATOR_MAX_RETRIES=3 # MORALSTACK_SIMULATOR_MAX_TOKENS=384 # MORALSTACK_SIMULATOR_TEMPERATURE=0.1 @@ -129,7 +134,7 @@ MORALSTACK_UI_PASSWORD= # ----------------------------------------------------------------------------- # See docs/modules/hindsight.md for full documentation of each variable. # Model for hindsight evaluator (if set, overrides OPENAI_MODEL for hindsight only; used in run and benchmark) -# MORALSTACK_HINDSIGHT_MODEL=gpt-4o +# MORALSTACK_HINDSIGHT_MODEL=gpt-4o-mini # MORALSTACK_HINDSIGHT_MAX_RETRIES=3 # MORALSTACK_HINDSIGHT_MAX_TOKENS=768 # MORALSTACK_HINDSIGHT_TEMPERATURE=0.2 @@ -169,6 +174,11 @@ MORALSTACK_UI_PASSWORD= # MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD=0.4 # MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD=100 # MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER=0.95 +# When true and PARALLEL_MODULE_CALLS=true, critic runs in parallel with sim+persp instead of gating them sequentially. +# Hard violations are still honoured (sim/persp results discarded). Saves ~3.5s per cycle. +# MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES=true +# When true, risk estimation and draft generation run in parallel (speculative overlap). Saves ~4s on deliberative path. +# MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION=true # ----------------------------------------------------------------------------- # Tracing & Debug diff --git a/INSTALL.md b/INSTALL.md index 674f9d1..73f0f59 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -80,6 +80,7 @@ See [docs/modules/openai_params.md](docs/modules/openai_params.md) for details a |--------------------------------|---------------------------|----------------------------------------------------------------| | OPENAI_API_KEY | - | OpenAI API key (required) | | OPENAI_MODEL | gpt-4o | OpenAI model (see [Model compatibility](#model-compatibility)) | +| MORALSTACK_POLICY_REWRITE_MODEL | - (same as OPENAI_MODEL) | Policy `rewrite()` at cycle 2+; `.env.template` uses `gpt-4.1-nano`; set any lighter model to reduce latency (see [policy.md](docs/modules/policy.md)) | | OPENAI_BASE_URL | - | Base URL (proxy/enterprise) | | OPENAI_TIMEOUT_MS | 60000 | Timeout in milliseconds | | OPENAI_MAX_RETRIES | 3 | Retries on 429/503 | @@ -129,10 +130,12 @@ both CLI run and benchmark, hindsight configuration and model are read only from CLI override — env is the single source of configuration.** **Orchestrator**: Optional overrides (e.g. `MORALSTACK_ORCHESTRATOR_MAX_DELIBERATION_CYCLES`, -`MORALSTACK_ORCHESTRATOR_TIMEOUT_MS`, `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`, …) are listed in `.env.template` -and fully documented in [docs/modules/orchestrator.md](docs/modules/orchestrator.md#environment-variables). Leave them -commented to use built-in defaults. **In both CLI run and benchmark, orchestrator configuration is read only from the -environment (`.env`); there is no CLI override — env is the single source of configuration.** +`MORALSTACK_ORCHESTRATOR_TIMEOUT_MS`, `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`, +`MORALSTACK_ORCHESTRATOR_PARALLEL_MODULE_CALLS`, `MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES`, +`MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION`, …) are listed in `.env.template` and fully documented in +[docs/modules/orchestrator.md](docs/modules/orchestrator.md#environment-variables). Leave them commented to use built-in +defaults. **In both CLI run and benchmark, orchestrator configuration is read only from the environment (`.env`); there +is no CLI override — env is the single source of configuration.** **Benchmark**: Optional overrides (e.g. `MORALSTACK_BENCHMARK_OUTPUTS`, `MORALSTACK_BENCHMARK_BASELINE_MODEL`, `MORALSTACK_BENCHMARK_JUDGE_MODEL`) are listed in `.env.template` and fully documented diff --git a/README.md b/README.md index 16457c8..abdcfe4 100644 --- a/README.md +++ b/README.md @@ -87,8 +87,8 @@ Evaluated on 84 questions spanning adversarial prompts, dual-use domains, regula | | Baseline | MoralStack | Tie | |---|---|---|---| -| **Wins** | 4 | **54** | 26 | -| **Avg Safety Score** | 7.92/10 | **9.27/10** | — | +| **Wins** | 1 | **53** | 30 | +| **Avg Safety Score** | 7.73/10 | **9.39/10** | — | ### Decision Accuracy @@ -109,9 +109,9 @@ REFUSE 0 0 22 | | Baseline | MoralStack | |---|---|---| -| **Avg Latency** | ~10s | ~70s | +| **Avg Latency** | ~5s | ~60s | -Deliberative paths add latency by design (see [Limitations](#limitations--trade-offs)). +Deliberative paths add latency by design. Latency-reducing optimizations include speculative decoding, parallel risk estimation, lighter models for simulator and policy rewrite (see [Limitations](#limitations--trade-offs) and [Configuration](#configuration)). ## Quickstart @@ -180,6 +180,7 @@ Environment is loaded via `moralstack/utils/env_loader.py`. Key variables: - `OPENAI_MODEL` (default `gpt-4o`) +- `MORALSTACK_POLICY_REWRITE_MODEL` (optional; model for deliberative `rewrite()` at cycle 2+; if unset, same as `OPENAI_MODEL`. `.env.template` sets `gpt-4.1-nano` for lower rewrite latency.) - `OPENAI_TIMEOUT_MS` (default `60000`) - `OPENAI_MAX_RETRIES` (default `3`) - `OPENAI_TEMPERATURE` (code fallback default `0.7`; `.env.template` starter value `0.1`) @@ -190,6 +191,18 @@ Key variables: For full variable reference see [INSTALL.md](INSTALL.md) and `docs/modules/*.md`. +Default models by component (each can be overridden via its env var; see `INSTALL.md` and module docs): + +| Component | Default model | Env variable | +|-----------|---------------|--------------| +| Policy (generation) | gpt-4o | `OPENAI_MODEL` | +| Policy (rewrite) | same as primary, or `gpt-4.1-nano` in `.env.template` | `MORALSTACK_POLICY_REWRITE_MODEL` | +| Risk estimator | follows `OPENAI_MODEL` unless set | `MORALSTACK_RISK_MODEL` | +| Critic | follows `OPENAI_MODEL` unless set | `MORALSTACK_CRITIC_MODEL` | +| Simulator | follows `OPENAI_MODEL` unless set | `MORALSTACK_SIMULATOR_MODEL` | +| Perspectives | follows `OPENAI_MODEL` unless set | `MORALSTACK_PERSPECTIVES_MODEL` | +| Hindsight | follows `OPENAI_MODEL` unless set | `MORALSTACK_HINDSIGHT_MODEL` | + ## Running the Benchmark ```bash @@ -239,11 +252,11 @@ Open [http://localhost:8765/](http://localhost:8765/) (or `MORALSTACK_UI_PORT`). MoralStack makes deliberate trade-offs: -- **Latency over speed**: deliberative paths run multiple LLM calls (risk → critic → simulator → perspectives → hindsight). Average response time is ~70s vs ~10s for raw GPT-4o. This is a design choice — governance takes time. -- **Multi-model cost**: a single deliberative request makes 7-9 LLM calls. We use `gpt-4o-mini` for lower-stakes modules (simulator, perspectives) to reduce cost. +- **Latency over speed**: deliberative paths run multiple LLM calls (risk → critic → simulator → perspectives → hindsight). Average response time is ~60s vs ~5s for raw GPT-4o. This is a design choice — governance takes time. +- **Multi-model cost**: a single deliberative request makes 7-9 LLM calls. Example profiles: `.env.minimal` uses `gpt-4.1-nano` for policy rewrite and simulator, and `gpt-4o-mini` for perspectives (all overridable via env). - **Benchmark scope**: 84 curated questions demonstrate the approach but do not cover all edge cases. We recommend running your own evaluations on domain-specific inputs. - **LLM non-determinism**: despite low temperature settings across all modules, LLM outputs can vary between runs. The system includes deterministic guardrails in code to bound this variance, but perfect reproducibility is not guaranteed. -We are actively working on reducing latency through early-exit optimizations and context-mode switching. +Latency has been reduced through speculative decoding (predicted outputs for draft revisions), parallel risk estimation, lighter models for simulator and rewrite (`gpt-4.1-nano`), structured JSON output enforcement, and soft-revision prompt constraints. Further optimizations (early-exit on low-risk queries, context-mode switching) are planned. See full discussion in [docs/limitations_and_tradeoffs.md](docs/limitations_and_tradeoffs.md). diff --git a/docs/architecture_spec.md b/docs/architecture_spec.md index e1f8ae8..59e43d7 100644 --- a/docs/architecture_spec.md +++ b/docs/architecture_spec.md @@ -150,6 +150,9 @@ class OrchestratorConfig: enable_simulation: bool = True enable_hindsight: bool = True borderline_refuse_upper: float = 0.95 # Upper bound (inclusive) for borderline REFUSE deliberation + parallel_module_calls: bool = True + parallel_critic_with_modules: bool = True # *[impl]* critic || sim || persp when parallel_module_calls + enable_speculative_generation: bool = True # *[impl]* risk || speculative generate at controller entry @dataclass class RiskThresholds: @@ -245,8 +248,34 @@ To reduce tokens and latency, the deliberative cycle supports: cycles) to preserve revision context - **Gating**: `enable_hindsight_gating` is true by default (hindsight only in final cycle; opt-out for legacy). `enable_simulator_gating` (opt-in) skips simulator when safe. - **Trace**: optional fields `context_mode_by_module`, `modules_skipped` for reporting +- **Policy rewrite model**: deliberative `rewrite()` at cycle 2+ may use `MORALSTACK_POLICY_REWRITE_MODEL` (when unset, + same as `OPENAI_MODEL`) to reduce latency; initial `generate()` / speculative draft stays on the primary model. -See § Token Optimization above. +#### Policy rewrite model downgrade + +When the critic triggers a revision on soft violations, the policy `rewrite` at cycle 2+ uses a configurable model +(`MORALSTACK_POLICY_REWRITE_MODEL`). If unset or empty, the primary `OPENAI_MODEL` is used (backward compatible). A +lighter default (for example `gpt-4.1-nano` in `.env.template`) reduces rewrite latency because the call runs under +explicit critic guidance and constrained-generation instructions; speculative first-pass generation remains on the +primary model for baseline quality. To disable the split, set `MORALSTACK_POLICY_REWRITE_MODEL` to the same value as +`OPENAI_MODEL`. + +In benchmark testing, this optimization reduces rewrite step latency and, combined with +`gpt-4.1-nano` on the simulator, brings average deliberative latency from ~82s to ~60s +(~27% reduction) with no measurable compliance degradation (98.8% maintained). + +#### Rewrite prompt constraints + +To prevent lighter rewrite models from introducing new operational content during revision, +the rewrite system prompt includes explicit constraints: + +- Do not add new examples, scenarios, or operational details not present in the original draft +- Focus on restructuring and reframing existing content based on critic feedback +- When feedback requests conceptual focus, remove operational specifics rather than adding new ones + +These constraints are appended to the rewrite system prompt regardless of whether it comes from +the deliberation runner or uses the fallback default. They compensate for the tendency of smaller +models to "fill" revisions with new specifics rather than restructuring existing content. --- @@ -304,7 +333,9 @@ strutturato), non un classificatore leggero; i segnali sono semantici (es. `ethi ### 3.4 Policy LLM **Responsibility**: Text generation (responses, revisions, refusals). *[impl]* The Orchestrator uses `generate` for -draft, `rewrite` for revisions guided by Critic/Hindsight/Simulator/Perspectives, `refuse` for refusals. +draft, `rewrite` for revisions guided by Critic/Hindsight/Simulator/Perspectives, `refuse` for refusals. Optional env +`MORALSTACK_POLICY_REWRITE_MODEL` selects the model for `rewrite()` only; `generate()` and `refuse()` use the primary +`OPENAI_MODEL` (see [Policy rewrite model downgrade](#policy-rewrite-model-downgrade) above). ```python @dataclass @@ -893,6 +924,10 @@ Refusal text is persisted as an LLM call with `action` containing `"refuse"` (e. - Quick check: ~100ms - Assembly: ~10ms +> **Actual measured performance** (benchmark, 84 questions): fast path average ~10-12s. +> Target values above reflect aspirational architecture without LLM call latency. +> Real-world fast path includes speculative generation (~5-8s) plus quick-check (~2-3s). + --- ### 4.2 Sequence Diagram - Full Deliberation (risk ≥ 0.7) diff --git a/docs/limitations_and_tradeoffs.md b/docs/limitations_and_tradeoffs.md index b8efc35..4d90bab 100644 --- a/docs/limitations_and_tradeoffs.md +++ b/docs/limitations_and_tradeoffs.md @@ -17,20 +17,28 @@ to the risk of underestimating vulnerability situations. ### 2. Latency and Computational Cost When it activates the deliberative path, MoralStack introduces -significant computational overhead compared to a direct LLM call. +computational overhead compared to a direct LLM call (~60s average +vs ~5s for raw GPT-4o). -The system prioritizes: +The system prioritizes safety, decision correctness, and auditability +over pure latency. Recent optimizations (parallel risk estimation, +speculative decoding, lighter models for simulator and policy rewrite) +have reduced average deliberative latency by ~26% from the initial +architecture. -- safety -- decision correctness -- auditability +Latency profile by path: -over pure latency. +- **Fast path** (benign queries, ~11% of traffic): ~10-12s +- **Deliberative path** (standard): ~45-60s +- **Deliberative sensitive** (regulated domains): ~70-85s For this reason, MoralStack is not suitable for: - high-frequency creative chat -- real-time systems with strict latency constraints +- real-time systems with strict sub-second latency constraints + +Planned further optimizations include early-exit on low-risk deliberative +queries and context-mode switching for reduced token overhead. ### 3. SAFE_COMPLETE as Decision, not Error diff --git a/docs/modules/benchmark.md b/docs/modules/benchmark.md index 5683280..ae2e89e 100644 --- a/docs/modules/benchmark.md +++ b/docs/modules/benchmark.md @@ -55,6 +55,10 @@ All benchmark configuration can be overridden via `.env`. Variables are read whe 1. `--model` / `-m` (CLI) 2. `gpt-4o` (default) +**MoralStack policy rewrite** (env only; not set via CLI): `MORALSTACK_POLICY_REWRITE_MODEL` selects the model for +`rewrite()` in deliberation (cycle 2+). If unset, the rewrite uses the same model as primary policy generation. Report +`models_config.moralstack.policy_rewrite` reflects the effective value. + **Judge**: 1. `MORALSTACK_BENCHMARK_JUDGE_MODEL` (if set in `.env`) @@ -84,8 +88,8 @@ With `MORALSTACK_BENCHMARK_BASELINE_MODEL=gpt-4o` in `.env`, the baseline always ## Report and UI The benchmark report JSON (`benchmark_{run_id}.json`) stores `model`, `judge_model`, and `models_config` (baseline, -judge, MoralStack modules: policy, risk, critic, simulator, hindsight, perspectives). The markdown export and the UI -display these models clearly in the report header and on the run detail page. +judge, MoralStack modules: policy, policy_rewrite, risk, critic, simulator, hindsight, perspectives). The markdown +export and the UI display these models clearly in the report header and on the run detail page. **UI and export requirement**: The file `benchmark_{run_id}.json` in `MORALSTACK_BENCHMARK_OUTPUTS` is **required** for the moralstack-ui to display the full benchmark summary (Executive Summary, FP/FN, per-question baseline/moralstack diff --git a/docs/modules/critic.md b/docs/modules/critic.md index 386b6a9..089539f 100644 --- a/docs/modules/critic.md +++ b/docs/modules/critic.md @@ -170,6 +170,8 @@ variables.** - **Type**: int (>= 1) - **Meaning**: Number of parse attempts for the critic JSON response before raising an error. +Structured critic output uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON. + #### MORALSTACK_CRITIC_MAX_TOKENS - **Default**: `384` diff --git a/docs/modules/hindsight.md b/docs/modules/hindsight.md index 3cc2e54..70c330b 100644 --- a/docs/modules/hindsight.md +++ b/docs/modules/hindsight.md @@ -247,6 +247,8 @@ hindsight configuration is the single source of configuration — no CLI or code - **Type**: int (>= 1) - **Description**: Number of parse attempts for the hindsight JSON response before raising an error. +Hindsight evaluation uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON. + #### MORALSTACK_HINDSIGHT_MAX_TOKENS - **Default**: `768` diff --git a/docs/modules/openai_params.md b/docs/modules/openai_params.md index 149d0a7..56fba08 100644 --- a/docs/modules/openai_params.md +++ b/docs/modules/openai_params.md @@ -84,6 +84,41 @@ response = client.chat.completions.create( --- +## Predicted Output Support + +Some models support the `prediction` parameter, which enables speculative decoding for faster generation when the +expected output is largely similar to a known text (e.g. a draft revision in `rewrite()`). + +### `MODELS_SUPPORTING_PREDICTED_OUTPUT` + +Tuple of model name prefixes that support the `prediction` parameter: + +```python +MODELS_SUPPORTING_PREDICTED_OUTPUT = ( + "gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", +) +``` + +### `supports_predicted_output(model: str) -> bool` + +Returns `True` if the model supports speculative decoding via predicted outputs. + +```python +from moralstack.utils.openai_params import supports_predicted_output + +supports_predicted_output("gpt-4o") # True +supports_predicted_output("gpt-4.1") # True +supports_predicted_output("o3-mini") # False (uses max_completion_tokens) +supports_predicted_output("gpt-5.2") # False +``` + +**Constraints:** Predicted outputs are incompatible with `max_completion_tokens`, `logprobs`, and `n > 1`. The +`rewrite()` method in `OpenAIPolicy` uses this automatically — no caller changes needed. + +**Reference:** [OpenAI Predicted Outputs](https://platform.openai.com/docs/guides/predicted-outputs) + +--- + ## Updating the Model List When OpenAI releases new models that require `max_completion_tokens`, update the tuple in @@ -93,6 +128,12 @@ When OpenAI releases new models that require `max_completion_tokens`, update the MODELS_REQUIRING_MAX_COMPLETION_TOKENS = ("o1", "o3", "o4", "gpt-5", "gpt-6") # add new prefix ``` +When OpenAI adds predicted output support to new models, update: + +```python +MODELS_SUPPORTING_PREDICTED_OUTPUT = ("gpt-4o", "gpt-4o-mini", "gpt-4.1", ...) # add new prefix +``` + **Rules:** - Use the **shortest unique prefix** that identifies the model family (e.g. `gpt-5` matches `gpt-5.2`, `gpt-5.1`, @@ -109,7 +150,8 @@ deprecated and not compatible with o-series models. All modules that call the OpenAI Chat Completions API use this utility: -- **Policy LLM** (`moralstack/models/policy.py`) — `_complete()` +- **Policy LLM** (`moralstack/models/policy.py`) — `_complete()` uses `completion_tokens_param` and + `supports_predicted_output` (the latter for `rewrite()` speculative decoding) - **Benchmark** (`scripts/benchmark_moralstack.py`) — `OpenAIClient.generate()`, `_generate_with_model()` - **Constitution Retriever** (`moralstack/constitution/retriever.py`) — direct `client.chat.completions.create` calls (used by store) - **Runtime modules** (critic, perspective, hindsight, simulator, risk estimator) — via policy diff --git a/docs/modules/orchestrator.md b/docs/modules/orchestrator.md index d576e30..137cfb4 100644 --- a/docs/modules/orchestrator.md +++ b/docs/modules/orchestrator.md @@ -119,6 +119,26 @@ config = OrchestratorConfig( | `min_hindsight_score` | 0.8 | Minimum hindsight score for convergence | | `borderline_refuse_upper` | 0.95 | Upper bound for borderline REFUSE deliberation | | `early_exit_perspectives_threshold` | 0.85 | Weighted approval threshold for early exit (critic PROCEED path) | +| `parallel_critic_with_modules` | `True` | When `True` and `parallel_module_calls` is `True`, the critic runs in parallel with the simulator and perspectives instead of as a sequential gate. See [Latency-oriented parameters](#latency-oriented-parameters). | +| `enable_speculative_generation` | `True` | When `True`, risk estimation and speculative draft generation run in parallel before routing. The draft is reused on benign, fast, and deliberative routes when applicable. See [Latency-oriented parameters](#latency-oriented-parameters). | + +### Latency-oriented parameters + +These flags reduce wall-clock latency **without changing routing policy** (`decide_action`, `get_route`, overlay floors, or convergence invariants). They do not change how `final_action` is computed from risk and module outputs. + +**`parallel_critic_with_modules` (default `True`)** + +- **Requires** `parallel_module_calls=True`. When both are `True`, each deliberation cycle runs **critic**, **simulator**, and **perspectives** concurrently (three parallel LLM calls per cycle when those modules are enabled). +- **When `False`**: the runner uses a two-stage layout: critic runs first; only if there is no hard violation do simulator and perspectives run in parallel. This avoids paying for simulator/perspective calls when the critic would reject the draft, but adds sequential critic latency before sim/persp start. +- **Hard violations**: If the critic reports a hard violation, simulator and perspective results from that cycle are **discarded** and do not affect merged state. Convergence and refusal logic see the same critic outcome as in the gated layout; you may pay extra token cost in the rare hard-violation case. +- **Set to `false`** if you prioritize minimizing LLM spend on hard-violation paths over latency. + +**`enable_speculative_generation` (default `True`)** + +- **When `True`**: `OrchestrationController` starts **risk estimation** and a **speculative policy `generate`** (same base system prompt as normal first-pass generation) in parallel. After risk returns, routing proceeds as usual; the speculative draft is **not** used for policy routing decisions. +- **Reuse**: On benign fast path, fast path, and deliberative path, the draft is reused when it is still valid (skipping a duplicate first `generate` where implemented). On **REFUSE**, the speculative call is unused (wasted latency/token trade-off). **`SAFE_COMPLETE`** path does not reuse this draft (different system instructions). +- **Constrained generation** (`CLEARLY_HARMFUL` deliberation): the speculative draft is **not** applied as cycle-1 output; the constrained system prompt is used instead. +- **Note**: Speculative generation uses language resolution **before** the risk estimator’s `detected_language` is available (fallback path). Routing and safety decisions are unchanged; draft wording may differ slightly from a strictly sequential generate-after-risk for the same request. ### Borderline REFUSE Upper Bound @@ -178,7 +198,7 @@ For low-risk requests, the system uses an optimized path: Request → Risk Estimation (< 0.3) → Direct Generation → Response ``` -**Typical latency**: 300-500ms +**Typical latency**: ~10-12s (benchmark, 84 questions) ### Deliberative Path (risk ≥ 0.3) @@ -191,7 +211,9 @@ For significant-risk requests: 5. **Hindsight Evaluation**: Retrospective evaluation 6. **Convergence Check**: Verify termination criteria -**Typical latency**: 2-30 seconds (depends on number of cycles) +**Typical latency**: Deliberative path averages ~45-60s for standard queries, +~70-85s for sensitive-domain queries (1 cycle ~35s, 2 cycles ~65s). +Fast path averages ~10-12s. --- @@ -427,11 +449,30 @@ There is no dedicated model for the orchestrator (it is not an LLM module). - **Default**: `true` - **Type**: bool -- **Description**: When true, critic/simulator/perspectives run in parallel within each cycle. Their LLM calls are - persisted (with run/request/cycle context) and appear in moralstack-ui; `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`, +- **Description**: When true, the deliberation runner uses parallel execution for module calls within each cycle (see + also `MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES`). LLM calls are persisted (with run/request/cycle context) + and appear in moralstack-ui; `MORALSTACK_ORCHESTRATOR_ENABLE_PERSPECTIVES`, `MORALSTACK_ORCHESTRATOR_ENABLE_SIMULATION`, and `MORALSTACK_ORCHESTRATOR_ENABLE_HINDSIGHT` determine which modules run and thus which calls are recorded and visible in the UI. +#### MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES + +- **Default**: `true` +- **Type**: bool +- **Description**: When `true` and `MORALSTACK_ORCHESTRATOR_PARALLEL_MODULE_CALLS` is `true`, the critic runs in parallel + with the simulator and perspectives (full parallel evaluation). When `false`, the critic runs first as a gate; simulator + and perspectives run in parallel only after the critic reports no hard violation. See + [Latency-oriented parameters](#latency-oriented-parameters). + +#### MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION + +- **Default**: `true` +- **Type**: bool +- **Description**: When `true`, risk estimation and speculative first-pass draft generation run in parallel at the start + of `process()`. The draft may be reused on benign, fast, and deliberative routes; it is not used for routing. When + `false`, risk estimation runs alone, then generation follows the previous sequential pattern. See + [Latency-oriented parameters](#latency-oriented-parameters). + ### Risk thresholds (path routing) #### MORALSTACK_ORCHESTRATOR_RISK_LOW_THRESHOLD diff --git a/docs/modules/perspectives.md b/docs/modules/perspectives.md index 85d99d0..9c448c8 100644 --- a/docs/modules/perspectives.md +++ b/docs/modules/perspectives.md @@ -157,6 +157,8 @@ overrides these variables.** - **Type**: int (≥ 1) - **Meaning**: Number of parse attempts per perspective JSON response before marking that perspective as failed. +Perspective evaluation uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON. + #### MORALSTACK_PERSPECTIVES_MAX_TOKENS - **Default**: `512` diff --git a/docs/modules/policy.md b/docs/modules/policy.md index 357baba..6671746 100644 --- a/docs/modules/policy.md +++ b/docs/modules/policy.md @@ -33,7 +33,7 @@ models like gpt-5.x and o-series require the latter). ```python from moralstack.models.policy import OpenAIPolicy, OpenAIPolicyConfig -# From environment variables (OPENAI_API_KEY, OPENAI_MODEL, ...) +# From environment variables (OPENAI_API_KEY, OPENAI_MODEL, optional MORALSTACK_POLICY_REWRITE_MODEL, ...) policy = OpenAIPolicy() # Or with explicit overrides @@ -65,6 +65,32 @@ print(result.text) Revises a response based on feedback. +When the configured model supports it (gpt-4o, gpt-4o-mini, gpt-4.1 family), `rewrite()` automatically leverages +**OpenAI Predicted Outputs** (speculative decoding): the existing draft is provided as a prediction hint so that +unchanged portions of the text are generated significantly faster. This is transparent to the caller and does not +alter the output quality. + +**Rewrite model**: `OpenAIPolicy` may use a separate model for `rewrite()` only via `MORALSTACK_POLICY_REWRITE_MODEL`. +If unset or empty, `rewrite()` uses the same model as `generate()` (`OPENAI_MODEL`). This keeps the first-pass +draft on the primary model while allowing a lighter model for deliberative revisions (see `docs/architecture_spec.md`). + +The rewrite prompt also includes explicit constraints that prevent lighter models from adding +new operational content not present in the original draft. This ensures that `gpt-4.1-nano` +rewrites maintain quality comparable to `gpt-4o` rewrites on the benchmark (9.39 avg score, +0 leakage). + +--- + +## Environment variables + +| Variable | Purpose | +|----------|---------| +| `OPENAI_API_KEY` | Required API key | +| `OPENAI_MODEL` | Primary model for `generate()` and `refuse()` | +| `MORALSTACK_POLICY_REWRITE_MODEL` | Optional; model for `rewrite()` (deliberative cycle 2+). Defaults to `OPENAI_MODEL` when unset. `.env.template` sets `gpt-4.1-nano` when you copy that file. | + +See also `.env.template` and `INSTALL.md`. + ```python result = policy.rewrite( prompt="Original user request", @@ -106,6 +132,20 @@ class GenerationResult: finish_reason: str # Termination reason ("stop", "length", etc.) ``` +### GenerationConfig + +```python +@dataclass +class GenerationConfig: + max_tokens: int = 2048 + temperature: float = 0.7 + top_p: float = 0.9 + stop_sequences: list[str] = [] + response_format: Any = None # OpenAI response format constraint +``` + +The optional `response_format` field maps to OpenAI's response format (e.g. `{"type": "json_object"}`). Structured evaluation modules (Critic, Simulator, Hindsight, Perspectives) set it so the API returns guaranteed valid JSON. + --- ## Output Sanitization @@ -174,6 +214,8 @@ class PolicyLLMProtocol(Protocol): ... ``` +The `config` argument is typically a `GenerationConfig` (see [Output Structure](#output-structure)); it may include `response_format` for OpenAI structured outputs. + --- ## Usage with OpenAI diff --git a/docs/modules/simulator.md b/docs/modules/simulator.md index 7c9184b..5a258df 100644 --- a/docs/modules/simulator.md +++ b/docs/modules/simulator.md @@ -283,10 +283,15 @@ configuration is the single source of configuration — no CLI or code path over benchmark create a dedicated `OpenAIPolicy` with this model for the simulator; the rest of the stack keeps using `OPENAI_MODEL`. - **Effect**: - - **Set to a model id** (e.g. `gpt-4o-mini`): The simulator uses that model. Lets you use a smaller/cheaper model + - **Set to a model id** (e.g. `gpt-4o-mini`, `gpt-4.1-nano` as in `.env.template` / `.env.minimal`): The simulator uses that model. Lets you use a smaller/cheaper model for simulation and a larger one for generation. - **Unset or empty**: The simulator uses the same policy (and model) as the rest of the pipeline. +In the recommended configuration (`.env.template`), the simulator uses `gpt-4.1-nano`. +Benchmark testing shows this reduces average deliberative latency by ~27% compared to +`gpt-4o-mini` on the simulator, with no compliance degradation (98.8% maintained) and +minimal quality impact (avg score 9.39 vs 9.36 with `gpt-4o` across all modules). + ### LLM and retry behaviour #### MORALSTACK_SIMULATOR_MAX_RETRIES @@ -295,6 +300,8 @@ configuration is the single source of configuration — no CLI or code path over - **Type**: int (>= 1) - **Description**: Number of parse attempts for the simulator JSON response before raising an error. +Simulator generation uses OpenAI's `json_object` response format (`response_format={"type": "json_object"}` on `GenerationConfig`), which guarantees valid JSON and greatly reduces retries caused by malformed JSON. + #### MORALSTACK_SIMULATOR_MAX_TOKENS - **Default**: `384` diff --git a/moralstack/cli/loader.py b/moralstack/cli/loader.py index 55fc03e..71a78f7 100644 --- a/moralstack/cli/loader.py +++ b/moralstack/cli/loader.py @@ -163,8 +163,11 @@ def _load_real_modules(self) -> dict[str, Any]: model=self.config.openai_model, ) modules["policy"] = openai_policy - self.load_status["policy"] = f"✓ OpenAI ({self.config.openai_model})" - print_colored(f" ✓ policy: OpenAI ({self.config.openai_model})", "green") + policy_display = self.config.openai_model + if openai_policy.rewrite_model != openai_policy.model: + policy_display = f"{self.config.openai_model} (rewrite: {openai_policy.rewrite_model})" + self.load_status["policy"] = f"✓ OpenAI ({policy_display})" + print_colored(f" ✓ policy: OpenAI ({policy_display})", "green") # Constitution Store (requires openai_api_key) from moralstack.constitution.openai_config import OpenAIClientConfig diff --git a/moralstack/cli/report.py b/moralstack/cli/report.py index 59718dc..299c56b 100644 --- a/moralstack/cli/report.py +++ b/moralstack/cli/report.py @@ -17,7 +17,16 @@ def __init__(self, verbose: bool = False): self.calls: list[dict] = [] self.call_counter = 0 - def log_call(self, module: str, action: str, prompt: str, response: str = "", duration_ms: float = 0.0): + def log_call( + self, + module: str, + action: str, + prompt: str, + response: str = "", + duration_ms: float = 0.0, + *, + model: str | None = None, + ): """Logs an LLM call.""" self.call_counter += 1 @@ -32,6 +41,7 @@ def log_call(self, module: str, action: str, prompt: str, response: str = "", du "prompt": prompt[:prompt_limit] + "..." if len(prompt) > prompt_limit else prompt, "response": (response[:response_limit] + "..." if len(response) > response_limit else response), "duration_ms": duration_ms, + "model": model, # Always save complete data for revision history "full_prompt": prompt, "full_response": response, @@ -52,6 +62,9 @@ def _print_call(self, call: dict): print_colored(f"📞 LLM CALL #{call['id']}: {call['module']} → {call['action']}", header_color) print_colored(f"{'=' * 80}", border_color) + if call.get("model"): + print_colored(f"🤖 Model: {call['model']}", "cyan") + if call["prompt"]: print_colored("\n📤 PROMPT:", "blue") # Show full prompt for important modules @@ -422,6 +435,10 @@ def _detailed_phases(self, trace: DeliberationTrace, call_logger: CallLogger) -> lines.append(f"| **Duration** | `{phase.duration_ms:.0f}ms` |") lines.append(f"| **Success** | {status_icon} |") + policy_call = self._get_policy_call_dict_for_phase(phase, cycle_num, calls_by_module) + if policy_call and policy_call.get("model"): + lines.append(f"| **Model** | `{policy_call['model']}` |") + if phase.decision: lines.append(f"| **Decision** | `{phase.decision}` |") if phase.decision_reason: @@ -512,6 +529,27 @@ def _build_calls_map(self, call_logger: CallLogger) -> dict: return calls_map + def _get_policy_call_dict_for_phase(self, phase: PhaseResult, cycle: int, calls_map: dict) -> dict[str, Any] | None: + """Returns the CallLogger entry for policy generation or revision, if any.""" + if phase.phase not in (PhaseType.GENERATION, PhaseType.REVISION): + return None + if "policy" not in calls_map: + return None + module_calls = calls_map["policy"] + if phase.phase == PhaseType.GENERATION: + for call in module_calls: + if "generate" in call.get("action", "").lower(): + return call + return module_calls[0] if module_calls else None + if phase.phase == PhaseType.REVISION: + rewrite_calls = [c for c in module_calls if "rewrite" in c.get("action", "").lower()] + if cycle >= 2 and rewrite_calls: + idx = cycle - 2 + if 0 <= idx < len(rewrite_calls): + return rewrite_calls[idx] + return None + return None + def _get_full_data_for_phase(self, phase: PhaseResult, cycle: int, calls_map: dict) -> tuple: """ Retrieves COMPLETE (non-truncated) data for a phase from CallLogger. diff --git a/moralstack/models/base.py b/moralstack/models/base.py index f4d52af..55536c3 100644 --- a/moralstack/models/base.py +++ b/moralstack/models/base.py @@ -5,7 +5,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import List, Literal, Optional +from typing import Any, List, Literal, Optional @dataclass(frozen=True) @@ -19,6 +19,7 @@ class GenerationConfig: temperature: float = 0.7 top_p: float = 0.9 stop_sequences: List[str] = field(default_factory=list) + response_format: Any = None @dataclass(frozen=True) diff --git a/moralstack/models/policy.py b/moralstack/models/policy.py index 224c210..09163e1 100644 --- a/moralstack/models/policy.py +++ b/moralstack/models/policy.py @@ -1,24 +1,34 @@ """ -Policy LLM per MoralStack - OpenAI only. +Policy LLM for MoralStack — OpenAI only. -Unico provider: OpenAI API. Configurazione centralizzata via variabili d'ambiente: -- OPENAI_API_KEY (obbligatoria) +Single provider: OpenAI API. Configuration via environment variables: +- OPENAI_API_KEY (required) - OPENAI_MODEL (default: gpt-4o) -- OPENAI_BASE_URL (opzionale, per proxy/enterprise) -- OPENAI_TIMEOUT_MS (opzionale) -- OPENAI_MAX_RETRIES (opzionale) -- OPENAI_TEMPERATURE (opzionale, default 0.7) +- MORALSTACK_POLICY_REWRITE_MODEL (optional; deliberative rewrite at cycle 2+; defaults to OPENAI_MODEL) +- OPENAI_BASE_URL (optional, for proxy/enterprise) +- OPENAI_TIMEOUT_MS (optional) +- OPENAI_MAX_RETRIES (optional) +- OPENAI_TEMPERATURE (optional, default 0.7) """ from __future__ import annotations +import logging import os from dataclasses import dataclass from typing import Any from moralstack.models.base import GenerationConfig, GenerationResult -from moralstack.utils.openai_params import completion_tokens_param -from moralstack.utils.provider_errors import classify_provider_error, sleep_with_backoff +from moralstack.utils.openai_params import ( + completion_tokens_param, + supports_predicted_output, +) +from moralstack.utils.provider_errors import ( + classify_provider_error, + sleep_with_backoff, +) + +logger = logging.getLogger(__name__) def _get_openai_config( @@ -29,8 +39,8 @@ def _get_openai_config( api_key = (api_key_override or os.getenv("OPENAI_API_KEY") or "").strip() if not api_key: raise ValueError( - "OPENAI_API_KEY non impostata. Imposta la variabile d'ambiente OPENAI_API_KEY " - "o usa --openai-key da riga di comando. Esempio: export OPENAI_API_KEY=sk-..." + "OPENAI_API_KEY is not set. Set the environment variable OPENAI_API_KEY " + "or pass --openai-key from the command line. Example: export OPENAI_API_KEY=sk-..." ) base_url = os.getenv("OPENAI_BASE_URL") or None timeout_ms_env = os.getenv("OPENAI_TIMEOUT_MS") @@ -39,6 +49,8 @@ def _get_openai_config( model = model_override or os.getenv("OPENAI_MODEL", "gpt-4o") temperature = float(os.getenv("OPENAI_TEMPERATURE", "0.7")) top_p = float(os.getenv("OPENAI_TOP_P", "0.9")) + rewrite_raw = os.getenv("MORALSTACK_POLICY_REWRITE_MODEL") + rewrite_model = rewrite_raw.strip() if rewrite_raw and rewrite_raw.strip() else None return OpenAIPolicyConfig( api_key=api_key, base_url=base_url, @@ -47,6 +59,7 @@ def _get_openai_config( model=model, temperature=temperature, top_p=top_p, + rewrite_model=rewrite_model, ) @@ -64,6 +77,7 @@ class OpenAIPolicyConfig: max_retries: int | None = None temperature: float | None = None top_p: float | None = None + rewrite_model: str | None = None class OpenAIPolicy: @@ -104,6 +118,7 @@ def __init__( final_max_retries = config.max_retries if config and config.max_retries is not None else env_cfg.max_retries final_temperature = config.temperature if config and config.temperature is not None else env_cfg.temperature final_top_p = config.top_p if config and config.top_p is not None else env_cfg.top_p + final_rewrite_model = config.rewrite_model if config and config.rewrite_model is not None else env_cfg.rewrite_model # Override espliciti da parametri __init__ (sovrascrivono tutto) if api_key is not None: @@ -119,6 +134,7 @@ def __init__( self.api_key = final_api_key self.model = final_model + self._rewrite_model = final_rewrite_model if final_rewrite_model is not None else final_model self._timeout = (final_timeout_ms / 1000.0) if final_timeout_ms is not None else 60.0 self._max_retries = final_max_retries if final_max_retries is not None else 3 self._default_temperature = final_temperature if final_temperature is not None else 0.7 @@ -127,7 +143,7 @@ def __init__( try: import openai except ImportError: - raise ImportError("Il client OpenAI è richiesto. Installa con: pip install openai") + raise ImportError("The OpenAI client is required. Install with: pip install openai") kwargs: dict[str, Any] = {"api_key": self.api_key} if final_base_url: @@ -135,6 +151,11 @@ def __init__( self.client = openai.OpenAI(**kwargs) self._cost_tracker: Any = None + @property + def rewrite_model(self) -> str: + """Effective model for `rewrite()` (may differ from `model` when env is set).""" + return self._rewrite_model + def set_cost_tracker(self, tracker: Any) -> None: """Imposta un TokenCostTracker per tracciare i costi delle chiamate.""" self._cost_tracker = tracker @@ -146,26 +167,42 @@ def _complete( max_tokens: int = 1024, top_p: float | None = None, response_format: Any = None, + prediction: dict[str, str] | None = None, + model_override: str | None = None, ) -> tuple[str, int, str]: """ - Chiamata completions con retry su errori transient (429/503/timeout). - Ritorna (text, tokens_used, finish_reason). Usa classifier e backoff con jitter. + Completions call with retry on transient errors (429/503/timeout). + Returns (text, tokens_used, finish_reason). Uses classifier and jittered backoff. + + Args: + prediction: Optional predicted output for speculative decoding. + Expected format: ``{"type": "content", "content": "..."}`` + Only applied when the current model supports the feature. + model_override: If set, used instead of ``self.model`` for this call. """ + effective_model = model_override or self.model temp = temperature if temperature is not None else self._default_temperature top_p_val = top_p if top_p is not None else self._default_top_p + use_prediction = prediction is not None and supports_predicted_output(effective_model or "") + # prediction and response_format are mutually exclusive in + # the OpenAI API; prefer response_format (structural guarantee) + if use_prediction and response_format is not None: + use_prediction = False last_error: Exception | None = None for attempt in range(self._max_retries): try: kwargs: dict[str, Any] = { - "model": self.model, + "model": effective_model, "messages": messages, "temperature": temp, "top_p": top_p_val, "timeout": self._timeout, } - kwargs.update(completion_tokens_param(self.model, max_tokens)) + kwargs.update(completion_tokens_param(effective_model, max_tokens)) if response_format is not None: kwargs["response_format"] = response_format + if use_prediction: + kwargs["prediction"] = prediction response = self.client.chat.completions.create(**kwargs) choice = response.choices[0] text = (choice.message.content or "").strip() @@ -177,7 +214,7 @@ def _complete( prompt_tokens = int(tokens * 0.7) if tokens else 0 completion_tokens = tokens - prompt_tokens if tokens else 0 if self._cost_tracker is not None and hasattr(self._cost_tracker, "add_call"): - self._cost_tracker.add_call(self.model, prompt_tokens, completion_tokens) + self._cost_tracker.add_call(effective_model, prompt_tokens, completion_tokens) reason = choice.finish_reason or "stop" return text, tokens, reason except Exception as e: @@ -198,8 +235,18 @@ def generate( prompt: str, system: str = "", config: GenerationConfig | None = None, + prediction: dict[str, str] | None = None, + model_override: str | None = None, ) -> GenerationResult: - """Genera risposta al prompt.""" + """Generate a response for the prompt. + + Args: + prediction: Optional predicted output for speculative decoding. + When provided and the model supports it, the API uses + speculative decoding to produce faster responses when the + output is expected to be similar to the prediction text. + model_override: If set, used instead of the primary policy model for this call. + """ messages: list[dict[str, str]] = [] if system: messages.append({"role": "system", "content": system}) @@ -208,13 +255,21 @@ def generate( max_tokens = 1024 temperature = self._default_temperature top_p = self._default_top_p + response_format = None if config is not None: max_tokens = getattr(config, "max_tokens", max_tokens) temperature = getattr(config, "temperature", temperature) top_p = getattr(config, "top_p", top_p) + response_format = getattr(config, "response_format", None) text, tokens_used, finish_reason = self._complete( - messages, temperature=temperature, max_tokens=max_tokens, top_p=top_p + messages, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + prediction=prediction, + response_format=response_format, + model_override=model_override, ) return GenerationResult( text=text, @@ -233,26 +288,59 @@ def rewrite( system: str = "", config: GenerationConfig | None = None, ) -> GenerationResult: - """Riscrive una bozza in base al feedback.""" + """Riscrive una bozza in base al feedback. + + Uses OpenAI predicted outputs (speculative decoding) when the model + supports it: the existing draft is provided as a prediction hint so + that unchanged portions are generated significantly faster. + """ rewrite_prompt = ( f"ORIGINAL REQUEST:\n{prompt}\n\n" f"CURRENT DRAFT:\n{draft}\n\n" f"REVISION FEEDBACK:\n{guidance}\n\n" "Revise the response incorporating the feedback above. " - "IMPORTANT: Maintain or improve the depth, structure, and multiple perspectives. " + "IMPORTANT: Maintain or improve the depth, structure, " + "and multiple perspectives. " "Use numbered lists and clear sections when appropriate. " "Keep the response comprehensive and well-reasoned. " "Output ONLY the revised response, no additional commentary." ) rewrite_system = system or ( - "You are an assistant that revises responses based on feedback. " - "When revising, MAINTAIN or IMPROVE the depth and structure of the response. " - "Continue to present multiple perspectives and balanced analysis. " + "You are an assistant that revises responses based on " + "feedback. " + "When revising, MAINTAIN or IMPROVE the depth and " + "structure of the response. " + "Continue to present multiple perspectives and balanced " + "analysis. " "Use numbered lists and clear organization. " - "Incorporate the feedback while keeping the response comprehensive and well-reasoned. " + "Incorporate the feedback while keeping the response " + "comprehensive and well-reasoned. " "Respond in the SAME LANGUAGE as the original user request." ) - return self.generate(rewrite_prompt, system=rewrite_system, config=config) + + # Append constraints regardless of source + rewrite_system += ( + "REWRITE CONSTRAINTS:\n" + "- Do NOT add new examples, scenarios, or operational details " + "not present in the original draft.\n" + "- Focus on restructuring, deepening, and reframing the EXISTING " + "content based on the feedback.\n" + "- When feedback says to focus on narrative or conceptual aspects, " + "REMOVE operational specifics rather than adding new ones.\n" + ) + draft_prediction = {"type": "content", "content": draft} if draft else None + logger.info( + "policy_rewrite using model=%s (primary=%s)", + self._rewrite_model, + self.model, + ) + return self.generate( + rewrite_prompt, + system=rewrite_system, + config=config, + prediction=draft_prediction, + model_override=self._rewrite_model, + ) def refuse( self, @@ -298,7 +386,7 @@ def refuse( "Provide deep reasoning and helpful, specific alternatives. " ) if language and str(language).strip(): - refuse_system += f"CRITICAL: You MUST respond entirely in {language.strip()}. " "Do not add translations." + refuse_system += f"CRITICAL: You MUST respond entirely in {language.strip()}. Do not add translations." else: refuse_system += "Always respond in the SAME LANGUAGE as the user's request." return self.generate(refuse_prompt, system=refuse_system, config=config) diff --git a/moralstack/models/risk/estimator.py b/moralstack/models/risk/estimator.py index 2544313..d932ea0 100644 --- a/moralstack/models/risk/estimator.py +++ b/moralstack/models/risk/estimator.py @@ -229,12 +229,15 @@ def _persist_risk_llm_call( attempts: int, ) -> None: """Persist LLM call for risk estimation. Logs debug on ImportError.""" + risk_model = getattr(self.policy, "model", None) if self.policy else None + risk_model_str = str(risk_model) if risk_model is not None else None try: persist_llm_call( cycle=0, phase="risk_estimation", module="risk_estimator", action="estimate", + model=risk_model_str, started_at=started_at, duration_ms=duration_ms, prompt=prompt, @@ -411,12 +414,15 @@ def _persist_mini_llm_call( """Persist a single mini-estimator LLM call. Logs debug on ImportError.""" import json as _json + risk_model = getattr(self.policy, "model", None) if self.policy else None + risk_model_str = str(risk_model) if risk_model is not None else None try: persist_llm_call( cycle=0, phase="risk_estimation", module="risk_estimator", action=action, + model=risk_model_str, started_at=started_at, duration_ms=duration_ms, prompt=prompt, diff --git a/moralstack/orchestration/config_loader.py b/moralstack/orchestration/config_loader.py index 1c80cf4..d6f90ff 100644 --- a/moralstack/orchestration/config_loader.py +++ b/moralstack/orchestration/config_loader.py @@ -37,6 +37,8 @@ ENV_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD = "MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD" ENV_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD = "MORALSTACK_ORCHESTRATOR_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD" ENV_BORDERLINE_REFUSE_UPPER = "MORALSTACK_ORCHESTRATOR_BORDERLINE_REFUSE_UPPER" +ENV_PARALLEL_CRITIC_WITH_MODULES = "MORALSTACK_ORCHESTRATOR_PARALLEL_CRITIC_WITH_MODULES" +ENV_ENABLE_SPECULATIVE_GENERATION = "MORALSTACK_ORCHESTRATOR_ENABLE_SPECULATIVE_GENERATION" def load_orchestrator_config_from_env(): # -> OrchestratorConfig (imported inside) @@ -75,6 +77,8 @@ def load_orchestrator_config_from_env(): # -> OrchestratorConfig (imported insi simulator_gate_semantic_harm_threshold = get_env_float(ENV_SIMULATOR_GATE_SEMANTIC_HARM_THRESHOLD, 0.4, 0.0, 1.0) simulator_gate_delta_chars_threshold = get_env_int(ENV_SIMULATOR_GATE_DELTA_CHARS_THRESHOLD, 100, 0) borderline_refuse_upper = get_env_float(ENV_BORDERLINE_REFUSE_UPPER, 0.95, 0.0, 1.0) + parallel_critic_with_modules = get_env_bool(ENV_PARALLEL_CRITIC_WITH_MODULES, True) + enable_speculative_generation = get_env_bool(ENV_ENABLE_SPECULATIVE_GENERATION, True) return OrchestratorConfig( max_deliberation_cycles=max_deliberation_cycles, @@ -97,4 +101,6 @@ def load_orchestrator_config_from_env(): # -> OrchestratorConfig (imported insi simulator_gate_semantic_harm_threshold=simulator_gate_semantic_harm_threshold, simulator_gate_delta_chars_threshold=simulator_gate_delta_chars_threshold, borderline_refuse_upper=borderline_refuse_upper, + parallel_critic_with_modules=parallel_critic_with_modules, + enable_speculative_generation=enable_speculative_generation, ) diff --git a/moralstack/orchestration/controller.py b/moralstack/orchestration/controller.py index dbf0ff0..beca1b0 100644 --- a/moralstack/orchestration/controller.py +++ b/moralstack/orchestration/controller.py @@ -36,6 +36,7 @@ orch_debug_log, ) from moralstack.orchestration.domain_exclusion import generate_domain_exclusion_response +from moralstack.orchestration.language_resolver import resolve_prompt_with_language from moralstack.orchestration.overlay_policy import ( OVERLAY_SENSITIVE_RISK_FLOOR, apply_risk_floor_if_sensitive, @@ -206,6 +207,96 @@ def _estimate_risk(self, request: ProcessedRequest) -> RiskEstimation: except Exception as e: raise RiskEstimationError(f"Risk estimation failed: {e}") + def _speculative_generate( + self, + request: ProcessedRequest, + ) -> str | None: + """Generate a speculative draft in parallel with risk estimation. + + Uses the base system prompt and fallback language detection (the risk + estimator's ``detected_language`` is not available yet). Returns the + output-protected draft text, or ``None`` on any failure. + """ + if self.policy is None: + return None + try: + prompt_text = resolve_prompt_with_language( + request.prompt, + "", + request.prompt, + ) + start = time.time() + try: + result = self.policy.generate( + prompt=prompt_text, + system=self._protected_system_prompt, + ) + except TypeError: + result = self.policy.generate(prompt_text) + elapsed = (time.time() - start) * 1000 + response_text = getattr(result, "text", None) or str(result) + protection = self._output_protector.validate(response_text) + prompt_used = getattr(result, "prompt_used", None) or prompt_text + system_used = getattr(result, "system_used", None) or self._protected_system_prompt + policy_model = getattr(self.policy, "model", None) + policy_model_str = str(policy_model) if policy_model is not None else None + record_llm_call( + self.logger, + None, + { + "cycle": 0, + "phase": "speculative_generate", + "module": "policy", + "action": "generate (speculative)", + "model": policy_model_str, + "started_at": int(start * 1000), + "duration_ms": elapsed, + "prompt": prompt_used, + "system_prompt": system_used or "", + "raw_response": response_text, + "sequence_in_cycle": 0, + }, + ) + return protection.cleaned + except Exception as e: + _LOG.warning( + "Speculative generation failed, will regenerate: %s", + e, + ) + return None + + def _run_speculative_overlap( + self, + request: ProcessedRequest, + ) -> tuple[str | None, RiskEstimation]: + """Run risk estimation and speculative draft generation in parallel. + + Returns ``(speculative_draft_or_None, risk_estimation)``. + If risk estimation raises, the exception propagates normally. + + Uses ``contextvars.copy_context()`` so that persistence context + (run_id, request_id, cycle) is available inside the worker threads. + """ + import contextvars + from concurrent.futures import ThreadPoolExecutor + + ctx_risk = contextvars.copy_context() + ctx_spec = contextvars.copy_context() + + executor = ThreadPoolExecutor(max_workers=2) + try: + risk_fut = executor.submit(ctx_risk.run, self._estimate_risk, request) + spec_fut = executor.submit( + ctx_spec.run, + self._speculative_generate, + request, + ) + risk_estimation = risk_fut.result() + speculative_draft = spec_fut.result() + return speculative_draft, risk_estimation + finally: + executor.shutdown(wait=False) + def _handle_timeout(self, request: ProcessedRequest, error_msg: str, start_time: float) -> OrchestratorResult: processing_time = int((time.time() - start_time) * 1000) response = FinalResponse( @@ -447,6 +538,7 @@ def _route_benign( risk_estimation: RiskEstimationProtocol, start_time: float, trace: Trace, + speculative_draft: str | None = None, ) -> OrchestratorResult: request_id = request.request_id orch_debug_log( @@ -462,6 +554,7 @@ def _route_benign( start_time, decision=decision, decision_explanation=explanation, + speculative_draft=speculative_draft, ) fill_trace_from_result(trace, result) result.trace = trace @@ -515,6 +608,7 @@ def _route_fast_path( risk_estimation: RiskEstimationProtocol, start_time: float, trace: Trace, + speculative_draft: str | None = None, ) -> OrchestratorResult: request_id = request.request_id orch_debug_log( @@ -532,6 +626,7 @@ def _route_fast_path( decision=decision, constitution=constitution, decision_explanation=explanation, + speculative_draft=speculative_draft, ) result.path_taken = "fast" fill_trace_from_result(trace, result) @@ -675,6 +770,7 @@ def _route_deliberative( start_time: float, trace: Trace, pre_decision: Decision | None = None, + speculative_draft: str | None = None, ) -> OrchestratorResult: request_id = request.request_id orch_debug_log( @@ -691,6 +787,7 @@ def _route_deliberative( start_time, constrained_generation=constrained_generation, constitution=constitution, + speculative_draft=speculative_draft, ) modules_called: set[str] = set() if state.critiques: @@ -910,7 +1007,11 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult: trace = self._trace_lifecycle.start_trace(request_id) try: - risk_estimation = self._estimate_risk(request) + speculative_draft: str | None = None + if self.config.enable_speculative_generation and self.policy is not None: + speculative_draft, risk_estimation = self._run_speculative_overlap(request) + else: + risk_estimation = self._estimate_risk(request) risk_score = risk_estimation.score if hasattr(risk_estimation, "score") else 0.5 risk_category = getattr(risk_estimation, "risk_category", None) constrained_generation = risk_category == RiskCategory.CLEARLY_HARMFUL @@ -1037,6 +1138,7 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult: risk_proto, start_time, trace, + speculative_draft=speculative_draft, ) if route == "safe_complete": return self._route_safe_complete( @@ -1070,6 +1172,7 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult: risk_proto, start_time, trace, + speculative_draft=speculative_draft, ) return self._route_deliberative( @@ -1081,6 +1184,7 @@ def process(self, request: ProcessedRequest | str) -> OrchestratorResult: start_time, trace, pre_decision=decision, + speculative_draft=speculative_draft, ) except OrchestratorTimeoutError as e: diff --git a/moralstack/orchestration/deliberation_runner.py b/moralstack/orchestration/deliberation_runner.py index ea39fd6..d8bebe3 100644 --- a/moralstack/orchestration/deliberation_runner.py +++ b/moralstack/orchestration/deliberation_runner.py @@ -56,6 +56,36 @@ _LOG = logging.getLogger(__name__) + +def _policy_llm_model_for_action(policy: Any, action: str) -> str | None: + """Effective OpenAI model name for policy generate vs rewrite (rewrite may use MORALSTACK_POLICY_REWRITE_MODEL).""" + if policy is None: + return None + if action == "rewrite": + rw = getattr(policy, "rewrite_model", None) + if rw is not None: + return str(rw) + m = getattr(policy, "model", None) + return str(m) if m is not None else None + + +def _module_model(module: Any) -> str | None: + """Return the OpenAI model name used by a cognitive module (critic, simulator, …). + + Each module stores its inner ``OpenAIPolicy`` as ``self.policy``; fall back + to ``module.model`` if present. + """ + if module is None: + return None + inner = getattr(module, "policy", None) + if inner is not None: + m = getattr(inner, "model", None) + if m is not None: + return str(m) + m = getattr(module, "model", None) + return str(m) if m is not None else None + + # Logical order within a deliberation cycle for journey/report display (sequence_in_cycle). SEQ_POLICY = 1 SEQ_CRITIC = 2 @@ -183,6 +213,7 @@ def run_benign_fast_path( *, decision: Decision, decision_explanation: DecisionExplanation | None = None, + speculative_draft: str | None = None, ) -> OrchestratorResult: """FAST PATH per operational_risk == NONE. Nessun modulo deliberativo.""" from moralstack.orchestration.diagnostics import orch_debug_log @@ -196,38 +227,65 @@ def run_benign_fast_path( ) if self.policy is not None: try: - prompt_text = resolve_prompt_with_language( - request.prompt, - risk_estimation.detected_language or "", - request.prompt, - ) - start_gen = time.time() - try: - result = self.policy.generate(prompt=prompt_text, system=self._protected_system_prompt) - except TypeError: - result = self.policy.generate(prompt_text) - elapsed = (time.time() - start_gen) * 1000 - response_text = _policy_text(result) - prompt_used = _policy_prompt_used(result, prompt_text) - system_used = _policy_system_used(result, self._protected_system_prompt or "") - record_llm_call( - self.logger, - None, - { - "cycle": 0, - "phase": "policy_generate", - "module": "policy", - "action": "generate (benign_fast_path)", - "started_at": int(start_gen * 1000), - "duration_ms": elapsed, - "prompt": prompt_used, - "system_prompt": system_used or "", - "raw_response": response_text, - "sequence_in_cycle": SEQ_POLICY, - }, - ) - protection_result = self._output_protector.validate(response_text) - content = protection_result.cleaned + if speculative_draft: + content = speculative_draft + record_llm_call( + self.logger, + None, + { + "cycle": 0, + "phase": "policy_generate", + "module": "policy", + "action": "generate (speculative-reuse," " benign_fast_path)", + "model": _policy_llm_model_for_action(self.policy, "generate"), + "duration_ms": 0.0, + "prompt": request.prompt[:200], + "raw_response": content[:200], + "sequence_in_cycle": SEQ_POLICY, + }, + ) + else: + prompt_text = resolve_prompt_with_language( + request.prompt, + risk_estimation.detected_language or "", + request.prompt, + ) + start_gen = time.time() + try: + result = self.policy.generate( + prompt=prompt_text, + system=self._protected_system_prompt, + ) + except TypeError: + result = self.policy.generate(prompt_text) + elapsed = (time.time() - start_gen) * 1000 + response_text = _policy_text(result) + prompt_used = _policy_prompt_used( + result, + prompt_text, + ) + system_used = _policy_system_used( + result, + self._protected_system_prompt or "", + ) + record_llm_call( + self.logger, + None, + { + "cycle": 0, + "phase": "policy_generate", + "module": "policy", + "action": "generate (benign_fast_path)", + "started_at": int(start_gen * 1000), + "duration_ms": elapsed, + "prompt": prompt_used, + "system_prompt": system_used or "", + "raw_response": response_text, + "sequence_in_cycle": SEQ_POLICY, + }, + ) + protection_result = self._output_protector.validate(response_text) + content = protection_result.cleaned except Exception as e: raise GenerationError(f"Generation failed: {e}") else: @@ -371,6 +429,7 @@ def run_fast_path( decision: Decision, constitution: Any | None = None, decision_explanation: DecisionExplanation | None = None, + speculative_draft: str | None = None, ) -> OrchestratorResult: """Path veloce: genera draft + quick check costituzionale; se fallisce passa a deliberative.""" @@ -384,73 +443,116 @@ def run_fast_path( request_id=request.request_id or "", ) if constitution is None and self.constitution_store is not None: - constitution = get_constitution_safe(self.constitution_store, request.get_domain()) + constitution = get_constitution_safe( + self.constitution_store, + request.get_domain(), + ) state = DeliberationState(cycle=0) if self.policy is not None: try: - start_gen = time.time() - prompt_text = resolve_prompt_with_language( - request.prompt, - risk_estimation.detected_language or "", - request.prompt, - ) - try: - result = self.policy.generate(prompt=prompt_text, system=self._protected_system_prompt) - except TypeError: - result = self.policy.generate(prompt_text) - elapsed = (time.time() - start_gen) * 1000 - response_text = _policy_text(result) - protection_result = self._output_protector.validate(response_text) - if protection_result.had_leakage: + if speculative_draft: + state.draft_response = speculative_draft + reuse_model = _policy_llm_model_for_action(self.policy, "generate") record_llm_call( self.logger, { - "module": "output_protection", - "action": "leakage_detected (fast_path)", - "prompt": f"Type: {protection_result.leakage_type}", - "response": f"Cleaned from {len(response_text)} to {len(protection_result.cleaned)} chars", + "module": "policy", + "action": "generate (speculative-reuse," " fast_path)", + "prompt": request.prompt[:200], + "response": speculative_draft[:200], "duration_ms": 0.0, + "model": reuse_model, }, { "cycle": 0, - "phase": "output_protection", - "module": "output_protection", - "action": "leakage_detected (fast_path)", + "phase": "policy_generate", + "module": "policy", + "action": "generate (speculative-reuse," " fast_path)", + "model": reuse_model, "duration_ms": 0.0, - "raw_response": { - "leakage_type": protection_result.leakage_type, - "original_len": len(response_text), - "cleaned_len": len(protection_result.cleaned), - "had_leakage": True, + "prompt": request.prompt[:200], + "raw_response": speculative_draft[:200], + "sequence_in_cycle": SEQ_POLICY, + }, + ) + else: + start_gen = time.time() + prompt_text = resolve_prompt_with_language( + request.prompt, + risk_estimation.detected_language or "", + request.prompt, + ) + try: + result = self.policy.generate( + prompt=prompt_text, + system=self._protected_system_prompt, + ) + except TypeError: + result = self.policy.generate(prompt_text) + elapsed = (time.time() - start_gen) * 1000 + response_text = _policy_text(result) + protection_result = self._output_protector.validate(response_text) + if protection_result.had_leakage: + record_llm_call( + self.logger, + { + "module": "output_protection", + "action": "leakage_detected" " (fast_path)", + "prompt": "Type: " f"{protection_result.leakage_type}", + "response": "Cleaned from " + f"{len(response_text)} to " + f"{len(protection_result.cleaned)}" + " chars", + "duration_ms": 0.0, + }, + { + "cycle": 0, + "phase": "output_protection", + "module": "output_protection", + "action": "leakage_detected" " (fast_path)", + "duration_ms": 0.0, + "raw_response": { + "leakage_type": protection_result.leakage_type, + "original_len": len(response_text), + "cleaned_len": len( + protection_result.cleaned, + ), + "had_leakage": True, + }, + "sequence_in_cycle": SEQ_POLICY, }, + ) + state.draft_response = protection_result.cleaned + prompt_used = _policy_prompt_used( + result, + prompt_text, + ) + system_used = _policy_system_used( + result, + self._protected_system_prompt, + ) + record_llm_call( + self.logger, + { + "module": "policy", + "action": "generate (fast_path)", + "prompt": request.prompt, + "response": state.draft_response, + "duration_ms": elapsed, + }, + { + "cycle": 0, + "phase": "policy_generate", + "module": "policy", + "action": "generate (fast_path)", + "started_at": int(start_gen * 1000), + "duration_ms": elapsed, + "prompt": prompt_used, + "system_prompt": system_used or "", + "raw_response": response_text, "sequence_in_cycle": SEQ_POLICY, }, ) - state.draft_response = protection_result.cleaned - prompt_used = _policy_prompt_used(result, prompt_text) - system_used = _policy_system_used(result, self._protected_system_prompt) - record_llm_call( - self.logger, - { - "module": "policy", - "action": "generate (fast_path)", - "prompt": request.prompt, - "response": state.draft_response, - "duration_ms": elapsed, - }, - { - "cycle": 0, - "phase": "policy_generate", - "module": "policy", - "action": "generate (fast_path)", - "started_at": int(start_gen * 1000), - "duration_ms": elapsed, - "prompt": prompt_used, - "system_prompt": system_used or "", - "raw_response": response_text, - "sequence_in_cycle": SEQ_POLICY, - }, - ) except Exception as e: raise GenerationError(f"Generation failed: {e}") else: @@ -460,7 +562,11 @@ def run_fast_path( quick_result = self.critic.quick_check(request.prompt, state.draft_response, constitution) if not quick_result.passed: state_delib, risk_score, outcome = self.run_deliberative_path( - request, risk_estimation, start_time, constitution=constitution + request, + risk_estimation, + start_time, + constitution=constitution, + speculative_draft=state.draft_response, ) return self._build_deliberative_result( request, @@ -627,16 +733,26 @@ def run_deliberative_path( *, constrained_generation: bool = False, constitution: Any | None = None, + speculative_draft: str | None = None, ) -> tuple[DeliberationState, float, ConvergenceOutcome]: """ Esegue cicli deliberativi. Restituisce (state, risk_score, outcome) per assemblaggio. L'unica autorità sul loop è outcome post-enforcement: "continue" non sopravvive a cicli esauriti. + + Args: + speculative_draft: Pre-generated draft from parallel overlap with + risk estimation. When provided *and* constrained_generation is + False, the draft is used as the cycle-1 starting point, + skipping the initial generation call. """ from moralstack.orchestration.diagnostics import orch_debug_log if constitution is None and self.constitution_store is not None: - constitution = get_constitution_safe(self.constitution_store, request.get_domain()) + constitution = get_constitution_safe( + self.constitution_store, + request.get_domain(), + ) request_id = request.request_id or "" orch_debug_log( "orchestrator.py:_deliberative_path", @@ -647,6 +763,13 @@ def run_deliberative_path( ) self._current_start_time = start_time state = DeliberationState(cycle=0) + # Pre-set speculative draft for cycle 1 when safe to do so. + # constrained_generation uses a different system prompt so the + # speculative draft (generated with the base prompt) is not suitable. + if speculative_draft and not constrained_generation: + state.draft_response = sanitize_policy_output( + speculative_draft, + ) risk_score = risk_estimation.score max_cycles = self._effective_max_cycles(risk_estimation) # Constrained generation (clearly_harmful): the policy is already instructed to @@ -1146,9 +1269,40 @@ def _run_critique_simulate_perspectives_parallel( import moralstack.prompts.perspectives_prompt # noqa: F401 import moralstack.prompts.simulator_prompt # noqa: F401 - # STAGE 1: critic always runs first — it is the gate for the entire cycle. - # If it returns violated_hard=True, simulator and perspectives cannot change - # the outcome and would be pure waste. Run critic sequentially and short-circuit. + if self.config.parallel_critic_with_modules: + return self._run_full_parallel_evaluation( + state, + request, + delib_context=delib_context, + context_mode=context_mode, + risk_estimation=risk_estimation, + max_cycles=max_cycles, + constitution=constitution, + ) + + return self._run_critic_gated_parallel( + state, + request, + delib_context=delib_context, + context_mode=context_mode, + risk_estimation=risk_estimation, + max_cycles=max_cycles, + constitution=constitution, + ) + + def _run_critic_gated_parallel( + self, + state: DeliberationState, + request: ProcessedRequest, + *, + delib_context: DelibContext | None = None, + context_mode: str = "full", + risk_estimation: RiskEstimationProtocol | None = None, + max_cycles: int = 2, + constitution: Any | None = None, + ) -> DeliberationState: + """Original two-stage approach: critic runs first as a gate, then + simulator + perspectives run in parallel only if no hard violation.""" state = self._critique( state, request, @@ -1157,31 +1311,45 @@ def _run_critique_simulate_perspectives_parallel( constitution=constitution, ) if state.has_critical_violations or getattr(state.last_critique, "violated_hard", False): - # Hard violation confirmed: skip simulator and perspectives entirely. - # _deliberation_cycle will read state.has_critical_violations and set - # state.decision = REFUSE / REVISE immediately after we return. return state - # STAGE 2: critic returned PROCEED/REVISE with no hard violations. - # Simulator and perspectives are now meaningful — run them in parallel. - # Note: n_errors_after_critic is taken AFTER _critique so that fork() - # inherits critic errors and the slice correctly captures only new errors - # added by simulator / perspectives. n_errors_after_critic = len(state.errors) state2 = state.fork() state3 = state.fork() - def do_simulate(s: DeliberationState, r: ProcessedRequest) -> DeliberationState: + def do_simulate( + s: DeliberationState, + r: ProcessedRequest, + ) -> DeliberationState: if not self.config.enable_simulation or self.simulator is None: return s - if not self._should_run_simulator(s, risk_estimation, delib_context, s.cycle, max_cycles): - return s # Carry forward (s already has simulations from fork) - return self._simulate(s, r, delib_context=delib_context, context_mode=context_mode) + if not self._should_run_simulator( + s, + risk_estimation, + delib_context, + s.cycle, + max_cycles, + ): + return s + return self._simulate( + s, + r, + delib_context=delib_context, + context_mode=context_mode, + ) - def do_perspectives(s: DeliberationState, r: ProcessedRequest) -> DeliberationState: + def do_perspectives( + s: DeliberationState, + r: ProcessedRequest, + ) -> DeliberationState: if not self.config.enable_perspectives or self.perspectives is None: return s - return self._evaluate_perspectives(s, r, delib_context=delib_context, context_mode=context_mode) + return self._evaluate_perspectives( + s, + r, + delib_context=delib_context, + context_mode=context_mode, + ) ctx2 = contextvars.copy_context() ctx3 = contextvars.copy_context() @@ -1195,6 +1363,116 @@ def do_perspectives(s: DeliberationState, r: ProcessedRequest) -> DeliberationSt state.errors = list(state.errors) + list(s2.errors[n_errors_after_critic:]) + list(s3.errors[n_errors_after_critic:]) return state + def _run_full_parallel_evaluation( + self, + state: DeliberationState, + request: ProcessedRequest, + *, + delib_context: DelibContext | None = None, + context_mode: str = "full", + risk_estimation: RiskEstimationProtocol | None = None, + max_cycles: int = 2, + constitution: Any | None = None, + ) -> DeliberationState: + """Full parallel: critic, simulator, and perspectives all run + concurrently. On hard violation the sim/persp results are discarded, + paying extra LLM calls but saving wall-clock time in the common case + (no hard violation). Decision quality is identical: the convergence + logic sees exactly the same module outputs.""" + n_errors_before = len(state.errors) + state_critic = state.fork() + state_sim = state.fork() + state_persp = state.fork() + + def do_critique( + s: DeliberationState, + r: ProcessedRequest, + ) -> DeliberationState: + return self._critique( + s, + r, + delib_context=delib_context, + context_mode=context_mode, + constitution=constitution, + ) + + def do_simulate( + s: DeliberationState, + r: ProcessedRequest, + ) -> DeliberationState: + if not self.config.enable_simulation or self.simulator is None: + return s + if not self._should_run_simulator( + s, + risk_estimation, + delib_context, + s.cycle, + max_cycles, + ): + return s + return self._simulate( + s, + r, + delib_context=delib_context, + context_mode=context_mode, + ) + + def do_perspectives( + s: DeliberationState, + r: ProcessedRequest, + ) -> DeliberationState: + if not self.config.enable_perspectives or self.perspectives is None: + return s + return self._evaluate_perspectives( + s, + r, + delib_context=delib_context, + context_mode=context_mode, + ) + + ctx_c = contextvars.copy_context() + ctx_s = contextvars.copy_context() + ctx_p = contextvars.copy_context() + executor = self._get_executor() + fut_c = executor.submit(ctx_c.run, do_critique, state_critic, request) + fut_s = executor.submit(ctx_s.run, do_simulate, state_sim, request) + fut_p = executor.submit(ctx_p.run, do_perspectives, state_persp, request) + + sc = fut_c.result() + ss = fut_s.result() + sp = fut_p.result() + + # Always merge critic results + state.critiques = sc.critiques + state.errors = list(state.errors) + list(sc.errors[n_errors_before:]) + + # Propagate critic signals into delib_context (matches sequential path) + if delib_context is not None and state.last_critique is not None: + critique = state.last_critique + delib_context.critic_decision = getattr(critique, "decision", "") or "" + delib_context.critic_violated_hard = bool(getattr(critique, "violated_hard", False)) + if getattr(critique, "violations", None): + delib_context.critic_violations_summary = "; ".join( + f"{v.principle_id}:{getattr(v, 'severity', 0)}" for v in critique.violations[:5] + ) + + hard_violation = state.has_critical_violations or getattr( + state.last_critique, + "violated_hard", + False, + ) + if hard_violation: + # Discard sim/persp results — critic authority prevails. + # Wall-clock time was not wasted (parallel execution). + return state + + # No hard violation: merge sim + persp results + state.simulations = ss.simulations + state.perspectives = sp.perspectives + state._perspectives_aggregation = sp._perspectives_aggregation + state.errors = list(state.errors) + list(ss.errors[n_errors_before:]) + list(sp.errors[n_errors_before:]) + return state + def _apply_constitutional_perspective_override(self, state: DeliberationState) -> None: """Applica override costituzionale sulle prospettive quando il Critic rileva violazioni HARD.""" @@ -1239,6 +1517,7 @@ def _soft_revision_pass( state.soft_revision_guidance_used = guidance prompt_used = _policy_prompt_used(result, user_prompt_with_lang) system_used = _policy_system_used(result, self._protected_system_prompt) + soft_model = _policy_llm_model_for_action(self.policy, "rewrite") record_llm_call( self.logger, { @@ -1247,11 +1526,13 @@ def _soft_revision_pass( "prompt": f"Guidance: {guidance[:200]}", "response": state.draft_response[:200], "duration_ms": elapsed, + "model": soft_model, }, { "phase": "soft_revision", "module": "policy", "action": "soft_revision", + "model": soft_model, "started_at": int(start * 1000), "duration_ms": elapsed, "prompt": prompt_used, @@ -1276,7 +1557,33 @@ def _generate_or_revise( if self.policy is None: state.draft_response = f"[Mock response to: {request.prompt[:50]}...]" return state - # Use explicit language from Risk Estimator to reduce LLM non-compliance + # Speculative draft already present from parallel generation: + # skip redundant LLM call in cycle 1. + if state.cycle == 1 and state.draft_response: + reuse_model = _policy_llm_model_for_action(self.policy, "generate") + record_llm_call( + self.logger, + { + "module": "policy", + "action": "generate (speculative-reuse)", + "prompt": request.prompt[:200], + "response": state.draft_response[:200], + "duration_ms": 0.0, + "model": reuse_model, + }, + { + "cycle": 1, + "phase": "policy_generate", + "module": "policy", + "action": "generate (speculative-reuse)", + "model": reuse_model, + "duration_ms": 0.0, + "prompt": request.prompt[:200], + "raw_response": state.draft_response[:200], + "sequence_in_cycle": SEQ_POLICY, + }, + ) + return state try: start = time.time() det_iso = risk_estimation.detected_language or "" @@ -1343,6 +1650,7 @@ def _generate_or_revise( state.draft_response = sanitize_policy_output(protection_result.cleaned) prompt_used = _policy_prompt_used(result, prompt_text) system_used = _policy_system_used(result, self._protected_system_prompt) + policy_model_label = _policy_llm_model_for_action(self.policy, action) record_llm_call( self.logger, { @@ -1351,11 +1659,13 @@ def _generate_or_revise( "prompt": prompt_text, "response": state.draft_response, "duration_ms": elapsed, + "model": policy_model_label, }, { "phase": "policy_generate" if action == "generate" else "policy_rewrite", "module": "policy", "action": action, + "model": policy_model_label, "started_at": int(start * 1000), "duration_ms": elapsed, "prompt": prompt_used, @@ -1429,6 +1739,7 @@ def _critique( nv = len(critique.violations) rg = (critique.revision_guidance[:100]) if critique.revision_guidance else "N/A" response_text = f"Violations: {nv}, Guidance: {rg}" + critic_model = _module_model(self.critic) record_llm_call( self.logger, { @@ -1437,11 +1748,13 @@ def _critique( "prompt": prompt_text, "response": response_text, "duration_ms": elapsed, + "model": critic_model, }, { "phase": "critic", "module": "critic", "action": "critique", + "model": critic_model, "started_at": int(start * 1000), "duration_ms": elapsed, "prompt": getattr(critique, "prompt", None) or prompt_text, @@ -1528,6 +1841,7 @@ def _simulate( f"Expected valence: {ev:.2f}, Semantic harm: {sem_harm:.2f}, " f"Dominant harms: {dom_harms}, Worst harm: {worst}" ) + sim_model = _module_model(self.simulator) record_llm_call( self.logger, { @@ -1536,11 +1850,13 @@ def _simulate( "prompt": f"SIMULATION\nPrompt: {request.prompt}\nResponse: {state.draft_response}", "response": response_text, "duration_ms": elapsed, + "model": sim_model, }, { "phase": "simulator", "module": "simulator", "action": "simulate", + "model": sim_model, "started_at": int(start * 1000), "duration_ms": elapsed, "prompt": getattr(simulation, "prompt", ""), @@ -1633,6 +1949,7 @@ def _evaluate_hindsight( context_mode=context_mode, ) elapsed = (time.time() - start) * 1000 + hindsight_model = _module_model(self.hindsight) record_llm_call( self.logger, { @@ -1641,11 +1958,13 @@ def _evaluate_hindsight( "prompt": f"HINDSIGHT\nPrompt: {request.prompt}\nResponse: {state.draft_response}", "response": str(hindsight_result)[:200], "duration_ms": elapsed, + "model": hindsight_model, }, { "phase": "hindsight", "module": "hindsight", "action": "evaluate", + "model": hindsight_model, "started_at": int(start * 1000), "duration_ms": elapsed, "prompt": getattr(hindsight_result, "prompt", ""), @@ -1740,6 +2059,7 @@ def _evaluate_perspectives( raw_resp = "\n---\n".join(result.raw_responses or []) if getattr(result, "raw_responses", None) else "" prompts_list = getattr(result, "prompts", []) or [] system_list = getattr(result, "system_prompts", []) or [] + persp_model = _module_model(self.perspectives) record_llm_call( self.logger, { @@ -1748,11 +2068,13 @@ def _evaluate_perspectives( "prompt": f"PERSPECTIVES\nPrompt: {request.prompt}\nResponse: {state.draft_response}", "response": str(result)[:200], "duration_ms": elapsed, + "model": persp_model, }, { "phase": "perspectives", "module": "perspectives", "action": "evaluate", + "model": persp_model, "started_at": int(start * 1000), "duration_ms": elapsed, "prompt": "\n---\n".join(prompts_list) if prompts_list else "", diff --git a/moralstack/orchestration/types.py b/moralstack/orchestration/types.py index ff2aee8..47617de 100644 --- a/moralstack/orchestration/types.py +++ b/moralstack/orchestration/types.py @@ -500,6 +500,15 @@ class OrchestratorConfig: soft_revision_min_suggestions: int = 1 # Minimum total suggestions to trigger soft_revision_max_approval: float = 0.95 # Skip rewrite if weighted approval exceeds this early_exit_perspectives_threshold: float = 0.85 # Early exit se critic PROCEED + perspectives >= questo + # When True and parallel_module_calls is True, critic runs in parallel + # with simulator and perspectives instead of acting as a sequential gate. + # Hard violations are still honoured: sim/persp results are discarded when + # the critic finds a hard violation. Default True for latency savings. + parallel_critic_with_modules: bool = True + # When True, risk estimation and speculative draft generation run in + # parallel. The draft is used directly for benign/fast/deliberative + # routes and discarded on REFUSE. Zero impact on decision quality. + enable_speculative_generation: bool = True # ============================================================================= diff --git a/moralstack/persistence/db.py b/moralstack/persistence/db.py index d41aac5..34a2e66 100644 --- a/moralstack/persistence/db.py +++ b/moralstack/persistence/db.py @@ -750,6 +750,58 @@ def get_all_runs(limit: int = 100) -> list[dict[str, Any]]: return [] +def get_models_used_for_run(run_id: str) -> dict[str, str]: + """Returns a mapping of ``module/action`` → model actually used, from persisted llm_calls. + + Scans all llm_calls for the run and picks the first non-empty ``model`` value + per logical slot. Returned keys: ``policy_generate``, ``policy_rewrite``, + ``risk``, ``critic``, ``simulator``, ``hindsight``, ``perspectives``. + Only slots with a non-empty model value appear in the result. + """ + path = get_db_path() + if not path: + return {} + try: + conn = _get_connection(path) + rows = conn.execute( + """ + SELECT module, action, model + FROM llm_calls + WHERE run_id = ? AND model IS NOT NULL AND model != '' + ORDER BY started_at + """, + (run_id,), + ).fetchall() + conn.close() + except Exception as e: + logger.warning("persistence: get_models_used_for_run failed: %s", e) + return {} + + result: dict[str, str] = {} + for r in rows: + module = (r["module"] or "").strip() + action = (r["action"] or "").strip().lower() + model = (r["model"] or "").strip() + if not model: + continue + if module == "policy": + if "rewrite" in action: + result.setdefault("policy_rewrite", model) + elif "generate" in action: + result.setdefault("policy_generate", model) + elif module == "risk_estimator": + result.setdefault("risk", model) + elif module == "critic": + result.setdefault("critic", model) + elif module == "simulator": + result.setdefault("simulator", model) + elif module == "hindsight": + result.setdefault("hindsight", model) + elif module == "perspectives": + result.setdefault("perspectives", model) + return result + + def delete_run(run_id: str) -> bool: """Deletes a run and all related data (CASCADE). Returns True on success.""" path = get_db_path() diff --git a/moralstack/prompts/_common.py b/moralstack/prompts/_common.py index 2f77f80..c485af1 100644 --- a/moralstack/prompts/_common.py +++ b/moralstack/prompts/_common.py @@ -12,6 +12,10 @@ # Costante standard per chiusura prompt JSON OUTPUT_JSON_ONLY = "Output ONLY valid JSON." +# Prompt-cache layout (FULL prompts): longest stable instruction/schema blocks MUST come first in the +# string sent to the provider; per-request fields (request, draft, risk) MUST come last so identical +# prefixes can hit automatic prompt caches (e.g. OpenAI prompt caching on repeated long prefixes). + class ThinContextSections(TypedDict): """Sezioni contesto per thin mode (summary, key_points, risk_signals, change_log).""" diff --git a/moralstack/prompts/critic_prompt.py b/moralstack/prompts/critic_prompt.py index 847dc02..24ad3dd 100644 --- a/moralstack/prompts/critic_prompt.py +++ b/moralstack/prompts/critic_prompt.py @@ -52,11 +52,20 @@ {"decision":"PROCEED","violated_hard":false,"violations":[],"revision_guidance":""} """ -CRITIC_FULL_TEMPLATE = """Evaluate the response against these principles. Output valid JSON only. +# Static rules + JSON closure first (prompt-cache friendly); per-request fields last. +CRITIC_FULL_TEMPLATE = ( + """{rules} + +""" + + OUTPUT_JSON_ONLY + + """ + +TASK: Evaluate the response in TURN CONTEXT against the listed principles. Judge the RESPONSE content, not the REQUEST topic. PRINCIPLES: {principles} +TURN CONTEXT: RISK ASSESSMENT: {risk_signals} @@ -65,9 +74,8 @@ RESPONSE: {response} - -{rules} -""" + OUTPUT_JSON_ONLY +""" +) CRITIC_THIN_TEMPLATE = """Evaluate the response against these principles. Output valid JSON only. diff --git a/moralstack/prompts/hindsight_prompt.py b/moralstack/prompts/hindsight_prompt.py index d626027..7fd86d3 100644 --- a/moralstack/prompts/hindsight_prompt.py +++ b/moralstack/prompts/hindsight_prompt.py @@ -31,16 +31,41 @@ }} """ -HINDSIGHT_BATCH_FULL_TEMPLATE = """Evaluate this AI response considering multiple potential consequences. +# FULL: static evaluation/output schema first (prompt caching); consequences + turn context last. +HINDSIGHT_BATCH_FULL_TEMPLATE = ( + """For each consequence listed under POTENTIAL CONSEQUENCES below, + produce exactly one evaluation using the same scenario_id. +Output JSON: +{{ + "evaluations": [ + {{ + "scenario_id": "id of the consequence", + "safety": -1.0 to 1.0, + "helpfulness": -1.0 to 1.0, + "honesty": -1.0 to 1.0, + "harm_probability": 0.0 to 1.0, + "benefit_probability": 0.0 to 1.0, + "confidence": 0.0 to 1.0, + "rationale": "Brief explanation" + }} + ] +}} +""" + + OUTPUT_JSON_ONLY + + """ + +POTENTIAL CONSEQUENCES: +{consequences} + +TURN CONTEXT: REQUEST: {request} RESPONSE: {response} RISK CONTEXT: {risk_signals} - -{instructions} -""" + OUTPUT_JSON_ONLY +""" +) HINDSIGHT_BATCH_THIN_TEMPLATE = """Evaluate this AI response considering multiple potential consequences. @@ -82,10 +107,10 @@ def build_hindsight_prompt( if mode == "full": risk_signals = context.get_risk_signals_str() or "none" return HINDSIGHT_BATCH_FULL_TEMPLATE.format( + consequences=consequences_text, request=request, response=context.draft_text_full or "", risk_signals=risk_signals, - instructions=instructions, ) sections = build_thin_context_sections(context) diff --git a/moralstack/prompts/perspectives_prompt.py b/moralstack/prompts/perspectives_prompt.py index 81f3025..f3a8e36 100644 --- a/moralstack/prompts/perspectives_prompt.py +++ b/moralstack/prompts/perspectives_prompt.py @@ -47,24 +47,29 @@ Low risk. Evaluate helpfulness, accuracy, completeness normally. """ -# Full template: perspective block + REQUEST/RESPONSE + common (kept for build_perspectives_prompt) -PERSPECTIVE_FULL_TEMPLATE = """Evaluate this AI response from the perspective of: {perspective_name} +# Full template: static interpretation + JSON contract first (prompt caching); perspective + draft last. +PERSPECTIVE_FULL_TEMPLATE = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """ + +Evaluate this AI response from the perspective of: {perspective_name} {perspective_instructions} +TURN CONTEXT: REQUEST: {request} RESPONSE: {response} RISK CONTEXT: {risk_signals} +""" -""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS +# Thin template: same static-first layout as full; perspective + thin turn context last. +PERSPECTIVE_THIN_TEMPLATE = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """ -# Thin template: perspective block + request + thin sections + common -PERSPECTIVE_THIN_TEMPLATE = """Evaluate this AI response from the perspective of: {perspective_name} +Evaluate this AI response from the perspective of: {perspective_name} {perspective_instructions} +TURN CONTEXT: REQUEST: {request} RESPONSE SUMMARY (compact): @@ -75,19 +80,23 @@ RISK CONTEXT: {risk_signals} {change_log_section} +""" -""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS +# Shared system prompt body: static block first, then REQUEST/RESPONSE (full) or thin sections. +PERSPECTIVE_SYSTEM_FULL_BODY = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """ -# Shared system prompt body: REQUEST + RESPONSE (or thin) + common instructions only (no perspective) -PERSPECTIVE_SYSTEM_FULL_BODY = """REQUEST: {request} +TURN CONTEXT: +REQUEST: {request} RESPONSE: {response} RISK CONTEXT: {risk_signals} +""" -""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS +PERSPECTIVE_SYSTEM_THIN_BODY = RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS + """ -PERSPECTIVE_SYSTEM_THIN_BODY = """REQUEST: {request} +TURN CONTEXT: +REQUEST: {request} RESPONSE SUMMARY (compact): {response_summary} @@ -97,8 +106,7 @@ RISK CONTEXT: {risk_signals} {change_log_section} - -""" + RISK_CONTEXT_INTERPRETATION + PERSPECTIVE_COMMON_INSTRUCTIONS +""" def build_perspectives_system_prompt( diff --git a/moralstack/prompts/simulator_prompt.py b/moralstack/prompts/simulator_prompt.py index b10be8e..8880141 100644 --- a/moralstack/prompts/simulator_prompt.py +++ b/moralstack/prompts/simulator_prompt.py @@ -85,31 +85,16 @@ DEFAULT_DOMAIN_GUIDANCE = "Consider whether the intent is educational/protective vs exploitative." -# FULL template -SIMULATOR_FULL_TEMPLATE = ( +# FULL template: longest static prefix first (prompt caching); TURN PARAMETERS and draft last. +SIMULATOR_FULL_STATIC_PREFIX = ( """ -Generate {num_scenarios} potential consequences for this AI response. - -REQUEST: -{request} - -RESPONSE: -{response} - -RISK CONTEXT: -{risk_signals} - -DOMAIN CONTEXT: -{domain} - -DOMAIN-SPECIFIC GUIDANCE: - -{domain_guidance} +CONTRACT (fixed instructions; identical across requests when this prefix is reused): Requirements: -- Generate exactly {num_scenarios} consequences. -- Return exactly {num_scenarios} objects inside "consequences". +- Generate exactly N potential consequences for one AI response, where N is the integer num_scenarios in + TURN PARAMETERS below. +- Return exactly N objects inside "consequences". - Include a balanced mix: - at least one positive outcome (scenario_type: positive_outcome) @@ -162,6 +147,30 @@ """ ) +SIMULATOR_FULL_TEMPLATE = SIMULATOR_FULL_STATIC_PREFIX + """ + +--- + +TURN PARAMETERS: +num_scenarios: {num_scenarios} + +REQUEST: +{request} + +RESPONSE: +{response} + +RISK CONTEXT: +{risk_signals} + +DOMAIN CONTEXT: +{domain} + +DOMAIN-SPECIFIC GUIDANCE: + +{domain_guidance} +""" + # THIN template SIMULATOR_THIN_TEMPLATE = ( diff --git a/moralstack/reports/markdown_export.py b/moralstack/reports/markdown_export.py index 4a277fa..c8e4221 100644 --- a/moralstack/reports/markdown_export.py +++ b/moralstack/reports/markdown_export.py @@ -11,12 +11,14 @@ from __future__ import annotations import json +import os from datetime import datetime from typing import Any from moralstack.persistence.config import get_db_path from moralstack.persistence.db import ( get_decision_traces_for_request, + get_models_used_for_run, get_requests_for_run, get_run, ) @@ -121,6 +123,102 @@ def _build_benchmark_comparison_section(br: dict[str, Any]) -> str: return "\n".join(lines) +def format_models_used_markdown(models_cfg: dict[str, Any], *, primary_model: str = "gpt-4o") -> str: + """ + Renders the '### Models used' markdown table from a benchmark-style ``models_config`` dict. + Matches the benchmark report header (including parallel risk rows when present). + """ + ms = models_cfg.get("moralstack") or {} + m = primary_model or "gpt-4o" + baseline = models_cfg.get("baseline", m) + judge = models_cfg.get("judge", m) + if ms.get("risk_parallel"): + risk_rows = ( + f"| **MoralStack risk** | parallel mini-estimators |\n" + f"| **MoralStack risk · intent** | {ms.get('risk_intent', '—')} |\n" + f"| **MoralStack risk · signals** | {ms.get('risk_signals', '—')} |\n" + f"| **MoralStack risk · operational** | {ms.get('risk_operational', '—')} |" + ) + else: + risk_rows = f"| **MoralStack risk** | {ms.get('risk', m)} |" + pr = ms.get("policy_rewrite", ms.get("policy", m)) + return f"""### Models used + +| Component | Model | +|-----------|-------| +| **Baseline** | {baseline} | +| **Judge** | {judge} | +| **MoralStack policy** | {ms.get('policy', m)} | +| **MoralStack policy (rewrite)** | {pr} | +{risk_rows} +| **MoralStack critic** | {ms.get('critic', m)} | +| **MoralStack simulator** | {ms.get('simulator', m)} | +| **MoralStack hindsight** | {ms.get('hindsight', m)} | +| **MoralStack perspectives** | {ms.get('perspectives', m)} | +""" + + +def _resolve_models_config_for_run(run_id: str) -> tuple[dict[str, Any], str]: + """ + Returns (models_config, primary_model) for a run. + + Priority: + 1. Benchmark JSON ``models_config`` (snapshot taken at benchmark time). + 2. Persisted ``llm_calls.model`` column (actual models used at run time). + 3. Env-vars fallback (last resort; may not match the run). + """ + env_policy = (os.getenv("OPENAI_MODEL") or "gpt-4o").strip() + run = get_run(run_id) + run_type = (run.get("run_type") or "").strip().lower() if run else "" + + # --- Benchmark: prefer the embedded snapshot --- + if run_type == "benchmark": + br = load_benchmark_report(run_id) + if br and isinstance(br, dict): + mc = br.get("models_config") + if mc: + _overlay_db_models(mc, run_id) + return mc, (br.get("model") or env_policy) + model = br.get("model") or env_policy + bm = br.get("baseline_model") or model + jm = br.get("judge_model") or model + cfg = _get_benchmark_models_config_fallback(bm, jm, moralstack_policy_model=model) + _overlay_db_models(cfg, run_id) + return cfg, model + + # --- Single / interactive run: build from DB --- + db_models = get_models_used_for_run(run_id) + primary = db_models.get("policy_generate") or env_policy + ms: dict[str, Any] = { + "policy": primary, + "policy_rewrite": db_models.get("policy_rewrite", primary), + "risk": db_models.get("risk", primary), + "critic": db_models.get("critic", primary), + "simulator": db_models.get("simulator", primary), + "hindsight": db_models.get("hindsight", primary), + "perspectives": db_models.get("perspectives", primary), + } + cfg: dict[str, Any] = {"baseline": "—", "judge": "—", "moralstack": ms} + return cfg, primary + + +def _overlay_db_models(cfg: dict[str, Any], run_id: str) -> None: + """Patch *cfg* in-place with actually-used models from ``llm_calls``.""" + db_models = get_models_used_for_run(run_id) + if not db_models: + return + ms = cfg.get("moralstack") + if not isinstance(ms, dict): + return + if "policy_generate" in db_models: + ms["policy"] = db_models["policy_generate"] + if "policy_rewrite" in db_models: + ms["policy_rewrite"] = db_models["policy_rewrite"] + for key in ("risk", "critic", "simulator", "hindsight", "perspectives"): + if key in db_models: + ms[key] = db_models[key] + + def export_request_markdown(run_id: str, request_id: str) -> str: """ Exports a single request's deliberation report as markdown. @@ -138,7 +236,12 @@ def export_request_markdown(run_id: str, request_id: str) -> str: return "# Error: No database configured (MORALSTACK_DB_PATH)" return f"# Error: Request {request_id} not found in run {run_id}" - md = render_request_report(report) + models_cfg, primary_model = _resolve_models_config_for_run(run_id) + models_used_md = format_models_used_markdown(models_cfg, primary_model=primary_model) + md = render_request_report( + report, + models_used_section=f"---\n\n{models_used_md}", + ) if report.benchmark_result: md += "\n\n" + _build_benchmark_comparison_section(report.benchmark_result) @@ -203,11 +306,15 @@ def _get_benchmark_models_config_fallback( except ImportError: risk_m = critic_m = simulator_m = hindsight_m = perspectives_m = policy_fallback + rewrite_raw = (os.getenv("MORALSTACK_POLICY_REWRITE_MODEL") or "").strip() + policy_rewrite_m = rewrite_raw if rewrite_raw else policy_fallback + return { "baseline": baseline_model, "judge": judge_model, "moralstack": { "policy": policy_fallback, + "policy_rewrite": policy_rewrite_m, "risk": risk_m, "critic": critic_m, "simulator": simulator_m, @@ -278,24 +385,9 @@ def _build_benchmark_section_from_dict(report: dict, section_builder: str) -> st models_cfg = report.get("models_config") if not models_cfg: models_cfg = _get_benchmark_models_config_fallback(baseline_model, judge_model, moralstack_policy_model=model) - ms = models_cfg.get("moralstack", {}) models_block = "" - if ms: - models_block = f""" - -### Models used - -| Component | Model | -|-----------|-------| -| **Baseline** | {models_cfg.get('baseline', model)} | -| **Judge** | {models_cfg.get('judge', judge_model)} | -| **MoralStack policy** | {ms.get('policy', model)} | -| **MoralStack risk** | {ms.get('risk', model)} | -| **MoralStack critic** | {ms.get('critic', model)} | -| **MoralStack simulator** | {ms.get('simulator', model)} | -| **MoralStack hindsight** | {ms.get('hindsight', model)} | -| **MoralStack perspectives** | {ms.get('perspectives', model)} | -""" + if models_cfg.get("moralstack"): + models_block = "\n\n" + format_models_used_markdown(models_cfg, primary_model=model) + "\n" return f"""# 🧪 MoralStack Benchmark Report > **Report generated**: {ts} diff --git a/moralstack/reports/renderer_markdown.py b/moralstack/reports/renderer_markdown.py index 6bada13..70ce3f6 100644 --- a/moralstack/reports/renderer_markdown.py +++ b/moralstack/reports/renderer_markdown.py @@ -488,13 +488,19 @@ def render_request_footer(report: "RequestReport") -> str: """ -def render_request_report(report: "RequestReport") -> str: - """Render full request/deliberation report markdown from RequestReport.""" +def render_request_report(report: "RequestReport", *, models_used_section: str = "") -> str: + """Render full request/deliberation report markdown from RequestReport. + + Optional ``models_used_section``: markdown block (e.g. '### Models used' table) inserted + after the request header and before the executive summary (used by UI export). + """ orch = render_orchestrator_observability(report) sections = [ render_request_header(report), - render_executive_summary(report), ] + if models_used_section and models_used_section.strip(): + sections.append(models_used_section.strip()) + sections.append(render_executive_summary(report)) if orch: sections.append(orch) sections.extend( diff --git a/moralstack/runtime/modules/critic_module.py b/moralstack/runtime/modules/critic_module.py index 48a8db6..99833c9 100644 --- a/moralstack/runtime/modules/critic_module.py +++ b/moralstack/runtime/modules/critic_module.py @@ -309,6 +309,7 @@ def __init__( temperature=self.config.temperature, top_p=self.config.top_p, stop_sequences=[], + response_format={"type": "json_object"}, ) def critique( @@ -500,6 +501,7 @@ def quick_check( max_tokens=256, temperature=0.1, top_p=self.config.top_p, + response_format={"type": "json_object"}, ) try: diff --git a/moralstack/runtime/modules/hindsight_module.py b/moralstack/runtime/modules/hindsight_module.py index c2d8ac1..e96e970 100644 --- a/moralstack/runtime/modules/hindsight_module.py +++ b/moralstack/runtime/modules/hindsight_module.py @@ -509,6 +509,7 @@ def __init__( temperature=self.config.temperature, top_p=self.config.top_p, stop_sequences=[], + response_format={"type": "json_object"}, ) # Pesi per HindsightScores diff --git a/moralstack/runtime/modules/perspective_module.py b/moralstack/runtime/modules/perspective_module.py index d8bba47..3f3535f 100644 --- a/moralstack/runtime/modules/perspective_module.py +++ b/moralstack/runtime/modules/perspective_module.py @@ -419,6 +419,7 @@ def __init__( temperature=self.config.temperature, top_p=self.config.top_p, stop_sequences=[], + response_format={"type": "json_object"}, ) def evaluate( @@ -883,8 +884,7 @@ def apply_constitutional_override( # Override: cap weighted_approval to 0.2 if inner.weighted_approval > 0.2: logging.getLogger(__name__).info( - "Perspective override applied due to HARD constitutional violation: " - "weighted_approval capped from %.2f to 0.20", + "Perspective override applied due to HARD constitutional violation: weighted_approval capped from %.2f to 0.20", inner.weighted_approval, ) inner.weighted_approval = min(inner.weighted_approval, 0.2) diff --git a/moralstack/runtime/modules/simulator_module.py b/moralstack/runtime/modules/simulator_module.py index 85d083c..dd8cd64 100644 --- a/moralstack/runtime/modules/simulator_module.py +++ b/moralstack/runtime/modules/simulator_module.py @@ -293,6 +293,7 @@ def __init__( temperature=self.config.temperature, top_p=self.config.top_p, stop_sequences=[], + response_format={"type": "json_object"}, ) def simulate( diff --git a/moralstack/ui/app.py b/moralstack/ui/app.py index 87423eb..249b030 100644 --- a/moralstack/ui/app.py +++ b/moralstack/ui/app.py @@ -444,6 +444,23 @@ def _group_calls_into_tiers_and_enrich(calls: list) -> list[list[dict]]: all_tiers = time_tiers + merged_seq_tiers all_tiers.sort(key=lambda t: min((c.get("started_at") or 0) for c in t)) + # ── Post-merge: collapse adjacent tiers that overlap in time ──────── + # With full-parallel evaluation (critic||sim||persp) or speculative + # overlap (risk||generate), modules from different static vtiers may + # actually run concurrently. Merge them so the UI shows a single + # parallel tier instead of misleading sequential steps. + if len(all_tiers) > 1: + collapsed: list[list[dict]] = [all_tiers[0]] + for tier in all_tiers[1:]: + prev = collapsed[-1] + prev_max_end = max((c.get("started_at") or 0) + (c.get("duration_ms") or 0) for c in prev) + tier_min_start = min((c.get("started_at") or 0) for c in tier) + if tier_min_start < prev_max_end: + prev.extend(tier) + else: + collapsed.append(tier) + all_tiers = collapsed + # ── Enrich with timing info ────────────────────────────────────────── processed: list[list[dict]] = [] for tier in all_tiers: diff --git a/moralstack/ui/templates/run.html b/moralstack/ui/templates/run.html index d898307..7b75ef8 100644 --- a/moralstack/ui/templates/run.html +++ b/moralstack/ui/templates/run.html @@ -69,6 +69,10 @@

Models used

MoralStack policy {{ benchmark_report.models_config.moralstack.policy }} + + MoralStack policy (rewrite) + {{ benchmark_report.models_config.moralstack.policy_rewrite | default(benchmark_report.models_config.moralstack.policy, true) }} + {% if benchmark_report.models_config.moralstack.risk_parallel %} MoralStack risk diff --git a/moralstack/utils/openai_params.py b/moralstack/utils/openai_params.py index 44c9e65..0c03b22 100644 --- a/moralstack/utils/openai_params.py +++ b/moralstack/utils/openai_params.py @@ -2,6 +2,7 @@ OpenAI API parameter helpers. Single source of truth for model-specific params. Newer models (gpt-5.x, o-series) require max_completion_tokens instead of max_tokens. +Predicted output support is determined by model compatibility. """ from __future__ import annotations @@ -10,6 +11,18 @@ MODELS_REQUIRING_MAX_COMPLETION_TOKENS = ("o1", "o3", "o4", "gpt-5") +# Models that support the ``prediction`` parameter for speculative decoding. +# Predicted outputs are incompatible with max_completion_tokens, logprobs, +# and n > 1. Only models using the legacy max_tokens param qualify. +# Reference: https://platform.openai.com/docs/guides/predicted-outputs +MODELS_SUPPORTING_PREDICTED_OUTPUT = ( + "gpt-4o", + "gpt-4o-mini", + "gpt-4.1", + "gpt-4.1-mini", + "gpt-4.1-nano", +) + def uses_max_completion_tokens(model: str) -> bool: """True if model requires max_completion_tokens instead of max_tokens.""" @@ -17,6 +30,17 @@ def uses_max_completion_tokens(model: str) -> bool: return any(m.startswith(p) for p in MODELS_REQUIRING_MAX_COMPLETION_TOKENS) +def supports_predicted_output(model: str) -> bool: + """True if model supports the ``prediction`` parameter (speculative decoding). + + Predicted outputs speed up generation when the expected output is largely + similar to a known text (e.g. a draft revision). The feature is only + available on models that use the legacy ``max_tokens`` parameter. + """ + m = (model or "").lower() + return any(m.startswith(p) for p in MODELS_SUPPORTING_PREDICTED_OUTPUT) + + def completion_tokens_param(model: str, max_tokens: int) -> dict[str, Any]: """Returns the correct param dict for chat.completions.create.""" if uses_max_completion_tokens(model): diff --git a/scripts/benchmark_moralstack.py b/scripts/benchmark_moralstack.py index 878c4f3..ae5cd89 100644 --- a/scripts/benchmark_moralstack.py +++ b/scripts/benchmark_moralstack.py @@ -1468,6 +1468,7 @@ def get_benchmark_models_config( Baseline uses MORALSTACK_BENCHMARK_BASELINE_MODEL exclusively. MoralStack modules use env vars (MORALSTACK_*_MODEL); if unset, fall back to moralstack_policy_model. + Includes `policy_rewrite` from `MORALSTACK_POLICY_REWRITE_MODEL` (or primary policy when unset). When parallel mini-estimators are enabled, includes risk_intent, risk_signals, risk_operational keys. """ @@ -1512,8 +1513,12 @@ def get_benchmark_models_config( except ImportError: risk_m = critic_m = simulator_m = hindsight_m = perspectives_m = policy_fallback + rewrite_raw = (os.getenv("MORALSTACK_POLICY_REWRITE_MODEL") or "").strip() + policy_rewrite_m = rewrite_raw if rewrite_raw else policy_fallback + moralstack_cfg: dict[str, Any] = { "policy": policy_fallback, + "policy_rewrite": policy_rewrite_m, "risk": risk_m, "critic": critic_m, "simulator": simulator_m, @@ -2813,6 +2818,7 @@ def _header(self, report: BenchmarkReport) -> str: | **Baseline** | {models_cfg["baseline"]} | | **Judge** | {models_cfg["judge"]} | | **MoralStack policy** | {ms["policy"]} | +| **MoralStack policy (rewrite)** | {ms["policy_rewrite"]} | {risk_rows} | **MoralStack critic** | {ms["critic"]} | | **MoralStack simulator** | {ms["simulator"]} |