diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json index b28bbad..f8014e3 100644 --- a/.agents/plugins/marketplace.json +++ b/.agents/plugins/marketplace.json @@ -28,6 +28,18 @@ }, "category": "Full Stack" }, + { + "name": "bedrock", + "source": { + "source": "local", + "path": "./plugins/bedrock" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_INSTALL" + }, + "category": "AI" + }, { "name": "aws-serverless", "source": { diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 7130193..8a37535 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -48,6 +48,24 @@ "tags": ["aws", "amplify", "fullstack"], "version": "1.0.0" }, + { + "category": "ai", + "description": "Guided Amazon Bedrock setup: IAM configuration, model access, prompt caching, observability, and cost analysis.", + "keywords": [ + "aws", + "bedrock", + "prompt-caching", + "setup", + "generative-ai", + "foundation-models", + "observability", + "cost-analysis" + ], + "name": "bedrock", + "source": "./plugins/bedrock", + "tags": ["aws", "bedrock", "generative-ai", "prompt-caching"], + "version": "0.4.1" + }, { "category": "development", "description": "Design, build, deploy, test, and debug serverless applications with AWS Serverless services.", diff --git a/README.md b/README.md index 1e51475..ed3bd7c 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ To maximize the benefits of plugin-assisted development while maintaining securi | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------- | | **amazon-location-service** | Add maps, geocoding, routing, places search, and geospatial features to applications with Amazon Location Service | Available | | **aws-amplify** | Build full-stack apps with AWS Amplify Gen 2 using guided workflows for auth, data, storage, and functions | Available | +| **bedrock** | Guided Amazon Bedrock setup: IAM configuration, model access, prompt caching, observability, and cost analysis | Available | | **aws-serverless** | Build serverless applications with Lambda, API Gateway, EventBridge, Step Functions, and durable functions | Available | | **databases-on-aws** | Database guidance for the AWS database portfolio — schema design, queries, migrations, and multi-tenant patterns | Some Services Available (Aurora DSQL) | | **deploy-on-aws** | Deploy applications to AWS with architecture recommendations, cost estimates, and IaC deployment | Available | @@ -62,6 +63,12 @@ or or +```bash +/plugin install bedrock@agent-plugins-for-aws +``` + +or + ```bash /plugin install aws-serverless@agent-plugins-for-aws ``` @@ -227,6 +234,35 @@ Design, build, deploy, test, and debug serverless applications with AWS Lambda, | --------------------------- | --------------------------------------------- | --------------------------------------------- | | **SAM template validation** | After edits to `template.yaml`/`template.yml` | Runs `sam validate` and reports errors inline | +## bedrock + +Guided Amazon Bedrock setup — IAM configuration, model access, prompt caching, observability, quota optimization, and cost analysis. + +### Agent Skill Triggers + +| Agent Skill | Triggers | +| ----------- | -------------------------------------------------------------------------------------------------------------------------- | +| **bedrock** | "set up bedrock", "bedrock onboarding", "prompt caching", "bedrock IAM", "bedrock costs", "bedrock quota", "bedrock usage" | + +### Commands + +| Command | Description | +| -------------------------------- | -------------------------------------------------------------- | +| `/bedrock` | Unified entry point — routes to the right capability | +| `/bedrock-setup` | Interactive onboarding: IAM, model access, caching, validation | +| `/bedrock-cache` | Set up and validate prompt caching | +| `/bedrock-cache-debug` | Diagnose prompt caching issues | +| `/bedrock-quota` | Check quota health and detect max_tokens waste | +| `/bedrock-usage` | Analyze token consumption from CloudWatch | +| `/bedrock-costs` | Analyze actual Bedrock spend from AWS Cost Explorer | +| `/bedrock-validate-model-access` | Validate IAM permissions and model access | + +### MCP Servers + +| Server | Purpose | +| --------------------- | ------------------------------ | +| **aws-documentation** | AWS documentation and guidance | + ## databases-on-aws Database guidance for the AWS database portfolio. Design schemas, execute queries, handle migrations, build applications, and choose the right database for your workload. Currently includes Aurora DSQL — a serverless, PostgreSQL-compatible distributed SQL database. diff --git a/plugins/bedrock/.claude-plugin/plugin.json b/plugins/bedrock/.claude-plugin/plugin.json new file mode 100644 index 0000000..153d291 --- /dev/null +++ b/plugins/bedrock/.claude-plugin/plugin.json @@ -0,0 +1,20 @@ +{ + "author": { + "name": "Amazon Web Services" + }, + "description": "Guided Amazon Bedrock setup: IAM configuration, model access, prompt caching, observability, and cost analysis.", + "homepage": "https://github.com/awslabs/agent-plugins", + "keywords": [ + "aws", + "bedrock", + "prompt-caching", + "onboarding", + "setup", + "generative-ai", + "foundation-models" + ], + "license": "Apache-2.0", + "name": "bedrock", + "repository": "https://github.com/awslabs/agent-plugins", + "version": "0.4.1" +} diff --git a/plugins/bedrock/.codex-plugin/plugin.json b/plugins/bedrock/.codex-plugin/plugin.json new file mode 100644 index 0000000..f142312 --- /dev/null +++ b/plugins/bedrock/.codex-plugin/plugin.json @@ -0,0 +1,44 @@ +{ + "name": "bedrock", + "version": "0.4.1", + "description": "Guided Amazon Bedrock setup: IAM configuration, model access, prompt caching, observability, and cost analysis.", + "author": { + "name": "Amazon Web Services", + "email": "aws-agent-plugins@amazon.com", + "url": "https://github.com/awslabs/agent-plugins" + }, + "homepage": "https://github.com/awslabs/agent-plugins", + "repository": "https://github.com/awslabs/agent-plugins", + "license": "Apache-2.0", + "keywords": [ + "aws", + "bedrock", + "prompt-caching", + "onboarding", + "setup", + "generative-ai", + "foundation-models" + ], + "skills": "./skills/", + "mcpServers": "./.mcp.json", + "interface": { + "displayName": "Amazon Bedrock", + "shortDescription": "Set up Bedrock with IAM, model access, prompt caching, and cost analysis.", + "longDescription": "Guided Amazon Bedrock setup \u2014 IAM configuration, model access, prompt caching, observability, quota optimization, and cost analysis.", + "defaultPrompt": [ + "Set up Amazon Bedrock for my AWS account.", + "How much have I spent on Bedrock this month?", + "Am I getting throttled on any Bedrock models?" + ], + "developerName": "Amazon Web Services", + "category": "AI", + "capabilities": [ + "Read", + "Write" + ], + "websiteURL": "https://github.com/awslabs/agent-plugins", + "privacyPolicyURL": "https://aws.amazon.com/privacy/", + "termsOfServiceURL": "https://aws.amazon.com/service-terms/", + "brandColor": "#FF9900" + } +} diff --git a/plugins/bedrock/.mcp.json b/plugins/bedrock/.mcp.json new file mode 100644 index 0000000..9cae282 --- /dev/null +++ b/plugins/bedrock/.mcp.json @@ -0,0 +1,14 @@ +{ + "mcpServers": { + "aws-documentation": { + "command": "uvx", + "args": [ + "awslabs.aws-documentation-mcp-server@latest" + ], + "env": { + "FASTMCP_LOG_LEVEL": "ERROR" + }, + "timeout": 120000 + } + } +} diff --git a/plugins/bedrock/CLAUDE.md b/plugins/bedrock/CLAUDE.md new file mode 100644 index 0000000..e7e55d4 --- /dev/null +++ b/plugins/bedrock/CLAUDE.md @@ -0,0 +1,41 @@ +# Bedrock Plugin + +## Project Overview + +Claude Code plugin (`awslabs/agent-plugins` format) for Amazon Bedrock onboarding. +Guides developers through IAM setup, model access, prompt caching, observability, and cost analysis. + +## Plugin Format + +- `.claude-plugin/plugin.json` — plugin metadata +- `skills/` — SKILL.md files with YAML frontmatter (name, description, argument-hint) +- `commands/` — slash command definitions with YAML frontmatter (name, description) +- `scripts/` — executable scripts referenced via `${CLAUDE_PLUGIN_ROOT}/scripts/` +- `.mcp.json` — MCP server configuration + +## Key Conventions + +- Scripts use `${CLAUDE_PLUGIN_ROOT}` for portable paths — never hardcode absolute paths +- Default model: Claude Sonnet 4.6 (`us.anthropic.claude-sonnet-4-6`) +- Default region: `us-east-1` +- AWS profile: always ask the developer to confirm — never auto-select +- Reference docs in `skills/bedrock/references/` are loaded on-demand by topic +- Shell scripts must be POSIX-compatible and executable (`chmod +x`) +- Python scripts require Python 3.10+ and boto3 only — no other dependencies + +## Testing Changes + +1. Run `scripts/validate-bedrock-access.sh us.anthropic.claude-sonnet-4-6 us-east-1 PROFILE` — all 4 checks must pass +2. Run `python3 scripts/validate-prompt-caching.py --model-id us.anthropic.claude-sonnet-4-6 --region us-east-1 --profile PROFILE` — cache write + read confirmed +3. Run `python3 scripts/check-quota-health.py --model-id us.anthropic.claude-sonnet-4-6 --region us-east-1 --profile PROFILE` — quota analysis completes without error +4. Run `python3 scripts/debug-prompt-cache.py --model-id us.anthropic.claude-sonnet-4-6 --region us-east-1 --profile PROFILE` — all 6 diagnostic tests pass +5. Run `python3 scripts/analyze-bedrock-usage.py --model-id us.anthropic.claude-sonnet-4-6 --period 1 --profile PROFILE` — usage report generates +6. Run `python3 scripts/analyze-bedrock-costs.py --period 1 --profile PROFILE` — cost report generates (may show $0 if no traffic) +7. Each validation run costs ~$0.01-0.05 in Bedrock API calls; cache debug costs ~$0.02-0.08 + +## What Not to Do + +- Don't add dependencies beyond boto3 and AWS CLI +- Don't hardcode AWS credentials or account IDs in scripts +- Don't write custom IAM policies in examples — recommend `AmazonBedrockLimitedAccess` managed policy +- Don't run AWS commands without first confirming the profile with the developer diff --git a/plugins/bedrock/commands/bedrock-cache-debug.md b/plugins/bedrock/commands/bedrock-cache-debug.md new file mode 100644 index 0000000..e67838c --- /dev/null +++ b/plugins/bedrock/commands/bedrock-cache-debug.md @@ -0,0 +1,53 @@ +--- +name: bedrock-cache-debug +description: "Diagnose prompt caching issues: model support, thresholds, TTL, and cost analysis" +--- + +# Bedrock Cache Debugger + +Run 6 automated diagnostic tests to identify exactly why prompt caching is not working or is underperforming. + +## Required Permissions + +```json +{ + "Effect": "Allow", + "Action": ["bedrock:Converse"], + "Resource": "arn:aws:bedrock:::foundation-model/" +} +``` + +## Step 1: Run Cache Diagnostics + +Ask the user which model they're using (default: Claude Sonnet 4.6). + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/debug-prompt-cache.py --model-id --region +``` + +The 6 tests: + +1. **Model Support** — Does the model support caching? (silently ignored if not) +2. **Token Threshold** — Is content above the minimum? (silently ignored if below) +3. **Cache Write/Read** — Does the cache write-then-read cycle work? +4. **Prefix Sensitivity** — Demonstrates that even small content changes break the cache +5. **TTL Behavior** — Confirms cache persists within the TTL window +6. **Break-Even** — How many requests per TTL window before caching saves money? + +## Step 2: Diagnose Failures + +If any test fails, load the reference doc for targeted guidance: + +Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/prompt-caching.md` and focus on: + +- "Why Isn't My Cache Working?" for the specific failure mode +- "Preventing Cache Fragmentation" if prefix sensitivity is the issue +- "Break-Even Analysis" if the cost math doesn't work for their use case + +## Step 3: Recommend Strategy + +Based on the results, advise on: + +- Simplified vs explicit caching for their model +- 5-minute vs 1-hour TTL for their request pattern +- Whether caching is cost-effective at their request volume diff --git a/plugins/bedrock/commands/bedrock-cache.md b/plugins/bedrock/commands/bedrock-cache.md new file mode 100644 index 0000000..b02f912 --- /dev/null +++ b/plugins/bedrock/commands/bedrock-cache.md @@ -0,0 +1,73 @@ +--- +name: bedrock-cache +description: "Set up and validate prompt caching (simplified or explicit)" +--- + +# Bedrock Prompt Caching Setup + +Help the developer configure and validate prompt caching on Bedrock. + +## Step 1: Choose Caching Strategy + +Ask the developer: + +**Simplified caching** (recommended for Claude models): + +- Single `cachePoint` marker in the system or message blocks +- Bedrock automatically checks ~20 preceding blocks for cache hits +- Easiest to implement, fewer lines of code +- Claude models only + +**Explicit caching** (for all supported models): + +- Place `cachePoint` markers at specific positions +- Granular control over what gets cached +- Supports mixed TTL (1h + 5min) for different content sections +- Works with Claude, Nova, and other supported models + +## Required Permissions + +```json +{ + "Effect": "Allow", + "Action": ["bedrock:Converse", "bedrock:ConverseStream"], + "Resource": "arn:aws:bedrock:::foundation-model/" +} +``` + +## Step 2: Fetch Latest Implementation Guidance + +**Before giving any implementation advice**, fetch the latest prompt caching guidance from the aws-samples repo. This is the authoritative source and takes priority over any other knowledge: + +1. Use the context7 MCP tool to resolve and query `amazon-bedrock-samples` for prompt caching documentation: + - `mcp__plugin_context7_context7__resolve-library-id` with query `amazon-bedrock-samples prompt caching` + - Then `mcp__plugin_context7_context7__query-docs` for the resolved library, topic `prompt caching` +2. If context7 doesn't return results, use WebFetch to read the README at: https://raw.githubusercontent.com/aws-samples/amazon-bedrock-samples/main/introduction-to-bedrock/prompt-caching/README.md +3. For specific code samples, fetch directly from the repo: + - `converse_api/` — Model-agnostic examples using Converse API with `cachePoint` syntax (recommended starting point) + - `invoke_model_api/` — Model-specific examples using InvokeModel API (Anthropic `cache_control` format) + - Mixed TTL notebooks for advanced configurations + +Only after reviewing the upstream samples, read `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/prompt-caching.md` for additional conceptual guidance. + +Help the developer adapt the samples to their specific model, region, and use case. + +## Step 3: TTL Configuration + +If using Claude Sonnet 4.6, Opus 4.6, Sonnet 4.5, Opus 4.5, or Haiku 4.5, offer the 1-hour TTL option for rarely-changing content. For older Claude models that support caching, only the default 5-minute TTL is available. + +Remind: when mixing TTLs, longer durations must come before shorter ones. + +## Step 4: Validate + +Run: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/validate-prompt-caching.py --model-id --region +``` + +Confirm cache write on first request and cache read on second. + +## Step 5: Verify Metrics (Optional) + +Offer to run `/bedrock-usage` to confirm cache metrics are appearing in CloudWatch after the first cached requests. diff --git a/plugins/bedrock/commands/bedrock-costs.md b/plugins/bedrock/commands/bedrock-costs.md new file mode 100644 index 0000000..0d49eb6 --- /dev/null +++ b/plugins/bedrock/commands/bedrock-costs.md @@ -0,0 +1,54 @@ +--- +name: bedrock-costs +description: "Analyze actual Bedrock spend from AWS Cost Explorer — real billed amounts, not estimates" +--- + +# Bedrock Cost Analysis + +Query AWS Cost Explorer for the developer's actual Bedrock charges. All cost data comes directly from the AWS bill — no hardcoded pricing. + +## Required Permissions + +```json +{ + "Effect": "Allow", + "Action": "ce:GetCostAndUsage", + "Resource": "*" +} +``` + +**Prerequisite**: Cost Explorer must be enabled in the account. Enable via: AWS Console > Billing > Cost Explorer > Enable Cost Explorer. Takes up to 24 hours to activate. + +## Step 1: Run Cost Analysis + +Ask the user for the time period (default: 7 days). + +Aggregate Bedrock spend: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-costs.py --period +``` + +Grouped by usage type (shows per-model breakdown): + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-costs.py --period --group-by model +``` + +## Step 2: Interpret Results + +The report shows: + +- Total Amazon Bedrock spend for the period (actual billed amounts) +- Daily cost breakdown +- Per-model cost breakdown (when using `--group-by model`) + +## Step 3: Optimize + +For cost optimization guidance: + +- **Token-level metrics**: Run `/bedrock-usage` for CloudWatch token consumption data +- **Enable caching**: Run `/bedrock-cache` to set up prompt caching +- **Reduce quota waste**: Run `/bedrock-quota` for max_tokens optimization +- **Current pricing**: Use the AWS Docs MCP server to search for "Amazon Bedrock pricing" or visit https://aws.amazon.com/bedrock/pricing/ +- **Optimization strategies**: Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/cost-optimization.md` diff --git a/plugins/bedrock/commands/bedrock-quota.md b/plugins/bedrock/commands/bedrock-quota.md new file mode 100644 index 0000000..ef1b614 --- /dev/null +++ b/plugins/bedrock/commands/bedrock-quota.md @@ -0,0 +1,55 @@ +--- +name: bedrock-quota +description: "Check quota health, detect max_tokens waste, and generate quota increase data" +--- + +# Bedrock Quota Health Check + +Analyze the developer's Bedrock quota utilization, detect the max_tokens pre-reservation trap, and generate data for quota increase requests. + +## Required Permissions + +```json +{ + "Effect": "Allow", + "Action": [ + "cloudwatch:GetMetricStatistics", + "servicequotas:ListServiceQuotas" + ], + "Resource": "*" +} +``` + +## Step 1: Run Quota Health Check + +Ask the user which model and region they're using (defaults: Claude Sonnet 4.6 in us-east-1). + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/check-quota-health.py --model-id --region +``` + +This checks: + +1. Current quota limits (TPM, RPM) +2. Actual token usage from CloudWatch +3. Whether max_tokens is set too high (the #1 cause of throttling) +4. Burndown rate impact (5x for Claude 3.7+ models) +5. Whether cross-region inference would help + +## Step 2: Explain Findings + +If the max_tokens trap is detected, explain using the reference doc: + +Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/quota-optimization.md` and walk the developer through: + +- Why their default max_tokens is wasting quota +- The specific max_tokens value to set based on their actual output distribution +- The burndown rate for their model + +## Step 3: Quota Increase (if needed) + +If the developer needs more quota, the script generates the exact data AWS requires. Guide them through the Service Quotas console request process. + +## Step 4: Cross-Region Inference + +Claude models are only available through cross-region inference (prefixed model IDs like `us.anthropic.claude-sonnet-4-6`). If the developer is having access issues, run `/bedrock-validate-model-access`. For cross-region IAM and SCP guidance, load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/cost-optimization.md`. diff --git a/plugins/bedrock/commands/bedrock-setup.md b/plugins/bedrock/commands/bedrock-setup.md new file mode 100644 index 0000000..2fa077c --- /dev/null +++ b/plugins/bedrock/commands/bedrock-setup.md @@ -0,0 +1,76 @@ +--- +name: bedrock-setup +description: "Interactive Bedrock onboarding: IAM, model access, prompt caching, and validation" +--- + +# Bedrock Setup + +Guide the developer through a complete Amazon Bedrock setup. Follow these steps in order, confirming each before proceeding. + +## Required Permissions + +The full setup requires the **`AmazonBedrockLimitedAccess`** AWS managed policy (covers inference, model discovery, and marketplace permissions) plus a supplemental inline policy for observability and cost commands. See `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/iam-permissions.md` for the complete permissions reference. + +## AWS CLI Profile Selection + +Before any AWS command, ask the developer which AWS CLI profile to use. List available profiles by running `aws configure list-profiles`. **Wait for confirmation** — do not auto-select. Then verify with `aws sts get-caller-identity --profile `. If it fails or no profiles exist, load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/profile-setup.md` to guide the developer through profile creation. + +## Step 1: Check Region + +Ask which region they want to use. Recommend `us-east-1` for broadest model availability. + +Run: `aws configure get region --profile ` + +## Step 2: IAM Permissions + +Read the IAM setup reference at `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/iam-permissions.md`. + +The developer needs two things attached to their IAM user or role: + +1. **`AmazonBedrockLimitedAccess`** managed policy — covers inference, model discovery, and marketplace permissions +2. **Supplemental inline policy** (from `iam-permissions.md`) — covers CloudWatch, Service Quotas, Cost Explorer, and STS for the plugin's observability and cost commands + +Walk the developer through attaching both. Default model: `us.anthropic.claude-sonnet-4-6` + +## Step 3: Enable Model Access + +Read the model access reference at `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/model-access.md`. + +Remind the developer: + +- All Bedrock serverless models are auto-enabled on first invocation with the correct AWS Marketplace permissions +- **Anthropic models (Claude)**: require a one-time use case form (First Time Use) before first invocation — submit via the Bedrock console or `PutUseCaseForModelAccess` API +- **All other models** (Amazon Nova, Meta Llama, Mistral, DeepSeek, Qwen, OpenAI): work immediately with correct IAM permissions — no marketplace subscription or use case form needed + +## Step 4: Validate Bedrock Access + +Run the validation script: + +```bash +${CLAUDE_PLUGIN_ROOT}/scripts/validate-bedrock-access.sh +``` + +All 4 checks must pass before proceeding. + +## Step 5: Configure Prompt Caching + +Ask the developer: **Do you want simplified or explicit prompt caching?** + +- **Simplified** (recommended for Claude models): Single cache point, automatic cache management +- **Explicit**: Manual checkpoint placement, works with all supported models + +Read `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/prompt-caching.md` for conceptual guidance. For working code samples, reference: https://github.com/aws-samples/amazon-bedrock-samples/tree/main/introduction-to-bedrock/prompt-caching + +## Step 6: Validate Prompt Caching + +Run the end-to-end validation: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/validate-prompt-caching.py --model-id --region --profile +``` + +Verify that cache write succeeds on the first request and cache read succeeds on the second. + +## Step 7: Check Metrics (Optional) + +Let the developer know they can run `/bedrock-usage` at any time to pull token consumption and caching metrics from CloudWatch. Bedrock publishes metrics automatically — no setup needed. diff --git a/plugins/bedrock/commands/bedrock-usage.md b/plugins/bedrock/commands/bedrock-usage.md new file mode 100644 index 0000000..282947c --- /dev/null +++ b/plugins/bedrock/commands/bedrock-usage.md @@ -0,0 +1,54 @@ +--- +name: bedrock-usage +description: "Analyze Bedrock token consumption, invocation counts, and prompt caching efficiency from CloudWatch" +--- + +# Bedrock Usage Analysis + +Query CloudWatch for token consumption metrics — what the developer used, not what they paid. + +## Required Permissions + +```json +{ + "Effect": "Allow", + "Action": "cloudwatch:GetMetricStatistics", + "Resource": "*" +} +``` + +## Step 1: Run Usage Analysis + +Ask the user for the time period and model (defaults: 7 days, Claude Sonnet 4.6). + +For a specific model: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-usage.py --model-id --region --period +``` + +To discover and analyze all active models: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-usage.py --all-models --region --period +``` + +## Step 2: Interpret Results + +The report shows: + +- Invocation counts and average tokens per request +- Total input and output token consumption +- Prompt caching efficiency (cache hit ratio, write vs read tokens) +- Warnings for low cache hit ratios or missing caching + +## Step 3: Act on Findings + +Based on the results, suggest next steps: + +- **No caching detected**: Run `/bedrock-cache` to set up prompt caching +- **Low cache hit ratio (<80%)**: Run `/bedrock-cache-debug` to diagnose +- **High token consumption or throttling**: Run `/bedrock-quota` to check for max_tokens waste +- **Want to see actual costs**: Run `/bedrock-costs` for Cost Explorer data + +For deeper metric interpretation, read `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/observability.md`. diff --git a/plugins/bedrock/commands/bedrock-validate-model-access.md b/plugins/bedrock/commands/bedrock-validate-model-access.md new file mode 100644 index 0000000..1e3343a --- /dev/null +++ b/plugins/bedrock/commands/bedrock-validate-model-access.md @@ -0,0 +1,54 @@ +--- +name: bedrock-validate-model-access +description: "Validate IAM permissions and model access for Amazon Bedrock" +--- + +# Bedrock Model Access Validation + +Verify that the developer's IAM permissions and model access are configured correctly. + +## Required Permissions + +Attach the **`AmazonBedrockLimitedAccess`** managed policy to the developer's IAM user or role. This covers `bedrock:InvokeModel`, `bedrock:ListFoundationModels`, and `bedrock:GetFoundationModel`. The Converse API requires `bedrock:InvokeModel` (not a separate IAM action). + +For identity verification (`sts:GetCallerIdentity`), attach the supplemental policy from [iam-permissions.md](${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/iam-permissions.md). + +## Step 1: Run Access Validation + +Extract model and region from the developer's question. Build the model ID: + +1. **Resolve the base model ID** from the natural name (e.g. "sonnet 4.6" → `anthropic.claude-sonnet-4-6`). If unsure of the exact model ID, search the AWS Docs MCP server for the current model ID. +2. **Select the cross-region prefix** based on the target region. Claude models **cannot** be invoked with the bare base model ID — `anthropic.claude-sonnet-4-6` will return `ResourceNotFoundException`. You must add a prefix: + - US regions (`us-east-1`, `us-west-2`, etc.) → `us.` prefix + - EU regions (`eu-west-1`, `eu-central-1`, etc.) → `eu.` prefix + - AP regions (`ap-northeast-1`, `ap-southeast-1`, etc.) → `ap.` prefix + - `global.` prefix works for all regions (~10% cost savings via global routing) + - Amazon models (Nova, Titan) use bare model IDs with no prefix +3. **Combine**: `` (e.g. `us.anthropic.claude-sonnet-4-6`) + +Defaults: `us.anthropic.claude-sonnet-4-6` in `us-east-1`. + +Always use the validation script — never improvise raw AWS CLI calls: + +```bash +${CLAUDE_PLUGIN_ROOT}/scripts/validate-bedrock-access.sh +``` + +This checks: + +1. AWS credentials are valid (`sts:GetCallerIdentity`) +2. Bedrock service is accessible (`bedrock:ListFoundationModels`) +3. Target model is available (`bedrock:GetFoundationModel`) +4. Inference permissions work (`bedrock:Converse`) + +## Step 2: Diagnose Failures + +If any check fails, load the relevant reference: + +- Credentials invalid → help configure via `aws configure --profile ` +- Model not found → load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/model-access.md` +- Access denied → load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/iam-permissions.md` + +## Step 3: Report + +Summarize: pass/fail for each of the 4 checks, and specific fix instructions for any failures. diff --git a/plugins/bedrock/commands/bedrock.md b/plugins/bedrock/commands/bedrock.md new file mode 100644 index 0000000..465c593 --- /dev/null +++ b/plugins/bedrock/commands/bedrock.md @@ -0,0 +1,73 @@ +--- +name: bedrock +description: "Your starting point for Amazon Bedrock — ask anything about setup, usage, costs, caching, models, or optimization" +argument-hint: "[what do you need help with?]" +--- + +# Bedrock — Unified Entry Point + +## AWS CLI Profile Gate + +Before running ANY command that touches AWS (aws CLI, plugin scripts, boto3), you MUST: + +1. **Ask the developer** which AWS CLI profile to use. List available profiles by running `aws configure list-profiles`. **Wait for the developer to confirm** — do not auto-select a profile, even if one is suggested in CLAUDE.md or environment variables. Never read `~/.aws/credentials` directly — it contains secret keys. +2. **Verify** the confirmed profile with `aws sts get-caller-identity --profile ` and show the account ID and ARN. +3. **Pass `--profile `** to all subsequent AWS CLI commands and plugin scripts for the rest of the session. + +Questions that don't require AWS access (docs, architecture, code samples, pricing info) can be answered immediately without a profile. + +--- + +## Script Paths + +`${CLAUDE_PLUGIN_ROOT}` in code blocks below is a placeholder — it is NOT a shell variable. Before running any script, resolve it to the actual plugin directory. Find it by checking the path of this command file (visible in the tool call that loaded it). For example, if this file was loaded from `/Users/me/.claude/plugins/cache/agent-plugins-for-aws/bedrock/0.4.1/commands/bedrock.md`, then `${CLAUDE_PLUGIN_ROOT}` = `/Users/me/.claude/plugins/cache/agent-plugins-for-aws/bedrock/0.4.1`. + +--- + +Interpret the developer's natural language request and route to the appropriate capability. + +## Knowledge Sources + +Use these sources to answer Bedrock questions. Choose the source that best matches the query — you may consult multiple sources or skip sources that aren't relevant. + +- **AWS Docs MCP Server** (`mcp__plugin_bedrock_aws-documentation__search_documentation`, `mcp__plugin_bedrock_aws-documentation__read_documentation`) — Official AWS documentation. Best for: API reference, service limits, pricing, IAM permissions, error codes, feature availability. +- **aws-samples prompt caching repo** (https://github.com/aws-samples/amazon-bedrock-samples/tree/main/introduction-to-bedrock/prompt-caching) — Working code samples for Converse API and InvokeModel API prompt caching. Best for: caching implementation, code examples, cache configuration patterns. +- **Bedrock Central** (https://aws-samples.github.io/sample-amazon-bedrock-central/) — Curated getting-started guides, model discovery, workshops, and sample applications. Best for: onboarding, model comparison, architecture patterns, workshop walkthroughs. +- **Plugin reference docs** (`${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/`) — Operational runbooks for the plugin's own scripts. Best for: step-by-step guidance on IAM setup, quota optimization, observability, cost analysis, and cross-region inference. +- **Internet search** — Last resort when other sources don't cover the topic. Always state that the answer came from an internet search. + +Always cite the source that provided your answer. + +## Routing Table + +Match the developer's intent to the appropriate command: + +| Intent | Route To | +| ------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| First-time setup, onboarding, getting started | `/bedrock-setup` | +| Check if my Bedrock access works, verify permissions, can I use model X, is model available, model access in region | `/bedrock-validate-model-access`. Also load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/model-access.md` for model availability context | +| Set up prompt caching, implement caching | `/bedrock-cache` | +| Cache not working, cache debugging, no cache tokens | `/bedrock-cache-debug` | +| Caching concepts: TTL, cache duration, simplified vs explicit, mixed TTL, cachePoint, break-even, thresholds | Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/prompt-caching.md`. For code samples, also reference https://github.com/aws-samples/amazon-bedrock-samples/tree/main/introduction-to-bedrock/prompt-caching | +| Token usage, how many tokens am I using, CloudWatch metrics | `/bedrock-usage` | +| How much am I spending, cost breakdown, billing | `/bedrock-costs` | +| Quota, throttling, max_tokens, rate limits, 429 errors, retry strategy, ThrottlingException | `/bedrock-quota`. For retry patterns and error handling code, also load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/quota-optimization.md` (Handling ThrottlingException section) | +| Cross-region, higher throughput, failover, region prefix | Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/cost-optimization.md` (Cross-Region Inference section). For access issues, run `/bedrock-validate-model-access` | +| AWS CLI profile setup, configure profile, which account, multiple accounts | Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/profile-setup.md` | +| IAM permissions, policies, AccessDeniedException | Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/iam-permissions.md` | +| Model comparison, which model should I use | Search AWS Docs MCP for model information, then load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/model-access.md` | +| Pricing, how much does a model cost | Search AWS Docs MCP for "Amazon Bedrock pricing", link to https://aws.amazon.com/bedrock/pricing/ | +| Code samples, how to call Bedrock from my app | Fetch Bedrock Central for samples; reference https://github.com/aws-samples/amazon-bedrock-samples | +| Observability, monitoring, dashboards | Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/observability.md` | +| Provisioned throughput, dedicated capacity, reserved capacity | Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/cost-optimization.md` (When Provisioned Throughput Makes Sense section) | +| Cost optimization, reduce costs, save money | Load `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/cost-optimization.md` | + +## If No Route Matches + +If the request doesn't match any route above: + +1. Check if any plugin reference doc in `${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/` covers the topic — these contain curated, verified guidance and should take priority over external sources +2. Search the AWS Docs MCP server for the topic +3. Check Bedrock Central for relevant resources +4. Fall back to internet search only if needed +5. Be transparent about which source provided the answer diff --git a/plugins/bedrock/scripts/analyze-bedrock-costs.py b/plugins/bedrock/scripts/analyze-bedrock-costs.py new file mode 100644 index 0000000..a140565 --- /dev/null +++ b/plugins/bedrock/scripts/analyze-bedrock-costs.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Bedrock cost analysis using AWS Cost Explorer. + +Queries Cost Explorer for actual billed amounts filtered by Amazon Bedrock. +No hardcoded pricing — all cost data comes directly from your AWS bill. + +Prerequisites: + - Cost Explorer must be enabled in the AWS account (free, but not on by default). + Enable via: AWS Console > Billing > Cost Explorer > Enable Cost Explorer + +Required IAM permissions: + ce:GetCostAndUsage (Resource: *) + +Usage: + python3 analyze-bedrock-costs.py [--region REGION] [--profile PROFILE] [--period DAYS] [--group-by model] +""" + +import argparse +import sys +from datetime import datetime, timedelta, timezone + +try: + import boto3 + from botocore.exceptions import ClientError +except ImportError: + print("[FAIL] boto3 not installed. Run: pip3 install boto3") + sys.exit(1) + + +def get_bedrock_costs(session, period_days, group_by_model=False): + """Query Cost Explorer for Bedrock costs.""" + # Cost Explorer endpoint is always us-east-1 + ce = session.client("ce", region_name="us-east-1") + + end_date = datetime.now(timezone.utc).strftime("%Y-%m-%d") + start_date = (datetime.now(timezone.utc) - timedelta(days=period_days)).strftime("%Y-%m-%d") + + filter_expr = { + "Dimensions": { + "Key": "SERVICE", + "Values": ["Amazon Bedrock"], + } + } + + group_by = [] + if group_by_model: + group_by = [{"Type": "DIMENSION", "Key": "USAGE_TYPE"}] + + try: + resp = ce.get_cost_and_usage( + TimePeriod={"Start": start_date, "End": end_date}, + Granularity="DAILY", + Metrics=["UnblendedCost", "UsageQuantity"], + Filter=filter_expr, + GroupBy=group_by if group_by else None, + ) if group_by else ce.get_cost_and_usage( + TimePeriod={"Start": start_date, "End": end_date}, + Granularity="DAILY", + Metrics=["UnblendedCost", "UsageQuantity"], + Filter=filter_expr, + ) + return resp + except ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code == "AccessDeniedException": + print("[FAIL] Access denied to Cost Explorer.") + print(" Ensure your IAM role has ce:GetCostAndUsage permission.") + print() + print(" Minimum IAM policy:") + print(' {"Effect": "Allow", "Action": "ce:GetCostAndUsage", "Resource": "*"}') + sys.exit(1) + elif "not subscribed" in str(e).lower() or "not enabled" in str(e).lower(): + print("[FAIL] Cost Explorer is not enabled in this account.") + print(" Enable it: AWS Console > Billing > Cost Explorer > Enable Cost Explorer") + print(" It takes up to 24 hours to activate after enabling.") + sys.exit(1) + else: + raise + + +def print_summary_report(period_days, response): + """Print aggregate Bedrock cost summary.""" + print("=== Bedrock Cost Report (AWS Cost Explorer) ===") + print(f"Period: Last {period_days} day(s)") + print(f"Source: AWS Cost Explorer (actual billed amounts)") + print() + + total_cost = 0.0 + daily_costs = [] + + for result in response.get("ResultsByTime", []): + date = result["TimePeriod"]["Start"] + amount = float(result["Total"]["UnblendedCost"]["Amount"]) + total_cost += amount + if amount > 0: + daily_costs.append((date, amount)) + + if total_cost == 0: + print(" [INFO] No Bedrock charges found for this period.") + print(" Cost Explorer data can take up to 24 hours to appear.") + print() + print(" For token-level usage metrics, run /bedrock-usage instead.") + return 0 + + print("--- Total Spend ---") + print(f" Amazon Bedrock: ${total_cost:>10.2f}") + if daily_costs: + avg_daily = total_cost / len(daily_costs) + print(f" Daily average: ${avg_daily:>10.2f}") + print() + + if daily_costs: + print("--- Daily Breakdown ---") + for date, amount in sorted(daily_costs): + print(f" {date}: ${amount:>10.4f}") + print() + + print("--- Next Steps ---") + print(" Run /bedrock-usage for token-level consumption metrics.") + print(" Run /bedrock-quota to check for max_tokens waste.") + print(f" Current pricing: https://aws.amazon.com/bedrock/pricing/") + print() + + return 0 + + +def print_grouped_report(period_days, response): + """Print Bedrock costs grouped by usage type (model).""" + print("=== Bedrock Cost Report by Usage Type (AWS Cost Explorer) ===") + print(f"Period: Last {period_days} day(s)") + print(f"Source: AWS Cost Explorer (actual billed amounts)") + print() + + usage_totals = {} + grand_total = 0.0 + + for result in response.get("ResultsByTime", []): + for group in result.get("Groups", []): + usage_type = group["Keys"][0] + amount = float(group["Metrics"]["UnblendedCost"]["Amount"]) + if amount > 0: + usage_totals[usage_type] = usage_totals.get(usage_type, 0) + amount + grand_total += amount + + if not usage_totals: + print(" [INFO] No Bedrock charges found for this period.") + print(" Cost Explorer data can take up to 24 hours to appear.") + return 0 + + print("--- Cost by Usage Type ---") + for usage_type, amount in sorted(usage_totals.items(), key=lambda x: x[1], reverse=True): + pct = (amount / grand_total * 100) if grand_total > 0 else 0 + print(f" {usage_type:<55} ${amount:>10.4f} ({pct:>5.1f}%)") + print(f" {'─' * 75}") + print(f" {'Total':<55} ${grand_total:>10.4f}") + print() + + print("--- Next Steps ---") + print(" Run /bedrock-usage for token-level consumption metrics.") + print(f" Current pricing: https://aws.amazon.com/bedrock/pricing/") + print() + + return 0 + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze Bedrock costs from AWS Cost Explorer" + ) + parser.add_argument("--region", default="us-east-1", + help="AWS region for session (default: us-east-1)") + parser.add_argument("--profile", required=True, + help="AWS CLI profile name") + parser.add_argument("--period", type=int, default=7, + help="Analysis period in days (default: 7)") + parser.add_argument("--group-by", choices=["model"], default=None, + help="Group costs by usage type (model)") + args = parser.parse_args() + + session = boto3.Session(profile_name=args.profile, region_name=args.region) + + group_by_model = args.group_by == "model" + response = get_bedrock_costs(session, args.period, group_by_model=group_by_model) + + if group_by_model: + exit_code = print_grouped_report(args.period, response) + else: + exit_code = print_summary_report(args.period, response) + + sys.exit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/plugins/bedrock/scripts/analyze-bedrock-usage.py b/plugins/bedrock/scripts/analyze-bedrock-usage.py new file mode 100644 index 0000000..e99563e --- /dev/null +++ b/plugins/bedrock/scripts/analyze-bedrock-usage.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +""" +Bedrock token usage analysis from CloudWatch metrics. + +Queries CloudWatch for token consumption, invocation counts, and prompt caching +efficiency. Does not calculate costs — use /bedrock-costs for actual spend. + +Required IAM permissions: + cloudwatch:GetMetricStatistics (Resource: *) + +Usage: + python3 analyze-bedrock-usage.py [--model-id MODEL_ID] [--region REGION] [--profile PROFILE] [--period DAYS] [--all-models] +""" + +import argparse +import sys +from datetime import datetime, timedelta, timezone + +try: + import boto3 + from botocore.exceptions import ClientError +except ImportError: + print("[FAIL] boto3 not installed. Run: pip3 install boto3") + sys.exit(1) + +KNOWN_MODELS = [ + "anthropic.claude-sonnet-4-6", + "anthropic.claude-opus-4-6", + "anthropic.claude-haiku-4-5", + "anthropic.claude-sonnet-4-5", + "anthropic.claude-opus-4-5", + "anthropic.claude-3-7-sonnet", + "anthropic.claude-3-5-sonnet-v2", + "anthropic.claude-3-5-haiku", + "amazon.nova-pro", + "amazon.nova-lite", + "amazon.nova-micro", +] + + +def strip_prefix(model_id): + clean = model_id + for prefix in ("us.", "eu.", "ap.", "global."): + clean = clean.removeprefix(prefix) + return clean + + +def get_metric(cw, model_id, metric_name, start_time, end_time, period): + """Query a single CloudWatch metric and return the sum.""" + try: + resp = cw.get_metric_statistics( + Namespace="AWS/Bedrock", + MetricName=metric_name, + Dimensions=[{"Name": "ModelId", "Value": model_id}], + StartTime=start_time, + EndTime=end_time, + Period=period, + Statistics=["Sum"], + ) + datapoints = resp.get("Datapoints", []) + return sum(d.get("Sum", 0) for d in datapoints) + except ClientError: + return 0 + + +def get_all_metrics(session, region, model_id, period_days): + """Fetch all token usage metrics for a model.""" + cw = session.client("cloudwatch", region_name=region) + end_time = datetime.now(timezone.utc) + start_time = end_time - timedelta(days=period_days) + period = 3600 + + return { + "input_tokens": get_metric(cw, model_id, "InputTokenCount", start_time, end_time, period), + "output_tokens": get_metric(cw, model_id, "OutputTokenCount", start_time, end_time, period), + "cache_read_tokens": get_metric(cw, model_id, "CacheReadInputTokenCount", start_time, end_time, period), + "cache_write_tokens": get_metric(cw, model_id, "CacheWriteInputTokenCount", start_time, end_time, period), + "invocations": get_metric(cw, model_id, "Invocations", start_time, end_time, period), + } + + +def discover_active_models(session, region, period_days): + """Find all models with CloudWatch metrics in the given period.""" + cw = session.client("cloudwatch", region_name=region) + end_time = datetime.now(timezone.utc) + start_time = end_time - timedelta(days=period_days) + + active = [] + for base_model in KNOWN_MODELS: + for prefix in ["", "us.", "eu.", "ap.", "global."]: + model_id = f"{prefix}{base_model}" + try: + resp = cw.get_metric_statistics( + Namespace="AWS/Bedrock", + MetricName="Invocations", + Dimensions=[{"Name": "ModelId", "Value": model_id}], + StartTime=start_time, + EndTime=end_time, + Period=86400, + Statistics=["Sum"], + ) + total = sum(d.get("Sum", 0) for d in resp.get("Datapoints", [])) + if total > 0: + active.append({"model_id": model_id, "invocations": int(total)}) + except ClientError: + pass + return active + + +def print_report(model_id, region, period_days, metrics): + print(f"=== Bedrock Usage Report ===") + print(f"Model: {model_id}") + print(f"Region: {region}") + print(f"Period: Last {period_days} day(s)") + print() + + if metrics["invocations"] == 0: + print(" [INFO] No usage data found for this model/region/period.") + print(" Metrics appear automatically after your first Bedrock API call.") + print() + return 0 + + print("--- Invocations ---") + print(f" Total requests: {metrics['invocations']:>12,.0f}") + avg_input = metrics["input_tokens"] / max(metrics["invocations"], 1) + avg_output = metrics["output_tokens"] / max(metrics["invocations"], 1) + print(f" Avg input tokens: {avg_input:>12,.0f}") + print(f" Avg output tokens: {avg_output:>12,.0f}") + print() + + print("--- Token Consumption ---") + print(f" Input tokens: {metrics['input_tokens']:>12,.0f}") + print(f" Output tokens: {metrics['output_tokens']:>12,.0f}") + total_tokens = metrics["input_tokens"] + metrics["output_tokens"] + print(f" Total tokens: {total_tokens:>12,.0f}") + print() + + total_cache = metrics["cache_read_tokens"] + metrics["cache_write_tokens"] + if total_cache > 0: + print("--- Prompt Caching ---") + print(f" Cache write tokens: {metrics['cache_write_tokens']:>12,.0f}") + print(f" Cache read tokens: {metrics['cache_read_tokens']:>12,.0f}") + hit_ratio = metrics["cache_read_tokens"] / total_cache if total_cache > 0 else 0 + print(f" Cache hit ratio: {hit_ratio:>11.0%}") + if hit_ratio < 0.5: + print(f" \033[1;33m[WARN]\033[0m Low cache hit ratio — more writes than reads.") + print(f" Run /bedrock-cache-debug to diagnose.") + elif hit_ratio >= 0.8: + print(f" \033[0;32m[GOOD]\033[0m High cache hit ratio — caching is working well.") + print() + else: + if metrics["input_tokens"] > 100000: + print("--- Prompt Caching ---") + print(f" \033[1;33m[WARN]\033[0m No caching detected on {metrics['invocations']:,.0f} requests.") + print(f" Run /bedrock-cache to enable prompt caching.") + print() + + print("--- For Cost Analysis ---") + print(f" Run /bedrock-costs to see actual spend from AWS Cost Explorer.") + print(f" Current pricing: https://aws.amazon.com/bedrock/pricing/") + print() + + return 0 + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze Bedrock token usage from CloudWatch metrics" + ) + parser.add_argument("--model-id", default="us.anthropic.claude-sonnet-4-6", + help="Bedrock model ID (default: us.anthropic.claude-sonnet-4-6)") + parser.add_argument("--region", default="us-east-1", + help="AWS region (default: us-east-1)") + parser.add_argument("--profile", required=True, + help="AWS CLI profile name") + parser.add_argument("--period", type=int, default=7, + help="Analysis period in days (default: 7)") + parser.add_argument("--all-models", action="store_true", + help="Scan all models with CloudWatch metrics") + args = parser.parse_args() + + session = boto3.Session(profile_name=args.profile, region_name=args.region) + + if args.all_models: + print("=== Discovering Active Models ===") + print(f"Region: {args.region}") + print(f"Period: Last {args.period} day(s)") + print() + + active = discover_active_models(session, args.region, args.period) + if not active: + print(" [INFO] No models with traffic found in this region/period.") + sys.exit(0) + + print(f" Found {len(active)} active model(s):") + for m in sorted(active, key=lambda x: x["invocations"], reverse=True): + print(f" {m['model_id']}: {m['invocations']:,} invocations") + print() + + for m in sorted(active, key=lambda x: x["invocations"], reverse=True): + metrics = get_all_metrics(session, args.region, m["model_id"], args.period) + print_report(m["model_id"], args.region, args.period, metrics) + else: + metrics = get_all_metrics(session, args.region, args.model_id, args.period) + exit_code = print_report(args.model_id, args.region, args.period, metrics) + sys.exit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/plugins/bedrock/scripts/check-quota-health.py b/plugins/bedrock/scripts/check-quota-health.py new file mode 100644 index 0000000..35c35cd --- /dev/null +++ b/plugins/bedrock/scripts/check-quota-health.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +""" +Bedrock quota health check and max_tokens optimization analysis. + +Queries AWS Service Quotas and CloudWatch to diagnose quota utilization, +detect the max_tokens pre-reservation trap, and generate data for quota +increase requests. + +Usage: + python3 check-quota-health.py [--model-id MODEL_ID] [--region REGION] [--profile PROFILE] [--period HOURS] +""" + +import argparse +import sys +from datetime import datetime, timedelta, timezone + +try: + import boto3 + from botocore.exceptions import ClientError +except ImportError: + print("[FAIL] boto3 not installed. Run: pip3 install boto3") + sys.exit(1) + +# Burndown rate: Claude 3.7+ uses 5x for output tokens; all others use 1x. +# See: https://docs.aws.amazon.com/bedrock/latest/userguide/quotas-token-burndown.html +BURNDOWN_5X_MODELS = [ + "anthropic.claude-sonnet-4-6", + "anthropic.claude-opus-4-6", + "anthropic.claude-haiku-4-5", + "anthropic.claude-sonnet-4-5", + "anthropic.claude-opus-4-5", + "anthropic.claude-3-7-sonnet", +] + +# Default max_tokens when not explicitly set by the developer +DEFAULT_MAX_TOKENS = { + "anthropic.claude-sonnet-4-6": 64000, + "anthropic.claude-opus-4-6": 64000, + "anthropic.claude-haiku-4-5": 64000, + "anthropic.claude-sonnet-4-5": 64000, + "anthropic.claude-opus-4-5": 64000, + "anthropic.claude-3-7-sonnet": 64000, + "anthropic.claude-3-5-sonnet-v2": 8192, + "anthropic.claude-3-5-haiku": 8192, + "amazon.nova-pro": 5120, + "amazon.nova-lite": 5120, + "amazon.nova-micro": 5120, +} + + +def strip_cross_region_prefix(model_id): + clean = model_id + for prefix in ("us.", "eu.", "ap.", "global."): + clean = clean.removeprefix(prefix) + return clean + + +def get_burndown_rate(model_id): + """Return output token burndown multiplier for quota calculation.""" + clean = strip_cross_region_prefix(model_id) + for model in BURNDOWN_5X_MODELS: + if clean.startswith(model): + return 5 + return 1 + + +def get_default_max_tokens(model_id): + clean = strip_cross_region_prefix(model_id) + for key, val in DEFAULT_MAX_TOKENS.items(): + if clean.startswith(key): + return val + return 4096 + + +def model_display_name(model_id): + """Map model ID to the name AWS uses in Service Quotas.""" + clean = strip_cross_region_prefix(model_id) + name_map = { + "anthropic.claude-sonnet-4-6": "Claude Sonnet 4.6", + "anthropic.claude-opus-4-6": "Claude Opus 4.6", + "anthropic.claude-haiku-4-5": "Claude Haiku 4.5", + "anthropic.claude-sonnet-4-5": "Claude Sonnet 4.5 V1", + "anthropic.claude-opus-4-5": "Claude Opus 4.5", + "anthropic.claude-3-7-sonnet": "Claude 3.7 Sonnet", + "anthropic.claude-3-5-sonnet-v2": "Claude 3.5 Sonnet V2", + "anthropic.claude-3-5-haiku": "Claude 3.5 Haiku", + "amazon.nova-pro": "Amazon Nova Pro", + "amazon.nova-lite": "Amazon Nova Lite", + "amazon.nova-micro": "Amazon Nova Micro", + } + for key, name in name_map.items(): + if clean.startswith(key): + return name + return clean + + +def get_quota_limits(session, region, model_id): + """Query Service Quotas for Bedrock TPM and RPM limits.""" + sq = session.client("service-quotas", region_name=region) + quotas = {} + + display_name = model_display_name(model_id).lower() + # Build tight search terms from the model's display name + # e.g., "Claude Sonnet 4.6" -> match only quotas with that exact model name + try: + paginator = sq.get_paginator("list_service_quotas") + for page in paginator.paginate(ServiceCode="bedrock"): + for q in page.get("Quotas", []): + name_lower = q["QuotaName"].lower() + if not any(kw in name_lower for kw in ["tokens per minute", "requests per minute"]): + continue + # Match against the specific model display name + if display_name in name_lower: + quotas[q["QuotaName"]] = { + "value": q["Value"], + "code": q["QuotaCode"], + "adjustable": q.get("Adjustable", False), + } + except ClientError as e: + if "NoSuchResourceException" in str(e) or "AccessDeniedException" in str(e): + pass + else: + raise + + return quotas + + +def get_usage_metrics(session, region, model_id, period_hours): + """Query CloudWatch for Bedrock token usage metrics.""" + cw = session.client("cloudwatch", region_name=region) + end_time = datetime.now(timezone.utc) + start_time = end_time - timedelta(hours=period_hours) + period = 300 if period_hours <= 24 else 3600 # 5min or 1hr granularity + + metrics_config = { + "InputTokenCount": ["Sum", "Average", "Maximum"], + "OutputTokenCount": ["Sum", "Average", "Maximum"], + "Invocations": ["Sum"], + "InvocationLatency": ["Average", "Maximum"], + } + + results = {} + for metric_name, stats in metrics_config.items(): + try: + resp = cw.get_metric_statistics( + Namespace="AWS/Bedrock", + MetricName=metric_name, + Dimensions=[{"Name": "ModelId", "Value": model_id}], + StartTime=start_time, + EndTime=end_time, + Period=period, + Statistics=stats, + ) + datapoints = resp.get("Datapoints", []) + if datapoints: + results[metric_name] = { + "datapoints": sorted(datapoints, key=lambda x: x["Timestamp"]), + "count": len(datapoints), + } + for stat in stats: + values = [d[stat] for d in datapoints if stat in d] + if values: + results[metric_name][f"{stat}_all"] = values + results[metric_name][stat] = sum(values) / len(values) if stat == "Average" else ( + sum(values) if stat == "Sum" else max(values) + ) + except ClientError: + pass + + return results + + +def analyze(model_id, metrics, burndown_rate, default_max_tokens, period_hours): + """Analyze metrics and produce recommendations.""" + recommendations = [] + analysis = {} + + total_invocations = metrics.get("Invocations", {}).get("Sum", 0) + total_input = metrics.get("InputTokenCount", {}).get("Sum", 0) + total_output = metrics.get("OutputTokenCount", {}).get("Sum", 0) + + if total_invocations == 0: + return analysis, recommendations + + avg_input = total_input / total_invocations + avg_output = total_output / total_invocations + + # Estimate p90 output from maximum average per period + max_avg_output = metrics.get("OutputTokenCount", {}).get("Maximum", avg_output) + p90_output_estimate = max_avg_output # Conservative: use max observed per-period average + + analysis["avg_input_tokens"] = avg_input + analysis["avg_output_tokens"] = avg_output + analysis["p90_output_estimate"] = p90_output_estimate + analysis["total_invocations"] = total_invocations + + # max_tokens trap analysis + recommended_max_tokens = int(p90_output_estimate * 1.5) + recommended_max_tokens = max(recommended_max_tokens, 256) + + reserved_with_default = avg_input + (burndown_rate * default_max_tokens) + reserved_with_optimal = avg_input + (burndown_rate * recommended_max_tokens) + actual_usage = avg_input + (burndown_rate * avg_output) + + analysis["default_max_tokens"] = default_max_tokens + analysis["recommended_max_tokens"] = recommended_max_tokens + analysis["reserved_with_default"] = reserved_with_default + analysis["reserved_with_optimal"] = reserved_with_optimal + analysis["actual_usage"] = actual_usage + + waste_ratio = (reserved_with_default - actual_usage) / reserved_with_default if reserved_with_default > 0 else 0 + analysis["waste_ratio"] = waste_ratio + + if waste_ratio > 0.5: + recommendations.append({ + "severity": "HIGH", + "title": "Set max_tokens to reduce quota waste", + "detail": ( + f"Default max_tokens ({default_max_tokens:,}) reserves {reserved_with_default:,.0f} quota tokens/request " + f"(with {burndown_rate}x burndown), but actual usage is only {actual_usage:,.0f}. " + f"Setting max_tokens to {recommended_max_tokens:,} would reduce reservation to {reserved_with_optimal:,.0f} " + f"({waste_ratio:.0%} less waste)." + ), + }) + + if burndown_rate > 1: + recommendations.append({ + "severity": "INFO", + "title": f"This model uses {burndown_rate}x output token burndown", + "detail": ( + f"Each output token consumes {burndown_rate} tokens from your quota. " + f"With avg {avg_output:,.0f} output tokens/request, that's {avg_output * burndown_rate:,.0f} quota tokens " + f"for output alone. This is a quota management concern, not a billing concern." + ), + }) + + # Cross-region inference check + clean = strip_cross_region_prefix(model_id) + if model_id == clean: + recommendations.append({ + "severity": "MED", + "title": "Consider cross-region inference for higher throughput", + "detail": ( + f"You're using a bare model ID '{model_id}'. Claude models require a cross-region " + f"prefix (e.g., 'us.{clean}'). Use the prefixed model ID for access, higher " + f"throughput, and automatic failover. Run /bedrock-validate-model-access to test." + ), + }) + + # Peak TPM analysis + input_per_period = metrics.get("InputTokenCount", {}).get("Sum_all", []) + output_per_period = metrics.get("OutputTokenCount", {}).get("Sum_all", []) + if input_per_period and output_per_period: + # Calculate tokens per minute from 5-min period sums + period_minutes = 5 if period_hours <= 24 else 60 + tpm_values = [] + for i_dp, o_dp in zip( + metrics["InputTokenCount"]["datapoints"], + metrics["OutputTokenCount"]["datapoints"] + ): + tokens_in_period = i_dp.get("Sum", 0) + o_dp.get("Sum", 0) * burndown_rate + tpm_values.append(tokens_in_period / period_minutes) + + if tpm_values: + analysis["peak_tpm"] = max(tpm_values) + analysis["avg_tpm"] = sum(tpm_values) / len(tpm_values) + + return analysis, recommendations + + +def generate_quota_increase_data(analysis, model_id, region, period_hours): + """Generate the data AWS requires for quota increase requests.""" + data = { + "model_id": model_id, + "region": region, + "observation_period": f"Last {period_hours} hours", + "total_requests": int(analysis.get("total_invocations", 0)), + "avg_input_tokens_per_request": int(analysis.get("avg_input_tokens", 0)), + "avg_output_tokens_per_request": int(analysis.get("avg_output_tokens", 0)), + "steady_state_tpm": int(analysis.get("avg_tpm", 0)), + "peak_tpm": int(analysis.get("peak_tpm", 0)), + } + return data + + +def print_report(model_id, region, period_hours, burndown_rate, quotas, metrics, analysis, recommendations): + print("=== Bedrock Quota Health Check ===") + print(f"Model: {model_id}") + print(f"Region: {region}") + print(f"Period: Last {period_hours} hours") + print(f"Burndown rate: {burndown_rate}x for output tokens") + print() + + # Quota limits + print("--- Quota Limits ---") + if quotas: + for name, info in quotas.items(): + adjustable = " (adjustable)" if info["adjustable"] else "" + print(f" {name}: {info['value']:,.0f}{adjustable}") + else: + print(" [INFO] Could not retrieve quota limits. Check service-quotas:ListServiceQuotas permission.") + print(" View quotas: https://console.aws.amazon.com/servicequotas/home/services/bedrock/quotas") + print() + + # Usage metrics + print("--- Current Usage ---") + total_invocations = metrics.get("Invocations", {}).get("Sum", 0) + if total_invocations == 0: + print(" [INFO] No CloudWatch metrics found for this model/region/period.") + print(" This is normal for new accounts or if model invocation logging is not yet active.") + print(" Metrics appear automatically after your first Bedrock API call.") + print() + print("=== Summary ===") + print(" [INFO] No usage data to analyze. Run some Bedrock requests first, then re-run this check.") + return 0 + + print(f" Total requests: {total_invocations:,.0f}") + print(f" Avg input tokens: {analysis.get('avg_input_tokens', 0):,.0f}") + print(f" Avg output tokens: {analysis.get('avg_output_tokens', 0):,.0f}") + if "peak_tpm" in analysis: + print(f" Peak TPM (estimated): {analysis['peak_tpm']:,.0f}") + print(f" Avg TPM: {analysis['avg_tpm']:,.0f}") + print() + + # max_tokens analysis + print("--- max_tokens Analysis ---") + default_mt = analysis.get("default_max_tokens", 0) + rec_mt = analysis.get("recommended_max_tokens", 0) + reserved_default = analysis.get("reserved_with_default", 0) + actual = analysis.get("actual_usage", 0) + waste = analysis.get("waste_ratio", 0) + + if waste > 0.5: + print(f" \033[1;33m[WARN]\033[0m Default max_tokens ({default_mt:,}) reserves {reserved_default:,.0f} quota tokens/request") + print(f" Actual usage is only {actual:,.0f} quota tokens/request ({waste:.0%} wasted)") + print(f" Recommendation: set max_tokens to {rec_mt:,}") + elif waste > 0.2: + print(f" [INFO] max_tokens reservation is {waste:.0%} above actual usage. Consider setting to {rec_mt:,}") + else: + print(f" \033[0;32m[PASS]\033[0m max_tokens appears well-sized for your workload") + print() + + # Recommendations + if recommendations: + print("--- Recommendations ---") + for i, rec in enumerate(recommendations, 1): + severity_color = {"HIGH": "\033[0;31m", "MED": "\033[1;33m", "INFO": "\033[0;36m"}.get(rec["severity"], "") + print(f" {i}. {severity_color}[{rec['severity']}]\033[0m {rec['title']}") + # Wrap detail text + detail = rec["detail"] + indent = " " + words = detail.split() + line = indent + for word in words: + if len(line) + len(word) + 1 > 100: + print(line) + line = indent + word + else: + line = line + " " + word if line.strip() else indent + word + if line.strip(): + print(line) + print() + + # Quota increase data + if "peak_tpm" in analysis: + qi_data = generate_quota_increase_data(analysis, model_id, region, period_hours) + print("--- Quota Increase Request Data ---") + print(" Copy-paste this data when requesting a quota increase via AWS Service Quotas console:") + print() + for key, val in qi_data.items(): + label = key.replace("_", " ").title() + print(f" {label}: {val:,}" if isinstance(val, int) else f" {label}: {val}") + print() + print(" Request quotas: https://console.aws.amazon.com/servicequotas/home/services/bedrock/quotas") + print() + + # Summary + print("=== Summary ===") + high_count = sum(1 for r in recommendations if r["severity"] == "HIGH") + med_count = sum(1 for r in recommendations if r["severity"] == "MED") + if high_count > 0: + print(f" \033[1;33m[WARN]\033[0m {high_count} high-priority optimization(s) found. See recommendations above.") + return 1 + elif med_count > 0: + print(f" [INFO] {med_count} suggestion(s) for improvement. See recommendations above.") + return 0 + else: + print(" \033[0;32m[PASS]\033[0m Quota utilization looks healthy.") + return 0 + + +def main(): + parser = argparse.ArgumentParser( + description="Check Bedrock quota health and detect max_tokens waste" + ) + parser.add_argument("--model-id", default="us.anthropic.claude-sonnet-4-6", + help="Bedrock model ID (default: us.anthropic.claude-sonnet-4-6)") + parser.add_argument("--region", default="us-east-1", + help="AWS region (default: us-east-1)") + parser.add_argument("--profile", required=True, + help="AWS CLI profile name") + parser.add_argument("--period", type=int, default=24, + help="Analysis period in hours (default: 24)") + args = parser.parse_args() + + session = boto3.Session(profile_name=args.profile, region_name=args.region) + burndown_rate = get_burndown_rate(args.model_id) + + quotas = get_quota_limits(session, args.region, args.model_id) + metrics = get_usage_metrics(session, args.region, args.model_id, args.period) + analysis_result, recommendations = analyze( + args.model_id, metrics, burndown_rate, + get_default_max_tokens(args.model_id), args.period + ) + + exit_code = print_report( + args.model_id, args.region, args.period, + burndown_rate, quotas, metrics, analysis_result, recommendations + ) + sys.exit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/plugins/bedrock/scripts/debug-prompt-cache.py b/plugins/bedrock/scripts/debug-prompt-cache.py new file mode 100644 index 0000000..925f2bb --- /dev/null +++ b/plugins/bedrock/scripts/debug-prompt-cache.py @@ -0,0 +1,433 @@ +#!/usr/bin/env python3 +""" +Bedrock prompt cache diagnostic suite. + +Runs 6 targeted tests to identify exactly why prompt caching is not working +or is underperforming, and provides a break-even cost analysis. + +Usage: + python3 debug-prompt-cache.py [--model-id MODEL_ID] [--region REGION] [--profile PROFILE] [--verbose] +""" + +import argparse +import sys +import time + +try: + import boto3 + from botocore.exceptions import ClientError +except ImportError: + print("[FAIL] boto3 not installed. Run: pip3 install boto3") + sys.exit(1) + +MODEL_THRESHOLDS = { + "anthropic.claude-sonnet-4-6": 2048, + "anthropic.claude-opus-4-6": 4096, + "anthropic.claude-haiku-4-5": 4096, + "anthropic.claude-sonnet-4-5": 1024, + "anthropic.claude-opus-4-5": 4096, + "anthropic.claude-3-7-sonnet": 1024, + "anthropic.claude-3-5-sonnet-v2": 1024, + "anthropic.claude-3-5-haiku": 2048, + "amazon.nova-pro": 1024, + "amazon.nova-lite": 1536, + "amazon.nova-micro": 1536, +} + +HOUR_TTL_MODELS = [ + "anthropic.claude-sonnet-4-6", + "anthropic.claude-sonnet-4-5", + "anthropic.claude-opus-4-6", + "anthropic.claude-opus-4-5", + "anthropic.claude-haiku-4-5", +] + + +def strip_prefix(model_id): + clean = model_id + for prefix in ("us.", "eu.", "ap.", "global."): + clean = clean.removeprefix(prefix) + return clean + + +def get_min_tokens(model_id): + clean = strip_prefix(model_id) + for key, threshold in MODEL_THRESHOLDS.items(): + if clean.startswith(key): + return threshold + return 2048 + + +def supports_hour_ttl(model_id): + clean = strip_prefix(model_id) + return any(clean.startswith(m) for m in HOUR_TTL_MODELS) + + +def is_known_model(model_id): + clean = strip_prefix(model_id) + return any(clean.startswith(key) for key in MODEL_THRESHOLDS) + + +def generate_system_prompt(target_tokens): + """Generate a system prompt of approximately target_tokens length.""" + base = ( + "You are an expert software architect specializing in distributed systems, " + "cloud-native applications, and API design. Your role is to review code and " + "provide detailed, actionable feedback on scalability, maintainability, " + "security, and performance. " + ) + target_chars = target_tokens * 8 + repetitions = (target_chars // len(base)) + 1 + topics = [ + "microservices architecture", "event-driven design", "database optimization", + "caching strategies", "load balancing", "circuit breaker patterns", + "API versioning", "observability and monitoring", "security best practices", + "infrastructure as code", "container orchestration", "serverless patterns", + "data pipeline design", "message queue architecture", "rate limiting", + "authentication and authorization", "deployment strategies", "testing patterns", + "error handling", "logging and tracing", + ] + paragraphs = [] + for i in range(repetitions): + topic = topics[i % len(topics)] + paragraphs.append( + f"When reviewing {topic}, consider the following principles: {base}" + f"Apply these principles rigorously to {topic} implementations. " + f"Look for common anti-patterns in {topic} and suggest concrete improvements. " + ) + return "\n\n".join(paragraphs)[:target_chars] + + +def converse_with_cache(client, model_id, system_prompt, cache_point=None, verbose=False): + """Send a Converse request with optional cache point. Returns (usage, latency_ms).""" + system_blocks = [{"text": system_prompt}] + if cache_point is not None: + system_blocks.append(cache_point) + + messages = [{"role": "user", "content": [{"text": "Summarize your top 3 principles in one sentence each."}]}] + + start = time.time() + resp = client.converse( + modelId=model_id, + system=system_blocks, + messages=messages, + inferenceConfig={"maxTokens": 50}, + ) + latency = (time.time() - start) * 1000 + usage = resp.get("usage", {}) + + if verbose: + print(f" [DEBUG] Usage: {usage}") + print(f" [DEBUG] Latency: {latency:.0f}ms") + + return usage, latency + + +def test_model_supports_caching(client, model_id, min_tokens, verbose): + """Test 1: Verify model supports prompt caching.""" + print("--- Test 1: Model Caching Support ---") + + if not is_known_model(model_id): + print(f" \033[1;33m[WARN]\033[0m Model '{model_id}' is not in the known caching-supported list") + print(f" Testing via API to check actual behavior...") + + system_prompt = generate_system_prompt(min_tokens) + cache_point = {"cachePoint": {"type": "default"}} + + try: + usage, _ = converse_with_cache(client, model_id, system_prompt, cache_point, verbose) + cache_write = usage.get("cacheWriteInputTokens", 0) + cache_read = usage.get("cacheReadInputTokens", 0) + + if cache_write > 0 or cache_read > 0: + print(f" \033[0;32m[PASS]\033[0m Model supports prompt caching") + return True + else: + print(f" \033[0;31m[FAIL]\033[0m No cache activity detected. Model may not support caching.") + print(f" Caching is silently ignored for unsupported models -- no error is raised.") + return False + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m API call failed: {e}") + return False + + +def test_token_threshold(client, model_id, min_tokens, verbose): + """Test 2: Verify cache respects minimum token threshold.""" + print(f"--- Test 2: Token Threshold (minimum: {min_tokens:,} tokens) ---") + + cache_point = {"cachePoint": {"type": "default"}} + + # Test below threshold (~50% of minimum) + below_tokens = max(min_tokens // 2, 100) + below_prompt = generate_system_prompt(below_tokens) + try: + usage_below, _ = converse_with_cache(client, model_id, below_prompt, cache_point, verbose) + below_write = usage_below.get("cacheWriteInputTokens", 0) + below_read = usage_below.get("cacheReadInputTokens", 0) + + if below_write == 0 and below_read == 0: + print(f" \033[0;32m[PASS]\033[0m Below-threshold content ({below_tokens} target tokens): correctly not cached") + else: + print(f" [INFO] Below-threshold content showed cache activity (write={below_write}, read={below_read})") + print(f" This may indicate the actual token count exceeded the threshold despite targeting {below_tokens}") + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m Below-threshold test failed: {e}") + return False + + # Test above threshold + above_prompt = generate_system_prompt(min_tokens) + try: + usage_above, _ = converse_with_cache(client, model_id, above_prompt, cache_point, verbose) + above_write = usage_above.get("cacheWriteInputTokens", 0) + above_read = usage_above.get("cacheReadInputTokens", 0) + + if above_write > 0 or above_read > 0: + tokens_shown = above_write if above_write > 0 else above_read + activity = "write" if above_write > 0 else "read (already cached)" + print(f" \033[0;32m[PASS]\033[0m Above-threshold content: cache {activity} confirmed ({tokens_shown:,} tokens)") + return True + else: + print(f" \033[0;31m[FAIL]\033[0m Above-threshold content was not cached. Token count may still be below minimum.") + print(f" Try increasing your cached content size.") + return False + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m Above-threshold test failed: {e}") + return False + + +def test_cache_write_read_cycle(client, model_id, min_tokens, verbose): + """Test 3: Verify cache write then cache read cycle.""" + print("--- Test 3: Cache Write/Read Cycle ---") + + system_prompt = generate_system_prompt(min_tokens) + cache_point = {"cachePoint": {"type": "default"}} + + # Request 1: expect cache write + try: + usage1, latency1 = converse_with_cache(client, model_id, system_prompt, cache_point, verbose) + write1 = usage1.get("cacheWriteInputTokens", 0) + read1 = usage1.get("cacheReadInputTokens", 0) + print(f" Request 1: cache write = {write1:,} tokens, cache read = {read1:,} tokens ({latency1:.0f}ms)") + + if write1 == 0 and read1 == 0: + print(f" \033[0;31m[FAIL]\033[0m No cache activity on request 1") + return False, 0, 0 + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m Request 1 failed: {e}") + return False, 0, 0 + + time.sleep(1) + + # Request 2: expect cache read + try: + usage2, latency2 = converse_with_cache(client, model_id, system_prompt, cache_point, verbose) + write2 = usage2.get("cacheWriteInputTokens", 0) + read2 = usage2.get("cacheReadInputTokens", 0) + print(f" Request 2: cache write = {write2:,} tokens, cache read = {read2:,} tokens ({latency2:.0f}ms)") + + if read2 > 0: + improvement = ((latency1 - latency2) / latency1 * 100) if latency1 > 0 else 0 + print(f" \033[0;32m[PASS]\033[0m Cache working: {improvement:.0f}% latency improvement") + return True, latency1, latency2 + else: + print(f" \033[0;31m[FAIL]\033[0m No cache read on request 2. Cache may have been evicted.") + return False, latency1, latency2 + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m Request 2 failed: {e}") + return False, 0, 0 + + +def test_prefix_sensitivity(client, model_id, min_tokens, verbose): + """Test 4: Demonstrate that modifying cached content causes a cache miss.""" + print("--- Test 4: Prefix Sensitivity ---") + + system_prompt = generate_system_prompt(min_tokens) + cache_point = {"cachePoint": {"type": "default"}} + + # Warm the cache with the original prompt + try: + converse_with_cache(client, model_id, system_prompt, cache_point, verbose) + time.sleep(1) + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m Cache warm-up failed: {e}") + return False + + # Send modified prompt (append a word to break exact match) + modified_prompt = system_prompt + " Additionally, consider edge cases." + try: + usage, _ = converse_with_cache(client, model_id, modified_prompt, cache_point, verbose) + write = usage.get("cacheWriteInputTokens", 0) + read = usage.get("cacheReadInputTokens", 0) + + if write > 0 and read == 0: + print(f" \033[0;32m[PASS]\033[0m Modified prefix caused cache miss (new write: {write:,} tokens)") + print(f" [INFO] Cache requires exact byte-for-byte prefix match.") + print(f" [INFO] Even small changes (timestamps, IDs, whitespace) invalidate the cache.") + return True + elif read > 0: + print(f" [INFO] Modified prompt still got a cache read ({read:,} tokens).") + print(f" This may indicate simplified caching matched a partial prefix.") + return True + else: + print(f" [INFO] No cache activity on modified prompt (tokens may be below threshold after modification)") + return True + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m Modified prefix test failed: {e}") + return False + + +def test_ttl_behavior(client, model_id, min_tokens, verbose): + """Test 5: Verify cache persists within TTL window.""" + print("--- Test 5: TTL Behavior ---") + + system_prompt = generate_system_prompt(min_tokens) + cache_point = {"cachePoint": {"type": "default"}} + + # Write cache + try: + converse_with_cache(client, model_id, system_prompt, cache_point, verbose) + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m Cache write failed: {e}") + return False + + # Wait 3 seconds and verify cache still alive + time.sleep(3) + + try: + usage, _ = converse_with_cache(client, model_id, system_prompt, cache_point, verbose) + read = usage.get("cacheReadInputTokens", 0) + + if read > 0: + print(f" \033[0;32m[PASS]\033[0m Cache persists within TTL window (read {read:,} tokens after 3s)") + else: + print(f" \033[1;33m[WARN]\033[0m Cache was not read after 3s. May have been evicted under load.") + + print(f" [INFO] Default TTL: 5 minutes. Cache expires if no requests within this window.") + if supports_hour_ttl(model_id): + print(f" [INFO] 1-hour TTL available for this model. Use: {{\"cachePoint\": {{\"type\": \"default\", \"ttl\": \"1h\"}}}}") + else: + print(f" [INFO] This model only supports the default 5-minute TTL.") + return True + except ClientError as e: + print(f" \033[0;31m[FAIL]\033[0m TTL test failed: {e}") + return False + + +def test_break_even(min_tokens): + """Test 6: Calculate cost break-even for prompt caching.""" + print("--- Test 6: Break-Even Analysis ---") + + # Cache economics: + # - Cache write costs 25% MORE than standard input (1.25x) + # - Cache read costs 90% LESS than standard input (0.10x) + # - Standard input: 1.0x (baseline) + + print(f" Cache write premium: 25% over standard input price") + print(f" Cache read discount: 90% off standard input price") + print() + + # Break-even: 1 write (1.25) + N reads (N * 0.10) < 1 + N * 1.0 (all sending same content) + # 1.25 + 0.1N < 1 + N -> 0.25 < 0.9N -> N > 0.278 -> need >= 1 read after write + # But total cost comparison for N total requests: + # Without caching: N * 1.0 = N + # With caching: 1.25 + (N-1) * 0.10 + # Break-even: 1.25 + (N-1)*0.1 = N -> 1.25 + 0.1N - 0.1 = N -> 1.15 = 0.9N -> N = 1.28 + # So at 2 requests, caching wins. + + print(f" {'Requests/TTL':>15} | {'Without Cache':>15} | {'With Cache':>15} | {'Savings':>10}") + print(f" {'-'*15}-+-{'-'*15}-+-{'-'*15}-+-{'-'*10}") + + for n in [1, 2, 3, 5, 10, 20]: + without = n * 1.0 + with_cache = 1.25 + max(0, (n - 1)) * 0.10 + savings = (without - with_cache) / without * 100 if without > 0 else 0 + marker = " <-- break-even" if n == 2 else (" <-- COSTS MORE" if n == 1 else "") + print(f" {n:>15} | {without:>14.2f}x | {with_cache:>14.2f}x | {savings:>8.0f}%{marker}") + + print() + print(f" [INFO] You need at least 2 requests within the TTL window to save money.") + print(f" [INFO] At 1 request, caching INCREASES cost by 25% due to the write premium.") + print(f" [INFO] For single-use content (each document analyzed once), do NOT enable caching.") + return True + + +def run_diagnostics(model_id, region, profile, verbose): + print("=== Bedrock Prompt Cache Debugger ===") + print(f"Model: {model_id}") + print(f"Region: {region}") + print() + + session = boto3.Session(profile_name=profile, region_name=region) + client = session.client("bedrock-runtime") + min_tokens = get_min_tokens(model_id) + + tests_passed = 0 + tests_total = 6 + + # Test 1: Model support + if test_model_supports_caching(client, model_id, min_tokens, verbose): + tests_passed += 1 + else: + print() + print("=== Summary ===") + print(f" \033[0;31m[FAIL]\033[0m Model does not support caching. Remaining tests skipped.") + print(f" Caching is available for: {', '.join(sorted(MODEL_THRESHOLDS.keys()))}") + return 1 + print() + + # Test 2: Token threshold + if test_token_threshold(client, model_id, min_tokens, verbose): + tests_passed += 1 + print() + + # Test 3: Cache write/read cycle + cycle_passed, lat1, lat2 = test_cache_write_read_cycle(client, model_id, min_tokens, verbose) + if cycle_passed: + tests_passed += 1 + print() + + # Test 4: Prefix sensitivity + if test_prefix_sensitivity(client, model_id, min_tokens, verbose): + tests_passed += 1 + print() + + # Test 5: TTL behavior + if test_ttl_behavior(client, model_id, min_tokens, verbose): + tests_passed += 1 + print() + + # Test 6: Break-even analysis (pure math, always passes) + if test_break_even(min_tokens): + tests_passed += 1 + print() + + # Summary + print("=== Summary ===") + print(f" {tests_passed}/{tests_total} tests passed") + if tests_passed == tests_total: + print(f" \033[0;32m[PASS]\033[0m Prompt caching is healthy") + return 0 + else: + print(f" \033[1;33m[WARN]\033[0m {tests_total - tests_passed} test(s) need attention. See details above.") + return 1 + + +def main(): + parser = argparse.ArgumentParser( + description="Diagnose Bedrock prompt caching issues with 6 automated tests" + ) + parser.add_argument("--model-id", default="us.anthropic.claude-sonnet-4-6", + help="Bedrock model ID (default: us.anthropic.claude-sonnet-4-6)") + parser.add_argument("--region", default="us-east-1", + help="AWS region (default: us-east-1)") + parser.add_argument("--profile", required=True, + help="AWS CLI profile name") + parser.add_argument("--verbose", action="store_true", + help="Show detailed API response data") + args = parser.parse_args() + + sys.exit(run_diagnostics(args.model_id, args.region, args.profile, args.verbose)) + + +if __name__ == "__main__": + main() diff --git a/plugins/bedrock/scripts/validate-bedrock-access.sh b/plugins/bedrock/scripts/validate-bedrock-access.sh new file mode 100755 index 0000000..29d23a0 --- /dev/null +++ b/plugins/bedrock/scripts/validate-bedrock-access.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +# Validates AWS credentials, Bedrock access, and model availability. +# Usage: validate-bedrock-access.sh [MODEL_ID] [REGION] + +set -euo pipefail + +MODEL_ID="${1:-us.anthropic.claude-sonnet-4-6}" +REGION="${2:-us-east-1}" +PROFILE="${3:-}" + +# Never read AWS_PROFILE from the environment — it may be set for unrelated purposes. +# Profile must be passed as arg 3 or confirmed via the session lock file. +if [ -z "$PROFILE" ]; then + echo "[bedrock] No AWS profile specified. Run /bedrock:bedrock-setup to configure Bedrock access." + exit 1 +fi + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +pass() { echo -e "${GREEN}[PASS]${NC} $1"; } +fail() { echo -e "${RED}[FAIL]${NC} $1"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } + +echo "=== Bedrock Access Validation ===" +echo "Model: $MODEL_ID" +echo "Region: $REGION" +echo "Profile: $PROFILE" +echo "" + +ERRORS=0 + +# Step 1: AWS credentials +echo "--- Step 1: AWS Credentials ---" +if IDENTITY=$(aws sts get-caller-identity --profile "$PROFILE" --output json 2>&1); then + ACCOUNT=$(echo "$IDENTITY" | python3 -c "import sys,json; print(json.load(sys.stdin)['Account'])" 2>/dev/null || echo "unknown") + ARN=$(echo "$IDENTITY" | python3 -c "import sys,json; print(json.load(sys.stdin)['Arn'])" 2>/dev/null || echo "unknown") + pass "AWS credentials valid (Account: $ACCOUNT)" + pass "Identity: $ARN" +else + fail "AWS credentials not configured or invalid" + echo " Run: aws configure --profile $PROFILE" + exit 1 +fi +echo "" + +# Step 2: Bedrock service access +echo "--- Step 2: Bedrock Service Access ---" +if aws bedrock list-foundation-models --profile "$PROFILE" --region "$REGION" --output json >/dev/null 2>&1; then + pass "Bedrock service accessible in $REGION" +else + fail "Cannot access Bedrock service in $REGION" + echo " Check IAM policy includes bedrock:ListFoundationModels" + ERRORS=$((ERRORS + 1)) +fi +echo "" + +# Step 3: Model availability +echo "--- Step 3: Model Availability ---" +# Strip cross-region prefix for foundation model lookup +BASE_MODEL_ID="${MODEL_ID#us.}" +BASE_MODEL_ID="${BASE_MODEL_ID#eu.}" +BASE_MODEL_ID="${BASE_MODEL_ID#ap.}" +BASE_MODEL_ID="${BASE_MODEL_ID#global.}" + +if MODEL_INFO=$(aws bedrock get-foundation-model --model-identifier "$BASE_MODEL_ID" --profile "$PROFILE" --region "$REGION" --output json 2>&1); then + MODEL_NAME=$(echo "$MODEL_INFO" | python3 -c "import sys,json; print(json.load(sys.stdin)['modelDetails']['modelName'])" 2>/dev/null || echo "$BASE_MODEL_ID") + pass "Model found: $MODEL_NAME ($BASE_MODEL_ID)" +else + fail "Model $BASE_MODEL_ID not found in $REGION" + echo " Check model ID and region. List available models:" + echo " aws bedrock list-foundation-models --profile $PROFILE --region $REGION" + ERRORS=$((ERRORS + 1)) +fi +echo "" + +# Step 4: InvokeModel permission test (dry run via Converse with minimal input) +echo "--- Step 4: Invoke Permission Check ---" +CONVERSE_BODY=$(python3 -c " +import json +print(json.dumps({ + 'modelId': '$MODEL_ID', + 'messages': [{'role': 'user', 'content': [{'text': 'Say hello'}]}], + 'inferenceConfig': {'maxTokens': 1} +})) +") + +if CONVERSE_RESULT=$(aws bedrock-runtime converse \ + --model-id "$MODEL_ID" \ + --messages '[{"role":"user","content":[{"text":"Say hello"}]}]' \ + --inference-config '{"maxTokens":1}' \ + --profile "$PROFILE" \ + --region "$REGION" \ + --output json 2>&1); then + pass "InvokeModel/Converse permission confirmed" +else + if echo "$CONVERSE_RESULT" | grep -q "AccessDeniedException"; then + fail "Missing bedrock:Converse permission" + echo " Add bedrock:Converse and bedrock:InvokeModel to IAM policy" + ERRORS=$((ERRORS + 1)) + elif echo "$CONVERSE_RESULT" | grep -q "ResourceNotFoundException"; then + fail "Model not enabled for inference" + echo " For Claude: submit the one-time use case form in Bedrock console: https://console.aws.amazon.com/bedrock/" + echo " Ensure you're using the cross-region model ID (e.g., us.anthropic.claude-sonnet-4-6)" + ERRORS=$((ERRORS + 1)) + else + fail "Converse call failed: $CONVERSE_RESULT" + ERRORS=$((ERRORS + 1)) + fi +fi +echo "" + +# Summary +echo "=== Summary ===" +if [ "$ERRORS" -eq 0 ]; then + pass "All checks passed. Bedrock is ready to use." +else + fail "$ERRORS check(s) failed. See details above." +fi + +exit "$ERRORS" diff --git a/plugins/bedrock/scripts/validate-prompt-caching.py b/plugins/bedrock/scripts/validate-prompt-caching.py new file mode 100755 index 0000000..48fa11e --- /dev/null +++ b/plugins/bedrock/scripts/validate-prompt-caching.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +""" +End-to-end validation of Bedrock prompt caching. + +Sends two Converse API requests with identical cached content. +Verifies cache write on first request and cache read on second. + +Usage: + python3 validate-prompt-caching.py [--model-id MODEL_ID] [--region REGION] [--profile PROFILE] [--ttl TTL] +""" + +import argparse +import sys +import time + +try: + import boto3 +except ImportError: + print("[FAIL] boto3 not installed. Run: pip3 install boto3") + sys.exit(1) + +MODEL_THRESHOLDS = { + "anthropic.claude-sonnet-4-6": 2048, + "anthropic.claude-opus-4-6": 4096, + "anthropic.claude-haiku-4-5": 4096, + "anthropic.claude-sonnet-4-5": 1024, + "anthropic.claude-opus-4-5": 4096, + "anthropic.claude-3-7-sonnet": 1024, + "anthropic.claude-3-5-sonnet-v2": 1024, + "anthropic.claude-3-5-haiku": 2048, + "amazon.nova-pro": 1024, + "amazon.nova-lite": 1536, + "amazon.nova-micro": 1536, +} + +HOUR_TTL_MODELS = [ + "anthropic.claude-sonnet-4-6", + "anthropic.claude-sonnet-4-5", + "anthropic.claude-opus-4-6", + "anthropic.claude-opus-4-5", + "anthropic.claude-haiku-4-5", +] + + +def get_min_tokens(model_id): + """Look up minimum cache tokens for a model ID (handles cross-region prefixes).""" + clean = model_id + for prefix in ("us.", "eu.", "ap.", "global."): + clean = clean.removeprefix(prefix) + for key, threshold in MODEL_THRESHOLDS.items(): + if clean.startswith(key): + return threshold + return 2048 + + +def supports_hour_ttl(model_id): + clean = model_id + for prefix in ("us.", "eu.", "ap.", "global."): + clean = clean.removeprefix(prefix) + return any(clean.startswith(m) for m in HOUR_TTL_MODELS) + + +def generate_system_prompt(min_tokens): + """Generate a system prompt that exceeds the minimum token threshold. + Roughly 1 token ~ 4 chars, so we generate 5x chars to be safe.""" + base = ( + "You are an expert software architect specializing in distributed systems, " + "cloud-native applications, and API design. Your role is to review code and " + "provide detailed, actionable feedback on scalability, maintainability, " + "security, and performance. " + ) + # ~3-4 chars per token on average; use 8x multiplier to safely exceed the threshold + target_chars = min_tokens * 8 + repetitions = (target_chars // len(base)) + 1 + paragraphs = [] + topics = [ + "microservices architecture", "event-driven design", "database optimization", + "caching strategies", "load balancing", "circuit breaker patterns", + "API versioning", "observability and monitoring", "security best practices", + "infrastructure as code", "container orchestration", "serverless patterns", + "data pipeline design", "message queue architecture", "rate limiting", + "authentication and authorization", "deployment strategies", "testing patterns", + "error handling", "logging and tracing", + ] + for i in range(repetitions): + topic = topics[i % len(topics)] + paragraphs.append( + f"When reviewing {topic}, consider the following principles: {base}" + f"Apply these principles rigorously to {topic} implementations. " + f"Look for common anti-patterns in {topic} and suggest concrete improvements. " + ) + return "\n\n".join(paragraphs)[:target_chars] + + +def run_validation(model_id, region, profile, ttl): + print("=== Bedrock Prompt Caching Validation ===") + print(f"Model: {model_id}") + print(f"Region: {region}") + print(f"Profile: {profile}") + print(f"TTL: {ttl}") + print() + + min_tokens = get_min_tokens(model_id) + print(f"Minimum cache tokens for this model: {min_tokens}") + + if ttl == "1h" and not supports_hour_ttl(model_id): + print(f"[WARN] Model {model_id} does not support 1-hour TTL. Falling back to default (5min).") + ttl = None + + session = boto3.Session(profile_name=profile, region_name=region) + client = session.client("bedrock-runtime") + + system_prompt = generate_system_prompt(min_tokens) + approx_tokens = len(system_prompt) // 4 + print(f"Generated system prompt: ~{approx_tokens} tokens ({len(system_prompt)} chars)") + print() + + system_blocks = [{"text": system_prompt}] + cache_point = {"cachePoint": {"type": "default"}} + if ttl: + cache_point["cachePoint"]["ttl"] = ttl + system_blocks.append(cache_point) + + messages = [ + {"role": "user", "content": [{"text": "What are the top 3 principles you follow?"}]} + ] + + inference_config = {"maxTokens": 50} + + # Request 1: should write to cache + print("--- Request 1 (expect cache write) ---") + try: + start = time.time() + resp1 = client.converse( + modelId=model_id, + system=system_blocks, + messages=messages, + inferenceConfig=inference_config, + ) + latency1 = (time.time() - start) * 1000 + + usage1 = resp1.get("usage", {}) + cache_write = usage1.get("cacheWriteInputTokens", 0) + cache_read = usage1.get("cacheReadInputTokens", 0) + input_tokens = usage1.get("inputTokens", 0) + + print(f" Latency: {latency1:.0f}ms") + print(f" Input tokens: {input_tokens}") + print(f" Cache write: {cache_write}") + print(f" Cache read: {cache_read}") + + if cache_write > 0: + print(f" [PASS] Cache write confirmed ({cache_write} tokens written)") + elif cache_read > 0: + print(f" [PASS] Cache already populated from prior run ({cache_read} tokens read)") + else: + print(f" [WARN] No cache activity detected. Content may be below {min_tokens} token threshold.") + except Exception as e: + print(f" [FAIL] Request 1 failed: {e}") + return 1 + + print() + + # Brief pause to ensure cache is available + time.sleep(1) + + # Request 2: should read from cache + print("--- Request 2 (expect cache read) ---") + try: + start = time.time() + resp2 = client.converse( + modelId=model_id, + system=system_blocks, + messages=messages, + inferenceConfig=inference_config, + ) + latency2 = (time.time() - start) * 1000 + + usage2 = resp2.get("usage", {}) + cache_write2 = usage2.get("cacheWriteInputTokens", 0) + cache_read2 = usage2.get("cacheReadInputTokens", 0) + input_tokens2 = usage2.get("inputTokens", 0) + + print(f" Latency: {latency2:.0f}ms") + print(f" Input tokens: {input_tokens2}") + print(f" Cache write: {cache_write2}") + print(f" Cache read: {cache_read2}") + + if cache_read2 > 0: + print(f" [PASS] Cache read confirmed ({cache_read2} tokens from cache)") + else: + print(f" [FAIL] No cache read detected. Cache may not be working.") + return 1 + except Exception as e: + print(f" [FAIL] Request 2 failed: {e}") + return 1 + + print() + + # Summary + print("=== Results ===") + latency_improvement = ((latency1 - latency2) / latency1) * 100 if latency1 > 0 else 0 + print(f" Latency improvement: {latency_improvement:.1f}% ({latency1:.0f}ms -> {latency2:.0f}ms)") + print(f" Cache tokens: {cache_read2} tokens served from cache") + print(f" [PASS] Prompt caching is working correctly.") + return 0 + + +def main(): + parser = argparse.ArgumentParser(description="Validate Bedrock prompt caching") + parser.add_argument("--model-id", default="us.anthropic.claude-sonnet-4-6") + parser.add_argument("--region", default="us-east-1") + parser.add_argument("--profile", required=True, help="AWS CLI profile name") + parser.add_argument("--ttl", default=None, help="Cache TTL: '1h' or omit for default 5min") + args = parser.parse_args() + + sys.exit(run_validation(args.model_id, args.region, args.profile, args.ttl)) + + +if __name__ == "__main__": + main() diff --git a/plugins/bedrock/skills/bedrock/SKILL.md b/plugins/bedrock/skills/bedrock/SKILL.md new file mode 100644 index 0000000..09309cb --- /dev/null +++ b/plugins/bedrock/skills/bedrock/SKILL.md @@ -0,0 +1,185 @@ +--- +name: bedrock +description: "Amazon Bedrock setup and operations: onboarding, IAM setup, model access, prompt caching, observability, quota optimization, cost analysis, and cross-region inference. Triggers on phrases like: set up bedrock, configure bedrock, bedrock onboarding, prompt caching, bedrock IAM, enable model access, cache management, bedrock observability, bedrock costs, bedrock quota, cross-region inference." +argument-hint: "[what do you need help with?]" +--- + +# Amazon Bedrock + +Guide developers through Amazon Bedrock — from initial setup to ongoing operations. Covers IAM permissions, model access, prompt caching (simplified or explicit), CloudWatch observability, quota optimization, cost analysis, and cross-region inference. + +## Knowledge Sources + +Use these sources to answer Bedrock questions. Choose the source that best matches the query — you may consult multiple sources or skip sources that aren't relevant. + +- **AWS Docs MCP Server** (`mcp__plugin_bedrock_aws-documentation__search_documentation`, `mcp__plugin_bedrock_aws-documentation__read_documentation`) — Official AWS documentation. Best for: API reference, service limits, pricing, IAM permissions, error codes, feature availability. +- **aws-samples prompt caching repo** (https://github.com/aws-samples/amazon-bedrock-samples/tree/main/introduction-to-bedrock/prompt-caching) — Working code samples for Converse API and InvokeModel API prompt caching. Best for: caching implementation, code examples, cache configuration patterns. +- **Bedrock Central** (https://aws-samples.github.io/sample-amazon-bedrock-central/) — Curated getting-started guides, model discovery, workshops, and sample applications. Best for: onboarding, model comparison, architecture patterns, workshop walkthroughs. +- **Plugin reference docs** (`${CLAUDE_PLUGIN_ROOT}/skills/bedrock/references/`) — Operational runbooks for the plugin's own scripts. Best for: step-by-step guidance on IAM setup, quota optimization, observability, cost analysis, and cross-region inference. +- **Internet search** — Last resort when other sources don't cover the topic. Always state that the answer came from an internet search. + +Always cite the source that provided your answer. + +**Key capabilities:** + +- **IAM Setup**: Generate least-privilege IAM policies scoped to specific Bedrock models and actions +- **Model Access**: Enable foundation model access, understand region availability, and select the right model +- **Prompt Caching**: Configure simplified (Claude-only) or explicit cache management with the Converse API +- **Cache Debugging**: Diagnose prompt caching failures with automated 6-test diagnostic suite +- **Validation**: End-to-end verification that IAM, model access, and prompt caching work correctly +- **Observability**: CloudWatch metrics for cache hit rates, token usage, and latency monitoring +- **Quota Optimization**: Detect max_tokens waste, analyze quota utilization, generate quota increase request data +- **Usage Analysis**: CloudWatch token consumption metrics, invocation counts, and caching efficiency +- **Cost Analysis**: Actual billed amounts from AWS Cost Explorer — no hardcoded pricing +- **Cross-Region Inference**: Claude models are only available through cross-region inference — guidance on IAM, SCPs, and troubleshooting + +## Onboarding Workflow + +When a user asks to set up Bedrock, follow these steps in order: + +### Step 1: AWS CLI Profile Selection + +Before any AWS command, ask the developer which AWS CLI profile to use. List available profiles by running `aws configure list-profiles`. **Wait for confirmation** — do not auto-select. Then verify with `aws sts get-caller-identity --profile ` and show the account ID and ARN. + +If credentials are missing or the developer needs to create a new profile, load [references/profile-setup.md](references/profile-setup.md) for step-by-step guidance. + +### Step 2: IAM Permissions + +Load [references/iam-permissions.md](references/iam-permissions.md) and help the user create or verify their IAM policy. + +Run the validation script to check permissions: + +```bash +${CLAUDE_PLUGIN_ROOT}/scripts/validate-bedrock-access.sh +``` + +### Step 3: Model Access + +Load [references/model-access.md](references/model-access.md) and help the user enable their chosen model. + +Key points: + +- Since October 2025, all Bedrock serverless models are **auto-enabled** — no manual activation needed +- **Anthropic models** (Claude) require a **one-time use case form** (First Time Use) before first invocation — submit via the Bedrock console playground or `PutUseCaseForModelAccess` API +- **All other models** (Amazon Nova, Meta Llama, Mistral, etc.) work immediately with the correct IAM permissions +- Default recommendation: Claude Sonnet 4.6 (`us.anthropic.claude-sonnet-4-6`) + +### Step 4: Prompt Caching + +Ask the user: **simplified or explicit cache management?** + +- **Simplified** (Claude models only): A single `cachePoint` marker; Bedrock automatically checks ~20 preceding blocks for cache hits. Easiest to implement. +- **Explicit**: Manual placement of multiple cache checkpoints with granular TTL control. Works with all supported models. + +Load [references/prompt-caching.md](references/prompt-caching.md) for implementation details. + +### Step 5: Validation + +Run the end-to-end validation script: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/validate-prompt-caching.py --model-id us.anthropic.claude-sonnet-4-6 +``` + +This sends two Converse API requests with identical cached content and verifies: + +1. First request: `cacheWriteInputTokens > 0` (cache was written) +2. Second request: `cacheReadInputTokens > 0` (cache was hit) +3. Latency improvement on the second request + +### Step 6: Observability + +Let the developer know Bedrock publishes metrics to CloudWatch automatically. They can run `/bedrock-usage` to pull token consumption and caching metrics. For metric details, see [references/observability.md](references/observability.md). + +## When to Load Reference Files + +- **IAM**, **permissions**, **policy**, or **access denied errors** -> see [references/iam-permissions.md](references/iam-permissions.md) +- **Model selection**, **enable model**, **region**, **marketplace**, or **model access** -> see [references/model-access.md](references/model-access.md) +- **Prompt caching**, **cache point**, **TTL**, **simplified caching**, **explicit caching**, or **cache management** -> see [references/prompt-caching.md](references/prompt-caching.md) +- **Cache debug**, **cache not working**, **cache miss**, **no cache tokens**, or **cacheWriteInputTokens is zero** -> run `/bedrock-cache-debug` +- **CloudWatch**, **metrics**, **monitoring**, **dashboard**, **observability**, or **cache hit rate** -> see [references/observability.md](references/observability.md) +- **Quota**, **throttling**, **max_tokens**, **429 error**, **ThrottlingException**, **rate limit**, or **token limit** -> see [references/quota-optimization.md](references/quota-optimization.md) +- **Token usage**, **how many tokens**, **invocations**, or **CloudWatch metrics** -> run `/bedrock-usage` +- **Cost**, **spending**, **billing**, or **how much am I paying** -> run `/bedrock-costs` +- **Pricing**, **how much does a model cost**, or **per-token pricing** -> search AWS Docs MCP for "Amazon Bedrock pricing"; link to https://aws.amazon.com/bedrock/pricing/ +- **Cost optimization**, **savings**, **ROI**, or **cheaper model** -> see [references/cost-optimization.md](references/cost-optimization.md) +- **Cross-region**, **inference profile**, **SCP**, **multi-region**, or **higher throughput** -> see [references/cost-optimization.md](references/cost-optimization.md) +- **Code samples**, **how to call Bedrock**, or **example code** -> fetch Bedrock Central; reference https://github.com/aws-samples/amazon-bedrock-samples +- **IAM action names**, **ARN format**, **cross-region prefix**, **CloudWatch metric names**, or **stable Bedrock facts** -> see [references/bedrock-quick-reference.md](references/bedrock-quick-reference.md) + +## Best Practices + +### Security + +- Do: Use `AmazonBedrockLimitedAccess` managed policy for Bedrock permissions +- Do: Use named AWS CLI profiles with `--profile` rather than hardcoded credentials or environment variables +- Do: Scope Bedrock permissions to specific regions where you operate +- Don't: Write custom IAM policies when managed policies cover the use case +- Don't: Store AWS credentials in code or environment variables in production + +### Prompt Caching + +- Do: Use simplified caching if you only use Claude models — it requires less code and handles checkpoint placement automatically +- Do: Ensure your cached content exceeds the model's minimum token threshold (1,024–4,096 tokens for Claude models; 1,024–1,536 for Nova) +- Do: Place static content (system prompts, large documents, few-shot examples) before the cache point +- Do: Monitor `CacheReadInputTokenCount` in CloudWatch to verify cache hits +- Don't: Cache content that changes frequently — the cache key is based on exact content match +- Don't: Mix TTL durations out of order — longer TTLs must precede shorter TTLs in the message sequence + +### Cost Optimization + +- Do: Set `max_tokens` to ~1.5x your p90 actual output to avoid quota waste (the #1 optimization for throughput) +- Do: Start with prompt caching on your most repeated prompts first — it reduces costs by up to 90% and latency by up to 85% +- Do: Use the 1-hour TTL for system prompts that rarely change (supported on Claude Sonnet 4.6, Opus 4.6, Sonnet 4.5, Opus 4.5, Haiku 4.5) +- Do: Use cross-region inference (e.g., `us.anthropic.claude-sonnet-4-6`) for higher throughput at no additional routing cost +- Do: Monitor CloudWatch metrics to identify low cache hit rates and adjust accordingly +- Do: Run `/bedrock-quota` periodically to check for quota waste and throttling risks +- Don't: Cache very short content — there's a per-model minimum token threshold +- Don't: Cache single-use content — the 25% write premium increases cost when there are no cache reads +- Don't: Leave `max_tokens` at the default for high-concurrency workloads — it reserves up to 320K quota tokens per request + +## Supported Models for Prompt Caching + +For current supported models, minimum cache token thresholds, and TTL options, search the AWS Docs MCP Server for "Bedrock prompt caching supported models" or see the [prompt caching documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html). + +General patterns (stable): + +- Claude models: 1,024–4,096 minimum cache tokens depending on model; most support 5-minute TTL, newer models (Sonnet 4.6, Opus 4.6, Sonnet 4.5, Opus 4.5, Haiku 4.5) also support 1-hour TTL +- Amazon Nova models: 1,024–1,536 minimum cache tokens depending on model; 5-minute TTL + +For cross-region inference, add the geo prefix to base model IDs (e.g., `us.` for US regions). + +## Configuration + +### AWS CLI Setup + +This plugin requires AWS credentials configured on the host machine: + +**Verify access**: Run `aws sts get-caller-identity --profile PROFILE` to confirm credentials are valid. + +### Python Setup + +Validation scripts require Python 3.10+ with boto3: + +**Verify**: Run `python3 -c "import boto3; print(boto3.__version__)"` + +If missing: `pip3 install boto3` + +## Troubleshooting Quick Reference + +| Error | Cause | Solution | +| ------------------------------------------------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `AccessDeniedException` | Missing IAM permissions | Add `bedrock:InvokeModel` and `bedrock:Converse` to the IAM policy. See [references/iam-permissions.md](references/iam-permissions.md) | +| `ResourceNotFoundException` | Model not enabled in region | For Claude models, submit the one-time use case form. Check region availability and ensure you're using the cross-region model ID (e.g., `us.` prefix) | +| `ValidationException: cache tokens below minimum` | Cached content too short | Increase cached content to exceed the model's minimum token threshold | +| `ThrottlingException` | Rate limit exceeded | Run `/bedrock-quota` to diagnose; likely `max_tokens` is too high or you need cross-region inference. See [references/quota-optimization.md](references/quota-optimization.md) | +| No `cacheReadInputTokens` in response | Cache miss | Verify the cached content is identical between requests and within the TTL window | + +## Resources + +- [Bedrock Central](https://aws-samples.github.io/sample-amazon-bedrock-central/) — Getting started, model discovery, code samples, workshops +- [Amazon Bedrock Documentation](https://docs.aws.amazon.com/bedrock/) — Official AWS docs (also available via AWS Docs MCP server) +- [Prompt Caching Documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html) +- [Prompt Caching Code Samples](https://github.com/aws-samples/amazon-bedrock-samples/tree/main/introduction-to-bedrock/prompt-caching) +- [Bedrock Pricing](https://aws.amazon.com/bedrock/pricing/) +- [Bedrock Service Quotas](https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html) diff --git a/plugins/bedrock/skills/bedrock/references/bedrock-quick-reference.md b/plugins/bedrock/skills/bedrock/references/bedrock-quick-reference.md new file mode 100644 index 0000000..5efc78d --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/bedrock-quick-reference.md @@ -0,0 +1,80 @@ +# Bedrock Quick Reference + +Stable facts for frequent lookups. These rarely change but ALWAYS verify against the AWS Docs MCP Server if the information seems outdated or if more than 6 months have passed since the "Last verified" date. + +Last verified: 2026-04-08 + +## IAM Action Names for Bedrock + +| Action | Used For | +| --------------------------------------- | ------------------------------ | +| `bedrock:InvokeModel` | Single inference request | +| `bedrock:InvokeModelWithResponseStream` | Streaming inference | +| `bedrock:Converse` | Converse API (recommended) | +| `bedrock:ConverseStream` | Streaming Converse API | +| `bedrock:ListFoundationModels` | Model discovery | +| `bedrock:GetFoundationModel` | Model details | +| `bedrock:ListInferenceProfiles` | Cross-region profile discovery | +| `bedrock:ListModelAccessStatus` | Check model enablement status | + +These are API action names tied to the Bedrock API surface. New actions are added when AWS ships new API operations, but existing actions are not renamed or removed. + +## ARN Format Patterns + +| Resource | Pattern | +| ------------------ | ---------------------------------------------------------------- | +| Foundation model | `arn:aws:bedrock:REGION::foundation-model/MODEL_ID` | +| Cross-region model | `arn:aws:bedrock:REGION::foundation-model/PREFIX.MODEL_ID` | +| Custom model | `arn:aws:bedrock:REGION:ACCOUNT_ID:custom-model/MODEL_NAME` | +| Inference profile | `arn:aws:bedrock:REGION:ACCOUNT_ID:inference-profile/PROFILE_ID` | + +## Cross-Region Prefix Conventions + +| Prefix | Region Group | Regions | +| --------- | ------------ | ---------------------------------------------- | +| `us.` | US | us-east-1, us-east-2, us-west-2 | +| `eu.` | Europe | eu-central-1, eu-west-1, eu-west-3 | +| `ap.` | Asia Pacific | ap-northeast-1, ap-southeast-1, ap-southeast-2 | +| `global.` | Global | All Bedrock regions | + +Example: `us.anthropic.claude-sonnet-4-6` routes to any US region. + +## CloudWatch Metrics + +Namespace: `AWS/Bedrock` + +| Metric | Description | +| --------------------------- | ------------------------------------ | +| `Invocations` | Number of InvokeModel/Converse calls | +| `InputTokenCount` | Input tokens per request | +| `OutputTokenCount` | Output tokens per request | +| `CacheReadInputTokenCount` | Tokens read from cache | +| `CacheWriteInputTokenCount` | Tokens written to cache | +| `InvocationLatency` | End-to-end latency in ms | +| `InvocationClientErrors` | 4xx errors | +| `InvocationServerErrors` | 5xx errors | +| `InvocationThrottles` | Throttled requests | + +## Default Recommendations + +- **Region**: `us-east-1` (broadest model availability and earliest feature launches) +- **Caching approach**: Simplified for Claude-only workloads; explicit for multi-model +- **Inference routing**: Cross-region (`us.` prefix) for startups — better availability, no routing cost + +## What Is NOT in This File + +The following change frequently. Always query the AWS Docs MCP Server for current values: + +- Model IDs, model capabilities, minimum cache tokens, supported TTL durations +- Per-token pricing +- Service quota default values +- Feature availability dates and region rollout status +- Specific model version strings + +## Verification Instructions + +If any fact above seems wrong or a user reports an issue: + +1. Search the AWS Docs MCP Server for the specific topic +2. Update this file with corrected information and a new "Last verified" date +3. Cite the AWS documentation URL that confirmed the correction diff --git a/plugins/bedrock/skills/bedrock/references/cost-optimization.md b/plugins/bedrock/skills/bedrock/references/cost-optimization.md new file mode 100644 index 0000000..5847aeb --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/cost-optimization.md @@ -0,0 +1,155 @@ +# Cost Optimization and Cross-Region Inference + +## Bedrock Pricing + +For current per-token pricing, consult the official sources: + +- **AWS Bedrock Pricing Page**: https://aws.amazon.com/bedrock/pricing/ +- **AWS Docs MCP Server**: Search for "Amazon Bedrock pricing" using `mcp__aws-documentation__search_documentation` + +Bedrock charges per token processed. Input tokens are cheaper than output tokens (typically 3-5x). Prompt caching adds a 25% write premium but offers 90% savings on cache reads. + +### Cost Levers (Ranked by Impact) + +1. **Model selection**: Haiku 4.5 is ~4x cheaper than Sonnet for input, ~4x cheaper for output. Use Haiku for classification, routing, and short-response tasks. +2. **Prompt caching**: Up to 90% savings on cached input tokens. See [prompt-caching.md](prompt-caching.md). +3. **max_tokens optimization**: Doesn't affect billing directly but prevents throttling that blocks revenue-generating requests. See [quota-optimization.md](quota-optimization.md). +4. **Output length control**: Output tokens cost 5x more than input tokens for Claude models. Instruct the model to be concise when full responses aren't needed. + +## Cross-Region Inference + +Cross-region inference distributes requests across multiple AWS regions for higher throughput and availability. It uses inference profile IDs with a region prefix. + +### Inference Profile IDs + +| Type | Prefix | Description | +| --------------- | --------- | ----------------------------------------------------------------------- | +| Geographic (US) | `us.` | Routes within US regions (us-east-1, us-east-2, us-west-2, etc.) | +| Geographic (EU) | `eu.` | Routes within EU regions (eu-central-1, eu-west-1, eu-west-3, etc.) | +| Geographic (AP) | `ap.` | Routes within APAC regions | +| Global | `global.` | Routes to any supported commercial region worldwide (~10% cost savings) | + +Claude models require a cross-region prefix. Use the prefix that matches your target geography: + +```python +# Cross-region (US group): +model_id = "us.anthropic.claude-sonnet-4-6" + +# Cross-region (EU group): +model_id = "eu.anthropic.claude-sonnet-4-6" + +# Global (any supported region, ~10% cost savings): +model_id = "global.anthropic.claude-sonnet-4-6" +``` + +### Benefits + +- **Higher throughput**: Distributes requests across multiple regions for better capacity +- **Higher availability**: Automatic failover if one region is under load +- **No additional routing cost**: Price is calculated based on the source region. Global inference offers ~10% savings. +- **No data storage**: Inference data is processed in transit but not stored in destination regions. All data stays on the AWS network. +- **Prompt caching works**: Cache is maintained per-region but cross-region inference is compatible + +### Usage + +Claude models are only available through cross-region inference. Just use the prefixed model ID — no setup required: + +```python +# Use the prefixed model ID directly +model_id = "us.anthropic.claude-sonnet-4-6" +``` + +To validate access, run `/bedrock-validate-model-access`. + +### SCP Gotcha (The #1 Failure Cause) + +Cross-region inference requires Bedrock permissions in **ALL regions** in the inference profile group. If a Service Control Policy (SCP) blocks Bedrock in even ONE target region, the entire cross-region feature fails silently with `AccessDeniedException`. + +Example: Using `us.anthropic.claude-sonnet-4-6` in `us-east-1`. The model is available in us-east-1, us-east-2, and us-west-2. If your SCP blocks `us-east-2` for all services, cross-region inference fails — even though us-east-1 and us-west-2 are allowed. + +**Fix**: If your SCP restricts regions via a Deny statement, add a condition that exempts Bedrock actions from the region restriction: + +```json +{ + "Sid": "DenyNonApprovedRegions", + "Effect": "Deny", + "NotAction": [ + "bedrock:InvokeModel", + "bedrock:InvokeModelWithResponseStream", + "bedrock:Converse", + "bedrock:ConverseStream" + ], + "Resource": "*", + "Condition": { + "StringNotEquals": { + "aws:RequestedRegion": ["us-east-1"] + } + } +} +``` + +This denies all non-Bedrock actions outside `us-east-1` while allowing Bedrock inference to reach cross-region destinations (us-east-2, us-west-2, etc.). Alternatively, add all destination regions to your existing SCP's allowed region list. + +### IAM Requirements + +Cross-region inference requires a two-statement IAM policy: one granting access to the inference profile, and one granting access to the foundation model in all destination regions (with an `InferenceProfileArn` condition for least-privilege): + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowCrossRegionInferenceProfile", + "Effect": "Allow", + "Action": ["bedrock:InvokeModel", "bedrock:Converse"], + "Resource": [ + "arn:aws:bedrock:us-east-1::inference-profile/us.anthropic.claude-sonnet-4-6" + ] + }, + { + "Sid": "AllowFoundationModelInDestinationRegions", + "Effect": "Allow", + "Action": ["bedrock:InvokeModel", "bedrock:Converse"], + "Resource": [ + "arn:aws:bedrock:us-east-1::foundation-model/anthropic.claude-sonnet-4-6", + "arn:aws:bedrock:us-east-2::foundation-model/anthropic.claude-sonnet-4-6", + "arn:aws:bedrock:us-west-2::foundation-model/anthropic.claude-sonnet-4-6" + ], + "Condition": { + "StringEquals": { + "bedrock:InferenceProfileArn": "arn:aws:bedrock:us-east-1::inference-profile/us.anthropic.claude-sonnet-4-6" + } + } + } + ] +} +``` + +Replace `` with your AWS account ID. + +## When Provisioned Throughput Makes Sense + +On-demand inference (default) is billed per token with shared quota. Provisioned throughput reserves dedicated capacity at a fixed hourly rate. + +Consider provisioned throughput when: + +- You consistently hit on-demand quota limits even after optimization +- You need guaranteed latency SLAs +- Your workload exceeds 1M+ tokens per minute sustained +- You've already optimized max_tokens and enabled cross-region inference + +Provisioned throughput is significantly more expensive for low/variable workloads. Start with on-demand and optimize before considering provisioned. + +## Tracking Costs and Usage + +Use the plugin commands to pull metrics directly: + +```bash +# Token usage from CloudWatch +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-usage.py --all-models --period 7 + +# Actual costs from Cost Explorer +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-costs.py --period 7 --group-by model +``` + +For details on available CloudWatch metrics, see [observability.md](observability.md). diff --git a/plugins/bedrock/skills/bedrock/references/iam-permissions.md b/plugins/bedrock/skills/bedrock/references/iam-permissions.md new file mode 100644 index 0000000..d9e40e6 --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/iam-permissions.md @@ -0,0 +1,103 @@ +# IAM Permissions Reference + +What Bedrock permissions are needed to use this plugin's commands, and how to troubleshoot when they're wrong. + +## Quick Start: AWS Managed Policy + +Attach the **`AmazonBedrockLimitedAccess`** managed policy to the IAM user or role used with this plugin. This covers: + +- `bedrock:InvokeModel`, `bedrock:InvokeModelWithResponseStream` — inference via Converse and InvokeModel APIs +- `bedrock:Get*`, `bedrock:List*` — model discovery and metadata +- `aws-marketplace:Subscribe`, `aws-marketplace:ViewSubscriptions` — Anthropic model auto-enablement (conditioned on `CalledViaLast: bedrock.amazonaws.com`) + +Attach via CLI: + +```bash +aws iam attach-user-policy \ + --user-name \ + --policy-arn arn:aws:iam::aws:policy/AmazonBedrockLimitedAccess \ + --profile + +# Or for a role: +aws iam attach-role-policy \ + --role-name \ + --policy-arn arn:aws:iam::aws:policy/AmazonBedrockLimitedAccess \ + --profile +``` + +For full admin access (includes model management, provisioned throughput, guardrails), use **`AmazonBedrockFullAccess`** instead. + +## Supplemental Policy for Observability and Cost Commands + +`AmazonBedrockLimitedAccess` does not include CloudWatch, Service Quotas, Cost Explorer, or STS permissions. If you want to use `/bedrock-usage`, `/bedrock-quota`, or `/bedrock-costs`, create and attach this inline policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "BedrockPluginObservability", + "Effect": "Allow", + "Action": [ + "cloudwatch:GetMetricStatistics", + "servicequotas:ListServiceQuotas", + "ce:GetCostAndUsage", + "sts:GetCallerIdentity" + ], + "Resource": "*" + } + ] +} +``` + +`ce:GetCostAndUsage` requires Cost Explorer to be enabled in the account (free, but not on by default). Enable via: AWS Console > Billing > Cost Explorer > Enable Cost Explorer. + +## Per-Command Permissions + +Each `/bedrock-*` command requires specific IAM actions. Use this to understand what the managed policy and supplemental policy cover: + +| Command | IAM Actions Required | Covered By | +| -------------------------------- | ------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------- | +| `/bedrock-validate-model-access` | `sts:GetCallerIdentity`, `bedrock:ListFoundationModels`, `bedrock:GetFoundationModel`, `bedrock:InvokeModel` | Managed policy (Bedrock actions) + supplemental (STS) | +| `/bedrock-cache` | `bedrock:InvokeModel`, `bedrock:InvokeModelWithResponseStream` | Managed policy | +| `/bedrock-cache-debug` | `bedrock:InvokeModel` | Managed policy | +| `/bedrock-usage` | `cloudwatch:GetMetricStatistics` | Supplemental policy | +| `/bedrock-costs` | `ce:GetCostAndUsage` | Supplemental policy | +| `/bedrock-quota` | `cloudwatch:GetMetricStatistics`, `servicequotas:ListServiceQuotas` | Supplemental policy | +| `/bedrock-setup` | All of the above | Both policies | + +Note: The Converse and ConverseStream APIs require `bedrock:InvokeModel` and `bedrock:InvokeModelWithResponseStream` respectively — they are not separate IAM actions. + +### Cross-Region Inference Permissions + +Cross-region model IDs (e.g., `us.anthropic.claude-sonnet-4-6`) require IAM policies that grant access to both the inference profile ARN and the foundation model ARN in destination regions. See [cost-optimization.md](cost-optimization.md) (IAM Requirements section) for the full policy template. + +## Anthropic Model Prerequisites + +In addition to IAM permissions, Anthropic models require: + +- `aws-marketplace:Subscribe` and `aws-marketplace:ViewSubscriptions` IAM permissions for auto-enablement (included in `AmazonBedrockLimitedAccess`) +- A one-time use case form (First Time Use) before first invocation — submit via the [Bedrock console](https://console.aws.amazon.com/bedrock/) or `PutUseCaseForModelAccess` API +- If submitted from the AWS Organization management account, applies to all member accounts + +Models from Amazon, Meta, Mistral, DeepSeek, Qwen, and OpenAI work immediately with correct IAM permissions — no marketplace subscription or use case form needed. + +## Common Permission Errors + +| Error | Likely Cause | Fix | +| -------------------------------------------------------------------------------- | -------------------------------------- | --------------------------------------------------------------------------- | +| `AccessDeniedException` on InvokeModel | Missing `bedrock:InvokeModel` | Verify `AmazonBedrockLimitedAccess` is attached | +| `AccessDeniedException` on Converse | Missing `bedrock:InvokeModel` | Converse requires `bedrock:InvokeModel` — verify managed policy is attached | +| `AccessDeniedException` on ListFoundationModels | Missing `bedrock:ListFoundationModels` | Verify `AmazonBedrockLimitedAccess` is attached | +| `AccessDeniedException` on cross-region model ID (e.g., `us.anthropic.claude-*`) | SCP or IAM blocks destination regions | See [cost-optimization.md](cost-optimization.md) SCP Gotcha section | +| Model not in list despite permissions | Model not enabled | Submit use case form for Anthropic models, or check region availability | + +## Validation Passes but Application Still Fails + +If `validate-bedrock-access.sh` passes but your application still gets errors: + +1. **IAM principal mismatch**: The validation script runs under your CLI user/role, but your app may use a different principal (Lambda execution role, ECS task role, etc.). Check with `aws sts get-caller-identity` from your app's runtime. +2. **API action mismatch**: The script tests via Converse (`bedrock:InvokeModel`), but your app may use streaming (`bedrock:InvokeModelWithResponseStream`). Ensure the managed policy is attached (it includes both). +3. **SCP restrictions**: Organization-level Service Control Policies may block Bedrock in certain regions or for certain principals, even when IAM allows it. SCPs are invisible in IAM policy simulation. +4. **Streaming permissions**: If your app uses streaming (`ConverseStream` or `InvokeModelWithResponseStream`), ensure the policy includes `bedrock:InvokeModelWithResponseStream` (included in `AmazonBedrockLimitedAccess`). +5. **Model ID format**: Cross-region model IDs (e.g., `us.anthropic.claude-sonnet-4-6`) require IAM policies that grant access to both the inference profile ARN and the foundation model ARN in all destination regions. See [cost-optimization.md](cost-optimization.md) IAM Requirements section. diff --git a/plugins/bedrock/skills/bedrock/references/model-access.md b/plugins/bedrock/skills/bedrock/references/model-access.md new file mode 100644 index 0000000..095b19a --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/model-access.md @@ -0,0 +1,128 @@ +# Model Access and Selection + +## Recommended Starting Model + +**Claude Sonnet 4.6** (`us.anthropic.claude-sonnet-4-6`) + +- Best balance of capability and cost for startups +- Available via cross-region inference (`us.` prefix for geographic, `global.` for global routing) +- 1M token context window, 64K max output + +## Enabling Model Access + +Since October 2025, all Bedrock serverless models are **auto-enabled** — no manual toggle in the console needed. However, some models have one-time prerequisites before your first API call will succeed. You need: + +1. IAM policy allowing `bedrock:InvokeModel` / `bedrock:Converse` (see [iam-permissions.md](iam-permissions.md)) +2. `aws-marketplace:Subscribe` and `aws-marketplace:ViewSubscriptions` IAM permissions for auto-enablement to succeed on first invocation + +### Anthropic Models (Claude) + +In addition to auto-enablement, Anthropic requires a **one-time use case form** (First Time Use) before first invocation: + +- Submit by selecting any Anthropic model in the [Bedrock console](https://console.aws.amazon.com/bedrock/) playground +- Or via the `PutUseCaseForModelAccess` API +- If submitted from the AWS Organization management account, it applies to all member accounts +- Once submitted, access to all Anthropic models is granted immediately + +### Models Without Marketplace Product IDs + +Models from Amazon (Nova, Titan), Meta (Llama), Mistral AI, DeepSeek, Qwen, and OpenAI don't have AWS Marketplace product IDs and work immediately with the correct IAM permissions — no marketplace subscription or use case form needed. By invoking a model for the first time, you agree to its applicable EULA (see [AWS Service Terms](https://aws.amazon.com/service-terms/)). + +### Via CLI + +```bash +# List available models +aws bedrock list-foundation-models \ + --profile PROFILE \ + --region us-east-1 \ + --query 'modelSummaries[*].[modelId,modelName,providerName]' \ + --output table + +# Check model availability status +aws bedrock get-foundation-model-availability \ + --model-id anthropic.claude-sonnet-4-6 \ + --profile PROFILE \ + --region us-east-1 + +# Enable model access programmatically (third-party models) +aws bedrock create-foundation-model-agreement \ + --model-id MODEL_ID \ + --offer-token OFFER_TOKEN \ + --profile PROFILE \ + --region us-east-1 +``` + +See [Access Amazon Bedrock foundation models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) for details. + +## Model Selection Guide + +### For Prompt Caching + +| Priority | Model | Min Tokens | 1h TTL | Cost Tier | Best For | +| -------- | ----------------- | ---------- | ------ | --------- | ----------------------------------------------------------------- | +| 1 | Claude Sonnet 4.6 | 2,048 | Yes | Mid | General purpose, best cost/capability ratio (recommended default) | +| 2 | Claude Haiku 4.5 | 4,096 | Yes | Low | High-volume, cost-sensitive workloads | +| 3 | Nova Pro | 1,000 | No | Low | No use case form needed | +| 4 | Claude Opus 4.6 | 4,096 | Yes | High | Complex reasoning tasks | + +### Model Access by Provider + +**Auto-enabled, no form needed** (Amazon, Meta, Mistral, DeepSeek, Qwen, OpenAI): + +- Work immediately with correct IAM permissions +- Simplest setup + +**Anthropic (Claude)**: + +- Requires `aws-marketplace:Subscribe` and `aws-marketplace:ViewSubscriptions` IAM permissions for auto-enablement +- One-time use case form (First Time Use) required per account (or per AWS Organization from management account) +- After form submission, all Anthropic models are available immediately + +## Cross-Region Inference + +Claude models are **only available through cross-region inference** — you must use a prefixed model ID (e.g., `us.anthropic.claude-sonnet-4-6`). The bare base model ID (`anthropic.claude-sonnet-4-6`) **cannot be used for inference** and will return `ResourceNotFoundException`. No inference profile needs to be created; AWS provides system-defined inference profiles automatically. + +| Prefix | Use when target region is | Example | +| --------- | ------------------------------------------------------- | ------------------------------------ | +| `us.` | `us-east-1`, `us-east-2`, `us-west-2`, or any US region | `us.anthropic.claude-sonnet-4-6` | +| `eu.` | `eu-west-1`, `eu-central-1`, or any EU region | `eu.anthropic.claude-sonnet-4-6` | +| `ap.` | `ap-northeast-1`, `ap-southeast-1`, or any AP region | `ap.anthropic.claude-sonnet-4-6` | +| `global.` | Any region (~10% cost savings via global routing) | `global.anthropic.claude-sonnet-4-6` | + +Both the geographic prefix (`us.`, `eu.`, `ap.`) and `global.` are valid for any region in that geography. Choose `global.` for cost savings or geographic prefix for data residency. + +Non-Claude models (Amazon Nova, Titan, etc.) do not support cross-region inference and use bare model IDs with no prefix (e.g., `amazon.nova-pro-v1:0`). + +For IAM and SCP requirements for cross-region inference, see [cost-optimization.md](cost-optimization.md). + +## Region Availability + +Primary regions with broadest model availability: + +- `us-east-1` (N. Virginia) — most models available first +- `us-west-2` (Oregon) — second broadest availability +- `eu-west-1` (Ireland) — for EU data residency requirements +- `ap-northeast-1` (Tokyo) — for APAC + +Check current availability: + +```bash +aws bedrock list-foundation-models \ + --profile PROFILE \ + --region us-east-1 \ + --query 'modelSummaries[?contains(modelId, `claude`) || contains(modelId, `nova`)].modelId' \ + --output table +``` + +## Verifying Model Access + +```bash +# Check if a specific model is accessible (will fail with ResourceNotFoundException if not enabled) +aws bedrock get-foundation-model \ + --model-identifier anthropic.claude-sonnet-4-6 \ + --profile PROFILE \ + --region us-east-1 + +# Or run the full validation script +${CLAUDE_PLUGIN_ROOT}/scripts/validate-bedrock-access.sh +``` diff --git a/plugins/bedrock/skills/bedrock/references/observability.md b/plugins/bedrock/skills/bedrock/references/observability.md new file mode 100644 index 0000000..df734db --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/observability.md @@ -0,0 +1,90 @@ +# Bedrock Observability + +## Default CloudWatch Metrics + +Amazon Bedrock automatically publishes metrics to CloudWatch under the `AWS/Bedrock` namespace. No setup required — these are available as soon as you make inference calls. + +### Available Metrics + +| Metric | Description | Unit | What It Tells You | +| --------------------------- | -------------------------------------- | ------------ | -------------------------------- | +| `Invocations` | Number of InvokeModel/Converse calls | Count | Request volume | +| `InputTokenCount` | Input tokens per request | Count | How much context you're sending | +| `OutputTokenCount` | Output tokens per request | Count | How much the model is generating | +| `CacheReadInputTokenCount` | Tokens read from cache (cache hits) | Count | Cache effectiveness | +| `CacheWriteInputTokenCount` | Tokens written to cache (cache misses) | Count | Cache churn | +| `InvocationLatency` | End-to-end request latency | Milliseconds | Response time | +| `InvocationClientErrors` | 4xx errors | Count | Permission/validation issues | +| `InvocationServerErrors` | 5xx errors | Count | Service-side issues | +| `InvocationThrottles` | Throttled requests (429s) | Count | Quota pressure | + +### Dimensions + +- `ModelId`: Filter by specific model (e.g., `us.anthropic.claude-sonnet-4-6`) + +## Pulling Metrics with the Plugin + +The fastest way to see your Bedrock metrics: + +```bash +# Usage for a specific model (default: 7 days) +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-usage.py --model-id --region --period + +# Discover and analyze all active models +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/analyze-bedrock-usage.py --all-models --region --period +``` + +The script pulls directly from CloudWatch and reports: + +- Invocation counts and average tokens per request +- Total input and output token consumption +- Prompt caching efficiency (cache hit ratio, write vs read tokens) +- Warnings for low cache hit ratios or missing caching + +## Key Metrics to Watch + +### Cache Hit Rate + +``` +Cache Hit Rate = CacheReadInputTokens / (CacheReadInputTokens + CacheWriteInputTokens + non-cached InputTokens) +``` + +A healthy cache hit rate for repeated system prompts should be > 80%. If it's low, run `/bedrock-cache-debug` to diagnose. + +### Throttle Rate + +If `InvocationThrottles` is non-zero, you're hitting quota limits. Run `/bedrock-quota` to diagnose — most likely `max_tokens` is set too high. + +### Output-to-Input Ratio + +High `OutputTokenCount` relative to `InputTokenCount` means output tokens dominate your costs (output is 3-5x more expensive). Consider instructing the model to be concise when full responses aren't needed. + +## Querying Metrics Directly via CLI + +For quick spot-checks without the plugin script: + +```bash +# Total invocations for a model over the last 24 hours +aws cloudwatch get-metric-statistics \ + --namespace AWS/Bedrock \ + --metric-name Invocations \ + --dimensions Name=ModelId,Value=us.anthropic.claude-sonnet-4-6 \ + --start-time $(date -u -v-1d +%Y-%m-%dT%H:%M:%S) \ + --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \ + --period 3600 \ + --statistics Sum \ + --profile PROFILE \ + --region us-east-1 + +# Check for throttling +aws cloudwatch get-metric-statistics \ + --namespace AWS/Bedrock \ + --metric-name InvocationThrottles \ + --dimensions Name=ModelId,Value=us.anthropic.claude-sonnet-4-6 \ + --start-time $(date -u -v-1d +%Y-%m-%dT%H:%M:%S) \ + --end-time $(date -u +%Y-%m-%dT%H:%M:%S) \ + --period 3600 \ + --statistics Sum \ + --profile PROFILE \ + --region us-east-1 +``` diff --git a/plugins/bedrock/skills/bedrock/references/profile-setup.md b/plugins/bedrock/skills/bedrock/references/profile-setup.md new file mode 100644 index 0000000..70db1ec --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/profile-setup.md @@ -0,0 +1,77 @@ +# AWS CLI Profile Setup + +How to configure a named AWS CLI profile for use with this plugin. + +## Creating a Profile + +A named profile keeps your plugin's AWS access separate from other tools. Create one with: + +```bash +aws configure --profile my-bedrock-dev +``` + +You'll be prompted for: + +- **AWS Access Key ID** and **Secret Access Key** — from your IAM user or from your organization's credential vending process +- **Default region** — `us-east-1` recommended for broadest Bedrock model availability +- **Output format** — `json` recommended + +### If You Use IAM Identity Center (SSO) + +```bash +aws configure sso --profile my-bedrock-dev +``` + +This walks through SSO login configuration. After setup, authenticate with: + +```bash +aws sso login --profile my-bedrock-dev +``` + +### If You Use Role Assumption + +If your organization provides a base identity and you assume a role for Bedrock access, configure the profile in `~/.aws/config`: + +``` +[profile my-bedrock-dev] +role_arn = arn:aws:iam:::role/ +source_profile = +region = us-east-1 +``` + +Replace ``, ``, and `` with your values. The role should have the Bedrock permissions listed in [iam-permissions.md](iam-permissions.md). + +## Verifying the Profile + +```bash +# List all configured profiles +aws configure list-profiles + +# Verify credentials work and confirm which account you're in +aws sts get-caller-identity --profile my-bedrock-dev +``` + +Expected output shows the account ID and IAM principal (user or assumed role ARN). + +## Multiple Accounts + +A common setup for startups: + +| Profile | Purpose | Account | +| ----------------- | --------------------------------------------- | ----------------- | +| `claude-code` | Claude Code inference (powers the CLI itself) | Inference account | +| `my-bedrock-dev` | Application development with Bedrock | Dev account | +| `my-bedrock-prod` | Production Bedrock access | Prod account | + +Each profile points to a different account. When the plugin asks which profile to use, pick the one matching the account you want to operate in. The plugin passes `--profile` explicitly to every AWS command — it never reads `AWS_PROFILE` from the environment. + +## What the Plugin Needs + +The profile you select must have IAM permissions for the plugin commands you want to use. See [iam-permissions.md](iam-permissions.md) for the full permissions reference. + +At minimum, for basic validation: + +- `sts:GetCallerIdentity` +- `bedrock:ListFoundationModels` +- `bedrock:GetFoundationModel` +- `bedrock:Converse` diff --git a/plugins/bedrock/skills/bedrock/references/prompt-caching.md b/plugins/bedrock/skills/bedrock/references/prompt-caching.md new file mode 100644 index 0000000..155df21 --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/prompt-caching.md @@ -0,0 +1,168 @@ +# Prompt Caching on Amazon Bedrock + +Prompt caching stores frequently used input content so subsequent requests can reuse it, reducing latency by up to 85% and costs by up to 90%. + +## Two Approaches + +### Simplified Cache Management (Claude Models Only) + +A single `cachePoint` marker tells Bedrock to automatically check the preceding ~20 content blocks for cacheable content. No need to manually manage multiple checkpoints. + +On the first request, `cacheWriteInputTokens` will be > 0 (cache populated). +On subsequent identical requests within the TTL, `cacheReadInputTokens` will be > 0 (cache hit). + +### Explicit Cache Management (All Supported Models) + +Place multiple `cachePoint` markers at specific positions for granular control. Supports mixed TTL (1h + 5min) for different content sections. + +## Code Samples + +For working code samples covering both approaches (Converse API and InvokeModel API), see the official AWS samples repository: + +**https://github.com/aws-samples/amazon-bedrock-samples/tree/main/introduction-to-bedrock/prompt-caching** + +The samples include: + +- `converse_api/` — Model-agnostic examples using the Converse and ConverseStream APIs with `cachePoint` syntax +- `invoke_model_api/` — Model-specific examples using the InvokeModel API (Anthropic `cache_control` format, Nova native format) +- Mixed TTL notebooks demonstrating longer TTL checkpoints preceding shorter ones + +## Key Concepts + +### Cache Point Placement + +The `cachePoint` is a standalone content block placed **after** the content to cache. In the Converse API, it looks like `{"cachePoint": {"type": "default"}}`. For 1-hour TTL, add `"ttl": "1h"`. + +### TTL Configuration + +| TTL | Supported Models | Use Case | +| --------------- | ------------------------------------------------------------ | ------------------------------------ | +| 5 min (default) | All supported models | Dynamic content, short conversations | +| 1 hour | Claude Sonnet 4.6, Opus 4.6, Sonnet 4.5, Opus 4.5, Haiku 4.5 | System prompts, reference docs | + +When using multiple cache points with different TTLs, longer durations must precede shorter ones. + +### Response Fields + +Cache metrics appear in the Converse API `usage` object: + +- `cacheWriteInputTokens > 0`: Cache was populated (first request or cache expired) +- `cacheReadInputTokens > 0`: Cache was hit (subsequent requests within TTL) +- Both zero: Content didn't meet minimum threshold or caching not supported + +For InvokeModel API (Anthropic format), the fields are `cache_creation_input_tokens` and `cache_read_input_tokens`. + +## Minimum Token Thresholds + +Content before a cache point must meet the model's minimum token count: + +| Model | Minimum Tokens | +| -------------------- | -------------- | +| Claude Sonnet 4.6 | 2,048 | +| Claude Opus 4.6 | 4,096 | +| Claude Opus 4.5 | 4,096 | +| Claude Haiku 4.5 | 4,096 | +| Claude Sonnet 4.5 | 1,024 | +| Claude Opus 4.1 | 1,024 | +| Claude Opus 4 | 1,024 | +| Claude Sonnet 4 | 1,024 | +| Claude 3.7 Sonnet | 1,024 | +| Claude 3.5 Sonnet v2 | 1,024 | +| Claude 3.5 Haiku | 2,048 | +| Amazon Nova Pro | 1,024 | +| Amazon Nova Lite | 1,536 | +| Amazon Nova Micro | 1,536 | + +If content is below the threshold, the cache point is ignored (no error, just no caching). + +## What to Cache + +**Good candidates (static, reused across requests):** + +- System prompts +- Few-shot examples +- Reference documents / knowledge bases +- Tool definitions +- Long code files for review + +**Poor candidates (change frequently):** + +- User messages that vary each request +- Dynamic context that updates per call +- Very short content below the token threshold + +## Why Isn't My Cache Working? + +Prompt caching fails silently in several scenarios. Walk through this checklist: + +1. **Is the model supported?** Caching is silently ignored for unsupported models. No error, no warning. Check the supported models table above. + +2. **Does content exceed the minimum token threshold?** If content before the cache point is below the model's minimum (e.g., 1,024 for Claude Sonnet 4), the cache point is ignored. The request succeeds normally — you just don't get caching. + +3. **Is the cached content identical between requests?** Cache keys are based on exact byte-for-byte prefix match. Even small changes invalidate the cache: + - Timestamps or request IDs in the system prompt + - Whitespace differences + - Reordered JSON keys + - Session tokens or user-specific content before the cache point + +4. **Has the TTL expired?** Default TTL is 5 minutes. If more than 5 minutes pass between requests with the same prefix, the cache expires and the next request is a cache write (not a read). + +5. **Is the cache point in the right place?** The `cachePoint` must be a separate content block placed **after** the content to cache, not embedded within it. + +Run `/bedrock-cache-debug` for automated diagnosis of all these issues. + +## Break-Even Analysis + +Cache writes cost **25% more** than standard input tokens. Cache reads cost **90% less**. This means caching only saves money if you have enough reads per write. + +| Requests per TTL Window | Cost Without Cache | Cost With Cache | Savings | +| ----------------------- | ------------------ | --------------- | --------------------- | +| 1 (write only) | 1.00x | 1.25x | **-25% (costs MORE)** | +| 2 | 2.00x | 1.35x | 32% | +| 5 | 5.00x | 1.65x | 67% | +| 10 | 10.00x | 2.15x | 78% | +| 20 | 20.00x | 3.15x | 84% | + +**Key takeaway**: You need at least **2 requests within the TTL window** to break even. For single-use content (each document analyzed once), do NOT enable caching — it increases cost by 25%. + +## Preventing Cache Fragmentation + +Cache fragmentation occurs when "static" content varies slightly between requests, causing cache misses. Common causes and fixes: + +- **Timestamps in system prompts**: Move timestamps AFTER the cache point. +- **Dynamic user context mixed with static content**: Separate static and dynamic parts. +- **Non-deterministic formatting**: Use sorted keys in JSON, consistent whitespace, and fixed-format strings. +- **Session-specific tokens**: Keep session IDs, user IDs, and auth tokens after the cache point. + +## 1-Hour TTL + +For content that rarely changes (system prompts, reference docs, tool definitions), 1-hour TTL reduces cache writes and keeps the cache alive across longer idle periods. + +Supported models: Claude Sonnet 4.6, Opus 4.6, Sonnet 4.5, Opus 4.5, Haiku 4.5. + +When to use 1-hour TTL: + +- System prompts that don't change between sessions +- Reference documents loaded for RAG +- Tool definitions that are stable across requests +- Any content where 5 minutes between requests is too short + +When to keep 5-minute TTL: + +- Content that changes every few minutes +- High-frequency request patterns where 5 minutes is already sufficient +- When you want caches to expire quickly to pick up content updates + +## Validation + +Run the validation script to verify prompt caching works end-to-end: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/validate-prompt-caching.py --model-id us.anthropic.claude-sonnet-4-6 +``` + +For detailed diagnostics, run the cache debugger: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/debug-prompt-cache.py --model-id us.anthropic.claude-sonnet-4-6 +``` diff --git a/plugins/bedrock/skills/bedrock/references/quota-optimization.md b/plugins/bedrock/skills/bedrock/references/quota-optimization.md new file mode 100644 index 0000000..df2b419 --- /dev/null +++ b/plugins/bedrock/skills/bedrock/references/quota-optimization.md @@ -0,0 +1,146 @@ +# Quota Optimization and Throttling Guide + +Bedrock enforces per-model quotas on tokens per minute (TPM), tokens per day (TPD), and requests per minute (RPM). Understanding how these quotas work — especially the `max_tokens` pre-reservation and burndown rates — is the single most impactful optimization for production throughput. + +## The max_tokens Trap + +When you send a request, Bedrock **immediately reserves** quota for the maximum possible response: + +``` +Initial reservation = total_input_tokens + max_tokens +``` + +Where `total_input_tokens` includes input tokens, cache read tokens, and cache write tokens. After the response completes, the final adjusted deduction is calculated with the burndown rate applied to actual output: + +``` +Final deduction = input_tokens + cache_write_tokens + (actual_output_tokens × burndown_rate) +``` + +Cache read tokens don't count toward the final deduction. Unused reserved tokens are returned to your quota, but during processing the initial reservation blocks other concurrent requests. + +### Example: The Default Disaster + +Assume: 8,000 input tokens, actual output 1,200 tokens, default `max_tokens` of 64,000, Claude Sonnet 4.6 (5x burndown): + +| Scenario | Initial Reservation | Final Deduction | Wasted Reservation | +| ---------------------------- | --------------------------- | ------------------------ | ------------------------------ | +| Default max_tokens (64,000) | 8,000 + 64,000 = **72,000** | 8,000 + 5×1,200 = 14,000 | 80% | +| Optimized max_tokens (1,800) | 8,000 + 1,800 = **9,800** | 8,000 + 5×1,200 = 14,000 | 0% (final > initial, no waste) | + +With the default, a single request temporarily consumes **72,000 quota tokens**, leaving less room for concurrent requests. With an optimized `max_tokens`, the reservation is much smaller. Note: when the final deduction exceeds the initial reservation (due to burndown rate on output), the extra tokens are still deducted from your quota. + +### How to Right-Size max_tokens + +1. Run `python3 ${CLAUDE_PLUGIN_ROOT}/scripts/check-quota-health.py` to see your actual output distribution +2. Set `max_tokens` to approximately **1.5× your p90 actual output**: + +| Typical Output | Recommended max_tokens | +| ------------------- | ---------------------- | +| < 500 tokens | 750 | +| 500–2,000 tokens | 3,000 | +| 2,000–5,000 tokens | 7,500 | +| 5,000–10,000 tokens | 15,000 | +| > 10,000 tokens | p90 × 1.5 | + +Minimum floor: 256 tokens. Always leave headroom above your p90 to avoid truncating long responses. + +## Burndown Rates + +Not all models consume quota equally. Claude 3.7 and later models use a **5x burndown rate** for output tokens: + +| Model Family | Input Rate | Output Rate | Impact | +| --------------------------------------- | ---------- | ----------- | ------------------------------- | +| Claude Sonnet 4.6, Opus 4.6, Haiku 4.5 | 1:1 | **1:5** | 1 output token = 5 quota tokens | +| Claude Sonnet 4.5, Opus 4.5, 3.7 Sonnet | 1:1 | **1:5** | 1 output token = 5 quota tokens | +| Claude 3.5 Sonnet v2, 3.5 Haiku | 1:1 | 1:1 | Standard rate | +| Amazon Nova Pro, Lite, Micro | 1:1 | 1:1 | Standard rate | + +The 5x burndown is a **quota management** concern, not a billing concern. You are billed for actual tokens at standard rates. The multiplier only affects how fast you consume your per-minute quota. + +## Cross-Region Inference for Quota Relief + +Cross-region inference profiles (e.g., `us.anthropic.claude-sonnet-4-6` instead of `anthropic.claude-sonnet-4-6`) provide higher throughput by distributing requests across multiple regions. Check default quotas for cross-region profiles in [Amazon Bedrock service quotas](https://docs.aws.amazon.com/general/latest/gr/bedrock.html#limits_bedrock). + +Benefits: + +- Higher TPM and RPM quota than single-region inference +- Automatic regional failover for higher availability +- Same per-token pricing (no additional cost) +- Prompt caching works with cross-region inference + +For cross-region IAM and SCP guidance, see [cost-optimization.md](cost-optimization.md). To validate access, run `/bedrock-validate-model-access`. + +## Requesting a Quota Increase + +AWS requires specific data when reviewing quota increase requests. Run the quota health check to generate this data automatically: + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/scripts/check-quota-health.py --model-id MODEL_ID --region REGION +``` + +The script generates these fields, which you paste into the AWS Service Quotas console: + +- **Steady-state TPM**: Average tokens per minute during normal operation +- **Peak TPM**: Maximum tokens per minute during traffic spikes +- **Average input tokens per request**: Helps AWS understand request shape +- **Average output tokens per request**: Affects burndown rate impact +- **Total requests in observation period**: Demonstrates actual demand + +Request quotas at: https://console.aws.amazon.com/servicequotas/home/services/bedrock/quotas + +## Handling ThrottlingException + +A 429 `ThrottlingException` means your request was rejected because quota is exhausted. This is not a transient error — retrying immediately will fail again. + +### Retry Pattern + +```python +import time +import random +from botocore.exceptions import ClientError +from botocore.config import Config + +# Configure SDK-level retries with adaptive backoff +config = Config(retries={"max_attempts": 5, "mode": "adaptive"}) +client = boto3.client("bedrock-runtime", config=config) + +# For application-level retries beyond SDK defaults: +def invoke_with_backoff(client, max_retries=5, **kwargs): + for attempt in range(max_retries): + try: + return client.converse(**kwargs) + except ClientError as e: + if e.response["Error"]["Code"] == "ThrottlingException": + if attempt == max_retries - 1: + raise + # Exponential backoff with jitter, aligned to 60s quota window + delay = min(60, (2 ** attempt) + random.uniform(0, 1)) + time.sleep(delay) + else: + raise +``` + +Key points: + +- Quota refreshes every **60 seconds** — retrying within the same minute rarely helps +- Use **exponential backoff with jitter** to avoid thundering herd effects +- Set `max_tokens` properly to reduce the reservation that causes throttling in the first place +- Consider queuing requests with SQS to smooth out spikes + +## Monitoring Quota Usage + +Set up CloudWatch alarms to catch throttling before it impacts users: + +```bash +aws cloudwatch put-metric-alarm \ + --alarm-name "BedrockThrottling" \ + --namespace "AWS/Bedrock" \ + --metric-name "InvocationThrottles" \ + --dimensions Name=ModelId,Value=us.anthropic.claude-sonnet-4-6 \ + --statistic Sum \ + --period 300 \ + --evaluation-periods 1 \ + --threshold 1 \ + --comparison-operator GreaterThanOrEqualToThreshold \ + --alarm-actions "YOUR_SNS_TOPIC_ARN" +``` diff --git a/tools/generate_codex_manifests.py b/tools/generate_codex_manifests.py index 70ed1ce..73702c2 100644 --- a/tools/generate_codex_manifests.py +++ b/tools/generate_codex_manifests.py @@ -23,6 +23,16 @@ } INTERFACE_METADATA = { + "bedrock": { + "displayName": "Amazon Bedrock", + "shortDescription": "Set up Bedrock with IAM, model access, prompt caching, and cost analysis.", + "longDescription": "Guided Amazon Bedrock setup — IAM configuration, model access, prompt caching, observability, quota optimization, and cost analysis.", + "defaultPrompt": [ + "Set up Amazon Bedrock for my AWS account.", + "How much have I spent on Bedrock this month?", + "Am I getting throttled on any Bedrock models?", + ], + }, "amazon-location-service": { "displayName": "Amazon Location Service", "shortDescription": "Build maps, routing, geocoding, and places workflows on AWS.",