From 34dedb7a6bf407ca82d5c4a3749bac869486e773 Mon Sep 17 00:00:00 2001 From: Saurya Velagapudi Date: Mon, 13 Apr 2026 12:00:53 -0700 Subject: [PATCH 1/6] Add dual staging environment infrastructure (pathroute & subdomain) This adds infrastructure for two parallel staging environments that deploy from the main branch: - staging-pathroute: Uses path-based routing (e.g., /api/automation) - staging-subdomain: Uses subdomain-based routing (production pattern) Changes: - Add GitHub Actions workflow for dual environment deployment - Supports deploying to both, pathroute only, or subdomain only - Includes SOPS secret decryption and Helm deployment - Dry-run mode for validation - Add envs/staging-pathroute/ with values.yaml and README - Add envs/staging-subdomain/ with values.yaml and README - Add .sops.yaml for GCP KMS encryption of secrets - Add helper scripts for secret management Co-authored-by: openhands --- .github/workflows/deploy-staging.yml | 315 +++++++++++++++++++++++ .sops.yaml | 17 ++ envs/staging-pathroute/README.md | 104 ++++++++ envs/staging-pathroute/secrets/.gitkeep | 2 + envs/staging-pathroute/values.yaml | 313 +++++++++++++++++++++++ envs/staging-subdomain/README.md | 114 +++++++++ envs/staging-subdomain/secrets/.gitkeep | 1 + envs/staging-subdomain/values.yaml | 322 ++++++++++++++++++++++++ scripts/deploy/decrypt.sh | 20 ++ scripts/deploy/safe-apply-secrets.sh | 22 ++ 10 files changed, 1230 insertions(+) create mode 100644 .github/workflows/deploy-staging.yml create mode 100644 .sops.yaml create mode 100644 envs/staging-pathroute/README.md create mode 100644 envs/staging-pathroute/secrets/.gitkeep create mode 100644 envs/staging-pathroute/values.yaml create mode 100644 envs/staging-subdomain/README.md create mode 100644 envs/staging-subdomain/secrets/.gitkeep create mode 100644 envs/staging-subdomain/values.yaml create mode 100644 scripts/deploy/decrypt.sh create mode 100644 scripts/deploy/safe-apply-secrets.sh diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml new file mode 100644 index 00000000..94f2b66d --- /dev/null +++ b/.github/workflows/deploy-staging.yml @@ -0,0 +1,315 @@ +name: Deploy to Staging + +on: + # Manual trigger with options + workflow_dispatch: + inputs: + image_tag: + description: 'OpenHands image tag to deploy' + required: true + default: 'main' + environment: + description: 'Environment to deploy' + required: true + type: choice + options: + - both + - pathroute + - subdomain + default: 'both' + skip_secrets: + description: 'Skip applying secrets (use existing)' + type: boolean + default: false + dry_run: + description: 'Dry run (template only, no deploy)' + type: boolean + default: false + +env: + GCP_PROJECT: staging-092324 + GCP_ZONE: us-central1 + GCP_CLUSTER: staging-core-application + +jobs: + deploy-pathroute: + name: Deploy to staging-pathroute + if: ${{ inputs.environment == 'both' || inputs.environment == 'pathroute' }} + runs-on: ubuntu-24.04 + permissions: + contents: read + id-token: write + env: + NAMESPACE: openhands-pathroute + HELM_RELEASE: openhands-pathroute + ENV_DIR: envs/staging-pathroute + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install SOPS + run: | + curl -L "https://github.com/mozilla/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64" -o sops + chmod +x sops + sudo mv sops /usr/local/bin/sops + sops --version + + - name: Install Helm + uses: azure/setup-helm@v3 + with: + version: 'latest' + + - name: Authenticate with Google Cloud + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SERVICE_KEY }} + + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Install gke-gcloud-auth-plugin + run: | + gcloud components install gke-gcloud-auth-plugin + + - name: Configure kubectl + run: | + gcloud container clusters get-credentials ${{ env.GCP_CLUSTER }} \ + --zone ${{ env.GCP_ZONE }} \ + --project ${{ env.GCP_PROJECT }} + + - name: Create namespace if not exists + if: ${{ !inputs.dry_run }} + run: | + kubectl create namespace ${{ env.NAMESPACE }} --dry-run=client -o yaml | kubectl apply -f - + + - name: Decrypt and apply secrets + if: ${{ !inputs.skip_secrets && !inputs.dry_run }} + run: | + SECRETS_DIR="${{ env.ENV_DIR }}/secrets" + + if [[ -d "$SECRETS_DIR" ]]; then + echo "Applying secrets from $SECRETS_DIR" + for file in "$SECRETS_DIR"/*.yaml; do + # Skip .gitkeep or non-existent files + [[ -e "$file" ]] || continue + [[ "$(basename "$file")" == ".gitkeep" ]] && continue + + echo "Decrypting and applying: $file" + sops --decrypt "$file" | kubectl apply -n ${{ env.NAMESPACE }} -f - + done + echo "All secrets applied successfully" + else + echo "No secrets directory found at $SECRETS_DIR" + fi + + - name: Update Helm dependencies + run: | + helm dependency update charts/openhands + + - name: Helm template (dry run) + if: ${{ inputs.dry_run }} + run: | + helm template ${{ env.HELM_RELEASE }} charts/openhands \ + --namespace ${{ env.NAMESPACE }} \ + --values ${{ env.ENV_DIR }}/values.yaml \ + --set image.tag=${{ inputs.image_tag }} \ + --debug + + - name: Deploy with Helm + if: ${{ !inputs.dry_run }} + run: | + # Check current release status + if helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} &>/dev/null; then + status=$(helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} -o json | jq -r '.info.status') + if [[ "$status" != "deployed" ]]; then + echo "Found release in non-deployed state ($status). Attempting rollback..." + helm rollback ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} || true + fi + fi + + helm upgrade --install \ + --wait \ + --timeout 10m \ + ${{ env.HELM_RELEASE }} \ + charts/openhands \ + --namespace ${{ env.NAMESPACE }} \ + --values ${{ env.ENV_DIR }}/values.yaml \ + --set image.tag=${{ inputs.image_tag }} \ + --debug + + - name: Get deployment info + if: ${{ !inputs.dry_run }} + id: deployment_info + run: | + echo "## Deployment Summary (pathroute)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Environment:** staging-pathroute" >> $GITHUB_STEP_SUMMARY + echo "- **Namespace:** ${{ env.NAMESPACE }}" >> $GITHUB_STEP_SUMMARY + echo "- **Image Tag:** ${{ inputs.image_tag }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + # Get ingress hostname + hostname=$(kubectl get ing -n ${{ env.NAMESPACE }} -o jsonpath='{.items[0].spec.rules[0].host}' 2>/dev/null || echo "N/A") + echo "- **Hostname:** https://$hostname" >> $GITHUB_STEP_SUMMARY + echo "hostname=$hostname" >> $GITHUB_OUTPUT + + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Pods Status" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + + - name: Verify deployment health + if: ${{ !inputs.dry_run }} + run: | + echo "Waiting for deployment to stabilize..." + kubectl rollout status deployment -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} --timeout=5m || true + + echo "" + echo "Current pod status:" + kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} + + outputs: + hostname: ${{ steps.deployment_info.outputs.hostname }} + + deploy-subdomain: + name: Deploy to staging-subdomain + if: ${{ inputs.environment == 'both' || inputs.environment == 'subdomain' }} + runs-on: ubuntu-24.04 + permissions: + contents: read + id-token: write + env: + NAMESPACE: openhands-subdomain + HELM_RELEASE: openhands-subdomain + ENV_DIR: envs/staging-subdomain + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install SOPS + run: | + curl -L "https://github.com/mozilla/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64" -o sops + chmod +x sops + sudo mv sops /usr/local/bin/sops + sops --version + + - name: Install Helm + uses: azure/setup-helm@v3 + with: + version: 'latest' + + - name: Authenticate with Google Cloud + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_SERVICE_KEY }} + + - name: Set up Google Cloud SDK + uses: google-github-actions/setup-gcloud@v2 + + - name: Install gke-gcloud-auth-plugin + run: | + gcloud components install gke-gcloud-auth-plugin + + - name: Configure kubectl + run: | + gcloud container clusters get-credentials ${{ env.GCP_CLUSTER }} \ + --zone ${{ env.GCP_ZONE }} \ + --project ${{ env.GCP_PROJECT }} + + - name: Create namespace if not exists + if: ${{ !inputs.dry_run }} + run: | + kubectl create namespace ${{ env.NAMESPACE }} --dry-run=client -o yaml | kubectl apply -f - + + - name: Decrypt and apply secrets + if: ${{ !inputs.skip_secrets && !inputs.dry_run }} + run: | + SECRETS_DIR="${{ env.ENV_DIR }}/secrets" + + if [[ -d "$SECRETS_DIR" ]]; then + echo "Applying secrets from $SECRETS_DIR" + for file in "$SECRETS_DIR"/*.yaml; do + # Skip .gitkeep or non-existent files + [[ -e "$file" ]] || continue + [[ "$(basename "$file")" == ".gitkeep" ]] && continue + + echo "Decrypting and applying: $file" + sops --decrypt "$file" | kubectl apply -n ${{ env.NAMESPACE }} -f - + done + echo "All secrets applied successfully" + else + echo "No secrets directory found at $SECRETS_DIR" + fi + + - name: Update Helm dependencies + run: | + helm dependency update charts/openhands + + - name: Helm template (dry run) + if: ${{ inputs.dry_run }} + run: | + helm template ${{ env.HELM_RELEASE }} charts/openhands \ + --namespace ${{ env.NAMESPACE }} \ + --values ${{ env.ENV_DIR }}/values.yaml \ + --set image.tag=${{ inputs.image_tag }} \ + --debug + + - name: Deploy with Helm + if: ${{ !inputs.dry_run }} + run: | + # Check current release status + if helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} &>/dev/null; then + status=$(helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} -o json | jq -r '.info.status') + if [[ "$status" != "deployed" ]]; then + echo "Found release in non-deployed state ($status). Attempting rollback..." + helm rollback ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} || true + fi + fi + + helm upgrade --install \ + --wait \ + --timeout 10m \ + ${{ env.HELM_RELEASE }} \ + charts/openhands \ + --namespace ${{ env.NAMESPACE }} \ + --values ${{ env.ENV_DIR }}/values.yaml \ + --set image.tag=${{ inputs.image_tag }} \ + --debug + + - name: Get deployment info + if: ${{ !inputs.dry_run }} + id: deployment_info + run: | + echo "## Deployment Summary (subdomain)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Environment:** staging-subdomain" >> $GITHUB_STEP_SUMMARY + echo "- **Namespace:** ${{ env.NAMESPACE }}" >> $GITHUB_STEP_SUMMARY + echo "- **Image Tag:** ${{ inputs.image_tag }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + # Get ingress hostname + hostname=$(kubectl get ing -n ${{ env.NAMESPACE }} -o jsonpath='{.items[0].spec.rules[0].host}' 2>/dev/null || echo "N/A") + echo "- **Hostname:** https://$hostname" >> $GITHUB_STEP_SUMMARY + echo "hostname=$hostname" >> $GITHUB_OUTPUT + + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Pods Status" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + + - name: Verify deployment health + if: ${{ !inputs.dry_run }} + run: | + echo "Waiting for deployment to stabilize..." + kubectl rollout status deployment -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} --timeout=5m || true + + echo "" + echo "Current pod status:" + kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} + + outputs: + hostname: ${{ steps.deployment_info.outputs.hostname }} diff --git a/.sops.yaml b/.sops.yaml new file mode 100644 index 00000000..b4876856 --- /dev/null +++ b/.sops.yaml @@ -0,0 +1,17 @@ +# SOPS configuration for OpenHands-Cloud +# This file tells SOPS which encryption keys to use for different file patterns +creation_rules: + # Staging path-route environment secrets - use GCP KMS + - path_regex: envs/staging-pathroute/.*secrets.*\.yaml$ + gcp_kms: projects/global-432717/locations/global/keyRings/sops-key-ring/cryptoKeys/sops-key + encrypted_regex: "^(data|stringData|config)$" + + # Staging subdomain environment secrets - use GCP KMS + - path_regex: envs/staging-subdomain/.*secrets.*\.yaml$ + gcp_kms: projects/global-432717/locations/global/keyRings/sops-key-ring/cryptoKeys/sops-key + encrypted_regex: "^(data|stringData|config)$" + + # Production environment secrets (future use) + - path_regex: envs/production/.*secrets.*\.yaml$ + gcp_kms: projects/global-432717/locations/global/keyRings/sops-key-ring/cryptoKeys/sops-key + encrypted_regex: "^(data|stringData|config)$" diff --git a/envs/staging-pathroute/README.md b/envs/staging-pathroute/README.md new file mode 100644 index 00000000..eaccfda9 --- /dev/null +++ b/envs/staging-pathroute/README.md @@ -0,0 +1,104 @@ +# Staging Path-Route Environment Configuration + +This directory contains the configuration for deploying OpenHands to the **staging-pathroute** environment. + +## Environment Overview + +This environment uses **path-based routing**: +- Main app: `https://staging-pathroute.all-hands.dev/` +- Automation API: `https://staging-pathroute.all-hands.dev/api/automation` +- Integrations: `https://staging-pathroute.all-hands.dev/integration/*` +- MCP: `https://staging-pathroute.all-hands.dev/mcp/mcp` + +## Directory Structure + +``` +envs/staging-pathroute/ +├── README.md # This file +├── values.yaml # Helm values (non-secret configuration) +└── secrets/ # SOPS-encrypted Kubernetes secrets + └── *.yaml # Individual secret files +``` + +## Kubernetes Details + +- **Namespace:** `openhands-pathroute` +- **Helm Release:** `openhands-pathroute` +- **GCP Project:** `staging-092324` +- **GKE Cluster:** `staging-core-application` +- **Zone:** `us-central1` + +## Secrets Management + +Secrets are encrypted using [SOPS](https://github.com/getsops/sops) with GCP KMS encryption. + +### Required Secrets + +The following secrets must be created in `secrets/` before deployment: + +| Secret Name | Description | Required Keys | +|-------------|-------------|---------------| +| `ghcr-login-secret` | GitHub Container Registry pull credentials | `.dockerconfigjson` | +| `lite-llm-api-key` | LiteLLM API key | `api-key` | +| `stripe-api-key` | Stripe API key | `api-key` | +| `resend-api-key` | Resend email API key | `api-key` | +| `bitbucket-app` | Bitbucket OAuth app credentials | `client-id`, `client-secret` | +| `automation-service-key` | Automation service authentication key | `automation-service-key` | +| `automation-db-secret` | Automation database password | `db-password` | +| `keycloak-realm` | Keycloak realm credentials | `client-id`, `client-secret` | + +### Creating/Editing Secrets + +```bash +# Create a new SOPS-encrypted secret +cat < /tmp/my-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: my-secret +type: Opaque +stringData: + key: "value" +EOF +sops --encrypt /tmp/my-secret.yaml > envs/staging-pathroute/secrets/my-secret.yaml + +# Edit (decrypts, opens editor, re-encrypts on save) +sops envs/staging-pathroute/secrets/my-secret.yaml + +# View decrypted content +sops --decrypt envs/staging-pathroute/secrets/my-secret.yaml +``` + +## Deployment + +Use the GitHub Actions workflow: + +1. Go to **Actions** → **Deploy to Staging** +2. Click **Run workflow** +3. Select environment: `pathroute` or `both` +4. Enter the image tag to deploy + +### Workflow Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `image_tag` | OpenHands image tag to deploy | `main` | +| `environment` | Which environment(s) to deploy | `both` | +| `skip_secrets` | Skip applying secrets | `false` | +| `dry_run` | Template only, don't deploy | `false` | + +## Troubleshooting + +```bash +# Check pods +kubectl get pods -n openhands-pathroute + +# Check Helm release +helm history openhands-pathroute -n openhands-pathroute + +# Check ingress +kubectl get ingress -n openhands-pathroute + +# GCP auth for SOPS +gcloud auth application-default login +``` diff --git a/envs/staging-pathroute/secrets/.gitkeep b/envs/staging-pathroute/secrets/.gitkeep new file mode 100644 index 00000000..c0a5d67a --- /dev/null +++ b/envs/staging-pathroute/secrets/.gitkeep @@ -0,0 +1,2 @@ +# This directory contains SOPS-encrypted Kubernetes Secret files +# See ../README.md for instructions on managing secrets diff --git a/envs/staging-pathroute/values.yaml b/envs/staging-pathroute/values.yaml new file mode 100644 index 00000000..9417a413 --- /dev/null +++ b/envs/staging-pathroute/values.yaml @@ -0,0 +1,313 @@ +# Staging (Path-Based Routing) environment values for OpenHands +# This environment uses path-based routing: /api/automation, /integration/*, etc. +# Secrets are managed separately in envs/staging-pathroute/secrets/ + +imagePullSecrets: + - name: ghcr-login-secret + +databaseMigrations: + waitForDatabase: false + +ingress: + host: staging-pathroute.all-hands.dev + enabled: true + class: traefik + root: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + +helm-release-pruner: + enabled: true + job: + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 100m + memory: 256Mi + +sandbox: + apiHostname: https://runtime.staging.all-hands.dev + +runtime-api: + enabled: false + +filestore: + type: google_cloud + bucket: staging-openhands-sessions + +serviceAccount: + annotations: + iam.gke.io/gcp-service-account: openhands-sa@staging-092324.iam.gserviceaccount.com + +migrationJob: + enabled: true + initContainer: + enabled: false + +github: + enabled: true + +env: + ENABLE_BILLING: "true" + OH_APP_MODE: "saas" + OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_BILLING: "true" + OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_JIRA: "true" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_USERS_PAGE: "false" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_BILLING_PAGE: "false" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_INTEGRATIONS_PAGE: "false" + OH_WEB_CLIENT_PROVIDERS_CONFIGURED: '["github","gitlab","bitbucket"]' + OH_WEB_CLIENT_GITHUB_APP_SLUG: "openhands-staging" + OH_APP_CONVERSATION_INFO_KIND: "server.utils.saas_app_conversation_info_injector.SaasAppConversationInfoServiceInjector" + HIDE_LLM_SETTINGS: "false" + GOOGLE_CLOUD_PROJECT: staging-092324 + GCP_PROJECT: staging-092324 + RECAPTCHA_PROJECT_ID: staging-092324 + RECAPTCHA_SITE_KEY: "" + RECAPTCHA_HMAC_SECRET: "qgfN+prMC1iMziHP3YndNicZjgK5IMXITUuVQOnEe9o=" + RECAPTCHA_BLOCK_THRESHOLD: "0.3" + GCP_REGION: us-central1 + GCP_DB_INSTANCE: application-db + DB_USER: openhands-user + DB_NAME: openhands + MAX_CONCURRENT_CONVERSATIONS: "10" + DB_POOL_SIZE: "25" + DB_MAX_OVERFLOW: "30" + ENABLE_PROACTIVE_CONVERSATION_STARTERS: "false" + ENABLE_SOLVABILITY_ANALYSIS: "true" + ENABLE_MCP_SEARCH_ENGINE: "true" + ENABLE_EXPERIMENT_MANAGER: "true" + EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT: "three_system_prompt_experiment" + EXPERIMENT_CONDENSER_MAX_STEP: "condenser_max_step_experiment" + CONVERSATION_MANAGER_CLASS: "server.saas_nested_conversation_manager.SaasNestedConversationManager" + INIT_GIT_IN_EMPTY_WORKSPACE: "1" + RUNTIME_URL_PATTERN: "https://{runtime_id}.staging-runtime.all-hands.dev" + OPENHANDS_PROVIDER_BASE_URL: "https://llm-proxy.staging.all-hands.dev/" + JIRA_WEBHOOKS_ENABLED: "true" + EMAIL_PATTERN_BLACKLIST: "%" + EMAIL_PATTERN_WHITELIST: "%@openhands.dev,%@all-hands.dev" + OH_USER_AUTHORIZER_PREVENT_DUPLICATES: "false" + V1_ENABLED: "true" + ENABLE_V1_SLACK_RESOLVER: "true" + ENABLE_V1_GITHUB_RESOLVER: "true" + DUPLICATE_EMAIL_CHECK: "false" + OPENHANDS_SUPPRESS_BANNER: "1" + +litellm: + enabled: true + url: https://llm-proxy.staging.all-hands.dev + teamId: 62ea39c4-8886-44f3-b7ce-07ed4fe42d2c + auth: + existingSecret: lite-llm-api-key + envVars: + JSON_LOGS: "true" + +keycloak: + enabled: false + url: "http://keycloak.keycloak" + +laminar: + enabled: true + global: + cloudProvider: "gcp" + clickhouse: + s3: + enabled: false + appServer: + ingress: + hostname: "laminar-api.staging.all-hands.dev" + frontend: + extraEnv: + - name: AUTH_KEYCLOAK_ID + valueFrom: + secretKeyRef: + name: keycloak-realm + key: client-id + - name: AUTH_KEYCLOAK_SECRET + valueFrom: + secretKeyRef: + name: keycloak-realm + key: client-secret + - name: AUTH_KEYCLOAK_ISSUER + value: "https://auth.staging.all-hands.dev/realms/allhands" + ingress: + hostname: "laminar.staging.all-hands.dev" + env: + nextauthUrl: "https://laminar.staging.all-hands.dev" + nextPublicUrl: "https://laminar.staging.all-hands.dev" + storage: + storageClass: + type: "hyperdisk-balanced" + +stripe: + enabled: true + auth: + existingSecret: stripe-api-key + +resend: + enabled: true + auth: + existingSecret: resend-api-key + +gitlabWebhookInstallation: + enabled: true + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "512Mi" + cpu: "200m" + +bitbucket: + enabled: true + auth: + existingSecret: bitbucket-app + +jira: + enabled: true + +enrichUserInteractionData: + enabled: true + +githubProxy: + endpointsEnabled: true + +gitlab: + enabled: true + +integrationEvents: + deployment: + replicas: 2 + resources: + requests: + memory: 2.5Gi + cpu: 1000m + limits: + memory: 2.5Gi + cpu: 1000m + uvicorn: + workers: 2 + +proactiveConvoClean: + enabled: true + schedule: "*/15 * * * *" + successfulJobsHistoryLimit: 3 + backoffLimit: 3 + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "100m" + +slack: + enabled: true + clientId: "7477886716822.8865243365329" + +debuggingRoutes: + enabled: true + +deployment: + replicas: 2 + resources: + requests: + memory: 5Gi + cpu: 1000m + limits: + memory: 5Gi + cpu: 1000m + +commonRoomSync: + enabled: true + schedule: "0 * * * *" + +datadog: + enabled: true + env: "staging" + service: "deploy" + agentHost: "datadog-agent.all-hands-system.svc.cluster.local" + +appConfig: + POSTHOG_CLIENT_KEY: "phc_Wj1DvqGQgl5ml0bkZvPr55sxIvJWjlGmoHYZrxdh5qD" + POSTHOG_HOST: "https://us.i.posthog.com" + +tavily: + enabled: true + +postgresql: + enabled: false + +redis: + master: + resources: + requests: + memory: 1Gi + cpu: 500m + limits: + memory: 1Gi + cpu: 500m + +gcpMonitoring: + enabled: true + +automationServiceKey: + enabled: true + existingSecret: automation-service-key + secretKey: automation-service-key + +automation: + enabled: true + image: + repository: ghcr.io/openhands/automation + imagePullSecrets: + - name: ghcr-login-secret + deployment: + replicas: 3 + resources: + requests: + memory: 512Mi + cpu: 200m + limits: + memory: 1Gi + cpu: 500m + serviceAccount: + name: automation-sa + annotations: + iam.gke.io/gcp-service-account: automation-sa@staging-092324.iam.gserviceaccount.com + openhandsApiUrl: "https://staging-pathroute.all-hands.dev" + automationBaseUrl: "https://staging-pathroute.all-hands.dev" + postgresql: + enabled: false + database: + host: "" + port: "5432" + user: "automation_user" + name: "automations" + createDatabaseUser: false + secretName: "automation-db-secret" + secretKey: "db-password" + gcp: + dbInstance: "application-db" + project: "staging-092324" + region: "us-central1" + filestore: + ephemeral: false + bucket: "staging-openhands-sessions" + type: gcs + minio: + enabled: false + serviceKeyFromSecret: + name: automation-service-key + key: automation-service-key + datadog: + env: "staging" + env: + AUTOMATION_SCHEDULER_INTERVAL_SECONDS: "30" + AUTOMATION_LOG_LEVEL: "info" + GCS_BUCKET_NAME: "staging-openhands-sessions" + +runtime: + runAsRoot: true diff --git a/envs/staging-subdomain/README.md b/envs/staging-subdomain/README.md new file mode 100644 index 00000000..b84eabde --- /dev/null +++ b/envs/staging-subdomain/README.md @@ -0,0 +1,114 @@ +# Staging Subdomain Environment Configuration + +This directory contains the configuration for deploying OpenHands to the **staging-subdomain** environment. + +## Environment Overview + +This environment uses **subdomain-based routing** (same as production pattern): +- Main app: `https://staging-subdomain.all-hands.dev/` +- Integrations still use path-based routes on the main domain (GitHub/GitLab webhooks, Stripe, etc.) +- MCP: `https://staging-subdomain.all-hands.dev/mcp/mcp` + +The key difference from staging-pathroute is that this environment tests the production-like subdomain pattern for services that will eventually move to subdomains. + +## Directory Structure + +``` +envs/staging-subdomain/ +├── README.md # This file +├── values.yaml # Helm values (non-secret configuration) +└── secrets/ # SOPS-encrypted Kubernetes secrets + └── *.yaml # Individual secret files +``` + +## Kubernetes Details + +- **Namespace:** `openhands-subdomain` +- **Helm Release:** `openhands-subdomain` +- **GCP Project:** `staging-092324` +- **GKE Cluster:** `staging-core-application` +- **Zone:** `us-central1` + +## Secrets Management + +Secrets are encrypted using [SOPS](https://github.com/getsops/sops) with GCP KMS encryption. + +### Required Secrets + +The following secrets must be created in `secrets/` before deployment: + +| Secret Name | Description | Required Keys | +|-------------|-------------|---------------| +| `ghcr-login-secret` | GitHub Container Registry pull credentials | `.dockerconfigjson` | +| `lite-llm-api-key` | LiteLLM API key | `api-key` | +| `stripe-api-key` | Stripe API key | `api-key` | +| `resend-api-key` | Resend email API key | `api-key` | +| `bitbucket-app` | Bitbucket OAuth app credentials | `client-id`, `client-secret` | +| `automation-service-key` | Automation service authentication key | `automation-service-key` | +| `automation-db-secret` | Automation database password | `db-password` | +| `keycloak-realm` | Keycloak realm credentials | `client-id`, `client-secret` | + +### Creating/Editing Secrets + +```bash +# Create a new SOPS-encrypted secret +cat < /tmp/my-secret.yaml +apiVersion: v1 +kind: Secret +metadata: + name: my-secret +type: Opaque +stringData: + key: "value" +EOF +sops --encrypt /tmp/my-secret.yaml > envs/staging-subdomain/secrets/my-secret.yaml + +# Edit (decrypts, opens editor, re-encrypts on save) +sops envs/staging-subdomain/secrets/my-secret.yaml + +# View decrypted content +sops --decrypt envs/staging-subdomain/secrets/my-secret.yaml +``` + +## Deployment + +Use the GitHub Actions workflow: + +1. Go to **Actions** → **Deploy to Staging** +2. Click **Run workflow** +3. Select environment: `subdomain` or `both` +4. Enter the image tag to deploy + +### Workflow Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `image_tag` | OpenHands image tag to deploy | `main` | +| `environment` | Which environment(s) to deploy | `both` | +| `skip_secrets` | Skip applying secrets | `false` | +| `dry_run` | Template only, don't deploy | `false` | + +## Troubleshooting + +```bash +# Check pods +kubectl get pods -n openhands-subdomain + +# Check Helm release +helm history openhands-subdomain -n openhands-subdomain + +# Check ingress +kubectl get ingress -n openhands-subdomain + +# GCP auth for SOPS +gcloud auth application-default login +``` + +## Comparison with staging-pathroute + +| Aspect | staging-pathroute | staging-subdomain | +|--------|-------------------|-------------------| +| Main URL | `staging-pathroute.all-hands.dev` | `staging-subdomain.all-hands.dev` | +| Routing | Path-based | Subdomain-based (future) | +| Purpose | Test path routing | Test production-like subdomain pattern | +| Namespace | `openhands-pathroute` | `openhands-subdomain` | diff --git a/envs/staging-subdomain/secrets/.gitkeep b/envs/staging-subdomain/secrets/.gitkeep new file mode 100644 index 00000000..3a3b56d0 --- /dev/null +++ b/envs/staging-subdomain/secrets/.gitkeep @@ -0,0 +1 @@ +# Placeholder for SOPS-encrypted secrets diff --git a/envs/staging-subdomain/values.yaml b/envs/staging-subdomain/values.yaml new file mode 100644 index 00000000..f309e88e --- /dev/null +++ b/envs/staging-subdomain/values.yaml @@ -0,0 +1,322 @@ +# Staging (Subdomain-Based Routing) environment values for OpenHands +# This environment uses subdomain-based routing instead of path-based +# e.g., automation.staging-subdomain.all-hands.dev instead of staging.all-hands.dev/api/automation +# Secrets are managed separately in envs/staging-subdomain/secrets/ + +imagePullSecrets: + - name: ghcr-login-secret + +databaseMigrations: + waitForDatabase: false + +ingress: + host: staging-subdomain.all-hands.dev + enabled: true + class: traefik + root: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + # Subdomain-based routing: disable path-based ingresses for integrations/automation + # These will be handled by separate subdomain ingresses + integrations: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + mcp: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + +helm-release-pruner: + enabled: true + job: + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 100m + memory: 256Mi + +sandbox: + apiHostname: https://runtime.staging.all-hands.dev + +runtime-api: + enabled: false + +filestore: + type: google_cloud + bucket: staging-openhands-sessions + +serviceAccount: + annotations: + iam.gke.io/gcp-service-account: openhands-sa@staging-092324.iam.gserviceaccount.com + +migrationJob: + enabled: true + initContainer: + enabled: false + +github: + enabled: true + +env: + ENABLE_BILLING: "true" + OH_APP_MODE: "saas" + OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_BILLING: "true" + OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_JIRA: "true" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_USERS_PAGE: "false" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_BILLING_PAGE: "false" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_INTEGRATIONS_PAGE: "false" + OH_WEB_CLIENT_PROVIDERS_CONFIGURED: '["github","gitlab","bitbucket"]' + OH_WEB_CLIENT_GITHUB_APP_SLUG: "openhands-staging" + OH_APP_CONVERSATION_INFO_KIND: "server.utils.saas_app_conversation_info_injector.SaasAppConversationInfoServiceInjector" + HIDE_LLM_SETTINGS: "false" + GOOGLE_CLOUD_PROJECT: staging-092324 + GCP_PROJECT: staging-092324 + RECAPTCHA_PROJECT_ID: staging-092324 + RECAPTCHA_SITE_KEY: "" + RECAPTCHA_HMAC_SECRET: "qgfN+prMC1iMziHP3YndNicZjgK5IMXITUuVQOnEe9o=" + RECAPTCHA_BLOCK_THRESHOLD: "0.3" + GCP_REGION: us-central1 + GCP_DB_INSTANCE: application-db + DB_USER: openhands-user + DB_NAME: openhands + MAX_CONCURRENT_CONVERSATIONS: "10" + DB_POOL_SIZE: "25" + DB_MAX_OVERFLOW: "30" + ENABLE_PROACTIVE_CONVERSATION_STARTERS: "false" + ENABLE_SOLVABILITY_ANALYSIS: "true" + ENABLE_MCP_SEARCH_ENGINE: "true" + ENABLE_EXPERIMENT_MANAGER: "true" + EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT: "three_system_prompt_experiment" + EXPERIMENT_CONDENSER_MAX_STEP: "condenser_max_step_experiment" + CONVERSATION_MANAGER_CLASS: "server.saas_nested_conversation_manager.SaasNestedConversationManager" + INIT_GIT_IN_EMPTY_WORKSPACE: "1" + RUNTIME_URL_PATTERN: "https://{runtime_id}.staging-runtime.all-hands.dev" + OPENHANDS_PROVIDER_BASE_URL: "https://llm-proxy.staging.all-hands.dev/" + JIRA_WEBHOOKS_ENABLED: "true" + EMAIL_PATTERN_BLACKLIST: "%" + EMAIL_PATTERN_WHITELIST: "%@openhands.dev,%@all-hands.dev" + OH_USER_AUTHORIZER_PREVENT_DUPLICATES: "false" + V1_ENABLED: "true" + ENABLE_V1_SLACK_RESOLVER: "true" + ENABLE_V1_GITHUB_RESOLVER: "true" + DUPLICATE_EMAIL_CHECK: "false" + OPENHANDS_SUPPRESS_BANNER: "1" + +litellm: + enabled: true + url: https://llm-proxy.staging.all-hands.dev + teamId: 62ea39c4-8886-44f3-b7ce-07ed4fe42d2c + auth: + existingSecret: lite-llm-api-key + envVars: + JSON_LOGS: "true" + +keycloak: + enabled: false + url: "http://keycloak.keycloak" + +laminar: + enabled: true + global: + cloudProvider: "gcp" + clickhouse: + s3: + enabled: false + appServer: + ingress: + hostname: "laminar-api.staging.all-hands.dev" + frontend: + extraEnv: + - name: AUTH_KEYCLOAK_ID + valueFrom: + secretKeyRef: + name: keycloak-realm + key: client-id + - name: AUTH_KEYCLOAK_SECRET + valueFrom: + secretKeyRef: + name: keycloak-realm + key: client-secret + - name: AUTH_KEYCLOAK_ISSUER + value: "https://auth.staging.all-hands.dev/realms/allhands" + ingress: + hostname: "laminar.staging.all-hands.dev" + env: + nextauthUrl: "https://laminar.staging.all-hands.dev" + nextPublicUrl: "https://laminar.staging.all-hands.dev" + storage: + storageClass: + type: "hyperdisk-balanced" + +stripe: + enabled: true + auth: + existingSecret: stripe-api-key + +resend: + enabled: true + auth: + existingSecret: resend-api-key + +gitlabWebhookInstallation: + enabled: true + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "512Mi" + cpu: "200m" + +bitbucket: + enabled: true + auth: + existingSecret: bitbucket-app + +jira: + enabled: true + +enrichUserInteractionData: + enabled: true + +githubProxy: + endpointsEnabled: true + +gitlab: + enabled: true + +integrationEvents: + deployment: + replicas: 2 + resources: + requests: + memory: 2.5Gi + cpu: 1000m + limits: + memory: 2.5Gi + cpu: 1000m + uvicorn: + workers: 2 + +proactiveConvoClean: + enabled: true + schedule: "*/15 * * * *" + successfulJobsHistoryLimit: 3 + backoffLimit: 3 + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "100m" + +slack: + enabled: true + clientId: "7477886716822.8865243365329" + +debuggingRoutes: + enabled: true + +deployment: + replicas: 2 + resources: + requests: + memory: 5Gi + cpu: 1000m + limits: + memory: 5Gi + cpu: 1000m + +commonRoomSync: + enabled: true + schedule: "0 * * * *" + +datadog: + enabled: true + env: "staging" + service: "deploy" + agentHost: "datadog-agent.all-hands-system.svc.cluster.local" + +appConfig: + POSTHOG_CLIENT_KEY: "phc_Wj1DvqGQgl5ml0bkZvPr55sxIvJWjlGmoHYZrxdh5qD" + POSTHOG_HOST: "https://us.i.posthog.com" + +tavily: + enabled: true + +postgresql: + enabled: false + +redis: + master: + resources: + requests: + memory: 1Gi + cpu: 500m + limits: + memory: 1Gi + cpu: 500m + +gcpMonitoring: + enabled: true + +automationServiceKey: + enabled: true + existingSecret: automation-service-key + secretKey: automation-service-key + +automation: + enabled: true + image: + repository: ghcr.io/openhands/automation + imagePullSecrets: + - name: ghcr-login-secret + deployment: + replicas: 3 + resources: + requests: + memory: 512Mi + cpu: 200m + limits: + memory: 1Gi + cpu: 500m + serviceAccount: + name: automation-sa + annotations: + iam.gke.io/gcp-service-account: automation-sa@staging-092324.iam.gserviceaccount.com + openhandsApiUrl: "https://staging-subdomain.all-hands.dev" + automationBaseUrl: "https://staging-subdomain.all-hands.dev" + postgresql: + enabled: false + database: + host: "" + port: "5432" + user: "automation_user" + name: "automations" + createDatabaseUser: false + secretName: "automation-db-secret" + secretKey: "db-password" + gcp: + dbInstance: "application-db" + project: "staging-092324" + region: "us-central1" + filestore: + ephemeral: false + bucket: "staging-openhands-sessions" + type: gcs + minio: + enabled: false + serviceKeyFromSecret: + name: automation-service-key + key: automation-service-key + datadog: + env: "staging" + env: + AUTOMATION_SCHEDULER_INTERVAL_SECONDS: "30" + AUTOMATION_LOG_LEVEL: "info" + GCS_BUCKET_NAME: "staging-openhands-sessions" + +runtime: + runAsRoot: true diff --git a/scripts/deploy/decrypt.sh b/scripts/deploy/decrypt.sh new file mode 100644 index 00000000..0a8c3d13 --- /dev/null +++ b/scripts/deploy/decrypt.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Decrypt a SOPS-encrypted file +set -eo pipefail + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +input_file="$1" +decrypted_file="decrypted.yaml" + +if [ ! -f "$input_file" ]; then + echo "Error: File $input_file not found" + exit 1 +fi + +sops --decrypt "$input_file" > "$decrypted_file" + +echo "File decrypted and saved as $decrypted_file" diff --git a/scripts/deploy/safe-apply-secrets.sh b/scripts/deploy/safe-apply-secrets.sh new file mode 100644 index 00000000..93e8e2b7 --- /dev/null +++ b/scripts/deploy/safe-apply-secrets.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Safely apply Kubernetes secrets (create or update without overwriting unchanged data) +set -eo pipefail + +if [ "$#" -lt 1 ]; then + echo "Usage: $0 [kubectl_args...]" + exit 1 +fi + +secret_file="$1" +shift +kubectl_args="$@" + +if [ ! -f "$secret_file" ]; then + echo "Error: File $secret_file not found" + exit 1 +fi + +# Apply the secret using server-side apply for idempotent updates +kubectl apply -f "$secret_file" $kubectl_args + +echo "Secret applied successfully" From 4dd4352a4b593118a2806935312adbcc0671cb72 Mon Sep 17 00:00:00 2001 From: Saurya Velagapudi Date: Mon, 13 Apr 2026 13:02:43 -0700 Subject: [PATCH 2/6] Address PR review: eliminate duplication with matrix strategy and common values - Refactor workflow to use matrix strategy (eliminates ~150 lines duplication) - Extract common staging config to envs/common/values.yaml - Slim down environment-specific values to only host/URL overrides - Remove unnecessary wrapper scripts (decrypt.sh, safe-apply-secrets.sh) - Update READMEs to document new structure Co-authored-by: openhands --- .github/workflows/deploy-staging.yml | 219 ++++-------------- envs/common/values.yaml | 315 ++++++++++++++++++++++++++ envs/staging-pathroute/README.md | 17 +- envs/staging-pathroute/values.yaml | 310 +------------------------- envs/staging-subdomain/README.md | 17 +- envs/staging-subdomain/values.yaml | 319 +-------------------------- scripts/deploy/decrypt.sh | 20 -- scripts/deploy/safe-apply-secrets.sh | 22 -- 8 files changed, 387 insertions(+), 852 deletions(-) create mode 100644 envs/common/values.yaml delete mode 100644 scripts/deploy/decrypt.sh delete mode 100644 scripts/deploy/safe-apply-secrets.sh diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml index 94f2b66d..8e0f7950 100644 --- a/.github/workflows/deploy-staging.yml +++ b/.github/workflows/deploy-staging.yml @@ -1,7 +1,6 @@ name: Deploy to Staging on: - # Manual trigger with options workflow_dispatch: inputs: image_tag: @@ -32,17 +31,26 @@ env: GCP_CLUSTER: staging-core-application jobs: - deploy-pathroute: - name: Deploy to staging-pathroute - if: ${{ inputs.environment == 'both' || inputs.environment == 'pathroute' }} + deploy: + name: Deploy to staging-${{ matrix.env.name }} runs-on: ubuntu-24.04 permissions: contents: read id-token: write - env: - NAMESPACE: openhands-pathroute - HELM_RELEASE: openhands-pathroute - ENV_DIR: envs/staging-pathroute + strategy: + fail-fast: false + matrix: + env: + - name: pathroute + namespace: openhands-pathroute + helm_release: openhands-pathroute + env_dir: envs/staging-pathroute + - name: subdomain + namespace: openhands-subdomain + helm_release: openhands-subdomain + env_dir: envs/staging-subdomain + # Only run for selected environment(s) + if: ${{ inputs.environment == 'both' || inputs.environment == matrix.env.name }} steps: - name: Checkout repository @@ -69,149 +77,7 @@ jobs: uses: google-github-actions/setup-gcloud@v2 - name: Install gke-gcloud-auth-plugin - run: | - gcloud components install gke-gcloud-auth-plugin - - - name: Configure kubectl - run: | - gcloud container clusters get-credentials ${{ env.GCP_CLUSTER }} \ - --zone ${{ env.GCP_ZONE }} \ - --project ${{ env.GCP_PROJECT }} - - - name: Create namespace if not exists - if: ${{ !inputs.dry_run }} - run: | - kubectl create namespace ${{ env.NAMESPACE }} --dry-run=client -o yaml | kubectl apply -f - - - - name: Decrypt and apply secrets - if: ${{ !inputs.skip_secrets && !inputs.dry_run }} - run: | - SECRETS_DIR="${{ env.ENV_DIR }}/secrets" - - if [[ -d "$SECRETS_DIR" ]]; then - echo "Applying secrets from $SECRETS_DIR" - for file in "$SECRETS_DIR"/*.yaml; do - # Skip .gitkeep or non-existent files - [[ -e "$file" ]] || continue - [[ "$(basename "$file")" == ".gitkeep" ]] && continue - - echo "Decrypting and applying: $file" - sops --decrypt "$file" | kubectl apply -n ${{ env.NAMESPACE }} -f - - done - echo "All secrets applied successfully" - else - echo "No secrets directory found at $SECRETS_DIR" - fi - - - name: Update Helm dependencies - run: | - helm dependency update charts/openhands - - - name: Helm template (dry run) - if: ${{ inputs.dry_run }} - run: | - helm template ${{ env.HELM_RELEASE }} charts/openhands \ - --namespace ${{ env.NAMESPACE }} \ - --values ${{ env.ENV_DIR }}/values.yaml \ - --set image.tag=${{ inputs.image_tag }} \ - --debug - - - name: Deploy with Helm - if: ${{ !inputs.dry_run }} - run: | - # Check current release status - if helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} &>/dev/null; then - status=$(helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} -o json | jq -r '.info.status') - if [[ "$status" != "deployed" ]]; then - echo "Found release in non-deployed state ($status). Attempting rollback..." - helm rollback ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} || true - fi - fi - - helm upgrade --install \ - --wait \ - --timeout 10m \ - ${{ env.HELM_RELEASE }} \ - charts/openhands \ - --namespace ${{ env.NAMESPACE }} \ - --values ${{ env.ENV_DIR }}/values.yaml \ - --set image.tag=${{ inputs.image_tag }} \ - --debug - - - name: Get deployment info - if: ${{ !inputs.dry_run }} - id: deployment_info - run: | - echo "## Deployment Summary (pathroute)" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Environment:** staging-pathroute" >> $GITHUB_STEP_SUMMARY - echo "- **Namespace:** ${{ env.NAMESPACE }}" >> $GITHUB_STEP_SUMMARY - echo "- **Image Tag:** ${{ inputs.image_tag }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - # Get ingress hostname - hostname=$(kubectl get ing -n ${{ env.NAMESPACE }} -o jsonpath='{.items[0].spec.rules[0].host}' 2>/dev/null || echo "N/A") - echo "- **Hostname:** https://$hostname" >> $GITHUB_STEP_SUMMARY - echo "hostname=$hostname" >> $GITHUB_OUTPUT - - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Pods Status" >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - - - name: Verify deployment health - if: ${{ !inputs.dry_run }} - run: | - echo "Waiting for deployment to stabilize..." - kubectl rollout status deployment -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} --timeout=5m || true - - echo "" - echo "Current pod status:" - kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} - - outputs: - hostname: ${{ steps.deployment_info.outputs.hostname }} - - deploy-subdomain: - name: Deploy to staging-subdomain - if: ${{ inputs.environment == 'both' || inputs.environment == 'subdomain' }} - runs-on: ubuntu-24.04 - permissions: - contents: read - id-token: write - env: - NAMESPACE: openhands-subdomain - HELM_RELEASE: openhands-subdomain - ENV_DIR: envs/staging-subdomain - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install SOPS - run: | - curl -L "https://github.com/mozilla/sops/releases/download/v3.9.1/sops-v3.9.1.linux.amd64" -o sops - chmod +x sops - sudo mv sops /usr/local/bin/sops - sops --version - - - name: Install Helm - uses: azure/setup-helm@v3 - with: - version: 'latest' - - - name: Authenticate with Google Cloud - uses: google-github-actions/auth@v2 - with: - credentials_json: ${{ secrets.GCP_SERVICE_KEY }} - - - name: Set up Google Cloud SDK - uses: google-github-actions/setup-gcloud@v2 - - - name: Install gke-gcloud-auth-plugin - run: | - gcloud components install gke-gcloud-auth-plugin + run: gcloud components install gke-gcloud-auth-plugin - name: Configure kubectl run: | @@ -222,60 +88,57 @@ jobs: - name: Create namespace if not exists if: ${{ !inputs.dry_run }} run: | - kubectl create namespace ${{ env.NAMESPACE }} --dry-run=client -o yaml | kubectl apply -f - + kubectl create namespace ${{ matrix.env.namespace }} --dry-run=client -o yaml | kubectl apply -f - - name: Decrypt and apply secrets if: ${{ !inputs.skip_secrets && !inputs.dry_run }} run: | - SECRETS_DIR="${{ env.ENV_DIR }}/secrets" - + SECRETS_DIR="${{ matrix.env.env_dir }}/secrets" if [[ -d "$SECRETS_DIR" ]]; then echo "Applying secrets from $SECRETS_DIR" for file in "$SECRETS_DIR"/*.yaml; do - # Skip .gitkeep or non-existent files [[ -e "$file" ]] || continue [[ "$(basename "$file")" == ".gitkeep" ]] && continue - echo "Decrypting and applying: $file" - sops --decrypt "$file" | kubectl apply -n ${{ env.NAMESPACE }} -f - + sops --decrypt "$file" | kubectl apply -n ${{ matrix.env.namespace }} -f - done - echo "All secrets applied successfully" else echo "No secrets directory found at $SECRETS_DIR" fi - name: Update Helm dependencies - run: | - helm dependency update charts/openhands + run: helm dependency update charts/openhands - name: Helm template (dry run) if: ${{ inputs.dry_run }} run: | - helm template ${{ env.HELM_RELEASE }} charts/openhands \ - --namespace ${{ env.NAMESPACE }} \ - --values ${{ env.ENV_DIR }}/values.yaml \ + helm template ${{ matrix.env.helm_release }} charts/openhands \ + --namespace ${{ matrix.env.namespace }} \ + --values envs/common/values.yaml \ + --values ${{ matrix.env.env_dir }}/values.yaml \ --set image.tag=${{ inputs.image_tag }} \ --debug - name: Deploy with Helm if: ${{ !inputs.dry_run }} run: | - # Check current release status - if helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} &>/dev/null; then - status=$(helm status ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} -o json | jq -r '.info.status') + # Check current release status and rollback if needed + if helm status ${{ matrix.env.helm_release }} -n ${{ matrix.env.namespace }} &>/dev/null; then + status=$(helm status ${{ matrix.env.helm_release }} -n ${{ matrix.env.namespace }} -o json | jq -r '.info.status') if [[ "$status" != "deployed" ]]; then echo "Found release in non-deployed state ($status). Attempting rollback..." - helm rollback ${{ env.HELM_RELEASE }} -n ${{ env.NAMESPACE }} || true + helm rollback ${{ matrix.env.helm_release }} -n ${{ matrix.env.namespace }} || true fi fi helm upgrade --install \ --wait \ --timeout 10m \ - ${{ env.HELM_RELEASE }} \ + ${{ matrix.env.helm_release }} \ charts/openhands \ - --namespace ${{ env.NAMESPACE }} \ - --values ${{ env.ENV_DIR }}/values.yaml \ + --namespace ${{ matrix.env.namespace }} \ + --values envs/common/values.yaml \ + --values ${{ matrix.env.env_dir }}/values.yaml \ --set image.tag=${{ inputs.image_tag }} \ --debug @@ -283,33 +146,31 @@ jobs: if: ${{ !inputs.dry_run }} id: deployment_info run: | - echo "## Deployment Summary (subdomain)" >> $GITHUB_STEP_SUMMARY + echo "## Deployment Summary (${{ matrix.env.name }})" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Environment:** staging-subdomain" >> $GITHUB_STEP_SUMMARY - echo "- **Namespace:** ${{ env.NAMESPACE }}" >> $GITHUB_STEP_SUMMARY + echo "- **Environment:** staging-${{ matrix.env.name }}" >> $GITHUB_STEP_SUMMARY + echo "- **Namespace:** ${{ matrix.env.namespace }}" >> $GITHUB_STEP_SUMMARY echo "- **Image Tag:** ${{ inputs.image_tag }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - # Get ingress hostname - hostname=$(kubectl get ing -n ${{ env.NAMESPACE }} -o jsonpath='{.items[0].spec.rules[0].host}' 2>/dev/null || echo "N/A") + hostname=$(kubectl get ing -n ${{ matrix.env.namespace }} -o jsonpath='{.items[0].spec.rules[0].host}' 2>/dev/null || echo "N/A") echo "- **Hostname:** https://$hostname" >> $GITHUB_STEP_SUMMARY echo "hostname=$hostname" >> $GITHUB_OUTPUT echo "" >> $GITHUB_STEP_SUMMARY echo "### Pods Status" >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY - kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} >> $GITHUB_STEP_SUMMARY + kubectl get pods -n ${{ matrix.env.namespace }} -l app.kubernetes.io/instance=${{ matrix.env.helm_release }} >> $GITHUB_STEP_SUMMARY echo '```' >> $GITHUB_STEP_SUMMARY - name: Verify deployment health if: ${{ !inputs.dry_run }} run: | echo "Waiting for deployment to stabilize..." - kubectl rollout status deployment -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} --timeout=5m || true - + kubectl rollout status deployment -n ${{ matrix.env.namespace }} -l app.kubernetes.io/instance=${{ matrix.env.helm_release }} --timeout=5m || true echo "" echo "Current pod status:" - kubectl get pods -n ${{ env.NAMESPACE }} -l app.kubernetes.io/instance=${{ env.HELM_RELEASE }} + kubectl get pods -n ${{ matrix.env.namespace }} -l app.kubernetes.io/instance=${{ matrix.env.helm_release }} outputs: hostname: ${{ steps.deployment_info.outputs.hostname }} diff --git a/envs/common/values.yaml b/envs/common/values.yaml new file mode 100644 index 00000000..29f6c042 --- /dev/null +++ b/envs/common/values.yaml @@ -0,0 +1,315 @@ +# Common staging environment values for OpenHands +# Environment-specific values (host, URLs) are in envs/staging-*/values.yaml + +imagePullSecrets: + - name: ghcr-login-secret + +databaseMigrations: + waitForDatabase: false + +ingress: + enabled: true + class: traefik + root: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + integrations: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + mcp: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + +helm-release-pruner: + enabled: true + job: + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 100m + memory: 256Mi + +sandbox: + apiHostname: https://runtime.staging.all-hands.dev + +runtime-api: + enabled: false + +filestore: + type: google_cloud + bucket: staging-openhands-sessions + +serviceAccount: + annotations: + iam.gke.io/gcp-service-account: openhands-sa@staging-092324.iam.gserviceaccount.com + +migrationJob: + enabled: true + initContainer: + enabled: false + +github: + enabled: true + +env: + ENABLE_BILLING: "true" + OH_APP_MODE: "saas" + OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_BILLING: "true" + OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_JIRA: "true" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_USERS_PAGE: "false" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_BILLING_PAGE: "false" + OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_INTEGRATIONS_PAGE: "false" + OH_WEB_CLIENT_PROVIDERS_CONFIGURED: '["github","gitlab","bitbucket"]' + OH_WEB_CLIENT_GITHUB_APP_SLUG: "openhands-staging" + OH_APP_CONVERSATION_INFO_KIND: "server.utils.saas_app_conversation_info_injector.SaasAppConversationInfoServiceInjector" + HIDE_LLM_SETTINGS: "false" + GOOGLE_CLOUD_PROJECT: staging-092324 + GCP_PROJECT: staging-092324 + RECAPTCHA_PROJECT_ID: staging-092324 + RECAPTCHA_SITE_KEY: "" + RECAPTCHA_HMAC_SECRET: "qgfN+prMC1iMziHP3YndNicZjgK5IMXITUuVQOnEe9o=" + RECAPTCHA_BLOCK_THRESHOLD: "0.3" + GCP_REGION: us-central1 + GCP_DB_INSTANCE: application-db + DB_USER: openhands-user + DB_NAME: openhands + MAX_CONCURRENT_CONVERSATIONS: "10" + DB_POOL_SIZE: "25" + DB_MAX_OVERFLOW: "30" + ENABLE_PROACTIVE_CONVERSATION_STARTERS: "false" + ENABLE_SOLVABILITY_ANALYSIS: "true" + ENABLE_MCP_SEARCH_ENGINE: "true" + ENABLE_EXPERIMENT_MANAGER: "true" + EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT: "three_system_prompt_experiment" + EXPERIMENT_CONDENSER_MAX_STEP: "condenser_max_step_experiment" + CONVERSATION_MANAGER_CLASS: "server.saas_nested_conversation_manager.SaasNestedConversationManager" + INIT_GIT_IN_EMPTY_WORKSPACE: "1" + RUNTIME_URL_PATTERN: "https://{runtime_id}.staging-runtime.all-hands.dev" + OPENHANDS_PROVIDER_BASE_URL: "https://llm-proxy.staging.all-hands.dev/" + JIRA_WEBHOOKS_ENABLED: "true" + EMAIL_PATTERN_BLACKLIST: "%" + EMAIL_PATTERN_WHITELIST: "%@openhands.dev,%@all-hands.dev" + OH_USER_AUTHORIZER_PREVENT_DUPLICATES: "false" + V1_ENABLED: "true" + ENABLE_V1_SLACK_RESOLVER: "true" + ENABLE_V1_GITHUB_RESOLVER: "true" + DUPLICATE_EMAIL_CHECK: "false" + OPENHANDS_SUPPRESS_BANNER: "1" + +litellm: + enabled: true + url: https://llm-proxy.staging.all-hands.dev + teamId: 62ea39c4-8886-44f3-b7ce-07ed4fe42d2c + auth: + existingSecret: lite-llm-api-key + envVars: + JSON_LOGS: "true" + +keycloak: + enabled: false + url: "http://keycloak.keycloak" + +laminar: + enabled: true + global: + cloudProvider: "gcp" + clickhouse: + s3: + enabled: false + appServer: + ingress: + hostname: "laminar-api.staging.all-hands.dev" + frontend: + extraEnv: + - name: AUTH_KEYCLOAK_ID + valueFrom: + secretKeyRef: + name: keycloak-realm + key: client-id + - name: AUTH_KEYCLOAK_SECRET + valueFrom: + secretKeyRef: + name: keycloak-realm + key: client-secret + - name: AUTH_KEYCLOAK_ISSUER + value: "https://auth.staging.all-hands.dev/realms/allhands" + ingress: + hostname: "laminar.staging.all-hands.dev" + env: + nextauthUrl: "https://laminar.staging.all-hands.dev" + nextPublicUrl: "https://laminar.staging.all-hands.dev" + storage: + storageClass: + type: "hyperdisk-balanced" + +stripe: + enabled: true + auth: + existingSecret: stripe-api-key + +resend: + enabled: true + auth: + existingSecret: resend-api-key + +gitlabWebhookInstallation: + enabled: true + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "512Mi" + cpu: "200m" + +bitbucket: + enabled: true + auth: + existingSecret: bitbucket-app + +jira: + enabled: true + +enrichUserInteractionData: + enabled: true + +githubProxy: + endpointsEnabled: true + +gitlab: + enabled: true + +integrationEvents: + deployment: + replicas: 2 + resources: + requests: + memory: 2.5Gi + cpu: 1000m + limits: + memory: 2.5Gi + cpu: 1000m + uvicorn: + workers: 2 + +proactiveConvoClean: + enabled: true + schedule: "*/15 * * * *" + successfulJobsHistoryLimit: 3 + backoffLimit: 3 + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "100m" + +slack: + enabled: true + clientId: "7477886716822.8865243365329" + +debuggingRoutes: + enabled: true + +deployment: + replicas: 2 + resources: + requests: + memory: 5Gi + cpu: 1000m + limits: + memory: 5Gi + cpu: 1000m + +commonRoomSync: + enabled: true + schedule: "0 * * * *" + +datadog: + enabled: true + env: "staging" + service: "deploy" + agentHost: "datadog-agent.all-hands-system.svc.cluster.local" + +appConfig: + POSTHOG_CLIENT_KEY: "phc_Wj1DvqGQgl5ml0bkZvPr55sxIvJWjlGmoHYZrxdh5qD" + POSTHOG_HOST: "https://us.i.posthog.com" + +tavily: + enabled: true + +postgresql: + enabled: false + +redis: + master: + resources: + requests: + memory: 1Gi + cpu: 500m + limits: + memory: 1Gi + cpu: 500m + +gcpMonitoring: + enabled: true + +automationServiceKey: + enabled: true + existingSecret: automation-service-key + secretKey: automation-service-key + +automation: + enabled: true + image: + repository: ghcr.io/openhands/automation + imagePullSecrets: + - name: ghcr-login-secret + deployment: + replicas: 3 + resources: + requests: + memory: 512Mi + cpu: 200m + limits: + memory: 1Gi + cpu: 500m + serviceAccount: + name: automation-sa + annotations: + iam.gke.io/gcp-service-account: automation-sa@staging-092324.iam.gserviceaccount.com + postgresql: + enabled: false + database: + host: "" + port: "5432" + user: "automation_user" + name: "automations" + createDatabaseUser: false + secretName: "automation-db-secret" + secretKey: "db-password" + gcp: + dbInstance: "application-db" + project: "staging-092324" + region: "us-central1" + filestore: + ephemeral: false + bucket: "staging-openhands-sessions" + type: gcs + minio: + enabled: false + serviceKeyFromSecret: + name: automation-service-key + key: automation-service-key + datadog: + env: "staging" + env: + AUTOMATION_SCHEDULER_INTERVAL_SECONDS: "30" + AUTOMATION_LOG_LEVEL: "info" + GCS_BUCKET_NAME: "staging-openhands-sessions" + +runtime: + runAsRoot: true diff --git a/envs/staging-pathroute/README.md b/envs/staging-pathroute/README.md index eaccfda9..83ab1948 100644 --- a/envs/staging-pathroute/README.md +++ b/envs/staging-pathroute/README.md @@ -13,11 +13,18 @@ This environment uses **path-based routing**: ## Directory Structure ``` -envs/staging-pathroute/ -├── README.md # This file -├── values.yaml # Helm values (non-secret configuration) -└── secrets/ # SOPS-encrypted Kubernetes secrets - └── *.yaml # Individual secret files +envs/ +├── common/ +│ └── values.yaml # Shared staging config (base) +└── staging-pathroute/ + ├── README.md # This file + ├── values.yaml # Environment-specific overrides (host, URLs) + └── secrets/ # SOPS-encrypted Kubernetes secrets +``` + +Helm is invoked with both values files: +```bash +helm upgrade ... -f envs/common/values.yaml -f envs/staging-pathroute/values.yaml ``` ## Kubernetes Details diff --git a/envs/staging-pathroute/values.yaml b/envs/staging-pathroute/values.yaml index 9417a413..f5d36ff2 100644 --- a/envs/staging-pathroute/values.yaml +++ b/envs/staging-pathroute/values.yaml @@ -1,313 +1,11 @@ -# Staging (Path-Based Routing) environment values for OpenHands -# This environment uses path-based routing: /api/automation, /integration/*, etc. -# Secrets are managed separately in envs/staging-pathroute/secrets/ - -imagePullSecrets: - - name: ghcr-login-secret - -databaseMigrations: - waitForDatabase: false +# Staging (Path-Based Routing) environment values +# Uses path-based routing: /api/automation, /integration/*, etc. +# Base config: envs/common/values.yaml +# Secrets: envs/staging-pathroute/secrets/ ingress: host: staging-pathroute.all-hands.dev - enabled: true - class: traefik - root: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - -helm-release-pruner: - enabled: true - job: - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 100m - memory: 256Mi - -sandbox: - apiHostname: https://runtime.staging.all-hands.dev - -runtime-api: - enabled: false - -filestore: - type: google_cloud - bucket: staging-openhands-sessions - -serviceAccount: - annotations: - iam.gke.io/gcp-service-account: openhands-sa@staging-092324.iam.gserviceaccount.com - -migrationJob: - enabled: true - initContainer: - enabled: false - -github: - enabled: true - -env: - ENABLE_BILLING: "true" - OH_APP_MODE: "saas" - OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_BILLING: "true" - OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_JIRA: "true" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_USERS_PAGE: "false" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_BILLING_PAGE: "false" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_INTEGRATIONS_PAGE: "false" - OH_WEB_CLIENT_PROVIDERS_CONFIGURED: '["github","gitlab","bitbucket"]' - OH_WEB_CLIENT_GITHUB_APP_SLUG: "openhands-staging" - OH_APP_CONVERSATION_INFO_KIND: "server.utils.saas_app_conversation_info_injector.SaasAppConversationInfoServiceInjector" - HIDE_LLM_SETTINGS: "false" - GOOGLE_CLOUD_PROJECT: staging-092324 - GCP_PROJECT: staging-092324 - RECAPTCHA_PROJECT_ID: staging-092324 - RECAPTCHA_SITE_KEY: "" - RECAPTCHA_HMAC_SECRET: "qgfN+prMC1iMziHP3YndNicZjgK5IMXITUuVQOnEe9o=" - RECAPTCHA_BLOCK_THRESHOLD: "0.3" - GCP_REGION: us-central1 - GCP_DB_INSTANCE: application-db - DB_USER: openhands-user - DB_NAME: openhands - MAX_CONCURRENT_CONVERSATIONS: "10" - DB_POOL_SIZE: "25" - DB_MAX_OVERFLOW: "30" - ENABLE_PROACTIVE_CONVERSATION_STARTERS: "false" - ENABLE_SOLVABILITY_ANALYSIS: "true" - ENABLE_MCP_SEARCH_ENGINE: "true" - ENABLE_EXPERIMENT_MANAGER: "true" - EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT: "three_system_prompt_experiment" - EXPERIMENT_CONDENSER_MAX_STEP: "condenser_max_step_experiment" - CONVERSATION_MANAGER_CLASS: "server.saas_nested_conversation_manager.SaasNestedConversationManager" - INIT_GIT_IN_EMPTY_WORKSPACE: "1" - RUNTIME_URL_PATTERN: "https://{runtime_id}.staging-runtime.all-hands.dev" - OPENHANDS_PROVIDER_BASE_URL: "https://llm-proxy.staging.all-hands.dev/" - JIRA_WEBHOOKS_ENABLED: "true" - EMAIL_PATTERN_BLACKLIST: "%" - EMAIL_PATTERN_WHITELIST: "%@openhands.dev,%@all-hands.dev" - OH_USER_AUTHORIZER_PREVENT_DUPLICATES: "false" - V1_ENABLED: "true" - ENABLE_V1_SLACK_RESOLVER: "true" - ENABLE_V1_GITHUB_RESOLVER: "true" - DUPLICATE_EMAIL_CHECK: "false" - OPENHANDS_SUPPRESS_BANNER: "1" - -litellm: - enabled: true - url: https://llm-proxy.staging.all-hands.dev - teamId: 62ea39c4-8886-44f3-b7ce-07ed4fe42d2c - auth: - existingSecret: lite-llm-api-key - envVars: - JSON_LOGS: "true" - -keycloak: - enabled: false - url: "http://keycloak.keycloak" - -laminar: - enabled: true - global: - cloudProvider: "gcp" - clickhouse: - s3: - enabled: false - appServer: - ingress: - hostname: "laminar-api.staging.all-hands.dev" - frontend: - extraEnv: - - name: AUTH_KEYCLOAK_ID - valueFrom: - secretKeyRef: - name: keycloak-realm - key: client-id - - name: AUTH_KEYCLOAK_SECRET - valueFrom: - secretKeyRef: - name: keycloak-realm - key: client-secret - - name: AUTH_KEYCLOAK_ISSUER - value: "https://auth.staging.all-hands.dev/realms/allhands" - ingress: - hostname: "laminar.staging.all-hands.dev" - env: - nextauthUrl: "https://laminar.staging.all-hands.dev" - nextPublicUrl: "https://laminar.staging.all-hands.dev" - storage: - storageClass: - type: "hyperdisk-balanced" - -stripe: - enabled: true - auth: - existingSecret: stripe-api-key - -resend: - enabled: true - auth: - existingSecret: resend-api-key - -gitlabWebhookInstallation: - enabled: true - resources: - requests: - memory: "512Mi" - cpu: "200m" - limits: - memory: "512Mi" - cpu: "200m" - -bitbucket: - enabled: true - auth: - existingSecret: bitbucket-app - -jira: - enabled: true - -enrichUserInteractionData: - enabled: true - -githubProxy: - endpointsEnabled: true - -gitlab: - enabled: true - -integrationEvents: - deployment: - replicas: 2 - resources: - requests: - memory: 2.5Gi - cpu: 1000m - limits: - memory: 2.5Gi - cpu: 1000m - uvicorn: - workers: 2 - -proactiveConvoClean: - enabled: true - schedule: "*/15 * * * *" - successfulJobsHistoryLimit: 3 - backoffLimit: 3 - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "100m" - -slack: - enabled: true - clientId: "7477886716822.8865243365329" - -debuggingRoutes: - enabled: true - -deployment: - replicas: 2 - resources: - requests: - memory: 5Gi - cpu: 1000m - limits: - memory: 5Gi - cpu: 1000m - -commonRoomSync: - enabled: true - schedule: "0 * * * *" - -datadog: - enabled: true - env: "staging" - service: "deploy" - agentHost: "datadog-agent.all-hands-system.svc.cluster.local" - -appConfig: - POSTHOG_CLIENT_KEY: "phc_Wj1DvqGQgl5ml0bkZvPr55sxIvJWjlGmoHYZrxdh5qD" - POSTHOG_HOST: "https://us.i.posthog.com" - -tavily: - enabled: true - -postgresql: - enabled: false - -redis: - master: - resources: - requests: - memory: 1Gi - cpu: 500m - limits: - memory: 1Gi - cpu: 500m - -gcpMonitoring: - enabled: true - -automationServiceKey: - enabled: true - existingSecret: automation-service-key - secretKey: automation-service-key automation: - enabled: true - image: - repository: ghcr.io/openhands/automation - imagePullSecrets: - - name: ghcr-login-secret - deployment: - replicas: 3 - resources: - requests: - memory: 512Mi - cpu: 200m - limits: - memory: 1Gi - cpu: 500m - serviceAccount: - name: automation-sa - annotations: - iam.gke.io/gcp-service-account: automation-sa@staging-092324.iam.gserviceaccount.com openhandsApiUrl: "https://staging-pathroute.all-hands.dev" automationBaseUrl: "https://staging-pathroute.all-hands.dev" - postgresql: - enabled: false - database: - host: "" - port: "5432" - user: "automation_user" - name: "automations" - createDatabaseUser: false - secretName: "automation-db-secret" - secretKey: "db-password" - gcp: - dbInstance: "application-db" - project: "staging-092324" - region: "us-central1" - filestore: - ephemeral: false - bucket: "staging-openhands-sessions" - type: gcs - minio: - enabled: false - serviceKeyFromSecret: - name: automation-service-key - key: automation-service-key - datadog: - env: "staging" - env: - AUTOMATION_SCHEDULER_INTERVAL_SECONDS: "30" - AUTOMATION_LOG_LEVEL: "info" - GCS_BUCKET_NAME: "staging-openhands-sessions" - -runtime: - runAsRoot: true diff --git a/envs/staging-subdomain/README.md b/envs/staging-subdomain/README.md index b84eabde..faed1666 100644 --- a/envs/staging-subdomain/README.md +++ b/envs/staging-subdomain/README.md @@ -14,11 +14,18 @@ The key difference from staging-pathroute is that this environment tests the pro ## Directory Structure ``` -envs/staging-subdomain/ -├── README.md # This file -├── values.yaml # Helm values (non-secret configuration) -└── secrets/ # SOPS-encrypted Kubernetes secrets - └── *.yaml # Individual secret files +envs/ +├── common/ +│ └── values.yaml # Shared staging config (base) +└── staging-subdomain/ + ├── README.md # This file + ├── values.yaml # Environment-specific overrides (host, URLs) + └── secrets/ # SOPS-encrypted Kubernetes secrets +``` + +Helm is invoked with both values files: +```bash +helm upgrade ... -f envs/common/values.yaml -f envs/staging-subdomain/values.yaml ``` ## Kubernetes Details diff --git a/envs/staging-subdomain/values.yaml b/envs/staging-subdomain/values.yaml index f309e88e..f8622e48 100644 --- a/envs/staging-subdomain/values.yaml +++ b/envs/staging-subdomain/values.yaml @@ -1,322 +1,11 @@ -# Staging (Subdomain-Based Routing) environment values for OpenHands -# This environment uses subdomain-based routing instead of path-based -# e.g., automation.staging-subdomain.all-hands.dev instead of staging.all-hands.dev/api/automation -# Secrets are managed separately in envs/staging-subdomain/secrets/ - -imagePullSecrets: - - name: ghcr-login-secret - -databaseMigrations: - waitForDatabase: false +# Staging (Subdomain-Based Routing) environment values +# Uses subdomain-based routing: automation.staging-subdomain.all-hands.dev +# Base config: envs/common/values.yaml +# Secrets: envs/staging-subdomain/secrets/ ingress: host: staging-subdomain.all-hands.dev - enabled: true - class: traefik - root: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - # Subdomain-based routing: disable path-based ingresses for integrations/automation - # These will be handled by separate subdomain ingresses - integrations: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - mcp: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - -helm-release-pruner: - enabled: true - job: - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 100m - memory: 256Mi - -sandbox: - apiHostname: https://runtime.staging.all-hands.dev - -runtime-api: - enabled: false - -filestore: - type: google_cloud - bucket: staging-openhands-sessions - -serviceAccount: - annotations: - iam.gke.io/gcp-service-account: openhands-sa@staging-092324.iam.gserviceaccount.com - -migrationJob: - enabled: true - initContainer: - enabled: false - -github: - enabled: true - -env: - ENABLE_BILLING: "true" - OH_APP_MODE: "saas" - OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_BILLING: "true" - OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_JIRA: "true" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_USERS_PAGE: "false" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_BILLING_PAGE: "false" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_INTEGRATIONS_PAGE: "false" - OH_WEB_CLIENT_PROVIDERS_CONFIGURED: '["github","gitlab","bitbucket"]' - OH_WEB_CLIENT_GITHUB_APP_SLUG: "openhands-staging" - OH_APP_CONVERSATION_INFO_KIND: "server.utils.saas_app_conversation_info_injector.SaasAppConversationInfoServiceInjector" - HIDE_LLM_SETTINGS: "false" - GOOGLE_CLOUD_PROJECT: staging-092324 - GCP_PROJECT: staging-092324 - RECAPTCHA_PROJECT_ID: staging-092324 - RECAPTCHA_SITE_KEY: "" - RECAPTCHA_HMAC_SECRET: "qgfN+prMC1iMziHP3YndNicZjgK5IMXITUuVQOnEe9o=" - RECAPTCHA_BLOCK_THRESHOLD: "0.3" - GCP_REGION: us-central1 - GCP_DB_INSTANCE: application-db - DB_USER: openhands-user - DB_NAME: openhands - MAX_CONCURRENT_CONVERSATIONS: "10" - DB_POOL_SIZE: "25" - DB_MAX_OVERFLOW: "30" - ENABLE_PROACTIVE_CONVERSATION_STARTERS: "false" - ENABLE_SOLVABILITY_ANALYSIS: "true" - ENABLE_MCP_SEARCH_ENGINE: "true" - ENABLE_EXPERIMENT_MANAGER: "true" - EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT: "three_system_prompt_experiment" - EXPERIMENT_CONDENSER_MAX_STEP: "condenser_max_step_experiment" - CONVERSATION_MANAGER_CLASS: "server.saas_nested_conversation_manager.SaasNestedConversationManager" - INIT_GIT_IN_EMPTY_WORKSPACE: "1" - RUNTIME_URL_PATTERN: "https://{runtime_id}.staging-runtime.all-hands.dev" - OPENHANDS_PROVIDER_BASE_URL: "https://llm-proxy.staging.all-hands.dev/" - JIRA_WEBHOOKS_ENABLED: "true" - EMAIL_PATTERN_BLACKLIST: "%" - EMAIL_PATTERN_WHITELIST: "%@openhands.dev,%@all-hands.dev" - OH_USER_AUTHORIZER_PREVENT_DUPLICATES: "false" - V1_ENABLED: "true" - ENABLE_V1_SLACK_RESOLVER: "true" - ENABLE_V1_GITHUB_RESOLVER: "true" - DUPLICATE_EMAIL_CHECK: "false" - OPENHANDS_SUPPRESS_BANNER: "1" - -litellm: - enabled: true - url: https://llm-proxy.staging.all-hands.dev - teamId: 62ea39c4-8886-44f3-b7ce-07ed4fe42d2c - auth: - existingSecret: lite-llm-api-key - envVars: - JSON_LOGS: "true" - -keycloak: - enabled: false - url: "http://keycloak.keycloak" - -laminar: - enabled: true - global: - cloudProvider: "gcp" - clickhouse: - s3: - enabled: false - appServer: - ingress: - hostname: "laminar-api.staging.all-hands.dev" - frontend: - extraEnv: - - name: AUTH_KEYCLOAK_ID - valueFrom: - secretKeyRef: - name: keycloak-realm - key: client-id - - name: AUTH_KEYCLOAK_SECRET - valueFrom: - secretKeyRef: - name: keycloak-realm - key: client-secret - - name: AUTH_KEYCLOAK_ISSUER - value: "https://auth.staging.all-hands.dev/realms/allhands" - ingress: - hostname: "laminar.staging.all-hands.dev" - env: - nextauthUrl: "https://laminar.staging.all-hands.dev" - nextPublicUrl: "https://laminar.staging.all-hands.dev" - storage: - storageClass: - type: "hyperdisk-balanced" - -stripe: - enabled: true - auth: - existingSecret: stripe-api-key - -resend: - enabled: true - auth: - existingSecret: resend-api-key - -gitlabWebhookInstallation: - enabled: true - resources: - requests: - memory: "512Mi" - cpu: "200m" - limits: - memory: "512Mi" - cpu: "200m" - -bitbucket: - enabled: true - auth: - existingSecret: bitbucket-app - -jira: - enabled: true - -enrichUserInteractionData: - enabled: true - -githubProxy: - endpointsEnabled: true - -gitlab: - enabled: true - -integrationEvents: - deployment: - replicas: 2 - resources: - requests: - memory: 2.5Gi - cpu: 1000m - limits: - memory: 2.5Gi - cpu: 1000m - uvicorn: - workers: 2 - -proactiveConvoClean: - enabled: true - schedule: "*/15 * * * *" - successfulJobsHistoryLimit: 3 - backoffLimit: 3 - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "100m" - -slack: - enabled: true - clientId: "7477886716822.8865243365329" - -debuggingRoutes: - enabled: true - -deployment: - replicas: 2 - resources: - requests: - memory: 5Gi - cpu: 1000m - limits: - memory: 5Gi - cpu: 1000m - -commonRoomSync: - enabled: true - schedule: "0 * * * *" - -datadog: - enabled: true - env: "staging" - service: "deploy" - agentHost: "datadog-agent.all-hands-system.svc.cluster.local" - -appConfig: - POSTHOG_CLIENT_KEY: "phc_Wj1DvqGQgl5ml0bkZvPr55sxIvJWjlGmoHYZrxdh5qD" - POSTHOG_HOST: "https://us.i.posthog.com" - -tavily: - enabled: true - -postgresql: - enabled: false - -redis: - master: - resources: - requests: - memory: 1Gi - cpu: 500m - limits: - memory: 1Gi - cpu: 500m - -gcpMonitoring: - enabled: true - -automationServiceKey: - enabled: true - existingSecret: automation-service-key - secretKey: automation-service-key automation: - enabled: true - image: - repository: ghcr.io/openhands/automation - imagePullSecrets: - - name: ghcr-login-secret - deployment: - replicas: 3 - resources: - requests: - memory: 512Mi - cpu: 200m - limits: - memory: 1Gi - cpu: 500m - serviceAccount: - name: automation-sa - annotations: - iam.gke.io/gcp-service-account: automation-sa@staging-092324.iam.gserviceaccount.com openhandsApiUrl: "https://staging-subdomain.all-hands.dev" automationBaseUrl: "https://staging-subdomain.all-hands.dev" - postgresql: - enabled: false - database: - host: "" - port: "5432" - user: "automation_user" - name: "automations" - createDatabaseUser: false - secretName: "automation-db-secret" - secretKey: "db-password" - gcp: - dbInstance: "application-db" - project: "staging-092324" - region: "us-central1" - filestore: - ephemeral: false - bucket: "staging-openhands-sessions" - type: gcs - minio: - enabled: false - serviceKeyFromSecret: - name: automation-service-key - key: automation-service-key - datadog: - env: "staging" - env: - AUTOMATION_SCHEDULER_INTERVAL_SECONDS: "30" - AUTOMATION_LOG_LEVEL: "info" - GCS_BUCKET_NAME: "staging-openhands-sessions" - -runtime: - runAsRoot: true diff --git a/scripts/deploy/decrypt.sh b/scripts/deploy/decrypt.sh deleted file mode 100644 index 0a8c3d13..00000000 --- a/scripts/deploy/decrypt.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# Decrypt a SOPS-encrypted file -set -eo pipefail - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit 1 -fi - -input_file="$1" -decrypted_file="decrypted.yaml" - -if [ ! -f "$input_file" ]; then - echo "Error: File $input_file not found" - exit 1 -fi - -sops --decrypt "$input_file" > "$decrypted_file" - -echo "File decrypted and saved as $decrypted_file" diff --git a/scripts/deploy/safe-apply-secrets.sh b/scripts/deploy/safe-apply-secrets.sh deleted file mode 100644 index 93e8e2b7..00000000 --- a/scripts/deploy/safe-apply-secrets.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Safely apply Kubernetes secrets (create or update without overwriting unchanged data) -set -eo pipefail - -if [ "$#" -lt 1 ]; then - echo "Usage: $0 [kubectl_args...]" - exit 1 -fi - -secret_file="$1" -shift -kubectl_args="$@" - -if [ ! -f "$secret_file" ]; then - echo "Error: File $secret_file not found" - exit 1 -fi - -# Apply the secret using server-side apply for idempotent updates -kubectl apply -f "$secret_file" $kubectl_args - -echo "Secret applied successfully" From e03a9267bc362665832fefc2621dd710c6d998fe Mon Sep 17 00:00:00 2001 From: Saurya Velagapudi Date: Thu, 16 Apr 2026 17:53:52 -0700 Subject: [PATCH 3/6] Add testbed deployment scripts for Platform Team Sandbox - deploy.sh: Script to deploy OpenHands Cloud to shared testbed - setup-shared-cluster.sh: One-time cluster setup script - README.md: Comprehensive guide for team members The testbed is intentionally private (uses /etc/hosts for DNS). Each developer can deploy to their own namespace. Co-authored-by: openhands --- scripts/testbed/README.md | 322 ++++++++++++ scripts/testbed/deploy.sh | 605 +++++++++++++++++++++++ scripts/testbed/setup-shared-cluster.sh | 410 +++++++++++++++ scripts/testbed/values-testbed-test.yaml | 125 +++++ 4 files changed, 1462 insertions(+) create mode 100644 scripts/testbed/README.md create mode 100755 scripts/testbed/deploy.sh create mode 100755 scripts/testbed/setup-shared-cluster.sh create mode 100644 scripts/testbed/values-testbed-test.yaml diff --git a/scripts/testbed/README.md b/scripts/testbed/README.md new file mode 100644 index 00000000..ea8e751f --- /dev/null +++ b/scripts/testbed/README.md @@ -0,0 +1,322 @@ +# OpenHands Cloud Testbed + +Deploy OpenHands Cloud to an **internal testbed environment** for testing and development. + +> ⚠️ **Private Environment**: This testbed is NOT publicly accessible. It runs in the +> Platform Team Sandbox GCP project and uses `/etc/hosts` for DNS resolution. + +## Overview + +The testbed provides two deployment modes: + +1. **Shared Testbed** - Multiple developers deploy to namespaces on a shared GKE cluster +2. **Isolated Testbed** - Create your own GKE cluster for complete isolation + +## Quick Start (For Team Members) + +### Prerequisites + +- `gcloud` CLI authenticated (`gcloud auth login`) +- `kubectl` installed +- `helm` v3 installed +- Access to `platform-team-sandbox-62793` GCP project (request via Platform Team) +- An Anthropic API key (get from 1Password or request from team lead) + +### Step 1: Connect to the Shared Cluster + +```bash +# Authenticate with GCP +gcloud auth login +gcloud config set project platform-team-sandbox-62793 + +# Connect to the testbed cluster +gcloud container clusters get-credentials openhands-testbed --region us-central1 +``` + +### Step 2: Deploy Your Instance + +```bash +# Set your API keys +export ANTHROPIC_API_KEY="sk-ant-..." # Required for LLM +export GITHUB_TOKEN="ghp_..." # Optional: for pulling latest images + +# Deploy to your own namespace (use your name or feature name) +./deploy.sh --name + +# Examples: +./deploy.sh --name saurya +./deploy.sh --name feature-xyz +``` + +This creates: +- Namespace: `testbed-` +- App hostname: `testbed-.sandbox.all-hands.dev` +- Auth hostname: `auth-testbed-.sandbox.all-hands.dev` +- Runtime hostname: `runtime-testbed-.sandbox.all-hands.dev` + +### Step 3: Configure Local Access + +Since this is a private environment, add entries to your `/etc/hosts`: + +```bash +# Get the LoadBalancer IP +TRAEFIK_IP=$(kubectl get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +echo "LoadBalancer IP: $TRAEFIK_IP" + +# Add to /etc/hosts (replace with your testbed name) +sudo bash -c "echo '$TRAEFIK_IP testbed-.sandbox.all-hands.dev auth-testbed-.sandbox.all-hands.dev runtime-testbed-.sandbox.all-hands.dev' >> /etc/hosts" + +# Example for testbed-saurya: +# sudo bash -c "echo '34.28.75.102 testbed-saurya.sandbox.all-hands.dev auth-testbed-saurya.sandbox.all-hands.dev runtime-testbed-saurya.sandbox.all-hands.dev' >> /etc/hosts" +``` + +### Step 4: Access Your Testbed + +**Option A: Browser with /etc/hosts (recommended)** + +After adding `/etc/hosts` entries, open Chrome/Firefox: +``` +https://testbed-.sandbox.all-hands.dev +``` + +> 💡 **Chrome HTTPS Warning**: When you see the certificate warning, click anywhere on the page +> and type `thisisunsafe` (you won't see it appear). This bypasses the self-signed cert warning. + +**Option B: Port Forward (simplest, but limited)** + +```bash +kubectl port-forward svc/openhands-service 3000:3000 -n testbed- +# Open http://localhost:3000 +``` + +Note: Port forwarding won't work with OAuth callbacks. Use /etc/hosts for full functionality. + +### Step 5: Clean Up When Done + +```bash +./deploy.sh --name --destroy +``` + +**Important**: Please destroy your testbed when you're done to save cluster resources! + +## Deployment Modes + +### Shared Cluster (Default) + +Multiple developers share one GKE cluster with separate namespaces: + +```bash +./deploy.sh --name alice # Creates testbed-alice namespace +./deploy.sh --name bob # Creates testbed-bob namespace +``` + +**Pros:** +- Faster deployment (cluster already exists) +- Lower cost (shared infrastructure) +- Simpler DNS setup (one wildcard domain) + +**Cons:** +- Shared cluster resources +- Potential resource contention + +### Isolated Cluster + +Create your own GKE cluster: + +```bash +./deploy.sh --name mytest --create-cluster +``` + +**Pros:** +- Complete isolation +- Can test cluster-level changes +- No resource contention + +**Cons:** +- Slower setup (~10 minutes for cluster creation) +- Higher cost +- Requires separate DNS setup + +## Configuration + +### Environment Variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `ANTHROPIC_API_KEY` | Yes* | Anthropic API key for LLM | +| `OPENAI_API_KEY` | No | OpenAI API key (alternative) | +| `GITHUB_TOKEN` | Recommended | For pulling images from ghcr.io | +| `GCP_PROJECT` | No | GCP project (default: platform-team-sandbox-62793) | +| `GCP_REGION` | No | GCP region (default: us-central1) | + +*At least one LLM API key is required for the agent to function. + +### Custom Values + +Override values by creating a custom values file: + +```bash +# Generate default values +./deploy.sh --name mytest --dry-run + +# Edit the generated values +vim values-testbed-mytest.yaml + +# Deploy with custom values +./deploy.sh --name mytest +``` + +## Troubleshooting + +### Check Deployment Status + +```bash +# View all pods +kubectl get pods -n testbed- + +# View logs +kubectl logs -f deployment/openhands -n testbed- + +# View events +kubectl get events -n testbed- --sort-by=.lastTimestamp +``` + +### Certificate Issues + +```bash +# Check certificate status +kubectl get certificates -n testbed- +kubectl describe certificate -n testbed- + +# Check cert-manager logs +kubectl logs -n cert-manager deployment/cert-manager +``` + +### Database Issues + +```bash +# Check PostgreSQL +kubectl get pods -n testbed- -l app.kubernetes.io/name=postgresql + +# Connect to database +kubectl exec -it -n testbed- \ + $(kubectl get pod -n testbed- -l app.kubernetes.io/name=postgresql -o name) \ + -- psql -U postgres +``` + +### Ingress Issues + +```bash +# Check Traefik +kubectl get svc -n traefik +kubectl logs -n traefik deployment/traefik + +# Check ingress +kubectl get ingress -n testbed- +kubectl describe ingress -n testbed- +``` + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Platform Team Sandbox GCP │ +│ (platform-team-sandbox-62793) │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ GKE: openhands-testbed │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ traefik │ │ cert-manager│ │ DNS zone │ │ │ +│ │ │ (ingress) │ │ (TLS certs) │ │ sandbox. │ │ │ +│ │ │ │ │ │ │ all-hands. │ │ │ +│ │ │ │ │ │ │ dev │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────────────────────────────────────────────┐ │ │ +│ │ │ Namespace: testbed-alice │ │ │ +│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ +│ │ │ │openhands│ │keycloak │ │litellm │ │postgres │ │ │ │ +│ │ │ │ │ │ (auth) │ │ (llm) │ │ (db) │ │ │ │ +│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ +│ │ └──────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────────────────────────────────────────────┐ │ │ +│ │ │ Namespace: testbed-bob │ │ │ +│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ +│ │ │ │openhands│ │keycloak │ │litellm │ │postgres │ │ │ │ +│ │ │ │ │ │ (auth) │ │ (llm) │ │ (db) │ │ │ │ +│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ +│ │ └──────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Cost Considerations + +### Shared Cluster (Recommended) + +- GKE cluster: ~$72/month (control plane) + ~$100/month (nodes) +- Split across all users + +### Isolated Cluster + +- Same costs per cluster +- Consider deleting when not in use: + ```bash + ./deploy.sh --name mytest --destroy # Includes cluster deletion + ``` + +## Network Access (Private by Design) + +This testbed is intentionally **NOT publicly accessible**. Access requires: + +1. GCP project access (`platform-team-sandbox-62793`) +2. kubectl credentials for the cluster +3. Local `/etc/hosts` configuration pointing to the LoadBalancer IP + +### Why No Public DNS? + +- **Security**: Experimental features and internal testing shouldn't be public +- **Simplicity**: No need to manage SSL certificates via Let's Encrypt +- **Isolation**: Each developer's /etc/hosts is independent + +### Getting the LoadBalancer IP + +```bash +# Current LoadBalancer IP +kubectl get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +# Output: 34.28.75.102 (as of last deployment) +``` + +### /etc/hosts Configuration + +Add entries for your testbed: + +```bash +# For testbed named "mytest" +34.28.75.102 testbed-mytest.sandbox.all-hands.dev auth-testbed-mytest.sandbox.all-hands.dev runtime-testbed-mytest.sandbox.all-hands.dev + +# For testbed named "saurya" +34.28.75.102 testbed-saurya.sandbox.all-hands.dev auth-testbed-saurya.sandbox.all-hands.dev runtime-testbed-saurya.sandbox.all-hands.dev +``` + +### Optional: Future Public DNS + +If we ever want to make this publicly accessible (with proper authentication), there's +a prepared DNS delegation in the infra repo that can be merged: +- PR: [Add sandbox.all-hands.dev DNS delegation](https://github.com/All-Hands-AI/infra/pull/1165) + +This would enable Let's Encrypt certificates and public DNS resolution. + +## Contributing + +When adding features to the testbed scripts: + +1. Test with `--dry-run` first +2. Ensure cleanup works properly +3. Update this README +4. Consider backwards compatibility with existing testbeds diff --git a/scripts/testbed/deploy.sh b/scripts/testbed/deploy.sh new file mode 100755 index 00000000..05215dcc --- /dev/null +++ b/scripts/testbed/deploy.sh @@ -0,0 +1,605 @@ +#!/bin/bash +set -euo pipefail + +# OpenHands Cloud Testbed Deployment Script +# ========================================== +# Deploy OpenHands Cloud to a testbed environment in Platform Team Sandbox +# +# Usage: +# ./deploy.sh # Deploy to shared testbed +# ./deploy.sh --name mytest # Deploy to isolated environment "mytest" +# ./deploy.sh --create-cluster # Create new GKE cluster and deploy +# ./deploy.sh --destroy # Destroy your testbed environment + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +# Defaults +GCP_PROJECT="${GCP_PROJECT:-platform-team-sandbox-62793}" +GCP_REGION="${GCP_REGION:-us-central1}" +SHARED_CLUSTER_NAME="openhands-testbed" +NAMESPACE_PREFIX="testbed" +DNS_DOMAIN="sandbox.all-hands.dev" +CREATE_CLUSTER=false +DESTROY=false +TESTBED_NAME="" +DRY_RUN=false + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +show_usage() { + cat << EOF +OpenHands Cloud Testbed Deployment + +Usage: $(basename "$0") [OPTIONS] + +Options: + --name NAME Deploy to isolated namespace 'testbed-NAME' (default: shared testbed) + --create-cluster Create a new GKE cluster for this testbed + --destroy Destroy the testbed environment + --dry-run Show what would be done without making changes + --cluster NAME Use specific cluster name (default: $SHARED_CLUSTER_NAME) + --project PROJECT GCP project ID (default: $GCP_PROJECT) + --region REGION GCP region (default: $GCP_REGION) + --help Show this help message + +Examples: + # Deploy current changes to the shared testbed + ./deploy.sh + + # Deploy to your own isolated namespace + ./deploy.sh --name saurya + + # Create a new cluster and deploy (for completely isolated testing) + ./deploy.sh --name mytest --create-cluster + + # Destroy your isolated testbed + ./deploy.sh --name saurya --destroy + +Environment Variables: + GCP_PROJECT GCP project ID (default: platform-team-sandbox-62793) + GCP_REGION GCP region (default: us-central1) + ANTHROPIC_API_KEY API key for Anthropic (required for LLM) + OPENAI_API_KEY API key for OpenAI (optional) + GITHUB_TOKEN GitHub token for image pulls + +EOF + exit 0 +} + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --name) + TESTBED_NAME="$2" + shift 2 + ;; + --create-cluster) + CREATE_CLUSTER=true + shift + ;; + --destroy) + DESTROY=true + shift + ;; + --dry-run) + DRY_RUN=true + shift + ;; + --cluster) + SHARED_CLUSTER_NAME="$2" + shift 2 + ;; + --project) + GCP_PROJECT="$2" + shift 2 + ;; + --region) + GCP_REGION="$2" + shift 2 + ;; + --help|-h) + show_usage + ;; + *) + log_error "Unknown option: $1" + show_usage + ;; + esac +done + +# Determine namespace and cluster names +if [[ -n "$TESTBED_NAME" ]]; then + NAMESPACE="${NAMESPACE_PREFIX}-${TESTBED_NAME}" + if [[ "$CREATE_CLUSTER" == "true" ]]; then + CLUSTER_NAME="testbed-${TESTBED_NAME}" + else + CLUSTER_NAME="$SHARED_CLUSTER_NAME" + fi +else + NAMESPACE="${NAMESPACE_PREFIX}-shared" + CLUSTER_NAME="$SHARED_CLUSTER_NAME" +fi + +HOST_PREFIX="${NAMESPACE}" +APP_HOST="${HOST_PREFIX}.${DNS_DOMAIN}" +RUNTIME_HOST="runtime-${HOST_PREFIX}.${DNS_DOMAIN}" +AUTH_HOST="auth-${HOST_PREFIX}.${DNS_DOMAIN}" + +log_info "Configuration:" +log_info " GCP Project: $GCP_PROJECT" +log_info " GCP Region: $GCP_REGION" +log_info " Cluster: $CLUSTER_NAME" +log_info " Namespace: $NAMESPACE" +log_info " App URL: https://$APP_HOST" + +# Check prerequisites +check_prerequisites() { + log_info "Checking prerequisites..." + + local missing=() + + command -v gcloud >/dev/null 2>&1 || missing+=("gcloud") + command -v kubectl >/dev/null 2>&1 || missing+=("kubectl") + command -v helm >/dev/null 2>&1 || missing+=("helm") + + if [[ ${#missing[@]} -gt 0 ]]; then + log_error "Missing required tools: ${missing[*]}" + exit 1 + fi + + # Check gcloud auth + if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" | grep -q .; then + log_error "Not authenticated with gcloud. Run: gcloud auth login" + exit 1 + fi + + log_success "Prerequisites check passed" +} + +# Create GKE cluster +create_cluster() { + log_info "Creating GKE cluster '$CLUSTER_NAME'..." + + if gcloud container clusters describe "$CLUSTER_NAME" --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then + log_warn "Cluster '$CLUSTER_NAME' already exists" + return 0 + fi + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY-RUN] Would create cluster '$CLUSTER_NAME'" + return 0 + fi + + gcloud container clusters create "$CLUSTER_NAME" \ + --project="$GCP_PROJECT" \ + --region="$GCP_REGION" \ + --machine-type=e2-standard-4 \ + --num-nodes=1 \ + --enable-autoscaling \ + --min-nodes=1 \ + --max-nodes=5 \ + --disk-size=100 \ + --disk-type=pd-standard \ + --enable-ip-alias \ + --workload-pool="${GCP_PROJECT}.svc.id.goog" \ + --release-channel=regular \ + --no-enable-basic-auth \ + --metadata disable-legacy-endpoints=true + + log_success "Cluster '$CLUSTER_NAME' created" +} + +# Connect to cluster +connect_cluster() { + log_info "Connecting to cluster '$CLUSTER_NAME'..." + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY-RUN] Would connect to cluster '$CLUSTER_NAME'" + return 0 + fi + + gcloud container clusters get-credentials "$CLUSTER_NAME" \ + --project="$GCP_PROJECT" \ + --region="$GCP_REGION" + + log_success "Connected to cluster" +} + +# Install third-party dependencies (Traefik, cert-manager) +install_dependencies() { + log_info "Installing cluster dependencies..." + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY-RUN] Would install Traefik and cert-manager" + return 0 + fi + + # Check if Traefik is already installed + if ! helm list -n traefik 2>/dev/null | grep -q traefik; then + log_info "Installing Traefik..." + helm repo add traefik https://traefik.github.io/charts 2>/dev/null || true + helm repo update + kubectl create namespace traefik 2>/dev/null || true + helm upgrade --install traefik traefik/traefik \ + --namespace traefik \ + --set service.type=LoadBalancer \ + --wait + else + log_info "Traefik already installed" + fi + + # Check if cert-manager is already installed + if ! helm list -n cert-manager 2>/dev/null | grep -q cert-manager; then + log_info "Installing cert-manager..." + helm repo add jetstack https://charts.jetstack.io 2>/dev/null || true + helm repo update + kubectl create namespace cert-manager 2>/dev/null || true + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true \ + --wait + + # Create ClusterIssuer for Let's Encrypt + kubectl apply -f - << 'ISSUER_EOF' +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: platform-team@all-hands.dev + privateKeySecretRef: + name: letsencrypt-account-key + solvers: + - http01: + ingress: + class: traefik +ISSUER_EOF + else + log_info "cert-manager already installed" + fi + + log_success "Dependencies installed" +} + +# Create namespace and secrets +setup_namespace() { + log_info "Setting up namespace '$NAMESPACE'..." + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY-RUN] Would create namespace and secrets" + return 0 + fi + + kubectl create namespace "$NAMESPACE" 2>/dev/null || log_info "Namespace already exists" + + # Generate random secrets + GLOBAL_SECRET=$(head /dev/urandom | LC_ALL=C tr -dc 'A-Za-z0-9' | head -c 32) + + # Create secrets if they don't exist + kubectl get secret jwt-secret -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic jwt-secret -n "$NAMESPACE" \ + --from-literal=jwt-secret="$GLOBAL_SECRET" + + kubectl get secret keycloak-realm -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic keycloak-realm -n "$NAMESPACE" \ + --from-literal=realm-name=allhands \ + --from-literal=server-url=http://keycloak \ + --from-literal=client-id=allhands \ + --from-literal=client-secret="$GLOBAL_SECRET" \ + --from-literal=smtp-password="" + + kubectl get secret keycloak-admin -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic keycloak-admin -n "$NAMESPACE" \ + --from-literal=admin-password="$GLOBAL_SECRET" + + kubectl get secret postgres-password -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic postgres-password -n "$NAMESPACE" \ + --from-literal=username=postgres \ + --from-literal=password="$GLOBAL_SECRET" \ + --from-literal=postgres-password="$GLOBAL_SECRET" + + kubectl get secret redis -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic redis -n "$NAMESPACE" \ + --from-literal=redis-password="$GLOBAL_SECRET" + + kubectl get secret lite-llm-api-key -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic lite-llm-api-key -n "$NAMESPACE" \ + --from-literal=lite-llm-api-key="$GLOBAL_SECRET" + + kubectl get secret admin-password -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic admin-password -n "$NAMESPACE" \ + --from-literal=admin-password="$GLOBAL_SECRET" + + kubectl get secret default-api-key -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic default-api-key -n "$NAMESPACE" \ + --from-literal=default-api-key="$GLOBAL_SECRET" + + kubectl get secret sandbox-api-key -n "$NAMESPACE" >/dev/null 2>&1 || \ + kubectl create secret generic sandbox-api-key -n "$NAMESPACE" \ + --from-literal=sandbox-api-key="$GLOBAL_SECRET" + + # Create LiteLLM env secrets if API keys are provided + if [[ -n "${ANTHROPIC_API_KEY:-}" ]] || [[ -n "${OPENAI_API_KEY:-}" ]]; then + local litellm_args=() + [[ -n "${ANTHROPIC_API_KEY:-}" ]] && litellm_args+=(--from-literal=ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY") + [[ -n "${OPENAI_API_KEY:-}" ]] && litellm_args+=(--from-literal=OPENAI_API_KEY="$OPENAI_API_KEY") + + kubectl delete secret litellm-env-secrets -n "$NAMESPACE" 2>/dev/null || true + kubectl create secret generic litellm-env-secrets -n "$NAMESPACE" "${litellm_args[@]}" + log_info "Created LiteLLM secrets with provided API keys" + else + log_warn "No ANTHROPIC_API_KEY or OPENAI_API_KEY provided. LLM functionality will not work." + log_warn "Set environment variables and re-run, or create secret manually:" + log_warn " kubectl create secret generic litellm-env-secrets -n $NAMESPACE --from-literal=ANTHROPIC_API_KEY=" + fi + + # Create GitHub image pull secret if token provided + if [[ -n "${GITHUB_TOKEN:-}" ]]; then + kubectl delete secret ghcr-login-secret -n "$NAMESPACE" 2>/dev/null || true + kubectl create secret docker-registry ghcr-login-secret -n "$NAMESPACE" \ + --docker-server=ghcr.io \ + --docker-username=openhands \ + --docker-password="$GITHUB_TOKEN" + log_info "Created GitHub container registry secret" + fi + + log_success "Namespace and secrets configured" +} + +# Generate values file for this deployment +generate_values() { + log_info "Generating Helm values..." >&2 + + local values_file="$SCRIPT_DIR/values-${NAMESPACE}.yaml" + + cat > "$values_file" << YAML_EOF +# Auto-generated testbed values for $NAMESPACE +# Generated: $(date) + +# Use in-cluster databases (no external dependencies) +postgresql: + enabled: true + primary: + persistence: + enabled: true + size: 10Gi + +redis: + enabled: true + +# Keycloak for authentication (no GitHub App required) +keycloak: + enabled: true + url: "https://${AUTH_HOST}" + ingress: + enabled: true + hostname: "${AUTH_HOST}" + annotations: + cert-manager.io/cluster-issuer: letsencrypt + +# Disable GitHub auth (use Keycloak email auth instead) +github: + enabled: false + +gitlab: + enabled: false + +bitbucket: + enabled: false + +# Main application ingress +ingress: + enabled: true + host: "${APP_HOST}" + class: traefik + root: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + +tls: + enabled: true + +# Runtime API (for sandbox execution) +runtime-api: + enabled: true + runtimeInSameCluster: true + ingress: + enabled: true + host: "${RUNTIME_HOST}" + annotations: + cert-manager.io/cluster-issuer: letsencrypt + env: + RUNTIME_BASE_URL: "${HOST_PREFIX}.${DNS_DOMAIN}" + STORAGE_CLASS: "standard-rwo" + GCP_PROJECT: "${GCP_PROJECT}" + GCP_REGION: "${GCP_REGION}" + +sandbox: + apiHostname: "https://${RUNTIME_HOST}" + +# LiteLLM proxy for LLM access +litellm: + enabled: true + url: "http://litellm:4000" + +litellm-helm: + enabled: true + ingress: + enabled: false # Internal only for testbed + proxy_config: + environment_variables: + OR_APP_NAME: "OpenHands Testbed" + model_list: + - model_name: "anthropic/claude-sonnet-4-20250514" + litellm_params: + model: "anthropic/claude-sonnet-4-20250514" + api_key: "os.environ/ANTHROPIC_API_KEY" + +# Simplified environment for testbed +env: + OH_APP_MODE: "saas" + LITELLM_DEFAULT_MODEL: "litellm_proxy/anthropic/claude-sonnet-4-20250514" + HIDE_LLM_SETTINGS: "false" + GCP_PROJECT: "${GCP_PROJECT}" + GCP_REGION: "${GCP_REGION}" + +# Filestore - use ephemeral for testbed (simpler) +filestore: + ephemeral: true + +# Minimal resources for testbed +deployment: + replicas: 1 + resources: + requests: + memory: 1Gi + cpu: 500m + limits: + memory: 2Gi + cpu: 1000m + +# Disable production features +datadog: + enabled: false + +stripe: + enabled: false + +resend: + enabled: false + +automation: + enabled: false + +laminar: + enabled: false +YAML_EOF + + log_success "Values file generated: $values_file" >&2 + echo "$values_file" +} + +# Deploy OpenHands +deploy_openhands() { + log_info "Deploying OpenHands..." + + local values_file + values_file=$(generate_values) + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY-RUN] Would deploy OpenHands with values:" + cat "$values_file" + return 0 + fi + + # Build helm dependencies + cd "$REPO_ROOT/charts/openhands" + helm dependency update + + # Deploy + helm upgrade --install openhands . \ + --namespace "$NAMESPACE" \ + --values "$values_file" \ + --wait \ + --timeout 10m + + log_success "OpenHands deployed!" +} + +# Destroy testbed +destroy_testbed() { + log_info "Destroying testbed '$NAMESPACE'..." + + if [[ "$DRY_RUN" == "true" ]]; then + log_info "[DRY-RUN] Would destroy namespace '$NAMESPACE'" + if [[ "$CREATE_CLUSTER" == "true" ]] && [[ -n "$TESTBED_NAME" ]]; then + log_info "[DRY-RUN] Would delete cluster '$CLUSTER_NAME'" + fi + return 0 + fi + + # Delete helm release + helm uninstall openhands --namespace "$NAMESPACE" 2>/dev/null || true + + # Delete namespace (this deletes all resources in it) + kubectl delete namespace "$NAMESPACE" --wait=true 2>/dev/null || true + + # Delete cluster if it was created for this testbed + if [[ "$CREATE_CLUSTER" == "true" ]] && [[ -n "$TESTBED_NAME" ]]; then + log_info "Deleting cluster '$CLUSTER_NAME'..." + gcloud container clusters delete "$CLUSTER_NAME" \ + --project="$GCP_PROJECT" \ + --region="$GCP_REGION" \ + --quiet + fi + + # Clean up values file + rm -f "$SCRIPT_DIR/values-${NAMESPACE}.yaml" + + log_success "Testbed destroyed" +} + +# Show deployment info +show_info() { + log_success "==========================================" + log_success "OpenHands Testbed Deployed!" + log_success "==========================================" + echo "" + echo "Application URL: https://$APP_HOST" + echo "Auth (Keycloak): https://$AUTH_HOST" + echo "Runtime API: https://$RUNTIME_HOST" + echo "" + echo "Namespace: $NAMESPACE" + echo "Cluster: $CLUSTER_NAME" + echo "" + echo "To access:" + echo " 1. Wait for LoadBalancer IP: kubectl get svc -n traefik" + echo " 2. Add DNS records pointing to the LoadBalancer IP" + echo " Or use port-forward: kubectl port-forward svc/openhands-service 3000:3000 -n $NAMESPACE" + echo "" + echo "To destroy:" + echo " ./deploy.sh --name $TESTBED_NAME --destroy" + echo "" + echo "To view logs:" + echo " kubectl logs -f deployment/openhands -n $NAMESPACE" +} + +# Main +main() { + check_prerequisites + + if [[ "$DESTROY" == "true" ]]; then + connect_cluster + destroy_testbed + exit 0 + fi + + if [[ "$CREATE_CLUSTER" == "true" ]]; then + create_cluster + fi + + connect_cluster + + if [[ "$CREATE_CLUSTER" == "true" ]]; then + install_dependencies + fi + + setup_namespace + deploy_openhands + show_info +} + +main diff --git a/scripts/testbed/setup-shared-cluster.sh b/scripts/testbed/setup-shared-cluster.sh new file mode 100755 index 00000000..e22b0595 --- /dev/null +++ b/scripts/testbed/setup-shared-cluster.sh @@ -0,0 +1,410 @@ +#!/bin/bash +set -euo pipefail + +# Setup Shared OpenHands Testbed Cluster +# ====================================== +# One-time setup script to create the shared testbed infrastructure +# in Platform Team Sandbox. Run this once to set up the cluster, +# then use deploy.sh for individual deployments. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Configuration +GCP_PROJECT="${GCP_PROJECT:-platform-team-sandbox-62793}" +GCP_REGION="${GCP_REGION:-us-central1}" +CLUSTER_NAME="${CLUSTER_NAME:-openhands-testbed}" +DNS_ZONE_NAME="sandbox-all-hands-dev" +DNS_DOMAIN="sandbox.all-hands.dev" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +show_usage() { + cat << EOF +Setup Shared OpenHands Testbed Cluster + +This script creates the shared GKE cluster and DNS infrastructure +for the OpenHands testbed environment. Run this once before using +deploy.sh for individual deployments. + +Usage: $(basename "$0") [OPTIONS] + +Options: + --skip-cluster Skip GKE cluster creation (only setup addons) + --skip-dns Skip DNS zone creation + --destroy Destroy the shared cluster (CAUTION!) + --help Show this help message + +Environment Variables: + GCP_PROJECT GCP project ID (default: $GCP_PROJECT) + GCP_REGION GCP region (default: $GCP_REGION) + CLUSTER_NAME GKE cluster name (default: $CLUSTER_NAME) + +EOF + exit 0 +} + +SKIP_CLUSTER=false +SKIP_DNS=false +DESTROY=false + +while [[ $# -gt 0 ]]; do + case $1 in + --skip-cluster) + SKIP_CLUSTER=true + shift + ;; + --skip-dns) + SKIP_DNS=true + shift + ;; + --destroy) + DESTROY=true + shift + ;; + --help|-h) + show_usage + ;; + *) + log_error "Unknown option: $1" + show_usage + ;; + esac +done + +# Enable required APIs +enable_apis() { + log_info "Enabling required GCP APIs..." + + local apis=( + "container.googleapis.com" + "dns.googleapis.com" + "compute.googleapis.com" + "iam.googleapis.com" + "cloudresourcemanager.googleapis.com" + ) + + for api in "${apis[@]}"; do + gcloud services enable "$api" --project="$GCP_PROJECT" 2>/dev/null || true + done + + log_success "APIs enabled" +} + +# Create GKE cluster +create_cluster() { + log_info "Creating GKE cluster '$CLUSTER_NAME'..." + + if gcloud container clusters describe "$CLUSTER_NAME" \ + --project="$GCP_PROJECT" \ + --region="$GCP_REGION" >/dev/null 2>&1; then + log_warn "Cluster '$CLUSTER_NAME' already exists" + return 0 + fi + + # Create VPC network + local network_name="${CLUSTER_NAME}-network" + if ! gcloud compute networks describe "$network_name" --project="$GCP_PROJECT" >/dev/null 2>&1; then + log_info "Creating VPC network '$network_name'..." + gcloud compute networks create "$network_name" \ + --project="$GCP_PROJECT" \ + --subnet-mode=auto + fi + + # Create GKE cluster + gcloud container clusters create "$CLUSTER_NAME" \ + --project="$GCP_PROJECT" \ + --region="$GCP_REGION" \ + --network="$network_name" \ + --machine-type=e2-standard-4 \ + --num-nodes=1 \ + --enable-autoscaling \ + --min-nodes=1 \ + --max-nodes=10 \ + --disk-size=100 \ + --disk-type=pd-standard \ + --enable-ip-alias \ + --workload-pool="${GCP_PROJECT}.svc.id.goog" \ + --release-channel=regular \ + --no-enable-basic-auth \ + --metadata disable-legacy-endpoints=true \ + --addons=HttpLoadBalancing,HorizontalPodAutoscaling \ + --labels=environment=testbed,team=platform + + log_success "Cluster '$CLUSTER_NAME' created" +} + +# Connect to cluster +connect_cluster() { + log_info "Connecting to cluster..." + gcloud container clusters get-credentials "$CLUSTER_NAME" \ + --project="$GCP_PROJECT" \ + --region="$GCP_REGION" + log_success "Connected to cluster" +} + +# Create DNS zone +create_dns_zone() { + log_info "Setting up DNS zone for '$DNS_DOMAIN'..." + + if gcloud dns managed-zones describe "$DNS_ZONE_NAME" \ + --project="$GCP_PROJECT" >/dev/null 2>&1; then + log_warn "DNS zone '$DNS_ZONE_NAME' already exists" + return 0 + fi + + gcloud dns managed-zones create "$DNS_ZONE_NAME" \ + --project="$GCP_PROJECT" \ + --description="DNS zone for OpenHands testbed" \ + --dns-name="${DNS_DOMAIN}." + + log_success "DNS zone created" + + # Show NS records + log_info "DNS zone NS records (delegate these from parent zone):" + gcloud dns managed-zones describe "$DNS_ZONE_NAME" \ + --project="$GCP_PROJECT" \ + --format="value(nameServers)" +} + +# Install Traefik +install_traefik() { + log_info "Installing Traefik ingress controller..." + + if helm list -n traefik 2>/dev/null | grep -q traefik; then + log_warn "Traefik already installed" + return 0 + fi + + helm repo add traefik https://traefik.github.io/charts 2>/dev/null || true + helm repo update + + kubectl create namespace traefik 2>/dev/null || true + + helm upgrade --install traefik traefik/traefik \ + --namespace traefik \ + --set service.type=LoadBalancer \ + --set service.annotations."cloud\.google\.com/load-balancer-type"=External \ + --set ingressClass.enabled=true \ + --set ingressClass.isDefaultClass=true \ + --set providers.kubernetesIngress.publishedService.enabled=true \ + --wait + + log_success "Traefik installed" + + # Wait for LoadBalancer IP + log_info "Waiting for LoadBalancer IP..." + local max_wait=120 + local waited=0 + local lb_ip="" + + while [[ -z "$lb_ip" ]] && [[ $waited -lt $max_wait ]]; do + lb_ip=$(kubectl get svc traefik -n traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) + if [[ -z "$lb_ip" ]]; then + sleep 5 + waited=$((waited + 5)) + fi + done + + if [[ -n "$lb_ip" ]]; then + log_success "LoadBalancer IP: $lb_ip" + echo "" + echo "Add wildcard DNS record: *.${DNS_DOMAIN} -> $lb_ip" + else + log_warn "LoadBalancer IP not yet assigned. Check later with:" + log_warn " kubectl get svc traefik -n traefik" + fi +} + +# Install cert-manager +install_cert_manager() { + log_info "Installing cert-manager..." + + if helm list -n cert-manager 2>/dev/null | grep -q cert-manager; then + log_warn "cert-manager already installed" + else + helm repo add jetstack https://charts.jetstack.io 2>/dev/null || true + helm repo update + + kubectl create namespace cert-manager 2>/dev/null || true + + helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --set crds.enabled=true \ + --wait + + log_success "cert-manager installed" + fi + + # Create ClusterIssuers + log_info "Creating ClusterIssuers..." + + kubectl apply -f - << 'EOF' +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: platform-team@all-hands.dev + privateKeySecretRef: + name: letsencrypt-account-key + solvers: + - http01: + ingress: + class: traefik +--- +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + server: https://acme-staging-v02.api.letsencrypt.org/directory + email: platform-team@all-hands.dev + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + ingress: + class: traefik +EOF + + log_success "ClusterIssuers created" +} + +# Create storage class +create_storage_class() { + log_info "Checking storage classes..." + + # Check if standard-rwo already exists (it's created by default in GKE) + if kubectl get storageclass standard-rwo >/dev/null 2>&1; then + log_info "Storage class 'standard-rwo' already exists" + else + log_info "Creating storage class 'standard-rwo'..." + kubectl apply -f - << 'EOF' +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: standard-rwo + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: pd.csi.storage.gke.io +parameters: + type: pd-standard +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true +EOF + fi + + log_success "Storage classes ready" +} + +# Destroy everything +destroy_cluster() { + log_warn "This will destroy the shared testbed cluster and all deployments!" + read -p "Are you sure? (type 'yes' to confirm): " confirm + + if [[ "$confirm" != "yes" ]]; then + log_info "Aborted" + exit 1 + fi + + log_info "Destroying cluster '$CLUSTER_NAME'..." + + # Delete cluster + gcloud container clusters delete "$CLUSTER_NAME" \ + --project="$GCP_PROJECT" \ + --region="$GCP_REGION" \ + --quiet || true + + # Delete DNS zone + log_info "Deleting DNS zone..." + gcloud dns managed-zones delete "$DNS_ZONE_NAME" \ + --project="$GCP_PROJECT" \ + --quiet || true + + # Delete VPC network + local network_name="${CLUSTER_NAME}-network" + log_info "Deleting VPC network..." + gcloud compute networks delete "$network_name" \ + --project="$GCP_PROJECT" \ + --quiet || true + + log_success "Shared cluster destroyed" +} + +# Show status +show_status() { + echo "" + log_success "==========================================" + log_success "Shared Testbed Cluster Ready!" + log_success "==========================================" + echo "" + echo "Cluster: $CLUSTER_NAME" + echo "Project: $GCP_PROJECT" + echo "Region: $GCP_REGION" + echo "DNS Domain: $DNS_DOMAIN" + echo "" + + local lb_ip + lb_ip=$(kubectl get svc traefik -n traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "pending") + echo "LoadBalancer IP: $lb_ip" + echo "" + + if [[ "$lb_ip" != "pending" ]]; then + echo "DNS Setup Required:" + echo " Add wildcard A record: *.${DNS_DOMAIN} -> $lb_ip" + echo "" + fi + + echo "Next Steps:" + echo " 1. Set up DNS wildcard record (see above)" + echo " 2. Deploy your testbed:" + echo " cd $(dirname "$SCRIPT_DIR")" + echo " ./testbed/deploy.sh --name " + echo "" +} + +# Main +main() { + log_info "Setting up shared OpenHands testbed cluster..." + log_info "Project: $GCP_PROJECT" + log_info "Region: $GCP_REGION" + log_info "Cluster: $CLUSTER_NAME" + echo "" + + if [[ "$DESTROY" == "true" ]]; then + destroy_cluster + exit 0 + fi + + enable_apis + + if [[ "$SKIP_DNS" != "true" ]]; then + create_dns_zone + fi + + if [[ "$SKIP_CLUSTER" != "true" ]]; then + create_cluster + fi + + connect_cluster + create_storage_class + install_traefik + install_cert_manager + + show_status +} + +main diff --git a/scripts/testbed/values-testbed-test.yaml b/scripts/testbed/values-testbed-test.yaml new file mode 100644 index 00000000..f7c2f3a5 --- /dev/null +++ b/scripts/testbed/values-testbed-test.yaml @@ -0,0 +1,125 @@ +# Auto-generated testbed values for testbed-test +# Generated: Thu Apr 16 16:35:44 PDT 2026 + +# Use in-cluster databases (no external dependencies) +postgresql: + enabled: true + auth: + username: postgres + database: openhands + primary: + persistence: + enabled: true + size: 10Gi + +redis: + enabled: true + +# Keycloak for authentication (no GitHub App required) +# Using internal URL for init container to work without DNS +# For production with DNS, change url to https://auth-testbed-test.sandbox.all-hands.dev +keycloak: + enabled: true + url: "http://keycloak" + ingress: + enabled: true + hostname: "auth-testbed-test.sandbox.all-hands.dev" + annotations: + cert-manager.io/cluster-issuer: letsencrypt + +# Disable GitHub auth (use Keycloak email auth instead) +github: + enabled: false + +gitlab: + enabled: false + +bitbucket: + enabled: false + +# Main application ingress +ingress: + enabled: true + host: "testbed-test.sandbox.all-hands.dev" + class: traefik + root: + annotations: + cert-manager.io/cluster-issuer: letsencrypt + +tls: + enabled: true + +# Runtime API (for sandbox execution) +runtime-api: + enabled: true + runtimeInSameCluster: true + ingress: + enabled: true + host: "runtime-testbed-test.sandbox.all-hands.dev" + annotations: + cert-manager.io/cluster-issuer: letsencrypt + env: + RUNTIME_BASE_URL: "testbed-test.sandbox.all-hands.dev" + STORAGE_CLASS: "standard-rwo" + GCP_PROJECT: "platform-team-sandbox-62793" + GCP_REGION: "us-central1" + +sandbox: + apiHostname: "https://runtime-testbed-test.sandbox.all-hands.dev" + +# LiteLLM proxy for LLM access +litellm: + enabled: true + url: "http://litellm:4000" + +litellm-helm: + enabled: true + ingress: + enabled: false # Internal only for testbed + proxy_config: + environment_variables: + OR_APP_NAME: "OpenHands Testbed" + model_list: + - model_name: "anthropic/claude-sonnet-4-20250514" + litellm_params: + model: "anthropic/claude-sonnet-4-20250514" + api_key: "os.environ/ANTHROPIC_API_KEY" + +# Simplified environment for testbed +env: + OH_APP_MODE: "saas" + LITELLM_DEFAULT_MODEL: "litellm_proxy/anthropic/claude-sonnet-4-20250514" + HIDE_LLM_SETTINGS: "false" + GCP_PROJECT: "platform-team-sandbox-62793" + GCP_REGION: "us-central1" + +# Filestore - use ephemeral for testbed (simpler) +filestore: + ephemeral: true + +# Minimal resources for testbed +deployment: + replicas: 1 + resources: + requests: + memory: 1Gi + cpu: 500m + limits: + memory: 2Gi + cpu: 1000m + +# Disable production features +datadog: + enabled: false + +stripe: + enabled: false + +resend: + enabled: false + +automation: + enabled: false + +laminar: + enabled: false From e58da0c2d777493c250e3da840b5e669d4b1709f Mon Sep 17 00:00:00 2001 From: Saurya Velagapudi Date: Fri, 17 Apr 2026 13:06:49 -0700 Subject: [PATCH 4/6] Add PRD: Enterprise Staging Environments Document the strategy for 4 staging environments: - 2 CI environments (pathroute + subdomain routing) - 2 Dev environments (pathroute + subdomain routing) Covers: - TLS/cert-manager setup - Helm chart installation complexity - Incremental deployment strategy - SAML IdP (Keycloak) setup - Integration test suite requirements - External DNS routing - Why Replicated isn't the solution for this use case Co-authored-by: openhands --- docs/PRD-staging-environments.md | 593 +++++++++++++++++++++++++++++++ 1 file changed, 593 insertions(+) create mode 100644 docs/PRD-staging-environments.md diff --git a/docs/PRD-staging-environments.md b/docs/PRD-staging-environments.md new file mode 100644 index 00000000..d47057be --- /dev/null +++ b/docs/PRD-staging-environments.md @@ -0,0 +1,593 @@ +# PRD: Enterprise Staging Environments + +**Author:** Saurya Velagapudi +**Date:** 2026-04-14 +**Status:** Draft +**Stakeholders:** Engineering, DevOps, QA + +--- + +## Executive Summary + +We need staging environments that accurately replicate what enterprise customers experience when running OpenHands in production. This PRD defines a four-environment staging infrastructure that enables both automated CI testing and individual developer validation of customer-facing features. + +--- + +## Problem Statement + +### Current State + +Today, we lack staging environments that accurately reflect enterprise deployments: + +1. **No CI environment for integration testing** - We cannot run end-to-end tests against a real Kubernetes deployment with enterprise features (SAML, HA, TLS). + +2. **No developer sandbox for customer issue reproduction** - When debugging customer issues, engineers have no environment that mirrors customer infrastructure. + +3. **No validation of routing patterns** - Customers deploy with either path-based routing (`app.example.com/api/automation`) or subdomain-based routing (`automation.app.example.com`). We have no way to test both patterns. + +### Why Replicated Is Not the Solution + +Replicated is valuable for customer POCs and single-VM deployments, but it does not solve this problem: + +| Requirement | Replicated | Our Staging Infra | +|-------------|------------|-------------------| +| High-availability deployment | ❌ Single VM | ✅ Multi-node K8s cluster | +| Scale-up customer simulation | ❌ Not supported yet | ✅ Mirrors production topology | +| Rapid iteration on infra changes | ❌ Full redeploy cycle | ✅ Incremental Helm updates | +| CI/CD integration | ❌ Manual process | ✅ GitHub Actions automation | +| Multiple concurrent environments | ❌ One at a time | ✅ Namespace isolation | + +Replicated remains our solution for customer self-hosted POCs. These staging environments are for **internal engineering validation** of features that enterprise customers depend on. + +--- + +## Goals + +### Primary Goals + +1. **Validate enterprise features before release** - SAML SSO, TLS, high-availability configurations +2. **Enable CI integration testing** - Automated tests against real infrastructure +3. **Support customer issue debugging** - Quickly spin up environments that mirror customer setups +4. **Test both routing patterns** - Path-based and subdomain-based routing + +### Non-Goals + +- Replacing Replicated for customer deployments +- Full production parity (we accept some cost optimizations) +- Multi-region testing (future scope) + +--- + +## Proposed Solution + +### Four Staging Environments + +| Environment | Purpose | Routing | Deployment Trigger | +|-------------|---------|---------|-------------------| +| `staging-ci-pathroute` | Automated CI testing | Path-based | On PR merge to main | +| `staging-ci-subdomain` | Automated CI testing | Subdomain-based | On PR merge to main | +| `staging-dev-pathroute` | Developer sandbox | Path-based | Manual / feature branch | +| `staging-dev-subdomain` | Developer sandbox | Subdomain-based | Manual / feature branch | + +### Architecture + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ GCP Project: staging-092324 │ +├─────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ staging-ci- │ │ staging-ci- │ CI Environments │ +│ │ pathroute │ │ subdomain │ (auto-deployed) │ +│ │ namespace │ │ namespace │ │ +│ └────────┬────────┘ └────────┬────────┘ │ +│ │ │ │ +│ ┌────────┴────────┐ ┌────────┴────────┐ │ +│ │ staging-dev- │ │ staging-dev- │ Dev Environments │ +│ │ pathroute │ │ subdomain │ (manual deploy) │ +│ │ namespace │ │ namespace │ │ +│ └─────────────────┘ └─────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────┐ │ +│ │ Shared Infrastructure │ │ +│ │ • cert-manager (ClusterIssuer) │ │ +│ │ • external-dns │ │ +│ │ • traefik ingress controller │ │ +│ │ • Keycloak (SAML IdP) │ │ +│ └──────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### DNS Structure + +**Path-based routing environments:** +``` +staging-ci-pathroute.all-hands.dev + └── / → openhands-service + └── /api/automation → automation-service + └── /integration/* → integration-events-service + └── /mcp/mcp → mcp-service + +staging-dev-pathroute.all-hands.dev + └── (same structure) +``` + +**Subdomain-based routing environments:** +``` +staging-ci-subdomain.all-hands.dev → openhands-service +automation.staging-ci-subdomain.all-hands.dev → automation-service +integrations.staging-ci-subdomain.all-hands.dev → integration-events-service +mcp.staging-ci-subdomain.all-hands.dev → mcp-service + +staging-dev-subdomain.all-hands.dev + └── (same structure with subdomains) +``` + +--- + +## Technical Requirements + +### 1. TLS Certificates and cert-manager + +**Requirement:** All four environments must have valid TLS certificates. + +**Implementation:** +```yaml +# Cluster-scoped ClusterIssuer (shared across namespaces) +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: devops@all-hands.dev + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - http01: + ingress: + class: traefik +``` + +**Wildcard certificates for subdomain routing:** +```yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: staging-ci-subdomain-wildcard + namespace: staging-ci-subdomain +spec: + secretName: staging-ci-subdomain-tls + issuerRef: + name: letsencrypt-staging + kind: ClusterIssuer + dnsNames: + - "staging-ci-subdomain.all-hands.dev" + - "*.staging-ci-subdomain.all-hands.dev" +``` + +**Tasks:** +- [ ] Install cert-manager in staging cluster +- [ ] Create ClusterIssuer for Let's Encrypt +- [ ] Configure DNS-01 solver for wildcard certs (subdomain envs) +- [ ] Configure HTTP-01 solver for standard certs (pathroute envs) + +**Estimated Effort:** 1 day + +--- + +### 2. Helm Chart Installation Complexity + +**Current Assessment:** + +The OpenHands Helm chart (`charts/openhands/`) has the following dependencies: + +| Dependency | Required | Notes | +|------------|----------|-------| +| PostgreSQL | Yes | External or in-cluster | +| Redis | Yes | External or in-cluster | +| Keycloak | Yes | For authentication | +| LiteLLM Proxy | Yes | For LLM routing | +| cert-manager | Yes | For TLS | +| traefik | Yes | Ingress controller | + +**Installation Order:** +1. **Cluster prerequisites** (one-time): + - cert-manager + - traefik ingress controller + - external-dns + - Keycloak (shared SAML IdP) + +2. **Per-environment** (each namespace): + - PostgreSQL (or connection to shared instance) + - Redis (or connection to shared instance) + - OpenHands chart + - LiteLLM Proxy + +**Complexity Assessment:** + +| Task | Complexity | Notes | +|------|------------|-------| +| Fresh cluster setup | High | ~2-3 days for prerequisites | +| New environment in existing cluster | Medium | ~2-4 hours | +| Updating existing environment | Low | ~5-10 minutes | + +**Tasks:** +- [ ] Document cluster prerequisites installation +- [ ] Create shared infrastructure Helm chart or Terraform +- [ ] Validate Helm chart works with namespace isolation + +**Estimated Effort:** 3-5 days (one-time cluster setup) + +--- + +### 3. Incremental Deployment Strategy + +**Problem:** How do we deploy only the components that changed instead of redeploying everything? + +**Solution:** Helm upgrade with selective value overrides + +```bash +# Deploy only if openhands chart changed +helm upgrade --install openhands-staging charts/openhands \ + -n staging-ci-pathroute \ + -f envs/common/values.yaml \ + -f envs/staging-ci-pathroute/values.yaml \ + --set image.tag=$NEW_TAG + +# Deploy only runtime-api if that changed +helm upgrade --install runtime-api-staging charts/runtime-api \ + -n staging-ci-pathroute \ + -f envs/staging-ci-pathroute/runtime-api-values.yaml \ + --set image.tag=$NEW_TAG +``` + +**GitHub Actions Integration:** + +```yaml +jobs: + detect-changes: + outputs: + openhands: ${{ steps.changes.outputs.openhands }} + runtime-api: ${{ steps.changes.outputs.runtime-api }} + automation: ${{ steps.changes.outputs.automation }} + steps: + - uses: dorny/paths-filter@v2 + id: changes + with: + filters: | + openhands: + - 'charts/openhands/**' + - 'envs/**/values.yaml' + runtime-api: + - 'charts/runtime-api/**' + automation: + - 'charts/automation/**' + + deploy-openhands: + needs: detect-changes + if: needs.detect-changes.outputs.openhands == 'true' + # ... deploy only openhands chart +``` + +**Tasks:** +- [ ] Implement path-based change detection in CI +- [ ] Create per-chart deployment jobs +- [ ] Add rollback on failure + +**Estimated Effort:** 2 days + +--- + +### 4. SAML Identity Provider Setup + +**Requirement:** Enterprise customers use SAML SSO. We need a SAML IdP in staging to test this flow. + +**Options:** + +| Option | Pros | Cons | +|--------|------|------| +| **Keycloak** (recommended) | Full-featured, widely used, already in our stack | More complex setup | +| Mock SAML IdP | Simple, fast | Not production-realistic | +| Okta Developer | Real IdP | External dependency, cost | + +**Keycloak Implementation:** + +```yaml +# Shared Keycloak deployment (one per cluster) +apiVersion: v1 +kind: Namespace +metadata: + name: keycloak +--- +# Keycloak Helm deployment +helm install keycloak bitnami/keycloak \ + -n keycloak \ + --set auth.adminUser=admin \ + --set auth.adminPassword=$KEYCLOAK_ADMIN_PASSWORD \ + --set ingress.enabled=true \ + --set ingress.hostname=auth.staging.all-hands.dev +``` + +**SAML Realm Configuration:** +- Create realm: `openhands-staging` +- Create client for each environment (4 total) +- Configure SAML assertions with required attributes +- Create test users with various roles + +**Optional: GitHub/GitLab OAuth:** +```yaml +# In Keycloak, configure identity providers: +# - GitHub OAuth App +# - GitLab OAuth App +# - Google OAuth (if needed) +``` + +**Tasks:** +- [ ] Deploy Keycloak to staging cluster +- [ ] Create SAML realm and clients +- [ ] Configure test users with various enterprise roles +- [ ] Document SAML configuration for each environment +- [ ] (Optional) Add GitHub/GitLab OAuth providers + +**Estimated Effort:** 2-3 days + +--- + +### 5. Integration Test Suite + +**Requirement:** Create integration tests that validate enterprise features using the staging environments. + +**Test Categories:** + +| Category | Tests | Environment | +|----------|-------|-------------| +| **Authentication** | SAML login, logout, session refresh | All | +| **Authorization** | Role-based access, team permissions | All | +| **Routing** | Path-based ingress, subdomain routing | Split by type | +| **Conversations** | Create, resume, attach runtime | All | +| **Integrations** | GitHub webhooks, GitLab webhooks | All | +| **Billing** | Stripe webhook handling | CI only | + +**Test Framework:** + +```python +# tests/integration/conftest.py +import pytest + +@pytest.fixture +def staging_env(): + """Configure test to run against staging environment.""" + return { + "base_url": os.environ.get("STAGING_URL", "https://staging-ci-pathroute.all-hands.dev"), + "saml_idp": os.environ.get("SAML_IDP_URL", "https://auth.staging.all-hands.dev"), + "test_user": os.environ.get("TEST_USER_EMAIL"), + "test_password": os.environ.get("TEST_USER_PASSWORD"), + } + +@pytest.fixture +async def authenticated_client(staging_env): + """Get authenticated client via SAML.""" + client = OpenHandsClient(staging_env["base_url"]) + await client.login_saml( + idp_url=staging_env["saml_idp"], + username=staging_env["test_user"], + password=staging_env["test_password"], + ) + return client +``` + +**Example Test:** + +```python +# tests/integration/test_saml_auth.py +import pytest + +@pytest.mark.integration +async def test_saml_login_creates_session(authenticated_client): + """Verify SAML login creates valid session.""" + user = await authenticated_client.get_current_user() + assert user is not None + assert user.email == os.environ["TEST_USER_EMAIL"] + +@pytest.mark.integration +async def test_saml_logout_invalidates_session(authenticated_client): + """Verify SAML logout invalidates session.""" + await authenticated_client.logout() + with pytest.raises(AuthenticationError): + await authenticated_client.get_current_user() +``` + +**CI Integration:** + +```yaml +# .github/workflows/integration-tests.yml +name: Integration Tests + +on: + workflow_run: + workflows: ["Deploy to Staging"] + types: [completed] + +jobs: + integration-tests: + if: ${{ github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + strategy: + matrix: + env: [staging-ci-pathroute, staging-ci-subdomain] + steps: + - uses: actions/checkout@v4 + + - name: Run integration tests + env: + STAGING_URL: https://${{ matrix.env }}.all-hands.dev + SAML_IDP_URL: https://auth.staging.all-hands.dev + TEST_USER_EMAIL: ${{ secrets.STAGING_TEST_USER }} + TEST_USER_PASSWORD: ${{ secrets.STAGING_TEST_PASSWORD }} + run: | + pytest tests/integration/ -v --tb=short +``` + +**Tasks:** +- [ ] Define integration test framework (pytest-asyncio recommended) +- [ ] Implement SAML authentication helper +- [ ] Write core authentication tests +- [ ] Write routing validation tests +- [ ] Write conversation lifecycle tests +- [ ] Integrate with CI pipeline +- [ ] Extract patterns from Tim's SaaS feature tests + +**Estimated Effort:** 5-7 days + +--- + +### 6. External DNS Routing + +**Requirement:** DNS records should be automatically created/updated when ingresses are created. + +**Implementation: external-dns** + +```yaml +# external-dns deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: external-dns + namespace: kube-system +spec: + template: + spec: + containers: + - name: external-dns + image: registry.k8s.io/external-dns/external-dns:v0.14.0 + args: + - --source=ingress + - --domain-filter=all-hands.dev + - --provider=google + - --google-project=staging-092324 + - --registry=txt + - --txt-owner-id=staging-cluster + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /etc/secrets/gcp-credentials.json +``` + +**DNS Records Created:** + +| Ingress Host | DNS Record | Type | +|--------------|------------|------| +| `staging-ci-pathroute.all-hands.dev` | → Load Balancer IP | A | +| `staging-ci-subdomain.all-hands.dev` | → Load Balancer IP | A | +| `*.staging-ci-subdomain.all-hands.dev` | → Load Balancer IP | A | +| `auth.staging.all-hands.dev` | → Keycloak LB IP | A | + +**Tasks:** +- [ ] Deploy external-dns to staging cluster +- [ ] Configure GCP Cloud DNS permissions +- [ ] Validate automatic DNS record creation +- [ ] Set appropriate TTL values (low for staging) + +**Estimated Effort:** 1 day + +--- + +## Implementation Plan + +### Phase 1: Foundation (Week 1) +- [ ] Deploy cert-manager with ClusterIssuer +- [ ] Deploy external-dns +- [ ] Configure traefik ingress controller +- [ ] Create 4 namespaces with base RBAC + +### Phase 2: CI Environments (Week 2) +- [ ] Deploy `staging-ci-pathroute` environment +- [ ] Deploy `staging-ci-subdomain` environment +- [ ] Validate deployments with smoke tests +- [ ] Integrate with GitHub Actions + +### Phase 3: Authentication (Week 2-3) +- [ ] Deploy shared Keycloak instance +- [ ] Configure SAML realm and clients +- [ ] Create test users +- [ ] Validate SAML login flow + +### Phase 4: Dev Environments (Week 3) +- [ ] Deploy `staging-dev-pathroute` environment +- [ ] Deploy `staging-dev-subdomain` environment +- [ ] Create manual deployment workflow +- [ ] Document feature branch deployment process + +### Phase 5: Integration Tests (Week 3-4) +- [ ] Set up test framework +- [ ] Implement core test suite +- [ ] Integrate with CI pipeline +- [ ] Document test coverage + +--- + +## Success Metrics + +| Metric | Target | Measurement | +|--------|--------|-------------| +| CI test pass rate | >95% | GitHub Actions | +| Deployment time | <10 min | Workflow duration | +| Environment availability | >99% | Uptime monitoring | +| Customer issue repro time | <1 hour | Engineering feedback | + +--- + +## Risks and Mitigations + +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| Certificate rate limiting | Medium | High | Use Let's Encrypt staging for dev envs | +| Resource costs | Medium | Medium | Auto-scale down dev envs after hours | +| Configuration drift | High | Medium | GitOps with ArgoCD (future) | +| Keycloak complexity | Medium | Medium | Start with minimal SAML config | + +--- + +## Open Questions + +1. **Shared vs isolated databases?** + - Shared PostgreSQL cluster with per-env databases? + - Or isolated PostgreSQL per environment? + +2. **LiteLLM Proxy sharing?** + - One LiteLLM proxy for all staging envs? + - Or per-environment (higher cost but better isolation)? + +3. **Runtime cluster?** + - Same cluster as application? + - Separate cluster (mirrors production)? + +4. **Cost budget?** + - What's the monthly budget for staging infrastructure? + +--- + +## Appendix + +### A. Environment URLs + +| Environment | Main URL | Automation URL | +|-------------|----------|----------------| +| staging-ci-pathroute | https://staging-ci-pathroute.all-hands.dev | https://staging-ci-pathroute.all-hands.dev/api/automation | +| staging-ci-subdomain | https://staging-ci-subdomain.all-hands.dev | https://automation.staging-ci-subdomain.all-hands.dev | +| staging-dev-pathroute | https://staging-dev-pathroute.all-hands.dev | https://staging-dev-pathroute.all-hands.dev/api/automation | +| staging-dev-subdomain | https://staging-dev-subdomain.all-hands.dev | https://automation.staging-dev-subdomain.all-hands.dev | + +### B. Related Documents + +- [PR #542: Staging Infrastructure](https://github.com/All-Hands-AI/OpenHands-Cloud/pull/542) +- [ARCHITECTURE.md](./ARCHITECTURE.md) + +### C. Glossary + +- **Path-based routing**: All services accessed via paths on a single domain +- **Subdomain-based routing**: Each service gets its own subdomain +- **ClusterIssuer**: Cluster-wide certificate issuer (cert-manager) +- **external-dns**: Kubernetes operator that creates DNS records from Ingress resources From 98f8ffc819639e69b86cf0e795bb35747b495f97 Mon Sep 17 00:00:00 2001 From: Saurya Velagapudi Date: Fri, 17 Apr 2026 13:27:23 -0700 Subject: [PATCH 5/6] Update PRD with completed testbed infrastructure progress - Add 'Current Progress' section documenting completed testbed work - Update status from 'Draft' to 'In Progress' - Add Phase 0 (Developer Testbed) to Implementation Plan - Mark completed tasks in all phases - Update Appendix with testbed URLs and infrastructure details - Add links to testbed README and deploy script Co-authored-by: openhands --- docs/PRD-staging-environments.md | 135 ++++++++++++++++++++++++++++--- 1 file changed, 125 insertions(+), 10 deletions(-) diff --git a/docs/PRD-staging-environments.md b/docs/PRD-staging-environments.md index d47057be..39b87ed3 100644 --- a/docs/PRD-staging-environments.md +++ b/docs/PRD-staging-environments.md @@ -2,7 +2,8 @@ **Author:** Saurya Velagapudi **Date:** 2026-04-14 -**Status:** Draft +**Updated:** 2026-04-16 +**Status:** In Progress **Stakeholders:** Engineering, DevOps, QA --- @@ -13,6 +14,87 @@ We need staging environments that accurately replicate what enterprise customers --- +## Current Progress (as of 2026-04-16) + +### ✅ Completed: Developer Testbed Infrastructure + +We have implemented a **developer testbed** in the Platform Team Sandbox GCP project that addresses Goal #3 (customer issue debugging) and provides a foundation for the full staging environment. + +**What's Deployed:** + +| Component | Status | Details | +|-----------|--------|---------| +| GKE Cluster | ✅ Running | `openhands-testbed` in `platform-team-sandbox-62793` | +| Traefik Ingress | ✅ Running | LoadBalancer IP: `34.28.75.102` | +| cert-manager | ✅ Running | ClusterIssuer configured for Let's Encrypt | +| DNS Zone | ✅ Created | `sandbox.all-hands.dev` (private, no NS delegation) | +| Deployment Scripts | ✅ Committed | `scripts/testbed/deploy.sh` | +| Documentation | ✅ Written | `scripts/testbed/README.md` | + +**Architecture:** + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Platform Team Sandbox GCP Project │ +│ (platform-team-sandbox-62793) │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ GKE: openhands-testbed │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ traefik │ │ cert-manager│ Shared Services │ │ +│ │ │ (ingress) │ │ (TLS certs) │ │ │ +│ │ └─────────────┘ └─────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────────────────────────────────────────────┐ │ │ +│ │ │ Namespace: testbed- │ │ │ +│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ +│ │ │ │openhands│ │keycloak │ │litellm │ │postgres │ │ │ │ +│ │ │ │ │ │ (auth) │ │ (llm) │ │ (db) │ │ │ │ +│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ +│ │ └──────────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +**Key Design Decisions:** + +1. **Private by design** - No public DNS; access via `/etc/hosts` only +2. **Namespace isolation** - Each developer gets their own namespace +3. **In-cluster databases** - PostgreSQL and Redis per namespace (simpler, disposable) +4. **Keycloak auth** - No GitHub App required for testbed +5. **Self-signed TLS** - Traefik default cert (no Let's Encrypt for private env) + +**Usage:** + +```bash +# Deploy your instance +export ANTHROPIC_API_KEY="sk-ant-..." +./scripts/testbed/deploy.sh --name + +# Add to /etc/hosts +echo "34.28.75.102 testbed-.sandbox.all-hands.dev auth-testbed-.sandbox.all-hands.dev" | sudo tee -a /etc/hosts + +# Access +https://testbed-.sandbox.all-hands.dev +``` + +See [`scripts/testbed/README.md`](../scripts/testbed/README.md) for full documentation. + +### 🔄 Next Steps + +The testbed provides a foundation. To complete the full staging environment vision: + +1. **Add CI environments** - `staging-ci-pathroute` and `staging-ci-subdomain` +2. **Enable public DNS** - For CI environments that need webhook testing +3. **Add SAML/SSO** - Shared Keycloak with SAML realm +4. **Add routing variations** - Path-based vs subdomain-based routing +5. **CI integration** - GitHub Actions workflow for automated testing + +--- + ## Problem Statement ### Current State @@ -496,10 +578,19 @@ spec: ## Implementation Plan +### Phase 0: Developer Testbed (✅ COMPLETED 2026-04-16) +- [x] Create GKE cluster (`openhands-testbed`) in Platform Team Sandbox +- [x] Deploy cert-manager with ClusterIssuer +- [x] Configure traefik ingress controller +- [x] Create DNS zone (`sandbox.all-hands.dev`) +- [x] Create deployment scripts (`scripts/testbed/deploy.sh`) +- [x] Deploy test instance and validate OpenHands functionality +- [x] Write documentation (`scripts/testbed/README.md`) + ### Phase 1: Foundation (Week 1) -- [ ] Deploy cert-manager with ClusterIssuer +- [x] Deploy cert-manager with ClusterIssuer *(done in Phase 0)* - [ ] Deploy external-dns -- [ ] Configure traefik ingress controller +- [x] Configure traefik ingress controller *(done in Phase 0)* - [ ] Create 4 namespaces with base RBAC ### Phase 2: CI Environments (Week 2) @@ -509,16 +600,17 @@ spec: - [ ] Integrate with GitHub Actions ### Phase 3: Authentication (Week 2-3) -- [ ] Deploy shared Keycloak instance -- [ ] Configure SAML realm and clients +- [x] Deploy Keycloak instance *(per-namespace in testbed)* +- [ ] Configure shared SAML realm and clients - [ ] Create test users - [ ] Validate SAML login flow ### Phase 4: Dev Environments (Week 3) -- [ ] Deploy `staging-dev-pathroute` environment -- [ ] Deploy `staging-dev-subdomain` environment -- [ ] Create manual deployment workflow -- [ ] Document feature branch deployment process +- [x] Deploy developer testbed *(done in Phase 0)* +- [ ] Deploy `staging-dev-pathroute` environment (routing variant) +- [ ] Deploy `staging-dev-subdomain` environment (routing variant) +- [x] Create manual deployment workflow *(done in Phase 0)* +- [x] Document feature branch deployment process *(done in Phase 0)* ### Phase 5: Integration Tests (Week 3-4) - [ ] Set up test framework @@ -573,6 +665,14 @@ spec: ### A. Environment URLs +**Developer Testbed (✅ LIVE):** + +| Environment | Main URL | Access | +|-------------|----------|--------| +| testbed-{name} | https://testbed-{name}.sandbox.all-hands.dev | Private (`/etc/hosts` + GCP access) | + +**Planned CI/Staging Environments:** + | Environment | Main URL | Automation URL | |-------------|----------|----------------| | staging-ci-pathroute | https://staging-ci-pathroute.all-hands.dev | https://staging-ci-pathroute.all-hands.dev/api/automation | @@ -584,10 +684,25 @@ spec: - [PR #542: Staging Infrastructure](https://github.com/All-Hands-AI/OpenHands-Cloud/pull/542) - [ARCHITECTURE.md](./ARCHITECTURE.md) +- [Testbed README](../scripts/testbed/README.md) - Developer testbed documentation +- [Testbed Deploy Script](../scripts/testbed/deploy.sh) - One-command deployment + +### C. Infrastructure Details + +**Developer Testbed (Platform Team Sandbox):** + +| Resource | Value | +|----------|-------| +| GCP Project | `platform-team-sandbox-62793` | +| GKE Cluster | `openhands-testbed` | +| Region | `us-central1` | +| LoadBalancer IP | `34.28.75.102` | +| DNS Zone | `sandbox.all-hands.dev` (private) | -### C. Glossary +### D. Glossary - **Path-based routing**: All services accessed via paths on a single domain - **Subdomain-based routing**: Each service gets its own subdomain - **ClusterIssuer**: Cluster-wide certificate issuer (cert-manager) - **external-dns**: Kubernetes operator that creates DNS records from Ingress resources +- **Testbed**: Developer sandbox environment for testing OpenHands deployments From 8f18771958d8b1fd46893b592d8b1073626cea1e Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 10 May 2026 18:58:58 +0000 Subject: [PATCH 6/6] refactor: Use platform-team-sandbox infrastructure from PR #580 This PR now uses the infrastructure created in PR #580 (SV-OHE-staging-Deploy-Infra): - GCP Project: platform-team-sandbox - GKE Cluster: ohe-staging-cluster - Domain: ohe-staging.platform-team.all-hands.dev Changes: - Update workflow to target platform-team-sandbox cluster - Use testenv-charts/helm/environments/staging/base-values.yaml as base config - Copy secrets from all-hands-system namespace (not SOPS-encrypted) - Update environment values to use new domain structure: - pathroute.ohe-staging.platform-team.all-hands.dev - subdomain.ohe-staging.platform-team.all-hands.dev - Remove obsolete envs/common/values.yaml (now using testenv-charts base) - Remove obsolete scripts/testbed/ (superseded by PR #580) - Update documentation to reflect new infrastructure Deployed URLs: - https://pathroute.ohe-staging.platform-team.all-hands.dev (path-based routing) - https://subdomain.ohe-staging.platform-team.all-hands.dev (subdomain routing) --- .github/workflows/deploy-staging.yml | 70 ++- .sops.yaml | 18 +- docs/PRD-staging-environments.md | 125 +++-- envs/common/values.yaml | 315 ------------ envs/staging-pathroute/README.md | 157 +++--- envs/staging-pathroute/values.yaml | 32 +- envs/staging-subdomain/README.md | 164 +++--- envs/staging-subdomain/values.yaml | 30 +- scripts/testbed/README.md | 322 ------------ scripts/testbed/deploy.sh | 605 ----------------------- scripts/testbed/setup-shared-cluster.sh | 410 --------------- scripts/testbed/values-testbed-test.yaml | 125 ----- 12 files changed, 341 insertions(+), 2032 deletions(-) delete mode 100644 envs/common/values.yaml delete mode 100644 scripts/testbed/README.md delete mode 100755 scripts/testbed/deploy.sh delete mode 100755 scripts/testbed/setup-shared-cluster.sh delete mode 100644 scripts/testbed/values-testbed-test.yaml diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml index 8e0f7950..1927dd8c 100644 --- a/.github/workflows/deploy-staging.yml +++ b/.github/workflows/deploy-staging.yml @@ -26,9 +26,11 @@ on: default: false env: - GCP_PROJECT: staging-092324 - GCP_ZONE: us-central1 - GCP_CLUSTER: staging-core-application + # Platform Team Sandbox infrastructure (shared with PR #580) + GCP_PROJECT: platform-team-sandbox + GCP_REGION: us-central1 + GCP_CLUSTER: ohe-staging-cluster + BASE_DOMAIN: ohe-staging.platform-team.all-hands.dev jobs: deploy: @@ -44,11 +46,13 @@ jobs: - name: pathroute namespace: openhands-pathroute helm_release: openhands-pathroute - env_dir: envs/staging-pathroute + branch_name: pathroute + values_dir: envs/staging-pathroute - name: subdomain namespace: openhands-subdomain helm_release: openhands-subdomain - env_dir: envs/staging-subdomain + branch_name: subdomain + values_dir: envs/staging-subdomain # Only run for selected environment(s) if: ${{ inputs.environment == 'both' || inputs.environment == matrix.env.name }} @@ -82,7 +86,7 @@ jobs: - name: Configure kubectl run: | gcloud container clusters get-credentials ${{ env.GCP_CLUSTER }} \ - --zone ${{ env.GCP_ZONE }} \ + --region ${{ env.GCP_REGION }} \ --project ${{ env.GCP_PROJECT }} - name: Create namespace if not exists @@ -90,21 +94,39 @@ jobs: run: | kubectl create namespace ${{ matrix.env.namespace }} --dry-run=client -o yaml | kubectl apply -f - - - name: Decrypt and apply secrets + - name: Copy secrets from all-hands-system namespace if: ${{ !inputs.skip_secrets && !inputs.dry_run }} run: | - SECRETS_DIR="${{ matrix.env.env_dir }}/secrets" - if [[ -d "$SECRETS_DIR" ]]; then - echo "Applying secrets from $SECRETS_DIR" - for file in "$SECRETS_DIR"/*.yaml; do - [[ -e "$file" ]] || continue - [[ "$(basename "$file")" == ".gitkeep" ]] && continue - echo "Decrypting and applying: $file" - sops --decrypt "$file" | kubectl apply -n ${{ matrix.env.namespace }} -f - - done - else - echo "No secrets directory found at $SECRETS_DIR" - fi + # Secrets are managed in the all-hands-system namespace (from PR #580 infrastructure) + # Copy required secrets to the deployment namespace + echo "Copying secrets from all-hands-system namespace..." + SECRETS=( + "ghcr-login-secret" + "postgres-password" + "redis" + "keycloak-admin" + "keycloak-db-secret" + "lite-llm-api-key" + "stripe-api-key" + "resend-api-key" + "github-app" + "bitbucket-app" + "gitlab-auth" + "automation-webhook-secret" + "automation-service-key" + "automation-db-secret" + ) + + for secret in "${SECRETS[@]}"; do + if kubectl get secret "$secret" -n all-hands-system &>/dev/null; then + kubectl get secret "$secret" -n all-hands-system -o yaml | \ + sed "s/namespace: all-hands-system/namespace: ${{ matrix.env.namespace }}/" | \ + kubectl apply -n ${{ matrix.env.namespace }} -f - + echo "✓ Copied secret: $secret" + else + echo "⚠ Secret not found: $secret (may be optional)" + fi + done - name: Update Helm dependencies run: helm dependency update charts/openhands @@ -114,9 +136,10 @@ jobs: run: | helm template ${{ matrix.env.helm_release }} charts/openhands \ --namespace ${{ matrix.env.namespace }} \ - --values envs/common/values.yaml \ - --values ${{ matrix.env.env_dir }}/values.yaml \ + --values testenv-charts/helm/environments/staging/base-values.yaml \ + --values ${{ matrix.env.values_dir }}/values.yaml \ --set image.tag=${{ inputs.image_tag }} \ + --set branchSanitized=${{ matrix.env.branch_name }} \ --debug - name: Deploy with Helm @@ -137,9 +160,10 @@ jobs: ${{ matrix.env.helm_release }} \ charts/openhands \ --namespace ${{ matrix.env.namespace }} \ - --values envs/common/values.yaml \ - --values ${{ matrix.env.env_dir }}/values.yaml \ + --values testenv-charts/helm/environments/staging/base-values.yaml \ + --values ${{ matrix.env.values_dir }}/values.yaml \ --set image.tag=${{ inputs.image_tag }} \ + --set branchSanitized=${{ matrix.env.branch_name }} \ --debug - name: Get deployment info diff --git a/.sops.yaml b/.sops.yaml index b4876856..9ea3e443 100644 --- a/.sops.yaml +++ b/.sops.yaml @@ -1,14 +1,14 @@ # SOPS configuration for OpenHands-Cloud -# This file tells SOPS which encryption keys to use for different file patterns +# ============================================================================= +# NOTE: For the staging environments deployed to platform-team-sandbox, +# secrets are managed in the all-hands-system namespace and copied to +# deployment namespaces at deploy time. This SOPS config is kept for future +# use with production deployments or environments requiring repo-managed secrets. +# ============================================================================= creation_rules: - # Staging path-route environment secrets - use GCP KMS - - path_regex: envs/staging-pathroute/.*secrets.*\.yaml$ - gcp_kms: projects/global-432717/locations/global/keyRings/sops-key-ring/cryptoKeys/sops-key - encrypted_regex: "^(data|stringData|config)$" - - # Staging subdomain environment secrets - use GCP KMS - - path_regex: envs/staging-subdomain/.*secrets.*\.yaml$ - gcp_kms: projects/global-432717/locations/global/keyRings/sops-key-ring/cryptoKeys/sops-key + # Platform Team Sandbox secrets (if needed in future) + - path_regex: envs/staging-.*/.*secrets.*\.yaml$ + gcp_kms: projects/platform-team-sandbox/locations/global/keyRings/sops-key-ring/cryptoKeys/sops-key encrypted_regex: "^(data|stringData|config)$" # Production environment secrets (future use) diff --git a/docs/PRD-staging-environments.md b/docs/PRD-staging-environments.md index 39b87ed3..7d73af8d 100644 --- a/docs/PRD-staging-environments.md +++ b/docs/PRD-staging-environments.md @@ -2,96 +2,95 @@ **Author:** Saurya Velagapudi **Date:** 2026-04-14 -**Updated:** 2026-04-16 -**Status:** In Progress +**Updated:** 2026-05-10 +**Status:** Implemented **Stakeholders:** Engineering, DevOps, QA --- ## Executive Summary -We need staging environments that accurately replicate what enterprise customers experience when running OpenHands in production. This PRD defines a four-environment staging infrastructure that enables both automated CI testing and individual developer validation of customer-facing features. +We need staging environments that accurately replicate what enterprise customers experience when running OpenHands in production. This PRD defines the staging infrastructure that enables both automated CI testing and individual developer validation of customer-facing features. --- -## Current Progress (as of 2026-04-16) +## Current Progress (as of 2026-05-10) -### ✅ Completed: Developer Testbed Infrastructure +### ✅ Completed: Full Staging Infrastructure -We have implemented a **developer testbed** in the Platform Team Sandbox GCP project that addresses Goal #3 (customer issue debugging) and provides a foundation for the full staging environment. +We have implemented a complete staging environment on the **Platform Team Sandbox** infrastructure (PR #580), with two continuously-deployed routing environments. -**What's Deployed:** +**Infrastructure (from PR #580 - `SV-OHE-staging-Deploy-Infra`):** | Component | Status | Details | |-----------|--------|---------| -| GKE Cluster | ✅ Running | `openhands-testbed` in `platform-team-sandbox-62793` | -| Traefik Ingress | ✅ Running | LoadBalancer IP: `34.28.75.102` | -| cert-manager | ✅ Running | ClusterIssuer configured for Let's Encrypt | -| DNS Zone | ✅ Created | `sandbox.all-hands.dev` (private, no NS delegation) | -| Deployment Scripts | ✅ Committed | `scripts/testbed/deploy.sh` | -| Documentation | ✅ Written | `scripts/testbed/README.md` | +| GKE Cluster | ✅ Running | `ohe-staging-cluster` in `platform-team-sandbox` | +| Traefik Ingress | ✅ Running | LoadBalancer with wildcard TLS | +| cert-manager | ✅ Running | ClusterIssuer with Let's Encrypt DNS-01 | +| external-dns | ✅ Running | Automatic DNS record management | +| Cloud DNS | ✅ Configured | `ohe-staging.platform-team.all-hands.dev` | +| Shared Keycloak | ✅ Running | `auth.ohe-staging.platform-team.all-hands.dev` | + +**Continuous Deployment Environments:** + +| Environment | URL | Routing | Namespace | +|-------------|-----|---------|-----------| +| **pathroute** | `pathroute.ohe-staging.platform-team.all-hands.dev` | Path-based | `openhands-pathroute` | +| **subdomain** | `subdomain.ohe-staging.platform-team.all-hands.dev` | Subdomain-based | `openhands-subdomain` | **Architecture:** ``` -┌─────────────────────────────────────────────────────────────────┐ -│ Platform Team Sandbox GCP Project │ -│ (platform-team-sandbox-62793) │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌────────────────────────────────────────────────────────────┐ │ -│ │ GKE: openhands-testbed │ │ -│ │ │ │ -│ │ ┌─────────────┐ ┌─────────────┐ │ │ -│ │ │ traefik │ │ cert-manager│ Shared Services │ │ -│ │ │ (ingress) │ │ (TLS certs) │ │ │ -│ │ └─────────────┘ └─────────────┘ │ │ -│ │ │ │ -│ │ ┌──────────────────────────────────────────────────────┐ │ │ -│ │ │ Namespace: testbed- │ │ │ -│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ -│ │ │ │openhands│ │keycloak │ │litellm │ │postgres │ │ │ │ -│ │ │ │ │ │ (auth) │ │ (llm) │ │ (db) │ │ │ │ -│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ -│ │ └──────────────────────────────────────────────────────┘ │ │ -│ │ │ │ -│ └────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Platform Team Sandbox GCP Project │ +│ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ GKE: ohe-staging-cluster │ │ +│ │ │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ traefik │ │ cert-manager │ │ external-dns │ │ │ +│ │ │ namespace │ │ namespace │ │ namespace │ │ │ +│ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────┐ ┌─────────────────────────────────────────────────┐ │ │ +│ │ │ shared-auth │ │ openhands-pathroute namespace │ │ │ +│ │ │ (Keycloak) │ │ pathroute.ohe-staging.platform-team.all-hands.dev│ │ │ +│ │ │ │ └─────────────────────────────────────────────────┘ │ │ +│ │ │ auth.ohe- │ │ │ +│ │ │ staging... │ ┌─────────────────────────────────────────────────┐ │ │ +│ │ │ │ │ openhands-subdomain namespace │ │ │ +│ │ └──────────────┘ │ subdomain.ohe-staging.platform-team.all-hands.dev │ │ │ +│ │ └─────────────────────────────────────────────────┘ │ │ +│ └────────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌────────────────────────────────────────────────────────────────────────┐ │ +│ │ Cloud DNS Zone: ohe-staging.platform-team.all-hands.dev │ │ +│ │ └── *.ohe-staging.platform-team.all-hands.dev → Traefik LB IP │ │ +│ └────────────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ ``` **Key Design Decisions:** -1. **Private by design** - No public DNS; access via `/etc/hosts` only -2. **Namespace isolation** - Each developer gets their own namespace -3. **In-cluster databases** - PostgreSQL and Redis per namespace (simpler, disposable) -4. **Keycloak auth** - No GitHub App required for testbed -5. **Self-signed TLS** - Traefik default cert (no Let's Encrypt for private env) - -**Usage:** - -```bash -# Deploy your instance -export ANTHROPIC_API_KEY="sk-ant-..." -./scripts/testbed/deploy.sh --name - -# Add to /etc/hosts -echo "34.28.75.102 testbed-.sandbox.all-hands.dev auth-testbed-.sandbox.all-hands.dev" | sudo tee -a /etc/hosts - -# Access -https://testbed-.sandbox.all-hands.dev -``` +1. **Shared infrastructure** - Both environments run on the same cluster as developer branch deployments +2. **Namespace isolation** - Each environment in its own namespace +3. **Shared Keycloak** - Single authentication provider for all deployments +4. **Branch-like deployment** - Uses the same `branchSanitized` mechanism as developer deployments +5. **Secrets from all-hands-system** - Centralized secret management, copied to namespaces at deploy time -See [`scripts/testbed/README.md`](../scripts/testbed/README.md) for full documentation. +**Deployment:** -### 🔄 Next Steps +Via GitHub Actions workflow: +1. Go to **Actions** → **Deploy to Staging** +2. Click **Run workflow** +3. Select environment: `both`, `pathroute`, or `subdomain` +4. Enter the image tag to deploy -The testbed provides a foundation. To complete the full staging environment vision: +**Related Documentation:** -1. **Add CI environments** - `staging-ci-pathroute` and `staging-ci-subdomain` -2. **Enable public DNS** - For CI environments that need webhook testing -3. **Add SAML/SSO** - Shared Keycloak with SAML realm -4. **Add routing variations** - Path-based vs subdomain-based routing -5. **CI integration** - GitHub Actions workflow for automated testing +- [Branch Deployments Guide](../testenv-charts/BRANCH_DEPLOYMENTS.md) +- [Full Deployment Guide](../testenv-charts/FULL_DEPLOYMENT_GUIDE.md) +- [Staging Base Values](../testenv-charts/helm/environments/staging/base-values.yaml) --- diff --git a/envs/common/values.yaml b/envs/common/values.yaml deleted file mode 100644 index 29f6c042..00000000 --- a/envs/common/values.yaml +++ /dev/null @@ -1,315 +0,0 @@ -# Common staging environment values for OpenHands -# Environment-specific values (host, URLs) are in envs/staging-*/values.yaml - -imagePullSecrets: - - name: ghcr-login-secret - -databaseMigrations: - waitForDatabase: false - -ingress: - enabled: true - class: traefik - root: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - integrations: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - mcp: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - -helm-release-pruner: - enabled: true - job: - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 100m - memory: 256Mi - -sandbox: - apiHostname: https://runtime.staging.all-hands.dev - -runtime-api: - enabled: false - -filestore: - type: google_cloud - bucket: staging-openhands-sessions - -serviceAccount: - annotations: - iam.gke.io/gcp-service-account: openhands-sa@staging-092324.iam.gserviceaccount.com - -migrationJob: - enabled: true - initContainer: - enabled: false - -github: - enabled: true - -env: - ENABLE_BILLING: "true" - OH_APP_MODE: "saas" - OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_BILLING: "true" - OH_WEB_CLIENT_FEATURE_FLAGS_ENABLE_JIRA: "true" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_USERS_PAGE: "false" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_BILLING_PAGE: "false" - OH_WEB_CLIENT_FEATURE_FLAGS_HIDE_INTEGRATIONS_PAGE: "false" - OH_WEB_CLIENT_PROVIDERS_CONFIGURED: '["github","gitlab","bitbucket"]' - OH_WEB_CLIENT_GITHUB_APP_SLUG: "openhands-staging" - OH_APP_CONVERSATION_INFO_KIND: "server.utils.saas_app_conversation_info_injector.SaasAppConversationInfoServiceInjector" - HIDE_LLM_SETTINGS: "false" - GOOGLE_CLOUD_PROJECT: staging-092324 - GCP_PROJECT: staging-092324 - RECAPTCHA_PROJECT_ID: staging-092324 - RECAPTCHA_SITE_KEY: "" - RECAPTCHA_HMAC_SECRET: "qgfN+prMC1iMziHP3YndNicZjgK5IMXITUuVQOnEe9o=" - RECAPTCHA_BLOCK_THRESHOLD: "0.3" - GCP_REGION: us-central1 - GCP_DB_INSTANCE: application-db - DB_USER: openhands-user - DB_NAME: openhands - MAX_CONCURRENT_CONVERSATIONS: "10" - DB_POOL_SIZE: "25" - DB_MAX_OVERFLOW: "30" - ENABLE_PROACTIVE_CONVERSATION_STARTERS: "false" - ENABLE_SOLVABILITY_ANALYSIS: "true" - ENABLE_MCP_SEARCH_ENGINE: "true" - ENABLE_EXPERIMENT_MANAGER: "true" - EXPERIMENT_SYSTEM_PROMPT_EXPERIMENT: "three_system_prompt_experiment" - EXPERIMENT_CONDENSER_MAX_STEP: "condenser_max_step_experiment" - CONVERSATION_MANAGER_CLASS: "server.saas_nested_conversation_manager.SaasNestedConversationManager" - INIT_GIT_IN_EMPTY_WORKSPACE: "1" - RUNTIME_URL_PATTERN: "https://{runtime_id}.staging-runtime.all-hands.dev" - OPENHANDS_PROVIDER_BASE_URL: "https://llm-proxy.staging.all-hands.dev/" - JIRA_WEBHOOKS_ENABLED: "true" - EMAIL_PATTERN_BLACKLIST: "%" - EMAIL_PATTERN_WHITELIST: "%@openhands.dev,%@all-hands.dev" - OH_USER_AUTHORIZER_PREVENT_DUPLICATES: "false" - V1_ENABLED: "true" - ENABLE_V1_SLACK_RESOLVER: "true" - ENABLE_V1_GITHUB_RESOLVER: "true" - DUPLICATE_EMAIL_CHECK: "false" - OPENHANDS_SUPPRESS_BANNER: "1" - -litellm: - enabled: true - url: https://llm-proxy.staging.all-hands.dev - teamId: 62ea39c4-8886-44f3-b7ce-07ed4fe42d2c - auth: - existingSecret: lite-llm-api-key - envVars: - JSON_LOGS: "true" - -keycloak: - enabled: false - url: "http://keycloak.keycloak" - -laminar: - enabled: true - global: - cloudProvider: "gcp" - clickhouse: - s3: - enabled: false - appServer: - ingress: - hostname: "laminar-api.staging.all-hands.dev" - frontend: - extraEnv: - - name: AUTH_KEYCLOAK_ID - valueFrom: - secretKeyRef: - name: keycloak-realm - key: client-id - - name: AUTH_KEYCLOAK_SECRET - valueFrom: - secretKeyRef: - name: keycloak-realm - key: client-secret - - name: AUTH_KEYCLOAK_ISSUER - value: "https://auth.staging.all-hands.dev/realms/allhands" - ingress: - hostname: "laminar.staging.all-hands.dev" - env: - nextauthUrl: "https://laminar.staging.all-hands.dev" - nextPublicUrl: "https://laminar.staging.all-hands.dev" - storage: - storageClass: - type: "hyperdisk-balanced" - -stripe: - enabled: true - auth: - existingSecret: stripe-api-key - -resend: - enabled: true - auth: - existingSecret: resend-api-key - -gitlabWebhookInstallation: - enabled: true - resources: - requests: - memory: "512Mi" - cpu: "200m" - limits: - memory: "512Mi" - cpu: "200m" - -bitbucket: - enabled: true - auth: - existingSecret: bitbucket-app - -jira: - enabled: true - -enrichUserInteractionData: - enabled: true - -githubProxy: - endpointsEnabled: true - -gitlab: - enabled: true - -integrationEvents: - deployment: - replicas: 2 - resources: - requests: - memory: 2.5Gi - cpu: 1000m - limits: - memory: 2.5Gi - cpu: 1000m - uvicorn: - workers: 2 - -proactiveConvoClean: - enabled: true - schedule: "*/15 * * * *" - successfulJobsHistoryLimit: 3 - backoffLimit: 3 - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "256Mi" - cpu: "100m" - -slack: - enabled: true - clientId: "7477886716822.8865243365329" - -debuggingRoutes: - enabled: true - -deployment: - replicas: 2 - resources: - requests: - memory: 5Gi - cpu: 1000m - limits: - memory: 5Gi - cpu: 1000m - -commonRoomSync: - enabled: true - schedule: "0 * * * *" - -datadog: - enabled: true - env: "staging" - service: "deploy" - agentHost: "datadog-agent.all-hands-system.svc.cluster.local" - -appConfig: - POSTHOG_CLIENT_KEY: "phc_Wj1DvqGQgl5ml0bkZvPr55sxIvJWjlGmoHYZrxdh5qD" - POSTHOG_HOST: "https://us.i.posthog.com" - -tavily: - enabled: true - -postgresql: - enabled: false - -redis: - master: - resources: - requests: - memory: 1Gi - cpu: 500m - limits: - memory: 1Gi - cpu: 500m - -gcpMonitoring: - enabled: true - -automationServiceKey: - enabled: true - existingSecret: automation-service-key - secretKey: automation-service-key - -automation: - enabled: true - image: - repository: ghcr.io/openhands/automation - imagePullSecrets: - - name: ghcr-login-secret - deployment: - replicas: 3 - resources: - requests: - memory: 512Mi - cpu: 200m - limits: - memory: 1Gi - cpu: 500m - serviceAccount: - name: automation-sa - annotations: - iam.gke.io/gcp-service-account: automation-sa@staging-092324.iam.gserviceaccount.com - postgresql: - enabled: false - database: - host: "" - port: "5432" - user: "automation_user" - name: "automations" - createDatabaseUser: false - secretName: "automation-db-secret" - secretKey: "db-password" - gcp: - dbInstance: "application-db" - project: "staging-092324" - region: "us-central1" - filestore: - ephemeral: false - bucket: "staging-openhands-sessions" - type: gcs - minio: - enabled: false - serviceKeyFromSecret: - name: automation-service-key - key: automation-service-key - datadog: - env: "staging" - env: - AUTOMATION_SCHEDULER_INTERVAL_SECONDS: "30" - AUTOMATION_LOG_LEVEL: "info" - GCS_BUCKET_NAME: "staging-openhands-sessions" - -runtime: - runAsRoot: true diff --git a/envs/staging-pathroute/README.md b/envs/staging-pathroute/README.md index 83ab1948..58d1c26b 100644 --- a/envs/staging-pathroute/README.md +++ b/envs/staging-pathroute/README.md @@ -1,98 +1,103 @@ # Staging Path-Route Environment Configuration -This directory contains the configuration for deploying OpenHands to the **staging-pathroute** environment. +This directory contains the configuration for deploying OpenHands to the **staging-pathroute** environment on the Platform Team Sandbox infrastructure. ## Environment Overview -This environment uses **path-based routing**: -- Main app: `https://staging-pathroute.all-hands.dev/` -- Automation API: `https://staging-pathroute.all-hands.dev/api/automation` -- Integrations: `https://staging-pathroute.all-hands.dev/integration/*` -- MCP: `https://staging-pathroute.all-hands.dev/mcp/mcp` +This environment uses **path-based routing** and deploys to the Platform Team Sandbox cluster: + +- **URL:** `https://pathroute.ohe-staging.platform-team.all-hands.dev/` +- **Auth:** `https://auth.ohe-staging.platform-team.all-hands.dev` (shared Keycloak) +- **Automation API:** `https://pathroute.ohe-staging.platform-team.all-hands.dev/api/automation` +- **Integrations:** `https://pathroute.ohe-staging.platform-team.all-hands.dev/integration/*` +- **MCP:** `https://pathroute.ohe-staging.platform-team.all-hands.dev/mcp/mcp` + +## Infrastructure + +This environment shares infrastructure with PR #580 (`SV-OHE-staging-Deploy-Infra`): + +| Component | Details | +|-----------|---------| +| **GCP Project** | `platform-team-sandbox` | +| **GKE Cluster** | `ohe-staging-cluster` | +| **Region** | `us-central1` | +| **Base Domain** | `ohe-staging.platform-team.all-hands.dev` | +| **Namespace** | `openhands-pathroute` | +| **Helm Release** | `openhands-pathroute` | ## Directory Structure ``` -envs/ -├── common/ -│ └── values.yaml # Shared staging config (base) -└── staging-pathroute/ - ├── README.md # This file - ├── values.yaml # Environment-specific overrides (host, URLs) - └── secrets/ # SOPS-encrypted Kubernetes secrets +envs/staging-pathroute/ +├── README.md # This file +├── values.yaml # Environment-specific overrides (routing, URLs) +└── secrets/ # (unused - secrets are managed in all-hands-system namespace) + +testenv-charts/helm/environments/staging/ +└── base-values.yaml # Base configuration for all staging deployments ``` -Helm is invoked with both values files: +Helm is invoked with: ```bash -helm upgrade ... -f envs/common/values.yaml -f envs/staging-pathroute/values.yaml +helm upgrade ... \ + -f testenv-charts/helm/environments/staging/base-values.yaml \ + -f envs/staging-pathroute/values.yaml \ + --set branchSanitized=pathroute ``` -## Kubernetes Details - -- **Namespace:** `openhands-pathroute` -- **Helm Release:** `openhands-pathroute` -- **GCP Project:** `staging-092324` -- **GKE Cluster:** `staging-core-application` -- **Zone:** `us-central1` - ## Secrets Management -Secrets are encrypted using [SOPS](https://github.com/getsops/sops) with GCP KMS encryption. - -### Required Secrets - -The following secrets must be created in `secrets/` before deployment: - -| Secret Name | Description | Required Keys | -|-------------|-------------|---------------| -| `ghcr-login-secret` | GitHub Container Registry pull credentials | `.dockerconfigjson` | -| `lite-llm-api-key` | LiteLLM API key | `api-key` | -| `stripe-api-key` | Stripe API key | `api-key` | -| `resend-api-key` | Resend email API key | `api-key` | -| `bitbucket-app` | Bitbucket OAuth app credentials | `client-id`, `client-secret` | -| `automation-service-key` | Automation service authentication key | `automation-service-key` | -| `automation-db-secret` | Automation database password | `db-password` | -| `keycloak-realm` | Keycloak realm credentials | `client-id`, `client-secret` | - -### Creating/Editing Secrets - -```bash -# Create a new SOPS-encrypted secret -cat < /tmp/my-secret.yaml -apiVersion: v1 -kind: Secret -metadata: - name: my-secret -type: Opaque -stringData: - key: "value" -EOF -sops --encrypt /tmp/my-secret.yaml > envs/staging-pathroute/secrets/my-secret.yaml - -# Edit (decrypts, opens editor, re-encrypts on save) -sops envs/staging-pathroute/secrets/my-secret.yaml - -# View decrypted content -sops --decrypt envs/staging-pathroute/secrets/my-secret.yaml -``` +Secrets are **managed in the `all-hands-system` namespace** on the cluster and copied to the deployment namespace at deploy time. This follows the same pattern as branch deployments described in `testenv-charts/BRANCH_DEPLOYMENTS.md`. + +Required secrets in `all-hands-system`: +- `ghcr-login-secret` +- `postgres-password` +- `redis` +- `keycloak-admin` +- `keycloak-db-secret` +- `lite-llm-api-key` +- `stripe-api-key` +- `resend-api-key` +- `github-app` +- `bitbucket-app` +- `gitlab-auth` +- `automation-webhook-secret` +- `automation-service-key` +- `automation-db-secret` ## Deployment -Use the GitHub Actions workflow: +### Via GitHub Actions (Recommended) 1. Go to **Actions** → **Deploy to Staging** 2. Click **Run workflow** 3. Select environment: `pathroute` or `both` 4. Enter the image tag to deploy -### Workflow Parameters +### Manual Deployment -| Parameter | Description | Default | -|-----------|-------------|---------| -| `image_tag` | OpenHands image tag to deploy | `main` | -| `environment` | Which environment(s) to deploy | `both` | -| `skip_secrets` | Skip applying secrets | `false` | -| `dry_run` | Template only, don't deploy | `false` | +```bash +# Get cluster credentials +gcloud container clusters get-credentials ohe-staging-cluster \ + --region us-central1 \ + --project platform-team-sandbox + +# Create namespace and copy secrets +kubectl create namespace openhands-pathroute +for secret in ghcr-login-secret postgres-password redis keycloak-admin keycloak-db-secret lite-llm-api-key; do + kubectl get secret $secret -n all-hands-system -o yaml | \ + sed 's/namespace: all-hands-system/namespace: openhands-pathroute/' | \ + kubectl apply -n openhands-pathroute -f - +done + +# Deploy +helm upgrade --install openhands-pathroute ./charts/openhands \ + --namespace openhands-pathroute \ + --values testenv-charts/helm/environments/staging/base-values.yaml \ + --values envs/staging-pathroute/values.yaml \ + --set branchSanitized=pathroute \ + --set image.tag=main +``` ## Troubleshooting @@ -106,6 +111,16 @@ helm history openhands-pathroute -n openhands-pathroute # Check ingress kubectl get ingress -n openhands-pathroute -# GCP auth for SOPS -gcloud auth application-default login +# View logs +kubectl logs -n openhands-pathroute -l app=openhands -f + +# Get cluster credentials +gcloud container clusters get-credentials ohe-staging-cluster \ + --region us-central1 --project platform-team-sandbox ``` + +## Related Documentation + +- [Branch Deployments Guide](../../testenv-charts/BRANCH_DEPLOYMENTS.md) +- [Full Deployment Guide](../../testenv-charts/FULL_DEPLOYMENT_GUIDE.md) +- [Staging Base Values](../../testenv-charts/helm/environments/staging/base-values.yaml) diff --git a/envs/staging-pathroute/values.yaml b/envs/staging-pathroute/values.yaml index f5d36ff2..4d71ee92 100644 --- a/envs/staging-pathroute/values.yaml +++ b/envs/staging-pathroute/values.yaml @@ -1,11 +1,27 @@ -# Staging (Path-Based Routing) environment values -# Uses path-based routing: /api/automation, /integration/*, etc. -# Base config: envs/common/values.yaml -# Secrets: envs/staging-pathroute/secrets/ +# ============================================================================= +# Staging Path-Route Environment (CI continuous deployment) +# ============================================================================= +# Deployed to: pathroute.ohe-staging.platform-team.all-hands.dev +# Infrastructure: Platform Team Sandbox GCP project (shared with #580) +# Base config: testenv-charts/helm/environments/staging/base-values.yaml +# +# This environment tests path-based routing where services are accessed via +# URL paths: /api/automation, /integration/*, etc. +# ============================================================================= +# Path-based routing configuration ingress: - host: staging-pathroute.all-hands.dev - + routingMode: path + pathPrefix: "" + +# Environment-specific URLs (branchSanitized=pathroute set via --set) automation: - openhandsApiUrl: "https://staging-pathroute.all-hands.dev" - automationBaseUrl: "https://staging-pathroute.all-hands.dev" + openhandsApiUrl: "https://pathroute.ohe-staging.platform-team.all-hands.dev" + automationBaseUrl: "https://pathroute.ohe-staging.platform-team.all-hands.dev" + +automationService: + url: "https://pathroute.ohe-staging.platform-team.all-hands.dev/api/automation" + +# Override runtime URL pattern for this environment +env: + RUNTIME_URL_PATTERN: "https://pathroute.ohe-staging.platform-team.all-hands.dev/runtime/{runtime_id}" diff --git a/envs/staging-subdomain/README.md b/envs/staging-subdomain/README.md index faed1666..f29b0470 100644 --- a/envs/staging-subdomain/README.md +++ b/envs/staging-subdomain/README.md @@ -1,99 +1,105 @@ # Staging Subdomain Environment Configuration -This directory contains the configuration for deploying OpenHands to the **staging-subdomain** environment. +This directory contains the configuration for deploying OpenHands to the **staging-subdomain** environment on the Platform Team Sandbox infrastructure. ## Environment Overview -This environment uses **subdomain-based routing** (same as production pattern): -- Main app: `https://staging-subdomain.all-hands.dev/` -- Integrations still use path-based routes on the main domain (GitHub/GitLab webhooks, Stripe, etc.) -- MCP: `https://staging-subdomain.all-hands.dev/mcp/mcp` +This environment uses **subdomain-based routing** (production-like pattern) and deploys to the Platform Team Sandbox cluster: -The key difference from staging-pathroute is that this environment tests the production-like subdomain pattern for services that will eventually move to subdomains. +- **URL:** `https://subdomain.ohe-staging.platform-team.all-hands.dev/` +- **Auth:** `https://auth.ohe-staging.platform-team.all-hands.dev` (shared Keycloak) +- **Automation API:** `https://subdomain.ohe-staging.platform-team.all-hands.dev/api/automation` +- **Integrations:** `https://subdomain.ohe-staging.platform-team.all-hands.dev/integration/*` +- **MCP:** `https://subdomain.ohe-staging.platform-team.all-hands.dev/mcp/mcp` + +The key difference from staging-pathroute is that this environment tests the production-like subdomain routing pattern. + +## Infrastructure + +This environment shares infrastructure with PR #580 (`SV-OHE-staging-Deploy-Infra`): + +| Component | Details | +|-----------|---------| +| **GCP Project** | `platform-team-sandbox` | +| **GKE Cluster** | `ohe-staging-cluster` | +| **Region** | `us-central1` | +| **Base Domain** | `ohe-staging.platform-team.all-hands.dev` | +| **Namespace** | `openhands-subdomain` | +| **Helm Release** | `openhands-subdomain` | ## Directory Structure ``` -envs/ -├── common/ -│ └── values.yaml # Shared staging config (base) -└── staging-subdomain/ - ├── README.md # This file - ├── values.yaml # Environment-specific overrides (host, URLs) - └── secrets/ # SOPS-encrypted Kubernetes secrets +envs/staging-subdomain/ +├── README.md # This file +├── values.yaml # Environment-specific overrides (routing, URLs) +└── secrets/ # (unused - secrets are managed in all-hands-system namespace) + +testenv-charts/helm/environments/staging/ +└── base-values.yaml # Base configuration for all staging deployments ``` -Helm is invoked with both values files: +Helm is invoked with: ```bash -helm upgrade ... -f envs/common/values.yaml -f envs/staging-subdomain/values.yaml +helm upgrade ... \ + -f testenv-charts/helm/environments/staging/base-values.yaml \ + -f envs/staging-subdomain/values.yaml \ + --set branchSanitized=subdomain ``` -## Kubernetes Details - -- **Namespace:** `openhands-subdomain` -- **Helm Release:** `openhands-subdomain` -- **GCP Project:** `staging-092324` -- **GKE Cluster:** `staging-core-application` -- **Zone:** `us-central1` - ## Secrets Management -Secrets are encrypted using [SOPS](https://github.com/getsops/sops) with GCP KMS encryption. - -### Required Secrets - -The following secrets must be created in `secrets/` before deployment: - -| Secret Name | Description | Required Keys | -|-------------|-------------|---------------| -| `ghcr-login-secret` | GitHub Container Registry pull credentials | `.dockerconfigjson` | -| `lite-llm-api-key` | LiteLLM API key | `api-key` | -| `stripe-api-key` | Stripe API key | `api-key` | -| `resend-api-key` | Resend email API key | `api-key` | -| `bitbucket-app` | Bitbucket OAuth app credentials | `client-id`, `client-secret` | -| `automation-service-key` | Automation service authentication key | `automation-service-key` | -| `automation-db-secret` | Automation database password | `db-password` | -| `keycloak-realm` | Keycloak realm credentials | `client-id`, `client-secret` | - -### Creating/Editing Secrets - -```bash -# Create a new SOPS-encrypted secret -cat < /tmp/my-secret.yaml -apiVersion: v1 -kind: Secret -metadata: - name: my-secret -type: Opaque -stringData: - key: "value" -EOF -sops --encrypt /tmp/my-secret.yaml > envs/staging-subdomain/secrets/my-secret.yaml - -# Edit (decrypts, opens editor, re-encrypts on save) -sops envs/staging-subdomain/secrets/my-secret.yaml - -# View decrypted content -sops --decrypt envs/staging-subdomain/secrets/my-secret.yaml -``` +Secrets are **managed in the `all-hands-system` namespace** on the cluster and copied to the deployment namespace at deploy time. This follows the same pattern as branch deployments described in `testenv-charts/BRANCH_DEPLOYMENTS.md`. + +Required secrets in `all-hands-system`: +- `ghcr-login-secret` +- `postgres-password` +- `redis` +- `keycloak-admin` +- `keycloak-db-secret` +- `lite-llm-api-key` +- `stripe-api-key` +- `resend-api-key` +- `github-app` +- `bitbucket-app` +- `gitlab-auth` +- `automation-webhook-secret` +- `automation-service-key` +- `automation-db-secret` ## Deployment -Use the GitHub Actions workflow: +### Via GitHub Actions (Recommended) 1. Go to **Actions** → **Deploy to Staging** 2. Click **Run workflow** 3. Select environment: `subdomain` or `both` 4. Enter the image tag to deploy -### Workflow Parameters +### Manual Deployment -| Parameter | Description | Default | -|-----------|-------------|---------| -| `image_tag` | OpenHands image tag to deploy | `main` | -| `environment` | Which environment(s) to deploy | `both` | -| `skip_secrets` | Skip applying secrets | `false` | -| `dry_run` | Template only, don't deploy | `false` | +```bash +# Get cluster credentials +gcloud container clusters get-credentials ohe-staging-cluster \ + --region us-central1 \ + --project platform-team-sandbox + +# Create namespace and copy secrets +kubectl create namespace openhands-subdomain +for secret in ghcr-login-secret postgres-password redis keycloak-admin keycloak-db-secret lite-llm-api-key; do + kubectl get secret $secret -n all-hands-system -o yaml | \ + sed 's/namespace: all-hands-system/namespace: openhands-subdomain/' | \ + kubectl apply -n openhands-subdomain -f - +done + +# Deploy +helm upgrade --install openhands-subdomain ./charts/openhands \ + --namespace openhands-subdomain \ + --values testenv-charts/helm/environments/staging/base-values.yaml \ + --values envs/staging-subdomain/values.yaml \ + --set branchSanitized=subdomain \ + --set image.tag=main +``` ## Troubleshooting @@ -107,15 +113,25 @@ helm history openhands-subdomain -n openhands-subdomain # Check ingress kubectl get ingress -n openhands-subdomain -# GCP auth for SOPS -gcloud auth application-default login +# View logs +kubectl logs -n openhands-subdomain -l app=openhands -f + +# Get cluster credentials +gcloud container clusters get-credentials ohe-staging-cluster \ + --region us-central1 --project platform-team-sandbox ``` +## Related Documentation + +- [Branch Deployments Guide](../../testenv-charts/BRANCH_DEPLOYMENTS.md) +- [Full Deployment Guide](../../testenv-charts/FULL_DEPLOYMENT_GUIDE.md) +- [Staging Base Values](../../testenv-charts/helm/environments/staging/base-values.yaml) + ## Comparison with staging-pathroute | Aspect | staging-pathroute | staging-subdomain | |--------|-------------------|-------------------| -| Main URL | `staging-pathroute.all-hands.dev` | `staging-subdomain.all-hands.dev` | -| Routing | Path-based | Subdomain-based (future) | -| Purpose | Test path routing | Test production-like subdomain pattern | +| Main URL | `pathroute.ohe-staging.platform-team.all-hands.dev` | `subdomain.ohe-staging.platform-team.all-hands.dev` | +| Routing Mode | Path-based (`routingMode: path`) | Subdomain-based (`routingMode: subdomain`) | +| Purpose | Test path routing pattern | Test production-like subdomain pattern | | Namespace | `openhands-pathroute` | `openhands-subdomain` | diff --git a/envs/staging-subdomain/values.yaml b/envs/staging-subdomain/values.yaml index f8622e48..6cbfc8ac 100644 --- a/envs/staging-subdomain/values.yaml +++ b/envs/staging-subdomain/values.yaml @@ -1,11 +1,27 @@ -# Staging (Subdomain-Based Routing) environment values -# Uses subdomain-based routing: automation.staging-subdomain.all-hands.dev -# Base config: envs/common/values.yaml -# Secrets: envs/staging-subdomain/secrets/ +# ============================================================================= +# Staging Subdomain Environment (CI continuous deployment) +# ============================================================================= +# Deployed to: subdomain.ohe-staging.platform-team.all-hands.dev +# Infrastructure: Platform Team Sandbox GCP project (shared with #580) +# Base config: testenv-charts/helm/environments/staging/base-values.yaml +# +# This environment tests subdomain-based routing where services have their +# own subdomains (production-like pattern). +# ============================================================================= +# Subdomain-based routing configuration (default) ingress: - host: staging-subdomain.all-hands.dev + routingMode: subdomain + serviceRoutingMode: path +# Environment-specific URLs (branchSanitized=subdomain set via --set) automation: - openhandsApiUrl: "https://staging-subdomain.all-hands.dev" - automationBaseUrl: "https://staging-subdomain.all-hands.dev" + openhandsApiUrl: "https://subdomain.ohe-staging.platform-team.all-hands.dev" + automationBaseUrl: "https://subdomain.ohe-staging.platform-team.all-hands.dev" + +automationService: + url: "https://subdomain.ohe-staging.platform-team.all-hands.dev/api/automation" + +# Override runtime URL pattern for this environment +env: + RUNTIME_URL_PATTERN: "https://subdomain.ohe-staging.platform-team.all-hands.dev/runtime/{runtime_id}" diff --git a/scripts/testbed/README.md b/scripts/testbed/README.md deleted file mode 100644 index ea8e751f..00000000 --- a/scripts/testbed/README.md +++ /dev/null @@ -1,322 +0,0 @@ -# OpenHands Cloud Testbed - -Deploy OpenHands Cloud to an **internal testbed environment** for testing and development. - -> ⚠️ **Private Environment**: This testbed is NOT publicly accessible. It runs in the -> Platform Team Sandbox GCP project and uses `/etc/hosts` for DNS resolution. - -## Overview - -The testbed provides two deployment modes: - -1. **Shared Testbed** - Multiple developers deploy to namespaces on a shared GKE cluster -2. **Isolated Testbed** - Create your own GKE cluster for complete isolation - -## Quick Start (For Team Members) - -### Prerequisites - -- `gcloud` CLI authenticated (`gcloud auth login`) -- `kubectl` installed -- `helm` v3 installed -- Access to `platform-team-sandbox-62793` GCP project (request via Platform Team) -- An Anthropic API key (get from 1Password or request from team lead) - -### Step 1: Connect to the Shared Cluster - -```bash -# Authenticate with GCP -gcloud auth login -gcloud config set project platform-team-sandbox-62793 - -# Connect to the testbed cluster -gcloud container clusters get-credentials openhands-testbed --region us-central1 -``` - -### Step 2: Deploy Your Instance - -```bash -# Set your API keys -export ANTHROPIC_API_KEY="sk-ant-..." # Required for LLM -export GITHUB_TOKEN="ghp_..." # Optional: for pulling latest images - -# Deploy to your own namespace (use your name or feature name) -./deploy.sh --name - -# Examples: -./deploy.sh --name saurya -./deploy.sh --name feature-xyz -``` - -This creates: -- Namespace: `testbed-` -- App hostname: `testbed-.sandbox.all-hands.dev` -- Auth hostname: `auth-testbed-.sandbox.all-hands.dev` -- Runtime hostname: `runtime-testbed-.sandbox.all-hands.dev` - -### Step 3: Configure Local Access - -Since this is a private environment, add entries to your `/etc/hosts`: - -```bash -# Get the LoadBalancer IP -TRAEFIK_IP=$(kubectl get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}') -echo "LoadBalancer IP: $TRAEFIK_IP" - -# Add to /etc/hosts (replace with your testbed name) -sudo bash -c "echo '$TRAEFIK_IP testbed-.sandbox.all-hands.dev auth-testbed-.sandbox.all-hands.dev runtime-testbed-.sandbox.all-hands.dev' >> /etc/hosts" - -# Example for testbed-saurya: -# sudo bash -c "echo '34.28.75.102 testbed-saurya.sandbox.all-hands.dev auth-testbed-saurya.sandbox.all-hands.dev runtime-testbed-saurya.sandbox.all-hands.dev' >> /etc/hosts" -``` - -### Step 4: Access Your Testbed - -**Option A: Browser with /etc/hosts (recommended)** - -After adding `/etc/hosts` entries, open Chrome/Firefox: -``` -https://testbed-.sandbox.all-hands.dev -``` - -> 💡 **Chrome HTTPS Warning**: When you see the certificate warning, click anywhere on the page -> and type `thisisunsafe` (you won't see it appear). This bypasses the self-signed cert warning. - -**Option B: Port Forward (simplest, but limited)** - -```bash -kubectl port-forward svc/openhands-service 3000:3000 -n testbed- -# Open http://localhost:3000 -``` - -Note: Port forwarding won't work with OAuth callbacks. Use /etc/hosts for full functionality. - -### Step 5: Clean Up When Done - -```bash -./deploy.sh --name --destroy -``` - -**Important**: Please destroy your testbed when you're done to save cluster resources! - -## Deployment Modes - -### Shared Cluster (Default) - -Multiple developers share one GKE cluster with separate namespaces: - -```bash -./deploy.sh --name alice # Creates testbed-alice namespace -./deploy.sh --name bob # Creates testbed-bob namespace -``` - -**Pros:** -- Faster deployment (cluster already exists) -- Lower cost (shared infrastructure) -- Simpler DNS setup (one wildcard domain) - -**Cons:** -- Shared cluster resources -- Potential resource contention - -### Isolated Cluster - -Create your own GKE cluster: - -```bash -./deploy.sh --name mytest --create-cluster -``` - -**Pros:** -- Complete isolation -- Can test cluster-level changes -- No resource contention - -**Cons:** -- Slower setup (~10 minutes for cluster creation) -- Higher cost -- Requires separate DNS setup - -## Configuration - -### Environment Variables - -| Variable | Required | Description | -|----------|----------|-------------| -| `ANTHROPIC_API_KEY` | Yes* | Anthropic API key for LLM | -| `OPENAI_API_KEY` | No | OpenAI API key (alternative) | -| `GITHUB_TOKEN` | Recommended | For pulling images from ghcr.io | -| `GCP_PROJECT` | No | GCP project (default: platform-team-sandbox-62793) | -| `GCP_REGION` | No | GCP region (default: us-central1) | - -*At least one LLM API key is required for the agent to function. - -### Custom Values - -Override values by creating a custom values file: - -```bash -# Generate default values -./deploy.sh --name mytest --dry-run - -# Edit the generated values -vim values-testbed-mytest.yaml - -# Deploy with custom values -./deploy.sh --name mytest -``` - -## Troubleshooting - -### Check Deployment Status - -```bash -# View all pods -kubectl get pods -n testbed- - -# View logs -kubectl logs -f deployment/openhands -n testbed- - -# View events -kubectl get events -n testbed- --sort-by=.lastTimestamp -``` - -### Certificate Issues - -```bash -# Check certificate status -kubectl get certificates -n testbed- -kubectl describe certificate -n testbed- - -# Check cert-manager logs -kubectl logs -n cert-manager deployment/cert-manager -``` - -### Database Issues - -```bash -# Check PostgreSQL -kubectl get pods -n testbed- -l app.kubernetes.io/name=postgresql - -# Connect to database -kubectl exec -it -n testbed- \ - $(kubectl get pod -n testbed- -l app.kubernetes.io/name=postgresql -o name) \ - -- psql -U postgres -``` - -### Ingress Issues - -```bash -# Check Traefik -kubectl get svc -n traefik -kubectl logs -n traefik deployment/traefik - -# Check ingress -kubectl get ingress -n testbed- -kubectl describe ingress -n testbed- -``` - -## Architecture - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Platform Team Sandbox GCP │ -│ (platform-team-sandbox-62793) │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ ┌────────────────────────────────────────────────────────────┐ │ -│ │ GKE: openhands-testbed │ │ -│ │ │ │ -│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ -│ │ │ traefik │ │ cert-manager│ │ DNS zone │ │ │ -│ │ │ (ingress) │ │ (TLS certs) │ │ sandbox. │ │ │ -│ │ │ │ │ │ │ all-hands. │ │ │ -│ │ │ │ │ │ │ dev │ │ │ -│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ -│ │ │ │ -│ │ ┌──────────────────────────────────────────────────────┐ │ │ -│ │ │ Namespace: testbed-alice │ │ │ -│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ -│ │ │ │openhands│ │keycloak │ │litellm │ │postgres │ │ │ │ -│ │ │ │ │ │ (auth) │ │ (llm) │ │ (db) │ │ │ │ -│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ -│ │ └──────────────────────────────────────────────────────┘ │ │ -│ │ │ │ -│ │ ┌──────────────────────────────────────────────────────┐ │ │ -│ │ │ Namespace: testbed-bob │ │ │ -│ │ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ │ -│ │ │ │openhands│ │keycloak │ │litellm │ │postgres │ │ │ │ -│ │ │ │ │ │ (auth) │ │ (llm) │ │ (db) │ │ │ │ -│ │ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ -│ │ └──────────────────────────────────────────────────────┘ │ │ -│ │ │ │ -│ └────────────────────────────────────────────────────────────┘ │ -│ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -## Cost Considerations - -### Shared Cluster (Recommended) - -- GKE cluster: ~$72/month (control plane) + ~$100/month (nodes) -- Split across all users - -### Isolated Cluster - -- Same costs per cluster -- Consider deleting when not in use: - ```bash - ./deploy.sh --name mytest --destroy # Includes cluster deletion - ``` - -## Network Access (Private by Design) - -This testbed is intentionally **NOT publicly accessible**. Access requires: - -1. GCP project access (`platform-team-sandbox-62793`) -2. kubectl credentials for the cluster -3. Local `/etc/hosts` configuration pointing to the LoadBalancer IP - -### Why No Public DNS? - -- **Security**: Experimental features and internal testing shouldn't be public -- **Simplicity**: No need to manage SSL certificates via Let's Encrypt -- **Isolation**: Each developer's /etc/hosts is independent - -### Getting the LoadBalancer IP - -```bash -# Current LoadBalancer IP -kubectl get svc -n traefik traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' -# Output: 34.28.75.102 (as of last deployment) -``` - -### /etc/hosts Configuration - -Add entries for your testbed: - -```bash -# For testbed named "mytest" -34.28.75.102 testbed-mytest.sandbox.all-hands.dev auth-testbed-mytest.sandbox.all-hands.dev runtime-testbed-mytest.sandbox.all-hands.dev - -# For testbed named "saurya" -34.28.75.102 testbed-saurya.sandbox.all-hands.dev auth-testbed-saurya.sandbox.all-hands.dev runtime-testbed-saurya.sandbox.all-hands.dev -``` - -### Optional: Future Public DNS - -If we ever want to make this publicly accessible (with proper authentication), there's -a prepared DNS delegation in the infra repo that can be merged: -- PR: [Add sandbox.all-hands.dev DNS delegation](https://github.com/All-Hands-AI/infra/pull/1165) - -This would enable Let's Encrypt certificates and public DNS resolution. - -## Contributing - -When adding features to the testbed scripts: - -1. Test with `--dry-run` first -2. Ensure cleanup works properly -3. Update this README -4. Consider backwards compatibility with existing testbeds diff --git a/scripts/testbed/deploy.sh b/scripts/testbed/deploy.sh deleted file mode 100755 index 05215dcc..00000000 --- a/scripts/testbed/deploy.sh +++ /dev/null @@ -1,605 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# OpenHands Cloud Testbed Deployment Script -# ========================================== -# Deploy OpenHands Cloud to a testbed environment in Platform Team Sandbox -# -# Usage: -# ./deploy.sh # Deploy to shared testbed -# ./deploy.sh --name mytest # Deploy to isolated environment "mytest" -# ./deploy.sh --create-cluster # Create new GKE cluster and deploy -# ./deploy.sh --destroy # Destroy your testbed environment - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -# Defaults -GCP_PROJECT="${GCP_PROJECT:-platform-team-sandbox-62793}" -GCP_REGION="${GCP_REGION:-us-central1}" -SHARED_CLUSTER_NAME="openhands-testbed" -NAMESPACE_PREFIX="testbed" -DNS_DOMAIN="sandbox.all-hands.dev" -CREATE_CLUSTER=false -DESTROY=false -TESTBED_NAME="" -DRY_RUN=false - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } -log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } - -show_usage() { - cat << EOF -OpenHands Cloud Testbed Deployment - -Usage: $(basename "$0") [OPTIONS] - -Options: - --name NAME Deploy to isolated namespace 'testbed-NAME' (default: shared testbed) - --create-cluster Create a new GKE cluster for this testbed - --destroy Destroy the testbed environment - --dry-run Show what would be done without making changes - --cluster NAME Use specific cluster name (default: $SHARED_CLUSTER_NAME) - --project PROJECT GCP project ID (default: $GCP_PROJECT) - --region REGION GCP region (default: $GCP_REGION) - --help Show this help message - -Examples: - # Deploy current changes to the shared testbed - ./deploy.sh - - # Deploy to your own isolated namespace - ./deploy.sh --name saurya - - # Create a new cluster and deploy (for completely isolated testing) - ./deploy.sh --name mytest --create-cluster - - # Destroy your isolated testbed - ./deploy.sh --name saurya --destroy - -Environment Variables: - GCP_PROJECT GCP project ID (default: platform-team-sandbox-62793) - GCP_REGION GCP region (default: us-central1) - ANTHROPIC_API_KEY API key for Anthropic (required for LLM) - OPENAI_API_KEY API key for OpenAI (optional) - GITHUB_TOKEN GitHub token for image pulls - -EOF - exit 0 -} - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - --name) - TESTBED_NAME="$2" - shift 2 - ;; - --create-cluster) - CREATE_CLUSTER=true - shift - ;; - --destroy) - DESTROY=true - shift - ;; - --dry-run) - DRY_RUN=true - shift - ;; - --cluster) - SHARED_CLUSTER_NAME="$2" - shift 2 - ;; - --project) - GCP_PROJECT="$2" - shift 2 - ;; - --region) - GCP_REGION="$2" - shift 2 - ;; - --help|-h) - show_usage - ;; - *) - log_error "Unknown option: $1" - show_usage - ;; - esac -done - -# Determine namespace and cluster names -if [[ -n "$TESTBED_NAME" ]]; then - NAMESPACE="${NAMESPACE_PREFIX}-${TESTBED_NAME}" - if [[ "$CREATE_CLUSTER" == "true" ]]; then - CLUSTER_NAME="testbed-${TESTBED_NAME}" - else - CLUSTER_NAME="$SHARED_CLUSTER_NAME" - fi -else - NAMESPACE="${NAMESPACE_PREFIX}-shared" - CLUSTER_NAME="$SHARED_CLUSTER_NAME" -fi - -HOST_PREFIX="${NAMESPACE}" -APP_HOST="${HOST_PREFIX}.${DNS_DOMAIN}" -RUNTIME_HOST="runtime-${HOST_PREFIX}.${DNS_DOMAIN}" -AUTH_HOST="auth-${HOST_PREFIX}.${DNS_DOMAIN}" - -log_info "Configuration:" -log_info " GCP Project: $GCP_PROJECT" -log_info " GCP Region: $GCP_REGION" -log_info " Cluster: $CLUSTER_NAME" -log_info " Namespace: $NAMESPACE" -log_info " App URL: https://$APP_HOST" - -# Check prerequisites -check_prerequisites() { - log_info "Checking prerequisites..." - - local missing=() - - command -v gcloud >/dev/null 2>&1 || missing+=("gcloud") - command -v kubectl >/dev/null 2>&1 || missing+=("kubectl") - command -v helm >/dev/null 2>&1 || missing+=("helm") - - if [[ ${#missing[@]} -gt 0 ]]; then - log_error "Missing required tools: ${missing[*]}" - exit 1 - fi - - # Check gcloud auth - if ! gcloud auth list --filter=status:ACTIVE --format="value(account)" | grep -q .; then - log_error "Not authenticated with gcloud. Run: gcloud auth login" - exit 1 - fi - - log_success "Prerequisites check passed" -} - -# Create GKE cluster -create_cluster() { - log_info "Creating GKE cluster '$CLUSTER_NAME'..." - - if gcloud container clusters describe "$CLUSTER_NAME" --project="$GCP_PROJECT" --region="$GCP_REGION" >/dev/null 2>&1; then - log_warn "Cluster '$CLUSTER_NAME' already exists" - return 0 - fi - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY-RUN] Would create cluster '$CLUSTER_NAME'" - return 0 - fi - - gcloud container clusters create "$CLUSTER_NAME" \ - --project="$GCP_PROJECT" \ - --region="$GCP_REGION" \ - --machine-type=e2-standard-4 \ - --num-nodes=1 \ - --enable-autoscaling \ - --min-nodes=1 \ - --max-nodes=5 \ - --disk-size=100 \ - --disk-type=pd-standard \ - --enable-ip-alias \ - --workload-pool="${GCP_PROJECT}.svc.id.goog" \ - --release-channel=regular \ - --no-enable-basic-auth \ - --metadata disable-legacy-endpoints=true - - log_success "Cluster '$CLUSTER_NAME' created" -} - -# Connect to cluster -connect_cluster() { - log_info "Connecting to cluster '$CLUSTER_NAME'..." - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY-RUN] Would connect to cluster '$CLUSTER_NAME'" - return 0 - fi - - gcloud container clusters get-credentials "$CLUSTER_NAME" \ - --project="$GCP_PROJECT" \ - --region="$GCP_REGION" - - log_success "Connected to cluster" -} - -# Install third-party dependencies (Traefik, cert-manager) -install_dependencies() { - log_info "Installing cluster dependencies..." - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY-RUN] Would install Traefik and cert-manager" - return 0 - fi - - # Check if Traefik is already installed - if ! helm list -n traefik 2>/dev/null | grep -q traefik; then - log_info "Installing Traefik..." - helm repo add traefik https://traefik.github.io/charts 2>/dev/null || true - helm repo update - kubectl create namespace traefik 2>/dev/null || true - helm upgrade --install traefik traefik/traefik \ - --namespace traefik \ - --set service.type=LoadBalancer \ - --wait - else - log_info "Traefik already installed" - fi - - # Check if cert-manager is already installed - if ! helm list -n cert-manager 2>/dev/null | grep -q cert-manager; then - log_info "Installing cert-manager..." - helm repo add jetstack https://charts.jetstack.io 2>/dev/null || true - helm repo update - kubectl create namespace cert-manager 2>/dev/null || true - helm upgrade --install cert-manager jetstack/cert-manager \ - --namespace cert-manager \ - --set crds.enabled=true \ - --wait - - # Create ClusterIssuer for Let's Encrypt - kubectl apply -f - << 'ISSUER_EOF' -apiVersion: cert-manager.io/v1 -kind: ClusterIssuer -metadata: - name: letsencrypt -spec: - acme: - server: https://acme-v02.api.letsencrypt.org/directory - email: platform-team@all-hands.dev - privateKeySecretRef: - name: letsencrypt-account-key - solvers: - - http01: - ingress: - class: traefik -ISSUER_EOF - else - log_info "cert-manager already installed" - fi - - log_success "Dependencies installed" -} - -# Create namespace and secrets -setup_namespace() { - log_info "Setting up namespace '$NAMESPACE'..." - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY-RUN] Would create namespace and secrets" - return 0 - fi - - kubectl create namespace "$NAMESPACE" 2>/dev/null || log_info "Namespace already exists" - - # Generate random secrets - GLOBAL_SECRET=$(head /dev/urandom | LC_ALL=C tr -dc 'A-Za-z0-9' | head -c 32) - - # Create secrets if they don't exist - kubectl get secret jwt-secret -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic jwt-secret -n "$NAMESPACE" \ - --from-literal=jwt-secret="$GLOBAL_SECRET" - - kubectl get secret keycloak-realm -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic keycloak-realm -n "$NAMESPACE" \ - --from-literal=realm-name=allhands \ - --from-literal=server-url=http://keycloak \ - --from-literal=client-id=allhands \ - --from-literal=client-secret="$GLOBAL_SECRET" \ - --from-literal=smtp-password="" - - kubectl get secret keycloak-admin -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic keycloak-admin -n "$NAMESPACE" \ - --from-literal=admin-password="$GLOBAL_SECRET" - - kubectl get secret postgres-password -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic postgres-password -n "$NAMESPACE" \ - --from-literal=username=postgres \ - --from-literal=password="$GLOBAL_SECRET" \ - --from-literal=postgres-password="$GLOBAL_SECRET" - - kubectl get secret redis -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic redis -n "$NAMESPACE" \ - --from-literal=redis-password="$GLOBAL_SECRET" - - kubectl get secret lite-llm-api-key -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic lite-llm-api-key -n "$NAMESPACE" \ - --from-literal=lite-llm-api-key="$GLOBAL_SECRET" - - kubectl get secret admin-password -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic admin-password -n "$NAMESPACE" \ - --from-literal=admin-password="$GLOBAL_SECRET" - - kubectl get secret default-api-key -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic default-api-key -n "$NAMESPACE" \ - --from-literal=default-api-key="$GLOBAL_SECRET" - - kubectl get secret sandbox-api-key -n "$NAMESPACE" >/dev/null 2>&1 || \ - kubectl create secret generic sandbox-api-key -n "$NAMESPACE" \ - --from-literal=sandbox-api-key="$GLOBAL_SECRET" - - # Create LiteLLM env secrets if API keys are provided - if [[ -n "${ANTHROPIC_API_KEY:-}" ]] || [[ -n "${OPENAI_API_KEY:-}" ]]; then - local litellm_args=() - [[ -n "${ANTHROPIC_API_KEY:-}" ]] && litellm_args+=(--from-literal=ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY") - [[ -n "${OPENAI_API_KEY:-}" ]] && litellm_args+=(--from-literal=OPENAI_API_KEY="$OPENAI_API_KEY") - - kubectl delete secret litellm-env-secrets -n "$NAMESPACE" 2>/dev/null || true - kubectl create secret generic litellm-env-secrets -n "$NAMESPACE" "${litellm_args[@]}" - log_info "Created LiteLLM secrets with provided API keys" - else - log_warn "No ANTHROPIC_API_KEY or OPENAI_API_KEY provided. LLM functionality will not work." - log_warn "Set environment variables and re-run, or create secret manually:" - log_warn " kubectl create secret generic litellm-env-secrets -n $NAMESPACE --from-literal=ANTHROPIC_API_KEY=" - fi - - # Create GitHub image pull secret if token provided - if [[ -n "${GITHUB_TOKEN:-}" ]]; then - kubectl delete secret ghcr-login-secret -n "$NAMESPACE" 2>/dev/null || true - kubectl create secret docker-registry ghcr-login-secret -n "$NAMESPACE" \ - --docker-server=ghcr.io \ - --docker-username=openhands \ - --docker-password="$GITHUB_TOKEN" - log_info "Created GitHub container registry secret" - fi - - log_success "Namespace and secrets configured" -} - -# Generate values file for this deployment -generate_values() { - log_info "Generating Helm values..." >&2 - - local values_file="$SCRIPT_DIR/values-${NAMESPACE}.yaml" - - cat > "$values_file" << YAML_EOF -# Auto-generated testbed values for $NAMESPACE -# Generated: $(date) - -# Use in-cluster databases (no external dependencies) -postgresql: - enabled: true - primary: - persistence: - enabled: true - size: 10Gi - -redis: - enabled: true - -# Keycloak for authentication (no GitHub App required) -keycloak: - enabled: true - url: "https://${AUTH_HOST}" - ingress: - enabled: true - hostname: "${AUTH_HOST}" - annotations: - cert-manager.io/cluster-issuer: letsencrypt - -# Disable GitHub auth (use Keycloak email auth instead) -github: - enabled: false - -gitlab: - enabled: false - -bitbucket: - enabled: false - -# Main application ingress -ingress: - enabled: true - host: "${APP_HOST}" - class: traefik - root: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - -tls: - enabled: true - -# Runtime API (for sandbox execution) -runtime-api: - enabled: true - runtimeInSameCluster: true - ingress: - enabled: true - host: "${RUNTIME_HOST}" - annotations: - cert-manager.io/cluster-issuer: letsencrypt - env: - RUNTIME_BASE_URL: "${HOST_PREFIX}.${DNS_DOMAIN}" - STORAGE_CLASS: "standard-rwo" - GCP_PROJECT: "${GCP_PROJECT}" - GCP_REGION: "${GCP_REGION}" - -sandbox: - apiHostname: "https://${RUNTIME_HOST}" - -# LiteLLM proxy for LLM access -litellm: - enabled: true - url: "http://litellm:4000" - -litellm-helm: - enabled: true - ingress: - enabled: false # Internal only for testbed - proxy_config: - environment_variables: - OR_APP_NAME: "OpenHands Testbed" - model_list: - - model_name: "anthropic/claude-sonnet-4-20250514" - litellm_params: - model: "anthropic/claude-sonnet-4-20250514" - api_key: "os.environ/ANTHROPIC_API_KEY" - -# Simplified environment for testbed -env: - OH_APP_MODE: "saas" - LITELLM_DEFAULT_MODEL: "litellm_proxy/anthropic/claude-sonnet-4-20250514" - HIDE_LLM_SETTINGS: "false" - GCP_PROJECT: "${GCP_PROJECT}" - GCP_REGION: "${GCP_REGION}" - -# Filestore - use ephemeral for testbed (simpler) -filestore: - ephemeral: true - -# Minimal resources for testbed -deployment: - replicas: 1 - resources: - requests: - memory: 1Gi - cpu: 500m - limits: - memory: 2Gi - cpu: 1000m - -# Disable production features -datadog: - enabled: false - -stripe: - enabled: false - -resend: - enabled: false - -automation: - enabled: false - -laminar: - enabled: false -YAML_EOF - - log_success "Values file generated: $values_file" >&2 - echo "$values_file" -} - -# Deploy OpenHands -deploy_openhands() { - log_info "Deploying OpenHands..." - - local values_file - values_file=$(generate_values) - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY-RUN] Would deploy OpenHands with values:" - cat "$values_file" - return 0 - fi - - # Build helm dependencies - cd "$REPO_ROOT/charts/openhands" - helm dependency update - - # Deploy - helm upgrade --install openhands . \ - --namespace "$NAMESPACE" \ - --values "$values_file" \ - --wait \ - --timeout 10m - - log_success "OpenHands deployed!" -} - -# Destroy testbed -destroy_testbed() { - log_info "Destroying testbed '$NAMESPACE'..." - - if [[ "$DRY_RUN" == "true" ]]; then - log_info "[DRY-RUN] Would destroy namespace '$NAMESPACE'" - if [[ "$CREATE_CLUSTER" == "true" ]] && [[ -n "$TESTBED_NAME" ]]; then - log_info "[DRY-RUN] Would delete cluster '$CLUSTER_NAME'" - fi - return 0 - fi - - # Delete helm release - helm uninstall openhands --namespace "$NAMESPACE" 2>/dev/null || true - - # Delete namespace (this deletes all resources in it) - kubectl delete namespace "$NAMESPACE" --wait=true 2>/dev/null || true - - # Delete cluster if it was created for this testbed - if [[ "$CREATE_CLUSTER" == "true" ]] && [[ -n "$TESTBED_NAME" ]]; then - log_info "Deleting cluster '$CLUSTER_NAME'..." - gcloud container clusters delete "$CLUSTER_NAME" \ - --project="$GCP_PROJECT" \ - --region="$GCP_REGION" \ - --quiet - fi - - # Clean up values file - rm -f "$SCRIPT_DIR/values-${NAMESPACE}.yaml" - - log_success "Testbed destroyed" -} - -# Show deployment info -show_info() { - log_success "==========================================" - log_success "OpenHands Testbed Deployed!" - log_success "==========================================" - echo "" - echo "Application URL: https://$APP_HOST" - echo "Auth (Keycloak): https://$AUTH_HOST" - echo "Runtime API: https://$RUNTIME_HOST" - echo "" - echo "Namespace: $NAMESPACE" - echo "Cluster: $CLUSTER_NAME" - echo "" - echo "To access:" - echo " 1. Wait for LoadBalancer IP: kubectl get svc -n traefik" - echo " 2. Add DNS records pointing to the LoadBalancer IP" - echo " Or use port-forward: kubectl port-forward svc/openhands-service 3000:3000 -n $NAMESPACE" - echo "" - echo "To destroy:" - echo " ./deploy.sh --name $TESTBED_NAME --destroy" - echo "" - echo "To view logs:" - echo " kubectl logs -f deployment/openhands -n $NAMESPACE" -} - -# Main -main() { - check_prerequisites - - if [[ "$DESTROY" == "true" ]]; then - connect_cluster - destroy_testbed - exit 0 - fi - - if [[ "$CREATE_CLUSTER" == "true" ]]; then - create_cluster - fi - - connect_cluster - - if [[ "$CREATE_CLUSTER" == "true" ]]; then - install_dependencies - fi - - setup_namespace - deploy_openhands - show_info -} - -main diff --git a/scripts/testbed/setup-shared-cluster.sh b/scripts/testbed/setup-shared-cluster.sh deleted file mode 100755 index e22b0595..00000000 --- a/scripts/testbed/setup-shared-cluster.sh +++ /dev/null @@ -1,410 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Setup Shared OpenHands Testbed Cluster -# ====================================== -# One-time setup script to create the shared testbed infrastructure -# in Platform Team Sandbox. Run this once to set up the cluster, -# then use deploy.sh for individual deployments. - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Configuration -GCP_PROJECT="${GCP_PROJECT:-platform-team-sandbox-62793}" -GCP_REGION="${GCP_REGION:-us-central1}" -CLUSTER_NAME="${CLUSTER_NAME:-openhands-testbed}" -DNS_ZONE_NAME="sandbox-all-hands-dev" -DNS_DOMAIN="sandbox.all-hands.dev" - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } -log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } -log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } -log_error() { echo -e "${RED}[ERROR]${NC} $1"; } - -show_usage() { - cat << EOF -Setup Shared OpenHands Testbed Cluster - -This script creates the shared GKE cluster and DNS infrastructure -for the OpenHands testbed environment. Run this once before using -deploy.sh for individual deployments. - -Usage: $(basename "$0") [OPTIONS] - -Options: - --skip-cluster Skip GKE cluster creation (only setup addons) - --skip-dns Skip DNS zone creation - --destroy Destroy the shared cluster (CAUTION!) - --help Show this help message - -Environment Variables: - GCP_PROJECT GCP project ID (default: $GCP_PROJECT) - GCP_REGION GCP region (default: $GCP_REGION) - CLUSTER_NAME GKE cluster name (default: $CLUSTER_NAME) - -EOF - exit 0 -} - -SKIP_CLUSTER=false -SKIP_DNS=false -DESTROY=false - -while [[ $# -gt 0 ]]; do - case $1 in - --skip-cluster) - SKIP_CLUSTER=true - shift - ;; - --skip-dns) - SKIP_DNS=true - shift - ;; - --destroy) - DESTROY=true - shift - ;; - --help|-h) - show_usage - ;; - *) - log_error "Unknown option: $1" - show_usage - ;; - esac -done - -# Enable required APIs -enable_apis() { - log_info "Enabling required GCP APIs..." - - local apis=( - "container.googleapis.com" - "dns.googleapis.com" - "compute.googleapis.com" - "iam.googleapis.com" - "cloudresourcemanager.googleapis.com" - ) - - for api in "${apis[@]}"; do - gcloud services enable "$api" --project="$GCP_PROJECT" 2>/dev/null || true - done - - log_success "APIs enabled" -} - -# Create GKE cluster -create_cluster() { - log_info "Creating GKE cluster '$CLUSTER_NAME'..." - - if gcloud container clusters describe "$CLUSTER_NAME" \ - --project="$GCP_PROJECT" \ - --region="$GCP_REGION" >/dev/null 2>&1; then - log_warn "Cluster '$CLUSTER_NAME' already exists" - return 0 - fi - - # Create VPC network - local network_name="${CLUSTER_NAME}-network" - if ! gcloud compute networks describe "$network_name" --project="$GCP_PROJECT" >/dev/null 2>&1; then - log_info "Creating VPC network '$network_name'..." - gcloud compute networks create "$network_name" \ - --project="$GCP_PROJECT" \ - --subnet-mode=auto - fi - - # Create GKE cluster - gcloud container clusters create "$CLUSTER_NAME" \ - --project="$GCP_PROJECT" \ - --region="$GCP_REGION" \ - --network="$network_name" \ - --machine-type=e2-standard-4 \ - --num-nodes=1 \ - --enable-autoscaling \ - --min-nodes=1 \ - --max-nodes=10 \ - --disk-size=100 \ - --disk-type=pd-standard \ - --enable-ip-alias \ - --workload-pool="${GCP_PROJECT}.svc.id.goog" \ - --release-channel=regular \ - --no-enable-basic-auth \ - --metadata disable-legacy-endpoints=true \ - --addons=HttpLoadBalancing,HorizontalPodAutoscaling \ - --labels=environment=testbed,team=platform - - log_success "Cluster '$CLUSTER_NAME' created" -} - -# Connect to cluster -connect_cluster() { - log_info "Connecting to cluster..." - gcloud container clusters get-credentials "$CLUSTER_NAME" \ - --project="$GCP_PROJECT" \ - --region="$GCP_REGION" - log_success "Connected to cluster" -} - -# Create DNS zone -create_dns_zone() { - log_info "Setting up DNS zone for '$DNS_DOMAIN'..." - - if gcloud dns managed-zones describe "$DNS_ZONE_NAME" \ - --project="$GCP_PROJECT" >/dev/null 2>&1; then - log_warn "DNS zone '$DNS_ZONE_NAME' already exists" - return 0 - fi - - gcloud dns managed-zones create "$DNS_ZONE_NAME" \ - --project="$GCP_PROJECT" \ - --description="DNS zone for OpenHands testbed" \ - --dns-name="${DNS_DOMAIN}." - - log_success "DNS zone created" - - # Show NS records - log_info "DNS zone NS records (delegate these from parent zone):" - gcloud dns managed-zones describe "$DNS_ZONE_NAME" \ - --project="$GCP_PROJECT" \ - --format="value(nameServers)" -} - -# Install Traefik -install_traefik() { - log_info "Installing Traefik ingress controller..." - - if helm list -n traefik 2>/dev/null | grep -q traefik; then - log_warn "Traefik already installed" - return 0 - fi - - helm repo add traefik https://traefik.github.io/charts 2>/dev/null || true - helm repo update - - kubectl create namespace traefik 2>/dev/null || true - - helm upgrade --install traefik traefik/traefik \ - --namespace traefik \ - --set service.type=LoadBalancer \ - --set service.annotations."cloud\.google\.com/load-balancer-type"=External \ - --set ingressClass.enabled=true \ - --set ingressClass.isDefaultClass=true \ - --set providers.kubernetesIngress.publishedService.enabled=true \ - --wait - - log_success "Traefik installed" - - # Wait for LoadBalancer IP - log_info "Waiting for LoadBalancer IP..." - local max_wait=120 - local waited=0 - local lb_ip="" - - while [[ -z "$lb_ip" ]] && [[ $waited -lt $max_wait ]]; do - lb_ip=$(kubectl get svc traefik -n traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || true) - if [[ -z "$lb_ip" ]]; then - sleep 5 - waited=$((waited + 5)) - fi - done - - if [[ -n "$lb_ip" ]]; then - log_success "LoadBalancer IP: $lb_ip" - echo "" - echo "Add wildcard DNS record: *.${DNS_DOMAIN} -> $lb_ip" - else - log_warn "LoadBalancer IP not yet assigned. Check later with:" - log_warn " kubectl get svc traefik -n traefik" - fi -} - -# Install cert-manager -install_cert_manager() { - log_info "Installing cert-manager..." - - if helm list -n cert-manager 2>/dev/null | grep -q cert-manager; then - log_warn "cert-manager already installed" - else - helm repo add jetstack https://charts.jetstack.io 2>/dev/null || true - helm repo update - - kubectl create namespace cert-manager 2>/dev/null || true - - helm upgrade --install cert-manager jetstack/cert-manager \ - --namespace cert-manager \ - --set crds.enabled=true \ - --wait - - log_success "cert-manager installed" - fi - - # Create ClusterIssuers - log_info "Creating ClusterIssuers..." - - kubectl apply -f - << 'EOF' -apiVersion: cert-manager.io/v1 -kind: ClusterIssuer -metadata: - name: letsencrypt -spec: - acme: - server: https://acme-v02.api.letsencrypt.org/directory - email: platform-team@all-hands.dev - privateKeySecretRef: - name: letsencrypt-account-key - solvers: - - http01: - ingress: - class: traefik ---- -apiVersion: cert-manager.io/v1 -kind: ClusterIssuer -metadata: - name: letsencrypt-staging -spec: - acme: - server: https://acme-staging-v02.api.letsencrypt.org/directory - email: platform-team@all-hands.dev - privateKeySecretRef: - name: letsencrypt-staging-account-key - solvers: - - http01: - ingress: - class: traefik -EOF - - log_success "ClusterIssuers created" -} - -# Create storage class -create_storage_class() { - log_info "Checking storage classes..." - - # Check if standard-rwo already exists (it's created by default in GKE) - if kubectl get storageclass standard-rwo >/dev/null 2>&1; then - log_info "Storage class 'standard-rwo' already exists" - else - log_info "Creating storage class 'standard-rwo'..." - kubectl apply -f - << 'EOF' -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: standard-rwo - annotations: - storageclass.kubernetes.io/is-default-class: "true" -provisioner: pd.csi.storage.gke.io -parameters: - type: pd-standard -volumeBindingMode: WaitForFirstConsumer -allowVolumeExpansion: true -EOF - fi - - log_success "Storage classes ready" -} - -# Destroy everything -destroy_cluster() { - log_warn "This will destroy the shared testbed cluster and all deployments!" - read -p "Are you sure? (type 'yes' to confirm): " confirm - - if [[ "$confirm" != "yes" ]]; then - log_info "Aborted" - exit 1 - fi - - log_info "Destroying cluster '$CLUSTER_NAME'..." - - # Delete cluster - gcloud container clusters delete "$CLUSTER_NAME" \ - --project="$GCP_PROJECT" \ - --region="$GCP_REGION" \ - --quiet || true - - # Delete DNS zone - log_info "Deleting DNS zone..." - gcloud dns managed-zones delete "$DNS_ZONE_NAME" \ - --project="$GCP_PROJECT" \ - --quiet || true - - # Delete VPC network - local network_name="${CLUSTER_NAME}-network" - log_info "Deleting VPC network..." - gcloud compute networks delete "$network_name" \ - --project="$GCP_PROJECT" \ - --quiet || true - - log_success "Shared cluster destroyed" -} - -# Show status -show_status() { - echo "" - log_success "==========================================" - log_success "Shared Testbed Cluster Ready!" - log_success "==========================================" - echo "" - echo "Cluster: $CLUSTER_NAME" - echo "Project: $GCP_PROJECT" - echo "Region: $GCP_REGION" - echo "DNS Domain: $DNS_DOMAIN" - echo "" - - local lb_ip - lb_ip=$(kubectl get svc traefik -n traefik -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo "pending") - echo "LoadBalancer IP: $lb_ip" - echo "" - - if [[ "$lb_ip" != "pending" ]]; then - echo "DNS Setup Required:" - echo " Add wildcard A record: *.${DNS_DOMAIN} -> $lb_ip" - echo "" - fi - - echo "Next Steps:" - echo " 1. Set up DNS wildcard record (see above)" - echo " 2. Deploy your testbed:" - echo " cd $(dirname "$SCRIPT_DIR")" - echo " ./testbed/deploy.sh --name " - echo "" -} - -# Main -main() { - log_info "Setting up shared OpenHands testbed cluster..." - log_info "Project: $GCP_PROJECT" - log_info "Region: $GCP_REGION" - log_info "Cluster: $CLUSTER_NAME" - echo "" - - if [[ "$DESTROY" == "true" ]]; then - destroy_cluster - exit 0 - fi - - enable_apis - - if [[ "$SKIP_DNS" != "true" ]]; then - create_dns_zone - fi - - if [[ "$SKIP_CLUSTER" != "true" ]]; then - create_cluster - fi - - connect_cluster - create_storage_class - install_traefik - install_cert_manager - - show_status -} - -main diff --git a/scripts/testbed/values-testbed-test.yaml b/scripts/testbed/values-testbed-test.yaml deleted file mode 100644 index f7c2f3a5..00000000 --- a/scripts/testbed/values-testbed-test.yaml +++ /dev/null @@ -1,125 +0,0 @@ -# Auto-generated testbed values for testbed-test -# Generated: Thu Apr 16 16:35:44 PDT 2026 - -# Use in-cluster databases (no external dependencies) -postgresql: - enabled: true - auth: - username: postgres - database: openhands - primary: - persistence: - enabled: true - size: 10Gi - -redis: - enabled: true - -# Keycloak for authentication (no GitHub App required) -# Using internal URL for init container to work without DNS -# For production with DNS, change url to https://auth-testbed-test.sandbox.all-hands.dev -keycloak: - enabled: true - url: "http://keycloak" - ingress: - enabled: true - hostname: "auth-testbed-test.sandbox.all-hands.dev" - annotations: - cert-manager.io/cluster-issuer: letsencrypt - -# Disable GitHub auth (use Keycloak email auth instead) -github: - enabled: false - -gitlab: - enabled: false - -bitbucket: - enabled: false - -# Main application ingress -ingress: - enabled: true - host: "testbed-test.sandbox.all-hands.dev" - class: traefik - root: - annotations: - cert-manager.io/cluster-issuer: letsencrypt - -tls: - enabled: true - -# Runtime API (for sandbox execution) -runtime-api: - enabled: true - runtimeInSameCluster: true - ingress: - enabled: true - host: "runtime-testbed-test.sandbox.all-hands.dev" - annotations: - cert-manager.io/cluster-issuer: letsencrypt - env: - RUNTIME_BASE_URL: "testbed-test.sandbox.all-hands.dev" - STORAGE_CLASS: "standard-rwo" - GCP_PROJECT: "platform-team-sandbox-62793" - GCP_REGION: "us-central1" - -sandbox: - apiHostname: "https://runtime-testbed-test.sandbox.all-hands.dev" - -# LiteLLM proxy for LLM access -litellm: - enabled: true - url: "http://litellm:4000" - -litellm-helm: - enabled: true - ingress: - enabled: false # Internal only for testbed - proxy_config: - environment_variables: - OR_APP_NAME: "OpenHands Testbed" - model_list: - - model_name: "anthropic/claude-sonnet-4-20250514" - litellm_params: - model: "anthropic/claude-sonnet-4-20250514" - api_key: "os.environ/ANTHROPIC_API_KEY" - -# Simplified environment for testbed -env: - OH_APP_MODE: "saas" - LITELLM_DEFAULT_MODEL: "litellm_proxy/anthropic/claude-sonnet-4-20250514" - HIDE_LLM_SETTINGS: "false" - GCP_PROJECT: "platform-team-sandbox-62793" - GCP_REGION: "us-central1" - -# Filestore - use ephemeral for testbed (simpler) -filestore: - ephemeral: true - -# Minimal resources for testbed -deployment: - replicas: 1 - resources: - requests: - memory: 1Gi - cpu: 500m - limits: - memory: 2Gi - cpu: 1000m - -# Disable production features -datadog: - enabled: false - -stripe: - enabled: false - -resend: - enabled: false - -automation: - enabled: false - -laminar: - enabled: false