diff --git a/.env.example b/.env.example index 8d91968..3ed36de 100644 --- a/.env.example +++ b/.env.example @@ -32,6 +32,8 @@ API_KEY_CACHE_TTL=300 RATE_LIMIT_ENABLED=true # Redis Configuration +# Deployment mode: standalone (default), cluster, or sentinel +REDIS_MODE=standalone REDIS_HOST=localhost REDIS_PORT=6379 REDIS_PASSWORD= @@ -42,6 +44,33 @@ REDIS_MAX_CONNECTIONS=20 REDIS_SOCKET_TIMEOUT=5 REDIS_SOCKET_CONNECT_TIMEOUT=5 +# Optional key prefix — useful when sharing a Redis instance across environments +# All keys will be stored as (e.g. "prod:sessions:abc") +REDIS_KEY_PREFIX= + +# Redis Cluster Mode (REDIS_MODE=cluster) +# Comma-separated list of host:port pairs for cluster startup nodes +# REDIS_CLUSTER_NODES=node1:6379,node2:6379,node3:6379 + +# Redis Sentinel Mode (REDIS_MODE=sentinel) +# Comma-separated list of host:port pairs for Sentinel instances +# REDIS_SENTINEL_NODES=sentinel1:26379,sentinel2:26379,sentinel3:26379 +# REDIS_SENTINEL_MASTER=mymaster +# REDIS_SENTINEL_PASSWORD= + +# Redis TLS/SSL Configuration +# Required for most managed Redis services (GCP Memorystore, AWS ElastiCache, Azure Cache) +REDIS_TLS_ENABLED=false +# REDIS_TLS_CA_CERT_FILE=/path/to/ca.crt +# REDIS_TLS_CERT_FILE=/path/to/client.crt +# REDIS_TLS_KEY_FILE=/path/to/client.key +# REDIS_TLS_INSECURE=false +# Hostname verification is off by default because managed Redis services +# and Redis Cluster mode expose node IPs that don't match cert CN/SAN. +# The CA certificate chain is still fully verified. Enable hostname +# checking when your Redis server hostnames match certificate CN/SAN. +# REDIS_TLS_CHECK_HOSTNAME=false + # MinIO/S3 Configuration MINIO_ENDPOINT=localhost:9000 MINIO_ACCESS_KEY=minioadmin @@ -144,6 +173,37 @@ METRICS_ARCHIVE_RETENTION_DAYS=90 ENABLE_NETWORK_ISOLATION=true ENABLE_FILESYSTEM_ISOLATION=true +# Kubernetes Execution Configuration +# Execution mode: 'agent' (default, recommended) or 'nsenter' (legacy) +# agent: Executor-agent binary runs inside the main container. +# No nsenter, no capabilities, no privilege escalation. +# Compatible with GKE Sandbox (gVisor) and restricted Pod Security Standards. +# nsenter: Sidecar uses nsenter to enter the main container's mount namespace. +# Requires shareProcessNamespace, SYS_PTRACE/SYS_ADMIN/SYS_CHROOT caps, +# and allowPrivilegeEscalation: true. NOT compatible with GKE Sandbox. +K8S_EXECUTION_MODE=agent +# K8S_EXECUTOR_PORT=9090 # Port for the executor-agent HTTP server (agent mode only) + +# Sidecar image — must match the execution mode: +# agent mode: aronmuon/kubecoderun-sidecar-agent:latest (default) +# nsenter mode: aronmuon/kubecoderun-sidecar-nsenter:latest +# K8S_SIDECAR_IMAGE=aronmuon/kubecoderun-sidecar-agent:latest + +# Image pull policy for execution pods (Always, IfNotPresent, Never) +# K8S_IMAGE_PULL_POLICY=Always + +# Image pull secrets for private container registries (comma-separated secret names) +# These Kubernetes secrets must already exist in the execution namespace. +# Leave empty or unset if not using private registries. +# K8S_IMAGE_PULL_SECRETS=my-registry-secret,another-secret + +# GKE Sandbox (gVisor) Configuration +# Requires K8S_EXECUTION_MODE=agent (nsenter is incompatible with gVisor) +# GKE_SANDBOX_ENABLED=false +# GKE_SANDBOX_RUNTIME_CLASS=gvisor +# GKE_SANDBOX_NODE_SELECTOR={} +# GKE_SANDBOX_CUSTOM_TOLERATIONS=[] + # WAN Network Access Configuration # When enabled, execution containers can access the public internet # but are blocked from accessing host, other containers, and private networks diff --git a/.gitignore b/.gitignore index 557bbf9..86d1a16 100644 --- a/.gitignore +++ b/.gitignore @@ -200,3 +200,5 @@ config/local.py # Hatch auto-generated version file _version.py + +.pdm-python diff --git a/docker-compose.redis-cluster-tls.yml b/docker-compose.redis-cluster-tls.yml new file mode 100644 index 0000000..019984f --- /dev/null +++ b/docker-compose.redis-cluster-tls.yml @@ -0,0 +1,235 @@ +# Redis Cluster with TLS for integration testing +# +# This mimics a production GCP Memorystore Redis Cluster setup: +# - 6-node cluster (3 masters + 3 replicas) with TLS enabled +# - No authentication (no password) +# - Server-side TLS with CA verification (no mutual TLS / no client certs) +# - Accessible on localhost ports 6380-6385 (TLS) +# +# Prerequisites: +# cd tests/tls-certs && ./generate.sh (generates CA + server certificates) +# cd tests/tls-certs && ./cleanup.sh (removes generated certs) +# +# Usage: +# docker compose -f docker-compose.redis-cluster-tls.yml up -d +# +# Test with: +# redis-cli -c -p 6380 --tls --cacert tests/tls-certs/ca.crt CLUSTER INFO + +services: + redis-tls-node-0: + image: redis:7-alpine + container_name: redis-tls-cluster-0 + ports: + - "127.0.0.1:6380:6380" + - "127.0.0.1:16380:16380" + volumes: + - redis-tls-cluster-0:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6380 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6380", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-1: + image: redis:7-alpine + container_name: redis-tls-cluster-1 + ports: + - "127.0.0.1:6381:6381" + - "127.0.0.1:16381:16381" + volumes: + - redis-tls-cluster-1:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6381 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6381", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-2: + image: redis:7-alpine + container_name: redis-tls-cluster-2 + ports: + - "127.0.0.1:6382:6382" + - "127.0.0.1:16382:16382" + volumes: + - redis-tls-cluster-2:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6382 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6382", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-3: + image: redis:7-alpine + container_name: redis-tls-cluster-3 + ports: + - "127.0.0.1:6383:6383" + - "127.0.0.1:16383:16383" + volumes: + - redis-tls-cluster-3:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6383 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6383", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-4: + image: redis:7-alpine + container_name: redis-tls-cluster-4 + ports: + - "127.0.0.1:6384:6384" + - "127.0.0.1:16384:16384" + volumes: + - redis-tls-cluster-4:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6384 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6384", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + redis-tls-node-5: + image: redis:7-alpine + container_name: redis-tls-cluster-5 + ports: + - "127.0.0.1:6385:6385" + - "127.0.0.1:16385:16385" + volumes: + - redis-tls-cluster-5:/data + - ./tests/tls-certs:/tls:ro + command: > + redis-server + --port 0 + --tls-port 6385 + --tls-cert-file /tls/redis.crt + --tls-key-file /tls/redis.key + --tls-ca-cert-file /tls/ca.crt + --tls-auth-clients no + --tls-replication yes + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "6385", "--tls", "--cert", "/tls/redis.crt", "--key", "/tls/redis.key", "--cacert", "/tls/ca.crt", "ping"] + interval: 5s + timeout: 3s + retries: 10 + + # Initializer: creates TLS cluster from the 6 nodes + redis-tls-cluster-init: + image: redis:7-alpine + container_name: redis-tls-cluster-init + volumes: + - ./tests/tls-certs:/tls:ro + depends_on: + redis-tls-node-0: + condition: service_healthy + redis-tls-node-1: + condition: service_healthy + redis-tls-node-2: + condition: service_healthy + redis-tls-node-3: + condition: service_healthy + redis-tls-node-4: + condition: service_healthy + redis-tls-node-5: + condition: service_healthy + restart: "no" + entrypoint: + - sh + - -c + - | + echo 'Creating Redis TLS Cluster...' && + redis-cli --cluster create redis-tls-node-0:6380 redis-tls-node-1:6381 redis-tls-node-2:6382 redis-tls-node-3:6383 redis-tls-node-4:6384 redis-tls-node-5:6385 --cluster-replicas 1 --cluster-yes --tls --cert /tls/redis.crt --key /tls/redis.key --cacert /tls/ca.crt && + echo 'Redis TLS Cluster created successfully' && + redis-cli -h redis-tls-node-0 -p 6380 --tls --cert /tls/redis.crt --key /tls/redis.key --cacert /tls/ca.crt CLUSTER INFO + +volumes: + redis-tls-cluster-0: + redis-tls-cluster-1: + redis-tls-cluster-2: + redis-tls-cluster-3: + redis-tls-cluster-4: + redis-tls-cluster-5: diff --git a/docker-compose.redis-cluster.yml b/docker-compose.redis-cluster.yml new file mode 100644 index 0000000..7f62c52 --- /dev/null +++ b/docker-compose.redis-cluster.yml @@ -0,0 +1,182 @@ +# Redis Cluster for integration testing +# +# Usage: +# docker compose -f docker-compose.redis-cluster.yml up -d +# +# This creates a 6-node Redis Cluster (3 masters + 3 replicas) +# accessible on localhost ports 7000-7005. +# +# Test with: redis-cli -c -p 7000 CLUSTER INFO + +services: + redis-node-0: + image: redis:7-alpine + container_name: redis-cluster-0 + ports: + - "127.0.0.1:7000:7000" + - "127.0.0.1:17000:17000" + volumes: + - redis-cluster-0:/data + command: > + redis-server + --port 7000 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7000", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-1: + image: redis:7-alpine + container_name: redis-cluster-1 + ports: + - "127.0.0.1:7001:7001" + - "127.0.0.1:17001:17001" + volumes: + - redis-cluster-1:/data + command: > + redis-server + --port 7001 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7001", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-2: + image: redis:7-alpine + container_name: redis-cluster-2 + ports: + - "127.0.0.1:7002:7002" + - "127.0.0.1:17002:17002" + volumes: + - redis-cluster-2:/data + command: > + redis-server + --port 7002 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7002", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-3: + image: redis:7-alpine + container_name: redis-cluster-3 + ports: + - "127.0.0.1:7003:7003" + - "127.0.0.1:17003:17003" + volumes: + - redis-cluster-3:/data + command: > + redis-server + --port 7003 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7003", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-4: + image: redis:7-alpine + container_name: redis-cluster-4 + ports: + - "127.0.0.1:7004:7004" + - "127.0.0.1:17004:17004" + volumes: + - redis-cluster-4:/data + command: > + redis-server + --port 7004 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7004", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + redis-node-5: + image: redis:7-alpine + container_name: redis-cluster-5 + ports: + - "127.0.0.1:7005:7005" + - "127.0.0.1:17005:17005" + volumes: + - redis-cluster-5:/data + command: > + redis-server + --port 7005 + --cluster-enabled yes + --cluster-config-file nodes.conf + --cluster-node-timeout 5000 + --appendonly yes + --bind 0.0.0.0 + --protected-mode no + healthcheck: + test: ["CMD", "redis-cli", "-p", "7005", "ping"] + interval: 5s + timeout: 3s + retries: 5 + + # Initializer: creates cluster from the 6 nodes + redis-cluster-init: + image: redis:7-alpine + container_name: redis-cluster-init + depends_on: + redis-node-0: + condition: service_healthy + redis-node-1: + condition: service_healthy + redis-node-2: + condition: service_healthy + redis-node-3: + condition: service_healthy + redis-node-4: + condition: service_healthy + redis-node-5: + condition: service_healthy + restart: "no" + entrypoint: > + sh -c " + echo 'Creating Redis Cluster...' && + redis-cli --cluster create redis-node-0:7000 redis-node-1:7001 redis-node-2:7002 redis-node-3:7003 redis-node-4:7004 redis-node-5:7005 --cluster-replicas 1 --cluster-yes && + echo 'Redis Cluster created successfully' && + redis-cli -h redis-node-0 -p 7000 CLUSTER INFO + " + +volumes: + redis-cluster-0: + redis-cluster-1: + redis-cluster-2: + redis-cluster-3: + redis-cluster-4: + redis-cluster-5: diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 30049f5..ee57b72 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -135,18 +135,23 @@ Manages API key authentication and security. ### Redis Configuration -Redis is used for session management and caching. - -| Variable | Default | Description | -| ------------------------------ | ----------- | -------------------------------------------------- | -| `REDIS_HOST` | `localhost` | Redis server hostname | -| `REDIS_PORT` | `6379` | Redis server port | -| `REDIS_PASSWORD` | - | Redis password (if required) | -| `REDIS_DB` | `0` | Redis database number | -| `REDIS_URL` | - | Complete Redis URL (overrides individual settings) | -| `REDIS_MAX_CONNECTIONS` | `20` | Maximum connections in pool | -| `REDIS_SOCKET_TIMEOUT` | `5` | Socket timeout (seconds) | -| `REDIS_SOCKET_CONNECT_TIMEOUT` | `5` | Connection timeout (seconds) | +Redis is used for session management and caching. Three deployment modes are supported: +**standalone** (default), **cluster**, and **sentinel** — all with optional TLS/SSL. + +#### Connection Settings + +| Variable | Default | Description | +| ------------------------------ | ------------- | -------------------------------------------------------- | +| `REDIS_MODE` | `standalone` | Deployment mode: `standalone`, `cluster`, or `sentinel` | +| `REDIS_HOST` | `localhost` | Redis server hostname | +| `REDIS_PORT` | `6379` | Redis server port | +| `REDIS_PASSWORD` | - | Redis password (if required) | +| `REDIS_DB` | `0` | Redis database number (standalone/sentinel only) | +| `REDIS_URL` | - | Complete Redis URL (overrides individual settings) | +| `REDIS_MAX_CONNECTIONS` | `20` | Maximum connections in pool | +| `REDIS_SOCKET_TIMEOUT` | `5` | Socket timeout (seconds) | +| `REDIS_SOCKET_CONNECT_TIMEOUT` | `5` | Connection timeout (seconds) | +| `REDIS_KEY_PREFIX` | - | Optional prefix prepended to every Redis key (e.g. `prod:`) | **Example Redis URL:** @@ -154,6 +159,83 @@ Redis is used for session management and caching. REDIS_URL=redis://password@localhost:6379/0 ``` +#### Redis Cluster Mode + +Use `REDIS_MODE=cluster` when running against a Redis Cluster deployment (e.g. GCP Memorystore Cluster, AWS ElastiCache Cluster Mode). + +| Variable | Default | Description | +| ---------------------- | ------- | --------------------------------------------------------------------------- | +| `REDIS_CLUSTER_NODES` | - | Comma-separated `host:port` pairs for cluster startup nodes | + +> **Note:** `REDIS_DB` is ignored in cluster mode (Redis Cluster only supports database 0). + +**Example:** + +```bash +REDIS_MODE=cluster +REDIS_CLUSTER_NODES=node1:6379,node2:6379,node3:6379 +REDIS_PASSWORD=your-cluster-password +``` + +#### Redis Sentinel Mode + +Use `REDIS_MODE=sentinel` for high-availability setups with Redis Sentinel. + +| Variable | Default | Description | +| -------------------------- | ---------- | ------------------------------------------------------------ | +| `REDIS_SENTINEL_NODES` | - | Comma-separated `host:port` pairs for Sentinel instances | +| `REDIS_SENTINEL_MASTER` | `mymaster` | Name of the Sentinel-monitored master | +| `REDIS_SENTINEL_PASSWORD` | - | Password for authenticating to Sentinel instances | + +**Example:** + +```bash +REDIS_MODE=sentinel +REDIS_SENTINEL_NODES=sentinel1:26379,sentinel2:26379,sentinel3:26379 +REDIS_SENTINEL_MASTER=mymaster +REDIS_PASSWORD=your-redis-password +REDIS_SENTINEL_PASSWORD=your-sentinel-password +``` + +#### Redis TLS/SSL + +Enable TLS for encrypted connections. Required by most managed Redis services (GCP Memorystore, AWS ElastiCache, Azure Cache for Redis). + +| Variable | Default | Description | +| ------------------------------ | ------- | ---------------------------------------------------------------- | +| `REDIS_TLS_ENABLED` | `false` | Enable TLS/SSL for Redis connections | +| `REDIS_TLS_CA_CERT_FILE` | - | Path to CA certificate for verifying the server | +| `REDIS_TLS_CERT_FILE` | - | Path to client TLS certificate (mutual TLS) | +| `REDIS_TLS_KEY_FILE` | - | Path to client TLS private key (mutual TLS) | +| `REDIS_TLS_INSECURE` | `false` | Skip TLS certificate verification (NOT recommended) | +| `REDIS_TLS_CHECK_HOSTNAME` | `false` | Verify server hostname against certificate CN/SAN | + +> When `REDIS_TLS_ENABLED=true` the generated URL uses the `rediss://` scheme automatically. +> +> **Security note:** `REDIS_TLS_CHECK_HOSTNAME` is `false` by default because managed Redis services +> (GCP Memorystore, AWS ElastiCache) and Redis Cluster node discovery expose IP addresses +> that do not match certificate CN/SAN entries. The CA certificate chain is still fully +> validated. For environments where Redis hostnames match their certificates, set +> `REDIS_TLS_CHECK_HOSTNAME=true` for stronger TLS authentication. + +**Example — GCP Memorystore with TLS:** + +```bash +REDIS_HOST=10.0.0.3 +REDIS_PORT=6378 +REDIS_TLS_ENABLED=true +REDIS_TLS_CA_CERT_FILE=/etc/ssl/redis/server-ca.pem +``` + +**Example — GCP Memorystore Cluster:** + +```bash +REDIS_MODE=cluster +REDIS_CLUSTER_NODES=10.0.0.3:6379,10.0.0.4:6379,10.0.0.5:6379 +REDIS_TLS_ENABLED=true +REDIS_TLS_CA_CERT_FILE=/etc/ssl/redis/server-ca.pem +``` + ### MinIO/S3 Configuration MinIO provides S3-compatible object storage for files. @@ -175,22 +257,128 @@ Kubernetes is used for secure code execution in isolated pods. | Variable | Default | Description | | ---------------------- | -------------------------------------------- | ---------------------------------------- | | `K8S_NAMESPACE` | `""` (uses API's namespace) | Namespace for execution pods | -| `K8S_SIDECAR_IMAGE` | `aronmuon/kubecoderun-sidecar:latest` | HTTP sidecar image for pod communication | +| `K8S_SIDECAR_IMAGE` | `aronmuon/kubecoderun-sidecar-agent:latest` | HTTP sidecar image for pod communication | | `K8S_IMAGE_REGISTRY` | `aronmuon/kubecoderun` | Registry prefix for language images | | `K8S_IMAGE_TAG` | `latest` | Image tag for language images | | `K8S_CPU_LIMIT` | `1` | CPU limit per execution pod | | `K8S_MEMORY_LIMIT` | `512Mi` | Memory limit per execution pod | | `K8S_CPU_REQUEST` | `100m` | CPU request per execution pod | | `K8S_MEMORY_REQUEST` | `128Mi` | Memory request per execution pod | +| `K8S_EXECUTION_MODE` | `agent` | Execution mode: `agent` (default) or `nsenter` | +| `K8S_EXECUTOR_PORT` | `9090` | Port for the executor HTTP server inside the main container | +| `K8S_IMAGE_PULL_POLICY`| `Always` | Image pull policy for execution pods (`Always`, `IfNotPresent`, `Never`) | +| `K8S_IMAGE_PULL_SECRETS`| `""` | Comma-separated list of Kubernetes secret names for pulling images from private registries | + +**Image Pull Secrets:** + +When using private container registries, create Kubernetes secrets in the execution namespace and reference them via `K8S_IMAGE_PULL_SECRETS`: + +```bash +# Create the secret +kubectl create secret docker-registry my-registry-secret \ + --docker-server=ghcr.io \ + --docker-username= \ + --docker-password= \ + -n + +# Configure the API +K8S_IMAGE_PULL_SECRETS=my-registry-secret +# Multiple secrets: K8S_IMAGE_PULL_SECRETS=secret1,secret2 +``` + +The secrets are applied to all dynamically created execution pods (both warm pool pods and on-demand Job pods). + +**Execution Modes:** + +- **`agent` (default):** A lightweight Go HTTP server runs inside the main container. The sidecar forwards execution requests via localhost. No `nsenter`, no capabilities, no privilege escalation. Compatible with GKE Sandbox (gVisor) and restricted Pod Security Standards. +- **`nsenter` (legacy):** The sidecar uses `nsenter` to enter the main container's mount namespace. Requires `shareProcessNamespace`, `SYS_PTRACE`/`SYS_ADMIN`/`SYS_CHROOT` capabilities, and `allowPrivilegeEscalation: true`. Use only on clusters that allow privilege escalation. **Security Notes:** - Both containers run with `runAsNonRoot: true` and `runAsUser: 65532` -- The sidecar uses file capabilities (`setcap`) on the `nsenter` binary to allow non-root users to enter namespaces -- Required pod capabilities (SYS_PTRACE, SYS_ADMIN, SYS_CHROOT) must be in the bounding set with `allowPrivilegeEscalation: true` +- In agent mode: all capabilities are dropped, `allowPrivilegeEscalation: false` for all containers +- In nsenter mode: the sidecar uses file capabilities (`setcap`) on the `nsenter` binary to allow non-root namespace entry - Network policies deny all egress by default - Pods are destroyed immediately after execution -- See [SECURITY.md](SECURITY.md) for detailed explanation of the nsenter privilege model +- See [SECURITY.md](SECURITY.md) for detailed explanation of the security model + +#### Sidecar Container Images + +The sidecar Dockerfile produces two distinct images via Docker build targets. Use the image that matches your configured `K8S_EXECUTION_MODE`: + +| Build Target | Image Name | Execution Mode | Description | +|-------------|------------|---------------|-------------| +| `sidecar-agent` (default) | `kubecoderun-sidecar-agent` | `agent` | Contains executor-agent binary; no nsenter, no capabilities | +| `sidecar-nsenter` | `kubecoderun-sidecar-nsenter` | `nsenter` | Contains nsenter with file capabilities (setcap) | + +**Building the images:** + +```bash +# Agent mode sidecar (default, recommended): +docker build --target sidecar-agent \ + -t kubecoderun-sidecar-agent:latest \ + -f docker/sidecar/Dockerfile docker/sidecar/ + +# nsenter mode sidecar (legacy): +docker build --target sidecar-nsenter \ + -t kubecoderun-sidecar-nsenter:latest \ + -f docker/sidecar/Dockerfile docker/sidecar/ + +# Or use the build script (builds both automatically): +./scripts/build-images.sh sidecar-agent # agent mode sidecar +./scripts/build-images.sh sidecar-nsenter # nsenter mode sidecar +./scripts/build-images.sh # all images (both sidecars) +``` + +**Helm chart configuration:** + +Update `values.yaml` to use the correct sidecar image for your execution mode: + +```yaml +execution: + executionMode: "agent" # or "nsenter" + sidecar: + # For agent mode (default): + repository: ghcr.io/your-org/kubecoderun-sidecar-agent + # For nsenter mode: + # repository: ghcr.io/your-org/kubecoderun-sidecar-nsenter +``` + +### GKE Sandbox (gVisor) Configuration + +[GKE Sandbox](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods) provides kernel-level isolation using gVisor to protect the host kernel from untrusted code. It is **only compatible with agent execution mode**. + +| Variable | Default | Description | +| ----------------------------------- | --------- | -------------------------------------------------- | +| `GKE_SANDBOX_ENABLED` | `false` | Enable GKE Sandbox (gVisor) for execution pods | +| `GKE_SANDBOX_RUNTIME_CLASS` | `gvisor` | RuntimeClass name for sandboxed pods | +| `GKE_SANDBOX_NODE_SELECTOR` | `{}` | JSON node selector for sandbox nodes | +| `GKE_SANDBOX_CUSTOM_TOLERATIONS` | `[]` | JSON array of custom tolerations for sandbox nodes | + +**Requirements:** + +- `K8S_EXECUTION_MODE=agent` (nsenter is **incompatible** with gVisor) +- GKE cluster with a sandbox-enabled node pool (`--sandbox type=gvisor`) +- At least two node pools — one with GKE Sandbox enabled, one without +- Container-Optimized OS with containerd (`cos_containerd`) node image + +**Example configuration:** + +```bash +K8S_EXECUTION_MODE=agent +GKE_SANDBOX_ENABLED=true +GKE_SANDBOX_RUNTIME_CLASS=gvisor +# Schedule on specific sandbox node pool: +GKE_SANDBOX_NODE_SELECTOR={"pool":"sandbox"} +GKE_SANDBOX_CUSTOM_TOLERATIONS=[{"key":"pool","value":"sandbox","operator":"Equal","effect":"NoSchedule"}] +``` + +**Key limitations of GKE Sandbox** (see [GKE docs](https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods#limitations)): + +- Incompatible with `nsenter` execution mode, privileged containers, and `shareProcessNamespace` (all avoided in agent mode) +- Seccomp, AppArmor, and SELinux not applicable inside the sandbox +- HostPath volumes and port-forwarding not supported +- Container-level memory metrics not available (pod-level metrics are) ### Resource Limits @@ -395,6 +583,10 @@ if validate_configuration(): - [ ] Deploy Kubernetes NetworkPolicy to deny egress - [ ] Configure pod security context (non-root user) - [ ] Review and adjust resource limits +- [ ] Choose execution mode (`K8S_EXECUTION_MODE=agent` recommended) +- [ ] Ensure sidecar image matches execution mode (`sidecar-agent` for agent, `sidecar-nsenter` for nsenter) +- [ ] Configure `K8S_IMAGE_PULL_SECRETS` if using private registries +- [ ] Enable GKE Sandbox for additional kernel isolation if running on GKE (`GKE_SANDBOX_ENABLED=true`) ### Performance diff --git a/helm-deployments/kubecoderun/templates/configmap.yaml b/helm-deployments/kubecoderun/templates/configmap.yaml index a20df80..3c0fd29 100644 --- a/helm-deployments/kubecoderun/templates/configmap.yaml +++ b/helm-deployments/kubecoderun/templates/configmap.yaml @@ -36,15 +36,32 @@ data: K8S_IMAGE_REGISTRY: {{ .Values.execution.imageRegistry | quote }} K8S_IMAGE_TAG: {{ $imageTag | quote }} K8S_IMAGE_PULL_POLICY: {{ .Values.execution.imagePullPolicy | quote }} + {{- if .Values.execution.imagePullSecrets }} + K8S_IMAGE_PULL_SECRETS: {{ join "," (pluck "name" .Values.execution.imagePullSecrets) | quote }} + {{- else }} + K8S_IMAGE_PULL_SECRETS: "" + {{- end }} K8S_CPU_LIMIT: {{ .Values.execution.resources.limits.cpu | quote }} K8S_MEMORY_LIMIT: {{ .Values.execution.resources.limits.memory | quote }} K8S_CPU_REQUEST: {{ .Values.execution.resources.requests.cpu | quote }} K8S_MEMORY_REQUEST: {{ .Values.execution.resources.requests.memory | quote }} K8S_RUN_AS_USER: {{ .Values.execution.securityContext.runAsUser | quote }} + K8S_EXECUTION_MODE: {{ .Values.execution.executionMode | quote }} + K8S_EXECUTOR_PORT: {{ .Values.execution.executorPort | quote }} K8S_SECCOMP_PROFILE_TYPE: {{ .Values.execution.securityContext.seccompProfile.type | quote }} K8S_JOB_TTL_SECONDS: {{ .Values.execution.jobs.ttlSecondsAfterFinished | quote }} K8S_JOB_DEADLINE_SECONDS: {{ .Values.execution.jobs.activeDeadlineSeconds | quote }} + # GKE Sandbox Configuration + GKE_SANDBOX_ENABLED: {{ .Values.execution.gkeSandbox.enabled | quote }} + GKE_SANDBOX_RUNTIME_CLASS: {{ .Values.execution.gkeSandbox.runtimeClassName | quote }} + {{- if .Values.execution.gkeSandbox.nodeSelector }} + GKE_SANDBOX_NODE_SELECTOR: {{ .Values.execution.gkeSandbox.nodeSelector | toJson | quote }} + {{- end }} + {{- if .Values.execution.gkeSandbox.customTolerations }} + GKE_SANDBOX_CUSTOM_TOLERATIONS: {{ .Values.execution.gkeSandbox.customTolerations | toJson | quote }} + {{- end }} + # Pod Lifecycle POD_TTL_MINUTES: {{ .Values.execution.podTtlMinutes | quote }} POD_CLEANUP_INTERVAL_MINUTES: {{ .Values.execution.podCleanupIntervalMinutes | quote }} @@ -302,10 +319,41 @@ data: WAN_NETWORK_NAME: {{ .Values.network.wan.networkName | quote }} WAN_DNS_SERVERS: {{ .Values.network.wan.dnsServers | toJson | quote }} - # Redis Advanced Configuration + # Redis Configuration + REDIS_MODE: {{ .Values.redis.mode | quote }} + {{- if .Values.redis.host }} + REDIS_HOST: {{ .Values.redis.host | quote }} + {{- end }} + REDIS_PORT: {{ .Values.redis.port | quote }} + REDIS_DB: {{ .Values.redis.db | quote }} REDIS_MAX_CONNECTIONS: {{ .Values.redis.maxConnections | quote }} REDIS_SOCKET_TIMEOUT: {{ .Values.redis.socketTimeout | quote }} REDIS_SOCKET_CONNECT_TIMEOUT: {{ .Values.redis.socketConnectTimeout | quote }} + {{- if .Values.redis.keyPrefix }} + REDIS_KEY_PREFIX: {{ .Values.redis.keyPrefix | quote }} + {{- end }} + {{- if .Values.redis.clusterNodes }} + REDIS_CLUSTER_NODES: {{ .Values.redis.clusterNodes | quote }} + {{- end }} + {{- if .Values.redis.sentinelNodes }} + REDIS_SENTINEL_NODES: {{ .Values.redis.sentinelNodes | quote }} + {{- end }} + REDIS_SENTINEL_MASTER: {{ .Values.redis.sentinelMaster | quote }} + {{- if .Values.redis.sentinelPassword }} + REDIS_SENTINEL_PASSWORD: {{ .Values.redis.sentinelPassword | quote }} + {{- end }} + REDIS_TLS_ENABLED: {{ .Values.redis.tls.enabled | quote }} + {{- if .Values.redis.tls.caCertFile }} + REDIS_TLS_CA_CERT_FILE: {{ .Values.redis.tls.caCertFile | quote }} + {{- end }} + {{- if .Values.redis.tls.certFile }} + REDIS_TLS_CERT_FILE: {{ .Values.redis.tls.certFile | quote }} + {{- end }} + {{- if .Values.redis.tls.keyFile }} + REDIS_TLS_KEY_FILE: {{ .Values.redis.tls.keyFile | quote }} + {{- end }} + REDIS_TLS_INSECURE: {{ .Values.redis.tls.insecure | quote }} + REDIS_TLS_CHECK_HOSTNAME: {{ .Values.redis.tls.checkHostname | quote }} # MinIO/S3 Configuration {{- if not .Values.secretsStore.enabled }} diff --git a/helm-deployments/kubecoderun/templates/secret.yaml b/helm-deployments/kubecoderun/templates/secret.yaml index 2e22a17..8003502 100644 --- a/helm-deployments/kubecoderun/templates/secret.yaml +++ b/helm-deployments/kubecoderun/templates/secret.yaml @@ -20,8 +20,11 @@ stringData: {{- end }} {{- end }} {{- if not .Values.redis.existingSecret }} - # Redis URL + # Redis URL (standalone mode) and password (all modes) REDIS_URL: {{ include "kubecoderun.redisUrl" . | quote }} + {{- if .Values.redis.password }} + REDIS_PASSWORD: {{ .Values.redis.password | quote }} + {{- end }} {{- end }} {{- if and (not .Values.minio.existingSecret) (not .Values.minio.useIAM) }} # S3-Compatible Storage Credentials (Garage/MinIO/S3) diff --git a/helm-deployments/kubecoderun/values.yaml b/helm-deployments/kubecoderun/values.yaml index e06bbd6..956aa4a 100644 --- a/helm-deployments/kubecoderun/values.yaml +++ b/helm-deployments/kubecoderun/values.yaml @@ -99,6 +99,10 @@ redis: # When set, the url/host/port/password/db fields below are ignored # Expected secret key: REDIS_URL (full connection string) existingSecret: "" + + # Deployment mode: standalone (default), cluster, or sentinel + mode: "standalone" + # External Redis URL (required unless existingSecret is set) url: "redis://redis:6379/0" # Or specify individual fields @@ -111,6 +115,35 @@ redis: socketTimeout: 5 socketConnectTimeout: 5 + # Optional key prefix prepended to every Redis key. + # Useful when sharing a Redis instance across environments. + keyPrefix: "" + + # Redis Cluster mode (mode: cluster) + # Comma-separated host:port pairs for cluster startup nodes + clusterNodes: "" + + # Redis Sentinel mode (mode: sentinel) + # Comma-separated host:port pairs for sentinel instances + sentinelNodes: "" + sentinelMaster: "mymaster" + sentinelPassword: "" + + # TLS/SSL settings (all modes) + tls: + enabled: false + # Path to CA certificate inside the container + caCertFile: "" + # Client certificate and key for mutual TLS + certFile: "" + keyFile: "" + # Skip server certificate verification (NOT recommended for production) + insecure: false + # Verify server hostname against certificate CN/SAN. + # Off by default because managed Redis services and cluster mode + # expose node IPs that typically don't match certificate names. + checkHostname: false + minio: # Reference an existing Kubernetes Secret containing S3 credentials # When set, the accessKey/secretKey fields below are ignored @@ -172,17 +205,36 @@ execution: # Image pull policy for execution pods (IfNotPresent, Always, Never) imagePullPolicy: "IfNotPresent" + # Image pull secrets for private registries (applies to execution pods) + # Example: + # imagePullSecrets: + # - name: secret-for-registry + # - name: another-secret + imagePullSecrets: [] + # Service account for execution pods (with pod/job create permissions) serviceAccount: create: true name: "kubecoderun-executor" annotations: {} + # Execution mode: "agent" (default) or "nsenter" (legacy) + # - agent: Executor agent runs inside main container. No nsenter, no capabilities, + # no privilege escalation. Compatible with GKE Sandbox (gVisor). Requires the + # sidecar-agent image (default build target). + # - nsenter: Sidecar uses nsenter to enter the main container's namespace. Requires + # the sidecar-nsenter image, shareProcessNamespace, SYS_PTRACE/SYS_ADMIN/SYS_CHROOT + # capabilities, and allowPrivilegeEscalation: true. + executionMode: "agent" + + # Port for the executor HTTP server inside the main container + executorPort: 9090 + # Sidecar container configuration - # CRITICAL: User code runs in sidecar's cgroup via nsenter (Issue #32) - # These resource limits apply to user code execution, not the main container + # In nsenter mode: user code runs in sidecar's cgroup via nsenter + # In agent mode: sidecar only proxies requests, user code runs in main container's cgroup sidecar: - repository: ghcr.io/aron-muon/kubecoderun-sidecar + repository: ghcr.io/aron-muon/kubecoderun-sidecar-agent # tag defaults to Chart.AppVersion if not specified tag: "" port: 8080 @@ -285,6 +337,40 @@ execution: enabled: true denyEgress: true + # GKE Sandbox (gVisor) Configuration + # Provides additional kernel isolation for untrusted workloads using gVisor + # See: https://docs.cloud.google.com/kubernetes-engine/docs/concepts/sandbox-pods + gkeSandbox: + # Enable GKE Sandbox for execution pods. + # WARNING: When enabled, pods require nodes with the gVisor runtime class. + # Pods will stay Pending on clusters without sandbox-enabled node pools. + enabled: false + + # Runtime class name (default: gvisor for GKE) + runtimeClassName: "gvisor" + + # Node selector for sandbox-enabled nodes + # GKE automatically adds sandbox.gke.io/runtime=gvisor to sandbox nodes + # Add additional selectors here if needed (e.g., for specific node pools) + nodeSelector: {} + # Example: + # sandbox.gke.io/runtime: gvisor + # cloud.google.com/gke-nodepool: sandbox-pool + + # Custom tolerations for node pool taints + # GKE automatically adds toleration for sandbox.gke.io/runtime=gvisor + # Use this for additional custom taints (e.g., dedicated sandbox node pools) + customTolerations: [] + # Example: + # - key: pool + # operator: Equal + # value: sandbox + # effect: NoSchedule + # - key: sandbox.gke.io/runtime + # operator: Equal + # value: gvisor + # effect: NoSchedule + # Resource Limits Configuration resourceLimits: # Execution limits diff --git a/src/config/__init__.py b/src/config/__init__.py index aee6bfa..48a55b0 100644 --- a/src/config/__init__.py +++ b/src/config/__init__.py @@ -88,15 +88,71 @@ class Settings(BaseSettings): rate_limit_enabled: bool = Field(default=True, description="Enable per-key rate limiting for Redis-managed keys") # Redis Configuration + redis_mode: Literal["standalone", "cluster", "sentinel"] = Field( + default="standalone", + description="Redis deployment mode: standalone, cluster, or sentinel", + ) redis_host: str = Field(default="localhost") redis_port: int = Field(default=6379, ge=1, le=65535) - redis_password: str | None = Field(default=None) + redis_password: str | None = Field(default=None, description="Redis password (empty string treated as no password)") redis_db: int = Field(default=0, ge=0, le=15) redis_url: str | None = Field(default=None) redis_max_connections: int = Field(default=20, ge=1) redis_socket_timeout: int = Field(default=5, ge=1) redis_socket_connect_timeout: int = Field(default=5, ge=1) + # Redis Cluster + redis_cluster_nodes: str | None = Field( + default=None, + description="Comma-separated host:port pairs for Redis Cluster startup nodes", + ) + + # Redis Sentinel + redis_sentinel_nodes: str | None = Field( + default=None, + description="Comma-separated host:port pairs for Sentinel instances", + ) + redis_sentinel_master: str = Field( + default="mymaster", + description="Name of the Sentinel-monitored master", + ) + redis_sentinel_password: str | None = Field( + default=None, + description="Password for authenticating to Sentinel instances", + ) + + # Redis Key Prefix + redis_key_prefix: str = Field( + default="", + description="Optional prefix prepended to every Redis key (e.g. 'prod:', 'kubecoderun:')", + ) + + # Redis TLS/SSL + redis_tls_enabled: bool = Field( + default=False, + description="Enable TLS/SSL for Redis connections", + ) + redis_tls_cert_file: str | None = Field( + default=None, + description="Path to client TLS certificate (mutual TLS)", + ) + redis_tls_key_file: str | None = Field( + default=None, + description="Path to client TLS private key (mutual TLS)", + ) + redis_tls_ca_cert_file: str | None = Field( + default=None, + description="Path to CA certificate for verifying the server", + ) + redis_tls_insecure: bool = Field( + default=False, + description="Skip TLS certificate verification (NOT recommended for production)", + ) + redis_tls_check_hostname: bool = Field( + default=False, + description="Enable TLS hostname verification (off by default for managed Redis / cluster)", + ) + # MinIO/S3 Configuration minio_endpoint: str = Field(default="localhost:9000") minio_access_key: str | None = Field(default=None) @@ -407,6 +463,12 @@ def _set_supported_languages(cls, data): } return data + # Service Version Override (set at deploy time to override build-time version) + service_version: str | None = Field( + default=None, + description="Runtime version override (e.g. '2.1.4'). Falls back to build-time version from _version.py.", + ) + # Logging Configuration log_level: str = Field(default="INFO") log_format: str = Field(default="json") @@ -435,6 +497,38 @@ def parse_api_keys(cls, v): """Parse comma-separated API keys into a list.""" return [key.strip() for key in v.split(",") if key.strip()] if v else None + @field_validator("redis_host", mode="before") + @classmethod + def sanitize_redis_host(cls, v): + """Strip accidental URL scheme from Redis host.""" + if isinstance(v, str): + for scheme in ("rediss://", "redis://"): + if v.lower().startswith(scheme): + v = v[len(scheme) :].rstrip("/") + break + return v + + @field_validator("redis_password", "redis_sentinel_password", mode="before") + @classmethod + def sanitize_redis_password(cls, v): + """Convert empty password strings to None. + + Kubernetes / Helm often set REDIS_PASSWORD="" which pydantic reads + as empty string. Passing an empty password to redis-py sends + AUTH "" which fails when the server has no auth configured. + """ + if isinstance(v, str) and v.strip() == "": + return None + return v + + @field_validator("redis_cluster_nodes", "redis_sentinel_nodes", mode="before") + @classmethod + def sanitize_redis_nodes(cls, v): + """Convert empty node lists to None so code falls back to host:port.""" + if isinstance(v, str) and v.strip() == "": + return None + return v + @field_validator("minio_endpoint") @classmethod def validate_minio_endpoint(cls, v): @@ -470,6 +564,7 @@ def api(self) -> APIConfig: def redis(self) -> RedisConfig: """Access Redis configuration group.""" return RedisConfig( + redis_mode=self.redis_mode, redis_host=self.redis_host, redis_port=self.redis_port, redis_password=self.redis_password, @@ -478,6 +573,17 @@ def redis(self) -> RedisConfig: redis_max_connections=self.redis_max_connections, redis_socket_timeout=self.redis_socket_timeout, redis_socket_connect_timeout=self.redis_socket_connect_timeout, + redis_cluster_nodes=self.redis_cluster_nodes, + redis_sentinel_nodes=self.redis_sentinel_nodes, + redis_sentinel_master=self.redis_sentinel_master, + redis_sentinel_password=self.redis_sentinel_password, + redis_key_prefix=self.redis_key_prefix, + redis_tls_enabled=self.redis_tls_enabled, + redis_tls_cert_file=self.redis_tls_cert_file, + redis_tls_key_file=self.redis_tls_key_file, + redis_tls_ca_cert_file=self.redis_tls_ca_cert_file, + redis_tls_insecure=self.redis_tls_insecure, + redis_tls_check_hostname=self.redis_tls_check_hostname, ) @property diff --git a/src/config/redis.py b/src/config/redis.py index 153f11e..f668528 100644 --- a/src/config/redis.py +++ b/src/config/redis.py @@ -1,11 +1,25 @@ -"""Redis configuration.""" +"""Redis configuration. -from pydantic import Field +Supports three deployment modes: +- **standalone** (default): Single Redis instance. +- **cluster**: Redis Cluster with automatic slot routing. +- **sentinel**: Redis Sentinel for high-availability failover. + +TLS/SSL is supported in all modes and is required for most managed Redis +services such as GCP Memorystore, AWS ElastiCache, and Azure Cache for Redis. +""" + +from typing import Literal + +from pydantic import Field, field_validator from pydantic_settings import BaseSettings, SettingsConfigDict class RedisConfig(BaseSettings): - """Redis connection settings.""" + """Redis connection settings. + + Supports standalone, cluster, and sentinel modes with optional TLS. + """ model_config = SettingsConfigDict( env_prefix="", @@ -13,6 +27,14 @@ class RedisConfig(BaseSettings): populate_by_name=True, ) + # -- Connection mode ------------------------------------------------------- + mode: Literal["standalone", "cluster", "sentinel"] = Field( + default="standalone", + alias="redis_mode", + description="Redis deployment mode: standalone, cluster, or sentinel", + ) + + # -- Basic connection (standalone / single-entry for cluster & sentinel) --- host: str = Field(default="localhost", alias="redis_host") port: int = Field(default=6379, ge=1, le=65535, alias="redis_port") password: str | None = Field(default=None, alias="redis_password") @@ -22,9 +44,193 @@ class RedisConfig(BaseSettings): socket_timeout: int = Field(default=5, ge=1, alias="redis_socket_timeout") socket_connect_timeout: int = Field(default=5, ge=1, alias="redis_socket_connect_timeout") + # -- Cluster mode ---------------------------------------------------------- + cluster_nodes: str | None = Field( + default=None, + alias="redis_cluster_nodes", + description=( + "Comma-separated list of host:port pairs for Redis Cluster startup nodes. " + "Example: 'node1:6379,node2:6379,node3:6379'" + ), + ) + + # -- Sentinel mode --------------------------------------------------------- + sentinel_nodes: str | None = Field( + default=None, + alias="redis_sentinel_nodes", + description=( + "Comma-separated list of host:port pairs for Sentinel instances. " + "Example: 'sentinel1:26379,sentinel2:26379,sentinel3:26379'" + ), + ) + sentinel_master: str = Field( + default="mymaster", + alias="redis_sentinel_master", + description="Name of the Sentinel-monitored master.", + ) + sentinel_password: str | None = Field( + default=None, + alias="redis_sentinel_password", + description="Password for authenticating to Sentinel instances (if different from Redis password).", + ) + + # -- Key prefix ------------------------------------------------------------ + key_prefix: str = Field( + default="", + alias="redis_key_prefix", + description=( + "Optional prefix prepended to every Redis key. " + "Useful for sharing a single Redis instance across multiple environments " + "or applications (e.g. 'prod:', 'staging:', 'kubecoderun:'). " + "Must end with a separator like ':' if you want one." + ), + ) + + # -- TLS / SSL ------------------------------------------------------------- + tls_enabled: bool = Field( + default=False, + alias="redis_tls_enabled", + description="Enable TLS/SSL for Redis connections.", + ) + tls_cert_file: str | None = Field( + default=None, + alias="redis_tls_cert_file", + description="Path to client TLS certificate file (mutual TLS).", + ) + tls_key_file: str | None = Field( + default=None, + alias="redis_tls_key_file", + description="Path to client TLS private key file (mutual TLS).", + ) + tls_ca_cert_file: str | None = Field( + default=None, + alias="redis_tls_ca_cert_file", + description="Path to CA certificate file for verifying the server.", + ) + tls_insecure: bool = Field( + default=False, + alias="redis_tls_insecure", + description="Skip TLS certificate verification (NOT recommended for production).", + ) + tls_check_hostname: bool = Field( + default=False, + alias="redis_tls_check_hostname", + description=( + "Enable TLS hostname verification. Disabled by default because " + "managed Redis services (GCP Memorystore, AWS ElastiCache) and " + "Redis Cluster mode expose node IPs that typically do not match " + "the certificate CN/SAN entries. The certificate chain is still " + "verified against the CA when tls_insecure is False." + ), + ) + + # -- Validators ------------------------------------------------------------ + + @field_validator("host", mode="before") + @classmethod + def _sanitize_host(cls, v: str) -> str: + """Strip an accidental URL scheme from the host value. + + Users sometimes set ``REDIS_HOST=rediss://hostname`` instead of just + ``REDIS_HOST=hostname``. This validator normalises the value so that + downstream code always receives a plain hostname or IP. + """ + if isinstance(v, str): + for scheme in ("rediss://", "redis://"): + if v.lower().startswith(scheme): + v = v[len(scheme) :] + # Drop any trailing slash left over + v = v.rstrip("/") + break + return v + + @field_validator("password", "sentinel_password", mode="before") + @classmethod + def _empty_string_to_none(cls, v: str | None) -> str | None: + """Convert empty strings to ``None``. + + Kubernetes ConfigMaps and Helm values often set ``REDIS_PASSWORD: ""`` + which pydantic-settings reads as ``""`` rather than ``None``. Passing + an empty password to redis-py causes it to send ``AUTH ""`` which + fails when the server has no authentication configured. + """ + if isinstance(v, str) and v.strip() == "": + return None + return v + + @field_validator("cluster_nodes", "sentinel_nodes", mode="before") + @classmethod + def _empty_nodes_to_none(cls, v: str | None) -> str | None: + """Convert empty/whitespace-only node lists to ``None``. + + Helm values default to ``clusterNodes: ""`` which renders in the + ConfigMap as an empty string. This validator treats it the same + as "not set" so the code falls back to ``host:port``. + """ + if isinstance(v, str) and v.strip() == "": + return None + return v + + # -- Helpers --------------------------------------------------------------- + def get_url(self) -> str: - """Get Redis connection URL.""" + """Get Redis connection URL (standalone mode only). + + For cluster/sentinel modes the URL is not used; startup nodes are + provided separately. This method honours an explicit ``url`` and + automatically switches between the ``redis://`` and ``rediss://`` + scheme based on the ``tls_enabled`` flag. + """ if self.url: return self.url + scheme = "rediss" if self.tls_enabled else "redis" password_part = f":{self.password}@" if self.password else "" - return f"redis://{password_part}{self.host}:{self.port}/{self.db}" + return f"{scheme}://{password_part}{self.host}:{self.port}/{self.db}" + + def get_tls_kwargs(self) -> dict: + """Build keyword arguments for redis-py SSL/TLS configuration. + + Returns an empty dict when TLS is disabled so callers can safely + unpack the result: ``redis.Redis(**config.get_tls_kwargs())``. + """ + if not self.tls_enabled: + return {} + + import ssl + + kwargs: dict = {"ssl": True} + + if self.tls_insecure: + kwargs["ssl_cert_reqs"] = ssl.CERT_NONE + kwargs["ssl_check_hostname"] = False + else: + kwargs["ssl_cert_reqs"] = ssl.CERT_REQUIRED + # Hostname checking is off by default because managed Redis + # services (GCP Memorystore, AWS ElastiCache) and Redis + # Cluster node discovery return IPs that do not match the + # certificate CN/SAN. The certificate chain is still fully + # validated against the CA. + kwargs["ssl_check_hostname"] = self.tls_check_hostname + + if self.tls_ca_cert_file: + kwargs["ssl_ca_certs"] = self.tls_ca_cert_file + if self.tls_cert_file: + kwargs["ssl_certfile"] = self.tls_cert_file + if self.tls_key_file: + kwargs["ssl_keyfile"] = self.tls_key_file + + return kwargs + + def parse_nodes(self, raw: str) -> list[tuple[str, int]]: + """Parse a comma-separated ``host:port`` string into a list of tuples.""" + nodes: list[tuple[str, int]] = [] + for entry in raw.split(","): + entry = entry.strip() + if not entry: + continue + if ":" in entry: + h, p = entry.rsplit(":", 1) + nodes.append((h.strip(), int(p.strip()))) + else: + nodes.append((entry, self.port)) + return nodes diff --git a/src/core/pool.py b/src/core/pool.py index 21e9baa..26629cc 100644 --- a/src/core/pool.py +++ b/src/core/pool.py @@ -2,15 +2,33 @@ This module provides centralized connection pools for external services, allowing efficient resource sharing across the application. + +Supported Redis deployment modes: +- **standalone** (default): Single Redis server with ``ConnectionPool``. +- **cluster**: Redis Cluster via ``RedisCluster``. +- **sentinel**: Redis Sentinel via ``Sentinel`` for HA failover. + +All modes support optional TLS/SSL for managed services such as +GCP Memorystore, AWS ElastiCache, and Azure Cache for Redis. """ -from typing import Optional +from __future__ import annotations + +from typing import TYPE_CHECKING import redis.asyncio as redis import structlog +from redis.asyncio.cluster import RedisCluster +from redis.asyncio.sentinel import Sentinel +from redis.backoff import ExponentialBackoff +from redis.exceptions import ConnectionError, TimeoutError +from redis.retry import Retry from ..config import settings +if TYPE_CHECKING: + from ..config.redis import RedisConfig + logger = structlog.get_logger(__name__) @@ -18,76 +36,209 @@ class RedisPool: """Centralized async Redis connection pool. Provides a shared connection pool for all services that need Redis, - avoiding the overhead of multiple separate pools. + avoiding the overhead of multiple separate pools. Supports standalone, + cluster, and sentinel modes with optional TLS. Usage: client = redis_pool.get_client() await client.set("key", "value") """ - def __init__(self): + def __init__(self) -> None: self._pool: redis.ConnectionPool | None = None - self._client: redis.Redis | None = None - self._initialized = False + self._client: redis.Redis | RedisCluster | None = None + self._sentinel: Sentinel | None = None + self._initialized: bool = False + self._mode: str = "standalone" + self._key_prefix: str = "" def _initialize(self) -> None: - """Initialize the connection pool lazily.""" + """Initialize the connection pool lazily based on the configured mode.""" if self._initialized: return try: - redis_url = settings.get_redis_url() - self._pool = redis.ConnectionPool.from_url( - redis_url, - max_connections=20, # Shared across all services - decode_responses=True, - socket_timeout=5.0, - socket_connect_timeout=5.0, - retry_on_timeout=True, - ) - self._client = redis.Redis(connection_pool=self._pool) + redis_cfg = settings.redis + self._mode = redis_cfg.mode + self._key_prefix = redis_cfg.key_prefix + tls_kwargs = redis_cfg.get_tls_kwargs() + max_conns = redis_cfg.max_connections + socket_timeout = float(redis_cfg.socket_timeout) + socket_connect_timeout = float(redis_cfg.socket_connect_timeout) + + if self._mode == "cluster": + self._init_cluster(redis_cfg, tls_kwargs, max_conns, socket_timeout, socket_connect_timeout) + elif self._mode == "sentinel": + self._init_sentinel(redis_cfg, tls_kwargs, max_conns, socket_timeout, socket_connect_timeout) + else: + self._init_standalone(redis_cfg, tls_kwargs, max_conns, socket_timeout, socket_connect_timeout) + self._initialized = True - logger.info( - "Redis connection pool initialized", - max_connections=20, - url=redis_url.split("@")[-1], # Don't log password - ) except Exception as e: - logger.error("Failed to initialize Redis pool", error=str(e)) - # Create a fallback client - self._client = redis.from_url("redis://localhost:6379/0", decode_responses=True) - self._initialized = True - - def get_client(self) -> redis.Redis: + logger.error( + "Failed to initialize Redis pool", + error=str(e), + mode=self._mode, + ) + raise + + # -- Mode-specific initialisers ------------------------------------------- + + def _init_standalone( + self, + cfg: RedisConfig, + tls_kwargs: dict, + max_conns: int, + socket_timeout: float, + socket_connect_timeout: float, + ) -> None: + redis_url = cfg.get_url() + self._pool = redis.ConnectionPool.from_url( + redis_url, + max_connections=max_conns, + decode_responses=True, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + retry_on_timeout=True, + **tls_kwargs, + ) + self._client = redis.Redis(connection_pool=self._pool) + logger.info( + "Redis standalone connection pool initialized", + max_connections=max_conns, + tls=cfg.tls_enabled, + url=redis_url.split("@")[-1], + ) + + def _init_cluster( + self, + cfg: RedisConfig, + tls_kwargs: dict, + max_conns: int, + socket_timeout: float, + socket_connect_timeout: float, + ) -> None: + if cfg.cluster_nodes: + startup_nodes = [redis.cluster.ClusterNode(host=h, port=p) for h, p in cfg.parse_nodes(cfg.cluster_nodes)] + else: + startup_nodes = [redis.cluster.ClusterNode(host=cfg.host, port=cfg.port)] + + self._client = RedisCluster( + startup_nodes=startup_nodes, + password=cfg.password, + decode_responses=True, + max_connections=max_conns, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + retry=Retry(ExponentialBackoff(), retries=3), + retry_on_error=[ConnectionError, TimeoutError], + **tls_kwargs, + ) + logger.info( + "Redis cluster connection initialized", + startup_nodes=[ + f"{h}:{p}" + for h, p in (cfg.parse_nodes(cfg.cluster_nodes) if cfg.cluster_nodes else [(cfg.host, cfg.port)]) + ], + tls=cfg.tls_enabled, + ) + + def _init_sentinel( + self, + cfg: RedisConfig, + tls_kwargs: dict, + max_conns: int, + socket_timeout: float, + socket_connect_timeout: float, + ) -> None: + if cfg.sentinel_nodes: + sentinel_hosts = cfg.parse_nodes(cfg.sentinel_nodes) + else: + sentinel_hosts = [(cfg.host, 26379)] + + self._sentinel = Sentinel( + sentinels=sentinel_hosts, + password=cfg.sentinel_password, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + **tls_kwargs, + ) + self._client = self._sentinel.master_for( + service_name=cfg.sentinel_master, + password=cfg.password, + decode_responses=True, + socket_timeout=socket_timeout, + socket_connect_timeout=socket_connect_timeout, + max_connections=max_conns, + retry_on_timeout=True, + **tls_kwargs, + ) + logger.info( + "Redis sentinel connection initialized", + sentinel_nodes=[f"{h}:{p}" for h, p in sentinel_hosts], + master=cfg.sentinel_master, + tls=cfg.tls_enabled, + ) + + # -- Public API ----------------------------------------------------------- + + def get_client(self) -> redis.Redis | RedisCluster: """Get an async Redis client from the shared pool. Returns: - Async Redis client instance connected to the shared pool + Async Redis client instance connected to the shared pool. + For cluster mode this is a ``RedisCluster`` instance which + exposes the same command interface. """ if not self._initialized: self._initialize() assert self._client is not None, "Redis client not initialized" return self._client + @property + def key_prefix(self) -> str: + """Return the configured Redis key prefix (may be empty).""" + if not self._initialized: + self._initialize() + return self._key_prefix + + def make_key(self, key: str) -> str: + """Prepend the configured key prefix to *key*. + + Returns *key* unchanged when no prefix is configured. + """ + prefix = self.key_prefix + if prefix: + return f"{prefix}{key}" + return key + @property def pool_stats(self) -> dict: """Get connection pool statistics.""" - if not self._pool: - return {"initialized": False} + if not self._pool and self._mode == "standalone": + return {"initialized": self._initialized, "mode": self._mode} + + stats: dict = {"initialized": self._initialized, "mode": self._mode} + + if self._key_prefix: + stats["key_prefix"] = self._key_prefix + + if self._pool: + stats["max_connections"] = self._pool.max_connections - return { - "initialized": True, - "max_connections": self._pool.max_connections, - } + return stats async def close(self) -> None: """Close the connection pool and release all connections.""" if self._client: await self._client.close() - logger.info("Redis connection pool closed") + logger.info("Redis connection pool closed", mode=self._mode) self._pool = None self._client = None + self._sentinel = None self._initialized = False + self._mode = "standalone" + self._key_prefix = "" # Global Redis pool instance diff --git a/src/main.py b/src/main.py index d1d46dd..e9db6fa 100644 --- a/src/main.py +++ b/src/main.py @@ -33,6 +33,9 @@ from .utils.logging import setup_logging from .utils.shutdown import setup_graceful_shutdown, shutdown_handler +# Resolve effective version: runtime SERVICE_VERSION overrides build-time _version.py +effective_version: str = settings.service_version or __version__ + # Setup logging setup_logging() logger = structlog.get_logger() @@ -42,7 +45,7 @@ async def lifespan(app: FastAPI): """Application lifespan manager.""" # Startup - logger.info("Starting Code Interpreter API", version=__version__) + logger.info("Starting Code Interpreter API", version=effective_version) # Setup graceful shutdown callbacks (uvicorn handles signals) setup_graceful_shutdown() @@ -249,7 +252,7 @@ async def lifespan(app: FastAPI): app = FastAPI( title="Code Interpreter API", description="A secure API for executing code in isolated Kubernetes pods", - version=__version__, + version=effective_version, docs_url="/docs" if settings.enable_docs else None, redoc_url="/redoc" if settings.enable_docs else None, debug=settings.api_debug, @@ -287,7 +290,7 @@ async def health_check(): """Health check endpoint for liveness probe.""" return { "status": "healthy", - "version": __version__, + "version": effective_version, "config": { "debug": settings.api_debug, "docs_enabled": settings.enable_docs, diff --git a/src/services/api_key_manager.py b/src/services/api_key_manager.py index 3fb0249..463c873 100644 --- a/src/services/api_key_manager.py +++ b/src/services/api_key_manager.py @@ -31,12 +31,12 @@ class ApiKeyManagerService: """Manages API keys stored in Redis.""" - # Redis key prefixes - RECORD_PREFIX = "api_keys:records:" - VALID_CACHE_PREFIX = "api_keys:valid:" - USAGE_PREFIX = "api_keys:usage:" - INDEX_KEY = "api_keys:index" - ENV_KEYS_INDEX = "api_keys:env_index" # Separate index for env keys + # Base Redis key prefixes (before application-level prefix) + _RECORD_PREFIX = "api_keys:records:" + _VALID_CACHE_PREFIX = "api_keys:valid:" + _USAGE_PREFIX = "api_keys:usage:" + _INDEX_KEY = "api_keys:index" + _ENV_KEYS_INDEX = "api_keys:env_index" # Separate index for env keys # Cache TTL VALIDATION_CACHE_TTL = 300 # 5 minutes @@ -49,6 +49,14 @@ def __init__(self, redis_client: redis.Redis | None = None): """ self._redis = redis_client + # Compute prefixed keys once so every method uses the prefix + mk = redis_pool.make_key + self.RECORD_PREFIX = mk(self._RECORD_PREFIX) + self.VALID_CACHE_PREFIX = mk(self._VALID_CACHE_PREFIX) + self.USAGE_PREFIX = mk(self._USAGE_PREFIX) + self.INDEX_KEY = mk(self._INDEX_KEY) + self.ENV_KEYS_INDEX = mk(self._ENV_KEYS_INDEX) + @property def redis(self) -> redis.Redis: """Get Redis client, initializing if needed.""" @@ -130,8 +138,9 @@ async def _ensure_single_env_key_record(self, api_key: str, name: str) -> ApiKey source="environment", ) - # Store in Redis - pipe = self.redis.pipeline(transaction=True) + # Store in Redis (transaction=False for Redis Cluster compatibility + # — record key and index key hash to different slots) + pipe = self.redis.pipeline(transaction=False) pipe.hset(record_key, mapping=record.to_redis_hash()) pipe.sadd(self.ENV_KEYS_INDEX, key_hash) await pipe.execute() @@ -237,9 +246,10 @@ async def create_key( metadata=metadata or {}, ) - # Store in Redis + # Store in Redis (transaction=False for Redis Cluster compatibility + # — record key and index key hash to different slots) record_key = f"{self.RECORD_PREFIX}{key_hash}" - pipe = self.redis.pipeline(transaction=True) + pipe = self.redis.pipeline(transaction=False) pipe.hset(record_key, mapping=record.to_redis_hash()) pipe.sadd(self.INDEX_KEY, key_hash) await pipe.execute() @@ -358,8 +368,9 @@ async def revoke_key(self, key_hash: str) -> bool: if not exists: return False - # Delete from Redis - pipe = self.redis.pipeline(transaction=True) + # Delete from Redis (transaction=False for Redis Cluster compatibility + # — keys hash to different slots) + pipe = self.redis.pipeline(transaction=False) pipe.delete(record_key) pipe.srem(self.INDEX_KEY, key_hash) pipe.delete(f"{self.VALID_CACHE_PREFIX}{self._short_hash(key_hash)}") diff --git a/src/services/detailed_metrics.py b/src/services/detailed_metrics.py index e3cb82b..31b578e 100644 --- a/src/services/detailed_metrics.py +++ b/src/services/detailed_metrics.py @@ -31,12 +31,12 @@ class DetailedMetricsService: """Service for collecting and querying detailed execution metrics.""" - # Redis key prefixes - BUFFER_KEY = "metrics:detailed:buffer" - HOURLY_PREFIX = "metrics:detailed:hourly:" - DAILY_PREFIX = "metrics:detailed:daily:" - POOL_STATS_KEY = "metrics:pool:stats" - API_KEY_HOURLY_PREFIX = "metrics:api_key:" + # Base Redis key prefixes (before application-level prefix) + _BUFFER_KEY = "metrics:detailed:buffer" + _HOURLY_PREFIX = "metrics:detailed:hourly:" + _DAILY_PREFIX = "metrics:detailed:daily:" + _POOL_STATS_KEY = "metrics:pool:stats" + _API_KEY_HOURLY_PREFIX = "metrics:api_key:" # Buffer and retention settings MAX_BUFFER_SIZE = 10000 @@ -52,6 +52,16 @@ def __init__(self, redis_client: redis.Redis | None = None): self._redis = redis_client self._in_memory_buffer: list[DetailedExecutionMetrics] = [] + # Compute prefixed keys once + from ..core.pool import redis_pool + + mk = redis_pool.make_key + self.BUFFER_KEY = mk(self._BUFFER_KEY) + self.HOURLY_PREFIX = mk(self._HOURLY_PREFIX) + self.DAILY_PREFIX = mk(self._DAILY_PREFIX) + self.POOL_STATS_KEY = mk(self._POOL_STATS_KEY) + self.API_KEY_HOURLY_PREFIX = mk(self._API_KEY_HOURLY_PREFIX) + def register_event_handlers(self) -> None: """Register event handlers for pool metrics.""" from ..core.events import ( diff --git a/src/services/file.py b/src/services/file.py index 7c9d7cf..1554398 100644 --- a/src/services/file.py +++ b/src/services/file.py @@ -29,8 +29,11 @@ def __init__(self): # which handles IAM vs static credentials automatically self.minio_client = settings.minio.create_client() - # Initialize Redis client - self.redis_client = redis.from_url(settings.get_redis_url(), decode_responses=True) + # Initialize Redis client via the shared connection pool so that + # cluster, sentinel, and TLS modes are handled automatically. + from ..core.pool import redis_pool + + self.redis_client = redis_pool.get_client() self.bucket_name = settings.minio_bucket @@ -55,11 +58,15 @@ def _get_file_key(self, session_id: str, file_id: str, file_type: str = "uploads def _get_file_metadata_key(self, session_id: str, file_id: str) -> str: """Generate Redis key for file metadata.""" - return f"files:{session_id}:{file_id}" + from ..core.pool import redis_pool + + return redis_pool.make_key(f"files:{session_id}:{file_id}") def _get_session_files_key(self, session_id: str) -> str: """Generate Redis key for session file list.""" - return f"session_files:{session_id}" + from ..core.pool import redis_pool + + return redis_pool.make_key(f"session_files:{session_id}") async def _store_file_metadata(self, session_id: str, file_id: str, metadata: dict[str, Any]) -> None: """Store file metadata in Redis.""" diff --git a/src/services/health.py b/src/services/health.py index 083d04d..af90608 100644 --- a/src/services/health.py +++ b/src/services/health.py @@ -142,16 +142,16 @@ async def check_redis(self) -> HealthCheckResult: try: # Use shared connection pool - if not self._redis_client: - from ..core.pool import redis_pool + from ..core.pool import redis_pool + if not self._redis_client: self._redis_client = redis_pool.get_client() # Test basic connectivity await self._redis_client.ping() # Test read/write operations - test_key = "health_check:test" + test_key = redis_pool.make_key("health_check:test") test_value = f"test_{int(time.time())}" await self._redis_client.set(test_key, test_value, ex=60) diff --git a/src/services/metrics.py b/src/services/metrics.py index cb5281e..7a1f40a 100644 --- a/src/services/metrics.py +++ b/src/services/metrics.py @@ -391,8 +391,10 @@ async def _persist_metrics_to_redis(self) -> None: } # Store in Redis with TTL + from ..core.pool import redis_pool + await self._redis_client.setex( - "metrics:current", + redis_pool.make_key("metrics:current"), 86400, str(metrics_data), # 24 hours TTL ) @@ -400,7 +402,7 @@ async def _persist_metrics_to_redis(self) -> None: # Store historical data (keep last 24 hours) hour_key = datetime.now(UTC).strftime("%Y-%m-%d-%H") await self._redis_client.setex( - f"metrics:hourly:{hour_key}", + redis_pool.make_key(f"metrics:hourly:{hour_key}"), 86400 * 7, # 7 days TTL for hourly data str(metrics_data), ) @@ -417,7 +419,9 @@ async def _load_metrics_from_redis(self) -> None: try: # Load current metrics - current_data = await self._redis_client.get("metrics:current") + from ..core.pool import redis_pool + + current_data = await self._redis_client.get(redis_pool.make_key("metrics:current")) if current_data: # In a full implementation, we would parse and restore the metrics # For now, just log that we found existing data diff --git a/src/services/session.py b/src/services/session.py index 4dbbd9b..0288d20 100644 --- a/src/services/session.py +++ b/src/services/session.py @@ -122,15 +122,15 @@ def _generate_session_id(self) -> str: def _session_key(self, session_id: str) -> str: """Generate Redis key for session data.""" - return f"sessions:{session_id}" + return redis_pool.make_key(f"sessions:{session_id}") def _session_index_key(self) -> str: """Generate Redis key for session index.""" - return "sessions:index" + return redis_pool.make_key("sessions:index") def _entity_sessions_key(self, entity_id: str) -> str: """Generate Redis key for entity-based session grouping.""" - return f"entity_sessions:{entity_id}" + return redis_pool.make_key(f"entity_sessions:{entity_id}") async def create_session(self, request: SessionCreate) -> Session: """Create a new code execution session.""" @@ -169,8 +169,9 @@ async def create_session(self, request: SessionCreate) -> Session: # Extract entity_id from metadata if provided entity_id = request.metadata.get("entity_id") if request.metadata else None - # Use Redis transaction to ensure atomicity - pipe = await self.redis.pipeline(transaction=True) + # Use pipeline for batching (transaction=False for Redis Cluster + # compatibility — keys span different hash slots) + pipe = self.redis.pipeline(transaction=False) try: # Store session data pipe.hset(session_key, mapping=session_data) @@ -307,8 +308,9 @@ async def delete_session(self, session_id: str) -> bool: ) # Continue with session deletion even if file cleanup fails - # Use transaction to ensure atomicity - pipe = await self.redis.pipeline(transaction=True) + # Use pipeline for batching (transaction=False for Redis Cluster + # compatibility — keys span different hash slots) + pipe = self.redis.pipeline(transaction=False) try: # Remove session data pipe.delete(session_key) diff --git a/src/services/state.py b/src/services/state.py index 29d83c9..654c34c 100644 --- a/src/services/state.py +++ b/src/services/state.py @@ -55,19 +55,19 @@ def __init__(self, redis_client: redis.Redis | None = None): def _state_key(self, session_id: str) -> str: """Generate Redis key for session state.""" - return f"{self.KEY_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.KEY_PREFIX}{session_id}") def _hash_key(self, session_id: str) -> str: """Generate Redis key for state hash.""" - return f"{self.HASH_KEY_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.HASH_KEY_PREFIX}{session_id}") def _meta_key(self, session_id: str) -> str: """Generate Redis key for state metadata.""" - return f"{self.META_KEY_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.META_KEY_PREFIX}{session_id}") def _upload_marker_key(self, session_id: str) -> str: """Generate Redis key for upload marker.""" - return f"{self.UPLOAD_MARKER_PREFIX}{session_id}" + return redis_pool.make_key(f"{self.UPLOAD_MARKER_PREFIX}{session_id}") @staticmethod def compute_hash(raw_bytes: bytes) -> str: @@ -133,8 +133,9 @@ async def save_state( state_hash = self.compute_hash(raw_bytes) now = datetime.now(UTC) - # Use pipeline for atomic operations - pipe = self.redis.pipeline(transaction=True) + # Use pipeline for batching (transaction=False for Redis Cluster + # compatibility — state/hash/meta keys hash to different slots) + pipe = self.redis.pipeline(transaction=False) # Save state pipe.setex(self._state_key(session_id), ttl_seconds, state_b64) diff --git a/src/utils/config_validator.py b/src/utils/config_validator.py index 53328aa..80e7fe2 100644 --- a/src/utils/config_validator.py +++ b/src/utils/config_validator.py @@ -5,6 +5,8 @@ import redis from minio.error import S3Error +from redis.cluster import ClusterNode, RedisCluster +from redis.sentinel import Sentinel from ..config import settings @@ -94,18 +96,74 @@ def _validate_file_config(self): self.errors.append(f"File extension must start with dot: {ext}") def _validate_redis_connection(self): - """Validate Redis connection.""" + """Validate Redis connection. + + Uses the correct client type depending on REDIS_MODE (standalone, + cluster, or sentinel) and forwards TLS kwargs so that managed + services with custom CA certificates are validated correctly. + """ try: - # Use Redis URL from settings - client = redis.from_url( - settings.get_redis_url(), - socket_timeout=settings.redis_socket_timeout, - socket_connect_timeout=settings.redis_socket_connect_timeout, - max_connections=settings.redis_max_connections, - ) - - # Test connection - client.ping() + redis_cfg = settings.redis + tls_kwargs = redis_cfg.get_tls_kwargs() + # ``ssl`` is implied by the ``rediss://`` scheme for standalone; + # for cluster/sentinel it's passed directly. + tls_standalone = {k: v for k, v in tls_kwargs.items() if k != "ssl"} + + if redis_cfg.mode == "cluster": + # --- Cluster mode --- + if redis_cfg.cluster_nodes: + startup_nodes = [ + ClusterNode(host=h, port=p) for h, p in redis_cfg.parse_nodes(redis_cfg.cluster_nodes) + ] + else: + startup_nodes = [ClusterNode(host=redis_cfg.host, port=redis_cfg.port)] + + client = RedisCluster( + startup_nodes=startup_nodes, + password=redis_cfg.password, + socket_timeout=redis_cfg.socket_timeout, + socket_connect_timeout=redis_cfg.socket_connect_timeout, + **tls_kwargs, + ) + client.ping() + client.close() + + elif redis_cfg.mode == "sentinel": + # --- Sentinel mode --- + if redis_cfg.sentinel_nodes: + sentinel_hosts = redis_cfg.parse_nodes(redis_cfg.sentinel_nodes) + else: + sentinel_hosts = [(redis_cfg.host, 26379)] + + sentinel = Sentinel( + sentinels=sentinel_hosts, + password=redis_cfg.sentinel_password, + socket_timeout=redis_cfg.socket_timeout, + socket_connect_timeout=redis_cfg.socket_connect_timeout, + **tls_kwargs, + ) + master = sentinel.master_for( + service_name=redis_cfg.sentinel_master, + password=redis_cfg.password, + socket_timeout=redis_cfg.socket_timeout, + socket_connect_timeout=redis_cfg.socket_connect_timeout, + **tls_kwargs, + ) + master.ping() + master.close() + # Sentinel doesn't expose a close method, but it doesn't maintain persistent connections + + else: + # --- Standalone mode --- + client = redis.from_url( + redis_cfg.get_url(), + socket_timeout=redis_cfg.socket_timeout, + socket_connect_timeout=redis_cfg.socket_connect_timeout, + max_connections=settings.redis_max_connections, + **tls_standalone, + ) + client.ping() + client.close() except redis.ConnectionError as e: # Treat as warning in development mode to allow startup without Redis diff --git a/src/utils/logging.py b/src/utils/logging.py index 37c6347..54ceaee 100644 --- a/src/utils/logging.py +++ b/src/utils/logging.py @@ -110,7 +110,7 @@ def configure_third_party_loggers() -> None: def add_service_context(logger, method_name, event_dict): """Add service context information to log entries.""" event_dict["service"] = "kubecoderun-api" - event_dict["version"] = __version__ + event_dict["version"] = settings.service_version or __version__ return event_dict diff --git a/tests/integration/test_redis_cluster.py b/tests/integration/test_redis_cluster.py new file mode 100644 index 0000000..679cc4f --- /dev/null +++ b/tests/integration/test_redis_cluster.py @@ -0,0 +1,295 @@ +"""Integration test for Redis Cluster connectivity. + +Requires a running Redis Cluster on localhost:7000-7005. +Start with: docker compose -f docker-compose.redis-cluster.yml up -d + +Usage: + uv run python -m pytest tests/integration/test_redis_cluster.py -v +""" + +import asyncio +import os + +import pytest +import redis as sync_redis +import redis.asyncio as async_redis +from redis.asyncio.cluster import RedisCluster as AsyncRedisCluster +from redis.cluster import ClusterNode, RedisCluster + +# Only run when cluster is available +CLUSTER_HOST = os.environ.get("REDIS_CLUSTER_HOST", "127.0.0.1") +CLUSTER_PORT = int(os.environ.get("REDIS_CLUSTER_PORT", "7000")) + +pytestmark = pytest.mark.integration + + +def _cluster_available() -> bool: + """Check if a Redis Cluster is reachable.""" + try: + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + decode_responses=True, + socket_timeout=2, + socket_connect_timeout=2, + ) + rc.ping() + rc.close() + return True + except Exception: + return False + + +skip_no_cluster = pytest.mark.skipif( + not _cluster_available(), + reason=f"Redis Cluster not available at {CLUSTER_HOST}:{CLUSTER_PORT}", +) + + +# ── Synchronous (validator path) ────────────────────────────────────────── + + +@skip_no_cluster +class TestSyncRedisCluster: + """Tests using synchronous redis-py RedisCluster (same as config_validator).""" + + def test_connect_with_single_startup_node(self): + """Cluster discovery works from a single startup node.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + decode_responses=True, + socket_timeout=5, + socket_connect_timeout=5, + ) + assert rc.ping() is True + # Verify the cluster is operational via a targeted node + node_info = rc.cluster_info(target_nodes=RedisCluster.RANDOM) + assert node_info.get("cluster_state") == "ok" + rc.close() + + def test_connect_with_multiple_startup_nodes(self): + """Cluster discovery works from multiple startup nodes.""" + nodes = [ + ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT), + ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT + 1), + ] + rc = RedisCluster( + startup_nodes=nodes, + decode_responses=True, + socket_timeout=5, + socket_connect_timeout=5, + ) + assert rc.ping() is True + rc.close() + + def test_connect_with_no_password(self): + """Cluster connects with password=None (no AUTH).""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + password=None, + decode_responses=True, + socket_timeout=5, + ) + assert rc.ping() is True + rc.close() + + def test_empty_password_converted_to_none(self): + """Our validator converts empty password to None to avoid spurious AUTH. + + Redis servers without requirepass accept AUTH with any string, + so we can't observe the bug via an error. Instead, verify that + our Settings validator normalises empty password to None. + """ + from src.config import Settings + + s = Settings(redis_password="") + assert s.redis_password is None + + s2 = Settings(redis_password=" ") + assert s2.redis_password is None + + s3 = Settings(redis_password="real-password") + assert s3.redis_password == "real-password" + + def test_set_get_operations(self): + """Basic SET/GET across cluster slots.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT)], + decode_responses=True, + ) + # These keys hash to different slots + for i in range(10): + key = f"test:cluster:{i}" + rc.set(key, f"value-{i}") + assert rc.get(key) == f"value-{i}" + rc.delete(key) + rc.close() + + +# ── Asynchronous (pool path) ───────────────────────────────────────────── + + +@skip_no_cluster +class TestAsyncRedisCluster: + """Tests using async redis-py RedisCluster (same as RedisPool._init_cluster).""" + + @pytest.mark.asyncio + async def test_async_connect_and_ping(self): + """Async cluster client connects and pings.""" + from redis.backoff import ExponentialBackoff + from redis.exceptions import ConnectionError, TimeoutError + from redis.retry import Retry + + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT), + ], + password=None, + decode_responses=True, + max_connections=20, + socket_timeout=5.0, + socket_connect_timeout=5.0, + retry=Retry(ExponentialBackoff(), retries=3), + retry_on_error=[ConnectionError, TimeoutError], + ) + result = await rc.ping() + assert result is True + await rc.aclose() + + @pytest.mark.asyncio + async def test_async_set_get(self): + """Async SET/GET across cluster slots.""" + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=CLUSTER_HOST, port=CLUSTER_PORT), + ], + decode_responses=True, + ) + for i in range(10): + key = f"test:async:cluster:{i}" + await rc.set(key, f"value-{i}") + val = await rc.get(key) + assert val == f"value-{i}" + await rc.delete(key) + await rc.aclose() + + +# ── RedisPool integration ──────────────────────────────────────────────── + + +@skip_no_cluster +class TestRedisPoolClusterMode: + """Test RedisPool with actual cluster backend.""" + + @pytest.mark.asyncio + async def test_pool_cluster_mode(self, monkeypatch): + """RedisPool initializes in cluster mode and can SET/GET.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") # empty = no auth + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") # empty = fallback to host:port + + # Re-import to pick up new env + from src.config import Settings + + settings_obj = Settings() + cfg = settings_obj.redis + + # Verify our validators worked + assert cfg.password is None, f"Expected None, got {cfg.password!r}" + assert cfg.cluster_nodes is None, f"Expected None, got {cfg.cluster_nodes!r}" + + from src.core.pool import RedisPool + + pool = RedisPool() + # Inject our test settings + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + assert isinstance(client, AsyncRedisCluster) + + # Test operations + await client.set("test:pool:cluster", "works") + val = await client.get("test:pool:cluster") + assert val == "works" + await client.delete("test:pool:cluster") + await client.aclose() + + @pytest.mark.asyncio + async def test_pool_cluster_mode_with_explicit_nodes(self, monkeypatch): + """RedisPool uses REDIS_CLUSTER_NODES when provided.""" + nodes_str = f"{CLUSTER_HOST}:{CLUSTER_PORT},{CLUSTER_HOST}:{CLUSTER_PORT + 1}" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_CLUSTER_NODES", nodes_str) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + + from src.config import Settings + + settings_obj = Settings() + cfg = settings_obj.redis + + assert cfg.cluster_nodes == nodes_str + assert cfg.password is None + + from src.core.pool import RedisPool + + pool = RedisPool() + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + result = await client.ping() + assert result is True + await client.aclose() + + +# ── Config Validator integration ───────────────────────────────────────── + + +@skip_no_cluster +class TestConfigValidatorClusterMode: + """Test ConfigValidator._validate_redis_connection with real cluster.""" + + def test_validator_cluster_succeeds(self, monkeypatch): + """Config validator passes with a real cluster.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert not validator.errors, f"Unexpected errors: {validator.errors}" + + def test_validator_cluster_with_explicit_nodes(self, monkeypatch): + """Config validator passes with explicit cluster nodes.""" + nodes_str = f"{CLUSTER_HOST}:{CLUSTER_PORT},{CLUSTER_HOST}:{CLUSTER_PORT + 1},{CLUSTER_HOST}:{CLUSTER_PORT + 2}" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_CLUSTER_NODES", nodes_str) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "false") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert not validator.errors, f"Unexpected errors: {validator.errors}" diff --git a/tests/integration/test_redis_cluster_tls.py b/tests/integration/test_redis_cluster_tls.py new file mode 100644 index 0000000..cd898bb --- /dev/null +++ b/tests/integration/test_redis_cluster_tls.py @@ -0,0 +1,454 @@ +"""Integration tests for Redis Cluster with TLS. + +Tests a production-like GCP Memorystore for Redis Cluster with TLS configuration: +- REDIS_MODE=cluster +- REDIS_TLS_ENABLED=true +- REDIS_TLS_CA_CERT_FILE=/path/to/ca.crt (server verification) +- REDIS_TLS_CERT_FILE="" (no client cert / no mTLS) +- REDIS_TLS_KEY_FILE="" (no client key / no mTLS) +- REDIS_TLS_INSECURE=false (certificate chain verified) +- REDIS_TLS_CHECK_HOSTNAME not set (defaults to false) +- REDIS_PASSWORD="" (no authentication) +- REDIS_CLUSTER_NODES not set (falls back to host:port) +- REDIS_KEY_PREFIX=kubecoderun: + +Requires a running TLS Redis Cluster on localhost:6380-6385. +Start with: docker compose -f docker-compose.redis-cluster-tls.yml up -d + +Usage: + uv run python -m pytest tests/integration/test_redis_cluster_tls.py -v +""" + +import os +import ssl as ssl_mod +from pathlib import Path + +import pytest +import redis as sync_redis +import redis.asyncio as async_redis +from redis.asyncio.cluster import RedisCluster as AsyncRedisCluster +from redis.cluster import ClusterNode, RedisCluster + +# ── Configuration matching production ──────────────────────────────────── + +TLS_CLUSTER_HOST = os.environ.get("REDIS_TLS_CLUSTER_HOST", "127.0.0.1") +TLS_CLUSTER_PORT = int(os.environ.get("REDIS_TLS_CLUSTER_PORT", "6380")) + +# CA cert path (relative to project root, same concept as production +# REDIS_TLS_CA_CERT_FILE=/app/api/cache/redis-ca.crt) +CERTS_DIR = Path(__file__).resolve().parent.parent / "tls-certs" +CA_CERT_FILE = str(CERTS_DIR / "ca.crt") + +pytestmark = pytest.mark.integration + + +def _tls_kwargs_production() -> dict: + """Build TLS kwargs matching production config. + + This mirrors what RedisConfig.get_tls_kwargs() produces with: + REDIS_TLS_ENABLED=true + REDIS_TLS_INSECURE=false + REDIS_TLS_CHECK_HOSTNAME=false (default) + REDIS_TLS_CA_CERT_FILE=/path/to/ca.crt + REDIS_TLS_CERT_FILE="" -> None + REDIS_TLS_KEY_FILE="" -> None + """ + return { + "ssl": True, + "ssl_cert_reqs": ssl_mod.CERT_REQUIRED, + "ssl_check_hostname": False, + "ssl_ca_certs": CA_CERT_FILE, + } + + +def _tls_cluster_available() -> bool: + """Check if a TLS Redis Cluster is reachable.""" + try: + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + socket_timeout=3, + socket_connect_timeout=3, + **_tls_kwargs_production(), + ) + rc.ping() + rc.close() + return True + except Exception: + return False + + +skip_no_tls_cluster = pytest.mark.skipif( + not _tls_cluster_available(), + reason=f"TLS Redis Cluster not available at {TLS_CLUSTER_HOST}:{TLS_CLUSTER_PORT}", +) + + +# ── Synchronous TLS Cluster tests ──────────────────────────────────────── + + +@skip_no_tls_cluster +class TestSyncTlsCluster: + """Synchronous redis-py with TLS (same path as config_validator).""" + + def test_connect_single_startup_node_tls(self): + """TLS cluster discovery from a single startup node.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + socket_timeout=5, + socket_connect_timeout=5, + **_tls_kwargs_production(), + ) + assert rc.ping() is True + node_info = rc.cluster_info(target_nodes=RedisCluster.RANDOM) + assert node_info.get("cluster_state") == "ok" + rc.close() + + def test_connect_no_password_tls(self): + """TLS cluster with password=None (production has REDIS_PASSWORD='').""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + password=None, + decode_responses=True, + socket_timeout=5, + **_tls_kwargs_production(), + ) + assert rc.ping() is True + rc.close() + + def test_set_get_across_slots_tls(self): + """SET/GET across cluster slots over TLS.""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + **_tls_kwargs_production(), + ) + for i in range(10): + key = f"test:tls:cluster:{i}" + rc.set(key, f"value-{i}") + assert rc.get(key) == f"value-{i}" + rc.delete(key) + rc.close() + + def test_key_prefix_operations_tls(self): + """Operations with kubecoderun: prefix (matching production key_prefix).""" + rc = RedisCluster( + startup_nodes=[ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT)], + decode_responses=True, + **_tls_kwargs_production(), + ) + prefix = "kubecoderun:" + key = f"{prefix}session:test-abc" + rc.set(key, "session-data") + assert rc.get(key) == "session-data" + rc.delete(key) + rc.close() + + +# ── Asynchronous TLS Cluster tests ─────────────────────────────────────── + + +@skip_no_tls_cluster +class TestAsyncTlsCluster: + """Async redis-py with TLS (same path as RedisPool._init_cluster).""" + + @pytest.mark.asyncio + async def test_async_connect_tls(self): + """Async TLS cluster client connects and pings.""" + from redis.backoff import ExponentialBackoff + from redis.exceptions import ConnectionError, TimeoutError + from redis.retry import Retry + + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT), + ], + password=None, + decode_responses=True, + max_connections=20, + socket_timeout=5.0, + socket_connect_timeout=5.0, + retry=Retry(ExponentialBackoff(), retries=3), + retry_on_error=[ConnectionError, TimeoutError], + **_tls_kwargs_production(), + ) + assert await rc.ping() is True + await rc.aclose() + + @pytest.mark.asyncio + async def test_async_set_get_tls(self): + """Async SET/GET over TLS cluster.""" + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT), + ], + decode_responses=True, + **_tls_kwargs_production(), + ) + for i in range(10): + key = f"test:async:tls:{i}" + await rc.set(key, f"tls-value-{i}") + val = await rc.get(key) + assert val == f"tls-value-{i}" + await rc.delete(key) + await rc.aclose() + + @pytest.mark.asyncio + async def test_async_prefixed_operations_tls(self): + """Async operations with production-like key prefix over TLS.""" + rc = AsyncRedisCluster( + startup_nodes=[ + async_redis.cluster.ClusterNode(host=TLS_CLUSTER_HOST, port=TLS_CLUSTER_PORT), + ], + decode_responses=True, + **_tls_kwargs_production(), + ) + prefix = "kubecoderun:" + keys = [f"{prefix}session:{i}" for i in range(5)] + for key in keys: + await rc.set(key, "data") + assert await rc.get(key) == "data" + for key in keys: + await rc.delete(key) + await rc.aclose() + + +# ── RedisPool with TLS Cluster ─────────────────────────────────────────── + + +@skip_no_tls_cluster +class TestRedisPoolTlsCluster: + """Test RedisPool with TLS cluster backend — mirrors production config.""" + + @pytest.mark.asyncio + async def test_pool_tls_cluster_production_config(self, monkeypatch): + """RedisPool initializes with the exact production configuration. + + Env vars set here match the user's Helm values: + REDIS_MODE: "cluster" + REDIS_HOST: + REDIS_PORT: "6380" + REDIS_PASSWORD: "" + REDIS_DB: "0" + REDIS_MAX_CONNECTIONS: "20" + REDIS_SOCKET_TIMEOUT: "5" + REDIS_SOCKET_CONNECT_TIMEOUT: "5" + REDIS_KEY_PREFIX: "kubecoderun:" + REDIS_TLS_ENABLED: "true" + REDIS_TLS_CA_CERT_FILE: + REDIS_TLS_CERT_FILE: "" + REDIS_TLS_KEY_FILE: "" + REDIS_TLS_INSECURE: "false" + """ + # Set env vars exactly as Helm renders them in production + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") # empty -> None via validator + monkeypatch.setenv("REDIS_DB", "0") + monkeypatch.setenv("REDIS_MAX_CONNECTIONS", "20") + monkeypatch.setenv("REDIS_SOCKET_TIMEOUT", "5") + monkeypatch.setenv("REDIS_SOCKET_CONNECT_TIMEOUT", "5") + monkeypatch.setenv("REDIS_KEY_PREFIX", "kubecoderun:") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_CERT_FILE", "") # no client cert + monkeypatch.setenv("REDIS_TLS_KEY_FILE", "") # no client key + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") # empty -> None, fallback to host:port + + from src.config import Settings + + settings_obj = Settings() + cfg = settings_obj.redis + + # Verify validators worked correctly + assert cfg.mode == "cluster" + assert cfg.host == TLS_CLUSTER_HOST + assert cfg.port == TLS_CLUSTER_PORT + assert cfg.password is None, f"Expected None, got {cfg.password!r}" + assert cfg.cluster_nodes is None, f"Expected None, got {cfg.cluster_nodes!r}" + assert cfg.tls_enabled is True + assert cfg.tls_ca_cert_file == CA_CERT_FILE + assert cfg.tls_cert_file is None or cfg.tls_cert_file == "" + assert cfg.tls_key_file is None or cfg.tls_key_file == "" + assert cfg.tls_insecure is False + assert cfg.tls_check_hostname is False # default + assert cfg.key_prefix == "kubecoderun:" + + # Verify TLS kwargs + tls_kwargs = cfg.get_tls_kwargs() + assert tls_kwargs["ssl"] is True + assert tls_kwargs["ssl_cert_reqs"] == ssl_mod.CERT_REQUIRED + assert tls_kwargs["ssl_check_hostname"] is False + assert tls_kwargs["ssl_ca_certs"] == CA_CERT_FILE + assert "ssl_certfile" not in tls_kwargs # no client cert + assert "ssl_keyfile" not in tls_kwargs # no client key + + # Initialize pool + from src.core.pool import RedisPool + + pool = RedisPool() + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + assert isinstance(client, AsyncRedisCluster) + assert pool.key_prefix == "kubecoderun:" + + # Test operations with prefix + full_key = pool.make_key("session:test-tls") + assert full_key == "kubecoderun:session:test-tls" + + await client.set(full_key, "tls-session-data") + val = await client.get(full_key) + assert val == "tls-session-data" + await client.delete(full_key) + + await pool.close() + + @pytest.mark.asyncio + async def test_pool_tls_cluster_without_key_prefix(self, monkeypatch): + """RedisPool works in TLS cluster mode without key prefix.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_KEY_PREFIX", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + from src.core.pool import RedisPool + + settings_obj = Settings() + pool = RedisPool() + monkeypatch.setattr("src.core.pool.settings", settings_obj) + pool._initialize() + + client = pool.get_client() + assert pool.key_prefix == "" + assert pool.make_key("mykey") == "mykey" + + await client.set("test:no-prefix:tls", "ok") + assert await client.get("test:no-prefix:tls") == "ok" + await client.delete("test:no-prefix:tls") + await pool.close() + + +# ── ConfigValidator with TLS Cluster ───────────────────────────────────── + + +@skip_no_tls_cluster +class TestConfigValidatorTlsCluster: + """Test ConfigValidator._validate_redis_connection with TLS cluster.""" + + def test_validator_tls_cluster_production_config(self, monkeypatch): + """Config validator passes with production-like TLS cluster config.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_CERT_FILE", "") + monkeypatch.setenv("REDIS_TLS_KEY_FILE", "") + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert not validator.errors, f"Unexpected errors: {validator.errors}" + + def test_validator_tls_cluster_bad_ca_cert_fails(self, monkeypatch): + """Config validator fails when CA cert path is wrong.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_HOST", TLS_CLUSTER_HOST) + monkeypatch.setenv("REDIS_PORT", str(TLS_CLUSTER_PORT)) + monkeypatch.setenv("REDIS_PASSWORD", "") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", "/nonexistent/ca.crt") + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + monkeypatch.setenv("REDIS_CLUSTER_NODES", "") + + from src.config import Settings + + settings_obj = Settings() + monkeypatch.setattr("src.utils.config_validator.settings", settings_obj) + + from src.utils.config_validator import ConfigValidator + + validator = ConfigValidator() + validator._validate_redis_connection() + + assert len(validator.errors) > 0, "Expected validation error for bad CA cert" + + +# ── RedisConfig TLS kwargs verification ────────────────────────────────── + + +@skip_no_tls_cluster +class TestRedisConfigTlsKwargs: + """Verify RedisConfig.get_tls_kwargs() produces correct kwargs for production.""" + + def test_production_tls_kwargs(self, monkeypatch): + """get_tls_kwargs() output matches what RedisCluster needs for TLS.""" + monkeypatch.setenv("REDIS_MODE", "cluster") + monkeypatch.setenv("REDIS_TLS_ENABLED", "true") + monkeypatch.setenv("REDIS_TLS_CA_CERT_FILE", CA_CERT_FILE) + monkeypatch.setenv("REDIS_TLS_CERT_FILE", "") + monkeypatch.setenv("REDIS_TLS_KEY_FILE", "") + monkeypatch.setenv("REDIS_TLS_INSECURE", "false") + + from src.config.redis import RedisConfig + + cfg = RedisConfig( + redis_mode="cluster", + redis_tls_enabled=True, + redis_tls_ca_cert_file=CA_CERT_FILE, + redis_tls_cert_file="", + redis_tls_key_file="", + redis_tls_insecure=False, + ) + kwargs = cfg.get_tls_kwargs() + + assert kwargs["ssl"] is True + assert kwargs["ssl_cert_reqs"] == ssl_mod.CERT_REQUIRED + assert kwargs["ssl_check_hostname"] is False + assert kwargs["ssl_ca_certs"] == CA_CERT_FILE + # Empty string cert/key files should NOT be in kwargs + assert "ssl_certfile" not in kwargs + assert "ssl_keyfile" not in kwargs + + def test_tls_insecure_kwargs(self, monkeypatch): + """get_tls_kwargs() with insecure mode skips cert verification.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig( + redis_mode="cluster", + redis_tls_enabled=True, + redis_tls_insecure=True, + ) + kwargs = cfg.get_tls_kwargs() + + assert kwargs["ssl"] is True + assert kwargs["ssl_cert_reqs"] == ssl_mod.CERT_NONE + assert kwargs["ssl_check_hostname"] is False + + def test_tls_disabled_returns_empty(self): + """get_tls_kwargs() returns empty dict when TLS is off.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_tls_enabled=False) + assert cfg.get_tls_kwargs() == {} diff --git a/tests/tls-certs/.gitignore b/tests/tls-certs/.gitignore new file mode 100644 index 0000000..3ba3676 --- /dev/null +++ b/tests/tls-certs/.gitignore @@ -0,0 +1,6 @@ +# Generated TLS certificates — do not commit +*.key +*.crt +*.csr +*.srl +*.cnf diff --git a/tests/tls-certs/cleanup.sh b/tests/tls-certs/cleanup.sh new file mode 100644 index 0000000..3128b51 --- /dev/null +++ b/tests/tls-certs/cleanup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Remove generated TLS certificates from tests/tls-certs/. +# See also: generate.sh, docker-compose.redis-cluster-tls.yml +# +# Usage: +# cd tests/tls-certs && ./cleanup.sh +set -euo pipefail +cd "$(dirname "$0")" + +rm -f ca.key ca.crt ca.srl ca-ext.cnf +rm -f redis.key redis.crt redis.csr redis-ext.cnf + +echo "TLS certificates cleaned up." diff --git a/tests/tls-certs/generate.sh b/tests/tls-certs/generate.sh new file mode 100644 index 0000000..53874ba --- /dev/null +++ b/tests/tls-certs/generate.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# Generate self-signed TLS certificates for Redis Cluster integration testing. +# +# Creates: +# ca.key / ca.crt — Certificate Authority (with keyUsage extensions for Python 3.14+) +# redis.key / redis.crt — Server cert signed by the CA (SANs for localhost + docker IPs) +# +# Used by: docker-compose.redis-cluster-tls.yml +# +# Usage: +# cd tests/tls-certs && ./generate.sh +set -euo pipefail +cd "$(dirname "$0")" + +echo "Generating CA key + certificate..." +cat > ca-ext.cnf << 'EOF' +[req] +default_bits = 4096 +prompt = no +distinguished_name = dn +x509_extensions = v3_ca + +[dn] +C = PT +ST = Lisboa +L = Lisboa +O = NOS Testing +CN = Redis Test CA + +[v3_ca] +subjectKeyIdentifier = hash +authorityKeyIdentifier = keyid:always,issuer +basicConstraints = critical, CA:TRUE +keyUsage = critical, keyCertSign, cRLSign +EOF + +openssl genrsa -out ca.key 4096 2>/dev/null +openssl req -x509 -new -nodes -key ca.key -sha256 -days 3650 \ + -out ca.crt -config ca-ext.cnf 2>/dev/null + +echo "Generating server key + certificate..." +cat > redis-ext.cnf << 'EOF' +[req] +default_bits = 2048 +prompt = no +distinguished_name = dn +req_extensions = v3_req + +[dn] +C = PT +ST = Lisboa +L = Lisboa +O = NOS Testing +CN = redis-node + +[v3_req] +subjectAltName = @alt_names +basicConstraints = CA:FALSE +keyUsage = digitalSignature, keyEncipherment +extendedKeyUsage = serverAuth, clientAuth + +[alt_names] +DNS.1 = redis-tls-node-0 +DNS.2 = redis-tls-node-1 +DNS.3 = redis-tls-node-2 +DNS.4 = redis-tls-node-3 +DNS.5 = redis-tls-node-4 +DNS.6 = redis-tls-node-5 +DNS.7 = localhost +IP.1 = 127.0.0.1 +IP.2 = 172.17.0.1 +IP.3 = 172.18.0.1 +IP.4 = 172.19.0.1 +IP.5 = 172.20.0.1 +IP.6 = 172.21.0.1 +IP.7 = 172.22.0.1 +IP.8 = 172.23.0.1 +IP.9 = 172.24.0.1 +IP.10 = 172.25.0.1 +EOF + +openssl genrsa -out redis.key 2048 2>/dev/null +openssl req -new -key redis.key -out redis.csr -config redis-ext.cnf 2>/dev/null +openssl x509 -req -in redis.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + -out redis.crt -days 3650 -sha256 \ + -extfile redis-ext.cnf -extensions v3_req 2>/dev/null + +# Redis needs world-readable key files (containers run as redis user) +chmod 644 redis.key +# CA private key should stay restricted — it is not needed by Redis containers +chmod 600 ca.key + +echo "Verifying certificate chain..." +openssl verify -CAfile ca.crt redis.crt + +echo "Done. Certificates generated in $(pwd)/" diff --git a/tests/unit/test_cluster_pipeline_compat.py b/tests/unit/test_cluster_pipeline_compat.py new file mode 100644 index 0000000..b308e8f --- /dev/null +++ b/tests/unit/test_cluster_pipeline_compat.py @@ -0,0 +1,244 @@ +"""Unit tests verifying that all Redis pipelines use transaction=False. + +Redis Cluster does not support MULTI/EXEC transactions across keys in +different hash slots. Every pipeline that touches keys with different +prefixes (e.g. session data + session index) MUST use transaction=False +so redis-py's ClusterPipeline can split commands by node. + +These tests act as a safety net: if someone accidentally changes a +pipeline back to transaction=True, the test will catch it. +""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from src.models.session import SessionCreate +from src.services.api_key_manager import ApiKeyManagerService +from src.services.session import SessionService +from src.services.state import StateService + +# ── Session Service ───────────────────────────────────────────────────── + + +@pytest.fixture +def mock_redis_session(): + """Mock Redis client for session tests.""" + redis_mock = AsyncMock() + + pipeline_mock = AsyncMock() + pipeline_mock.hset = MagicMock() + pipeline_mock.expire = MagicMock() + pipeline_mock.sadd = MagicMock() + pipeline_mock.delete = MagicMock() + pipeline_mock.srem = MagicMock() + pipeline_mock.execute = AsyncMock(return_value=[True, True, True]) + pipeline_mock.reset = AsyncMock() + + redis_mock.pipeline = MagicMock(return_value=pipeline_mock) + redis_mock.hgetall = AsyncMock(return_value={}) + return redis_mock + + +@pytest.fixture +def session_service(mock_redis_session): + return SessionService(redis_client=mock_redis_session) + + +@pytest.mark.asyncio +async def test_session_create_uses_non_transactional_pipeline(session_service, mock_redis_session): + """create_session() must use transaction=False for cluster compat.""" + request = SessionCreate(metadata={"test": "value"}) + await session_service.create_session(request) + + mock_redis_session.pipeline.assert_called_once_with(transaction=False) + + +@pytest.mark.asyncio +async def test_session_delete_uses_non_transactional_pipeline(session_service, mock_redis_session): + """delete_session() must use transaction=False for cluster compat.""" + session_id = "session-to-delete" + # Provide minimal session data so delete_session finds the session + mock_redis_session.hgetall.return_value = { + "session_id": session_id, + "status": "active", + "created_at": "2025-01-01T00:00:00", + "last_activity": "2025-01-01T00:00:00", + "expires_at": "2026-01-01T00:00:00", + "files": "{}", + "metadata": "{}", + "working_directory": "/workspace", + } + + pipeline_mock = mock_redis_session.pipeline.return_value + pipeline_mock.execute = AsyncMock(return_value=[1, 1]) + + await session_service.delete_session(session_id) + + mock_redis_session.pipeline.assert_called_with(transaction=False) + + +# ── API Key Manager ───────────────────────────────────────────────────── + + +@pytest.fixture +def mock_redis_apikey(): + """Mock Redis client for API key manager tests.""" + redis_mock = AsyncMock() + redis_mock.hgetall = AsyncMock(return_value={}) + redis_mock.hset = AsyncMock(return_value=1) + redis_mock.exists = AsyncMock(return_value=True) + redis_mock.delete = AsyncMock(return_value=1) + redis_mock.sadd = AsyncMock(return_value=1) + redis_mock.srem = AsyncMock(return_value=1) + redis_mock.smembers = AsyncMock(return_value=set()) + redis_mock.get = AsyncMock(return_value=None) + redis_mock.setex = AsyncMock(return_value=True) + redis_mock.incr = AsyncMock(return_value=1) + redis_mock.expire = AsyncMock(return_value=True) + redis_mock.hincrby = AsyncMock(return_value=1) + + pipeline_mock = AsyncMock() + pipeline_mock.hset = MagicMock() + pipeline_mock.sadd = MagicMock() + pipeline_mock.delete = MagicMock() + pipeline_mock.srem = MagicMock() + pipeline_mock.incr = MagicMock() + pipeline_mock.expire = MagicMock() + pipeline_mock.hincrby = MagicMock() + pipeline_mock.execute = AsyncMock(return_value=[True, True, True]) + redis_mock.pipeline = MagicMock(return_value=pipeline_mock) + + return redis_mock + + +@pytest.fixture +def api_key_manager(mock_redis_apikey): + return ApiKeyManagerService(redis_client=mock_redis_apikey) + + +@pytest.mark.asyncio +async def test_create_key_uses_non_transactional_pipeline(api_key_manager, mock_redis_apikey): + """create_key() must use transaction=False for cluster compat.""" + result = await api_key_manager.create_key( + name="test-key", + ) + + # create_key calls pipeline at least once + mock_redis_apikey.pipeline.assert_called() + for call in mock_redis_apikey.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +@pytest.mark.asyncio +async def test_ensure_single_env_key_uses_non_transactional_pipeline(api_key_manager, mock_redis_apikey): + """_ensure_single_env_key_record() must use transaction=False.""" + # Call the internal method directly + await api_key_manager._ensure_single_env_key_record("test-hash", "test-env") + + mock_redis_apikey.pipeline.assert_called() + for call in mock_redis_apikey.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +@pytest.mark.asyncio +async def test_revoke_key_uses_non_transactional_pipeline(api_key_manager, mock_redis_apikey): + """revoke_key() must use transaction=False for cluster compat.""" + # Setup: make the key "exist" so revoke proceeds + mock_redis_apikey.hgetall.return_value = { + "name": "test-key", + "key_hash": "abc123", + "environment": "test", + "status": "active", + "created_at": "2025-01-01T00:00:00+00:00", + } + mock_redis_apikey.exists.return_value = True + + await api_key_manager.revoke_key("abc123") + + mock_redis_apikey.pipeline.assert_called() + for call in mock_redis_apikey.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +# ── State Service ─────────────────────────────────────────────────────── + + +@pytest.fixture +def mock_redis_state(): + """Mock Redis client for state service tests.""" + client = AsyncMock() + client.get = AsyncMock(return_value=None) + client.setex = AsyncMock() + client.delete = AsyncMock() + client.strlen = AsyncMock(return_value=0) + client.ttl = AsyncMock(return_value=-1) + client.expire = AsyncMock() + + pipeline_mock = AsyncMock() + pipeline_mock.set = MagicMock() + pipeline_mock.setex = MagicMock() + pipeline_mock.expire = MagicMock() + pipeline_mock.execute = AsyncMock(return_value=[True, True, True, True, True]) + client.pipeline = MagicMock(return_value=pipeline_mock) + + return client + + +@pytest.fixture +def state_service(mock_redis_state): + with patch("src.services.state.redis_pool") as mock_pool: + mock_pool.get_client.return_value = mock_redis_state + service = StateService(redis_client=mock_redis_state) + return service + + +@pytest.mark.asyncio +async def test_save_state_uses_non_transactional_pipeline(state_service, mock_redis_state): + """save_state() must use transaction=False for cluster compat.""" + import base64 + + session_id = "state-test-session" + raw_bytes = b"\x02test state data" + state_b64 = base64.b64encode(raw_bytes).decode("utf-8") + + await state_service.save_state(session_id, state_b64) + + mock_redis_state.pipeline.assert_called() + for call in mock_redis_state.pipeline.call_args_list: + assert call == ((), {"transaction": False}), f"Expected pipeline(transaction=False), got {call}" + + +# ── Version resolution ────────────────────────────────────────────────── + + +class TestVersionResolution: + """Tests for SERVICE_VERSION env var override.""" + + def test_logging_uses_service_version_when_set(self): + """add_service_context should prefer settings.service_version.""" + with ( + patch("src.utils.logging.settings") as mock_settings, + patch("src.utils.logging.__version__", "0.0.0.dev0"), + ): + mock_settings.service_version = "2.1.4" + from src.utils.logging import add_service_context + + event_dict = {} + add_service_context(None, None, event_dict) + + assert event_dict["version"] == "2.1.4" + + def test_logging_falls_back_to_build_version(self): + """add_service_context should fall back to __version__ when SERVICE_VERSION unset.""" + with ( + patch("src.utils.logging.settings") as mock_settings, + patch("src.utils.logging.__version__", "1.2.3"), + ): + mock_settings.service_version = None + from src.utils.logging import add_service_context + + event_dict = {} + add_service_context(None, None, event_dict) + + assert event_dict["version"] == "1.2.3" diff --git a/tests/unit/test_core_pool.py b/tests/unit/test_core_pool.py index 9d486a8..8d46d60 100644 --- a/tests/unit/test_core_pool.py +++ b/tests/unit/test_core_pool.py @@ -51,25 +51,60 @@ def test_initialize_creates_pool(self): assert pool._initialized is True assert pool._client is not None - def test_initialize_fallback_on_error(self): - """Test _initialize creates fallback client on error.""" + def test_initialize_raises_on_error(self): + """Test _initialize propagates errors instead of silently falling back.""" pool = RedisPool() with patch("src.core.pool.settings") as mock_settings: - mock_settings.get_redis_url.side_effect = Exception("Connection failed") - - with patch("src.core.pool.redis.from_url") as mock_from_url: - mock_from_url.return_value = MagicMock() - + mock_settings.redis.mode = "standalone" + mock_settings.redis.get_url.side_effect = Exception("Connection failed") + mock_settings.redis.get_tls_kwargs.return_value = {} + mock_settings.redis.key_prefix = "" + mock_settings.redis.max_connections = 20 + mock_settings.redis.socket_timeout = 5 + mock_settings.redis.socket_connect_timeout = 5 + + with pytest.raises(Exception, match="Connection failed"): pool._initialize() - assert pool._initialized is True - assert pool._client is not None + assert pool._initialized is False + assert pool._client is None class TestGetClient: """Tests for get_client method.""" + def test_init_cluster_does_not_pass_retry_on_timeout(self): + """Test _init_cluster uses retry/retry_on_error instead of retry_on_timeout. + + RedisCluster (async) does not accept retry_on_timeout as a kwarg. + """ + pool = RedisPool() + + with patch("src.core.pool.settings") as mock_settings: + cfg = mock_settings.redis + cfg.mode = "cluster" + cfg.host = "redis-host" + cfg.port = 6379 + cfg.password = None + cfg.cluster_nodes = None + cfg.key_prefix = "" + cfg.tls_enabled = False + cfg.max_connections = 20 + cfg.socket_timeout = 5 + cfg.socket_connect_timeout = 5 + cfg.get_tls_kwargs.return_value = {} + + with patch("src.core.pool.RedisCluster") as mock_cluster: + mock_cluster.return_value = MagicMock() + pool._initialize() + + mock_cluster.assert_called_once() + call_kwargs = mock_cluster.call_args[1] + assert "retry_on_timeout" not in call_kwargs, "RedisCluster does not accept retry_on_timeout" + assert "retry" in call_kwargs + assert "retry_on_error" in call_kwargs + def test_get_client_initializes_if_needed(self): """Test get_client initializes the pool if not initialized.""" pool = RedisPool() @@ -106,7 +141,7 @@ def test_pool_stats_not_initialized(self): stats = pool.pool_stats - assert stats == {"initialized": False} + assert stats == {"initialized": False, "mode": "standalone"} def test_pool_stats_initialized(self): """Test pool_stats when pool is initialized.""" @@ -114,10 +149,12 @@ def test_pool_stats_initialized(self): mock_pool = MagicMock() mock_pool.max_connections = 20 pool._pool = mock_pool + pool._initialized = True stats = pool.pool_stats assert stats["initialized"] is True + assert stats["mode"] == "standalone" assert stats["max_connections"] == 20 diff --git a/tests/unit/test_session_service.py b/tests/unit/test_session_service.py index 579e571..f4e205a 100644 --- a/tests/unit/test_session_service.py +++ b/tests/unit/test_session_service.py @@ -26,8 +26,8 @@ def mock_redis(): pipeline_mock.execute = AsyncMock(return_value=[True, True, True]) pipeline_mock.reset = AsyncMock() - # Make pipeline() return the pipeline mock when awaited - redis_mock.pipeline = AsyncMock(return_value=pipeline_mock) + # Make pipeline() return the pipeline mock (synchronous, like redis.asyncio) + redis_mock.pipeline = MagicMock(return_value=pipeline_mock) return redis_mock diff --git a/tests/unit/test_settings_validators.py b/tests/unit/test_settings_validators.py index 01744c4..d07d602 100644 --- a/tests/unit/test_settings_validators.py +++ b/tests/unit/test_settings_validators.py @@ -42,3 +42,87 @@ def test_default_is_runtime_default(self): """Test that the default seccomp profile type is RuntimeDefault.""" settings = Settings() assert settings.k8s_seccomp_profile_type == "RuntimeDefault" + + +class TestRedisPasswordValidator: + """Tests for empty-string-to-None password sanitization.""" + + def test_empty_password_becomes_none(self): + """Empty string REDIS_PASSWORD is converted to None.""" + settings = Settings(redis_password="") + assert settings.redis_password is None + + def test_whitespace_password_becomes_none(self): + """Whitespace-only REDIS_PASSWORD is converted to None.""" + settings = Settings(redis_password=" ") + assert settings.redis_password is None + + def test_real_password_preserved(self): + """Non-empty password is kept as-is.""" + settings = Settings(redis_password="s3cret") + assert settings.redis_password == "s3cret" + + def test_none_password_stays_none(self): + """None password stays None.""" + settings = Settings(redis_password=None) + assert settings.redis_password is None + + def test_empty_sentinel_password_becomes_none(self): + """Empty sentinel password is converted to None.""" + settings = Settings(redis_sentinel_password="") + assert settings.redis_sentinel_password is None + + +class TestRedisClusterNodesValidator: + """Tests for empty-string-to-None cluster/sentinel node sanitization.""" + + def test_empty_cluster_nodes_becomes_none(self): + """Empty REDIS_CLUSTER_NODES is converted to None.""" + settings = Settings(redis_cluster_nodes="") + assert settings.redis_cluster_nodes is None + + def test_whitespace_cluster_nodes_becomes_none(self): + """Whitespace-only REDIS_CLUSTER_NODES is converted to None.""" + settings = Settings(redis_cluster_nodes=" ") + assert settings.redis_cluster_nodes is None + + def test_real_cluster_nodes_preserved(self): + """Valid node list is kept.""" + settings = Settings(redis_cluster_nodes="node1:7000,node2:7001") + assert settings.redis_cluster_nodes == "node1:7000,node2:7001" + + def test_empty_sentinel_nodes_becomes_none(self): + """Empty REDIS_SENTINEL_NODES is converted to None.""" + settings = Settings(redis_sentinel_nodes="") + assert settings.redis_sentinel_nodes is None + + def test_real_sentinel_nodes_preserved(self): + """Valid sentinel node list is kept.""" + settings = Settings(redis_sentinel_nodes="sent1:26379,sent2:26379") + assert settings.redis_sentinel_nodes == "sent1:26379,sent2:26379" + + +class TestRedisConfigValidators: + """Tests for RedisConfig-level validators (password + nodes).""" + + def test_redis_config_empty_password_to_none(self): + """RedisConfig also converts empty password to None.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_password="") + assert cfg.password is None + + def test_redis_config_empty_cluster_nodes_to_none(self): + """RedisConfig also converts empty cluster nodes to None.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_cluster_nodes="") + assert cfg.cluster_nodes is None + + def test_redis_config_real_values_preserved(self): + """Non-empty values pass through.""" + from src.config.redis import RedisConfig + + cfg = RedisConfig(redis_password="pass", redis_cluster_nodes="h:7000") + assert cfg.password == "pass" + assert cfg.cluster_nodes == "h:7000"