NVIDIA · jyaunches · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/.github/workflows/e2e-brev.yaml b/.github/workflows/e2e-brev.yaml
@@ -89,7 +89,7 @@ concurrency:
 
 jobs:
   e2e-brev:
-    if: github.repository == 'NVIDIA/NemoClaw'
+    if: github.repository == 'NVIDIA/NemoClaw' || github.repository == 'jyaunches/NemoClaw'
     runs-on: ubuntu-latest
     timeout-minutes: 90
     steps:

diff --git a/scripts/brev-setup.sh b/scripts/brev-setup.sh
@@ -37,6 +37,28 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 export NEEDRESTART_MODE=a
 export DEBIAN_FRONTEND=noninteractive
 
+# ── Acquire exclusive apt access ──────────────────────────────────
+# Cloud VMs run multiple apt processes on boot: cloud-init, apt-daily,
+# and unattended-upgrades. We must wait for cloud-init AND disable the
+# background services before we can safely use apt ourselves.
+if command -v cloud-init >/dev/null 2>&1; then
+  info "Waiting for cloud-init to finish..."
+  cloud-init status --wait >/dev/null 2>&1 || true
+  info "cloud-init done"
+fi
+# Stop background apt services that run independently of cloud-init
+info "Disabling background apt services..."
+sudo systemctl stop apt-daily.timer apt-daily-upgrade.timer unattended-upgrades 2>/dev/null || true
+sudo systemctl disable apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true
+# Kill any straggler apt/dpkg processes
+sudo pkill -9 -x apt-get 2>/dev/null || true
+sudo pkill -9 -x apt 2>/dev/null || true
+sudo pkill -9 -x dpkg 2>/dev/null || true
+# Release any stale locks left by killed processes
+sudo rm -f /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null || true
+sudo dpkg --configure -a 2>/dev/null || true
+info "apt is now exclusively ours"
+
 # --- 0. Node.js (needed for services) ---
 if ! command -v node >/dev/null 2>&1; then
   info "Installing Node.js..."
@@ -55,9 +77,16 @@ if ! command -v node >/dev/null 2>&1; then
     else
       fail "No SHA-256 verification tool found (need sha256sum or shasum)"
     fi
-    sudo -E bash "$tmpdir/setup_node.sh" >/dev/null 2>&1
+    sudo -E bash "$tmpdir/setup_node.sh"
   )
-  sudo apt-get install -y -qq nodejs >/dev/null 2>&1
+  for attempt in 1 2 3 4 5; do
+    if sudo apt-get install -y -qq nodejs; then
+      break
+    fi
+    [ "$attempt" -eq 5 ] && fail "Node.js install failed after 5 attempts"
+    info "Node.js install failed (attempt $attempt/5), retrying in 30s..."
+    sleep 30
+  done
   info "Node.js $(node --version) installed"
 else
   info "Node.js already installed: $(node --version)"
@@ -66,16 +95,24 @@ fi
 # --- 1. Docker ---
 if ! command -v docker >/dev/null 2>&1; then
   info "Installing Docker..."
-  sudo apt-get update -qq >/dev/null 2>&1
-  sudo apt-get install -y -qq docker.io >/dev/null 2>&1
+  for attempt in 1 2 3 4 5; do
+    if sudo apt-get update -qq && sudo apt-get install -y -qq docker.io; then
+      break
+    fi
+    [ "$attempt" -eq 5 ] && fail "Docker install failed after 5 attempts"
+    info "Docker install failed (attempt $attempt/5), retrying in 30s..."
+    sleep 30
+  done
   sudo usermod -aG docker "$(whoami)"
   info "Docker installed"
 else
   info "Docker already installed"
 fi
 
 # --- 2. NVIDIA Container Toolkit (if GPU present) ---
-if command -v nvidia-smi >/dev/null 2>&1; then
+# Check that nvidia-smi actually works, not just that the binary exists.
+# Brev GPU images ship nvidia-smi even on CPU-only instances.
+if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
   if ! dpkg -s nvidia-container-toolkit >/dev/null 2>&1; then
     info "Installing NVIDIA Container Toolkit..."
     curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
@@ -91,6 +128,22 @@ if command -v nvidia-smi >/dev/null 2>&1; then
   else
     info "NVIDIA Container Toolkit already installed"
   fi
+else
+  # CPU-only instance: ensure Docker uses runc (not nvidia) as default runtime.
+  # Brev GPU images pre-configure nvidia as the default Docker runtime even on
+  # CPU instances, causing "nvidia-container-cli: nvml error: driver not loaded"
+  # when starting containers.
+  if grep -q '"default-runtime".*nvidia' /etc/docker/daemon.json 2>/dev/null; then
+    info "Resetting Docker default runtime to runc (no GPU detected)..."
+    sudo python3 -c "
+import json
+with open('/etc/docker/daemon.json') as f: cfg = json.load(f)
+cfg.pop('default-runtime', None)
+with open('/etc/docker/daemon.json', 'w') as f: json.dump(cfg, f, indent=2)
+"
+    sudo systemctl restart docker
+    info "Docker runtime reset to runc"
+  fi
 fi
 
 # --- 3. openshell CLI (binary release, not pip) ---

diff --git a/scripts/setup.sh b/scripts/setup.sh
@@ -112,7 +112,8 @@ info "Starting OpenShell gateway..."
 openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
 docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | grep . && docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs docker volume rm || true
 GATEWAY_ARGS=(--name nemoclaw)
-command -v nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
+# Only enable GPU if nvidia-smi actually works (driver loaded), not just present on PATH
+nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
 if ! openshell gateway start "${GATEWAY_ARGS[@]}" 2>&1 | grep -E "Gateway|✓|Error|error"; then
   warn "Gateway start failed. Cleaning up stale state..."
   openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true

diff --git a/test/e2e/brev-e2e.test.js b/test/e2e/brev-e2e.test.js
@@ -17,8 +17,7 @@
  *
  * Optional env vars:
  *   TEST_SUITE       — which test to run: full (default), credential-sanitization, telegram-injection, all
- *   BREV_MIN_VCPU    — Minimum vCPUs for CPU instance (default: 4)
- *   BREV_MIN_RAM     — Minimum RAM in GB for CPU instance (default: 16)
+ *   BREV_INSTANCE_TYPE — Brev/GCP instance type (default: n2-standard-4)
  */
 
 import { describe, it, expect, beforeAll, afterAll } from "vitest";
@@ -28,8 +27,8 @@ import { homedir } from "node:os";
 import path from "node:path";
 
 // CPU instance specs: min vCPUs and RAM for the instance search
-const BREV_MIN_VCPU = parseInt(process.env.BREV_MIN_VCPU || "4", 10);
-const BREV_MIN_RAM = parseInt(process.env.BREV_MIN_RAM || "16", 10);
+// Use a known CPU-only GCP instance type to avoid GPU images with broken nvidia runtime
+const BREV_INSTANCE_TYPE = process.env.BREV_INSTANCE_TYPE || "n2-standard-4";
 const INSTANCE_NAME = process.env.INSTANCE_NAME;
 const TEST_SUITE = process.env.TEST_SUITE || "full";
 const REPO_DIR = path.resolve(import.meta.dirname, "../..");
@@ -58,19 +57,15 @@ function ssh(cmd, { timeout = 120_000, stream = false } = {}) {
   return stream ? "" : result.trim();
 }
 
-/**
- * Escape a value for safe inclusion in a single-quoted shell string.
- * Replaces single quotes with the shell-safe sequence: '\''
- */
-function shellEscape(value) {
-  return String(value).replace(/'/g, "'\\''");
+function shellQuote(value) {
+  return `'${String(value).replace(/'/g, "'\\''")}'`;
 }
 
 /** Run a command on the remote VM with env vars set for NemoClaw. */
 function sshEnv(cmd, { timeout = 600_000, stream = false } = {}) {
   const envPrefix = [
-    `export NVIDIA_API_KEY='${shellEscape(process.env.NVIDIA_API_KEY)}'`,
-    `export GITHUB_TOKEN='${shellEscape(process.env.GITHUB_TOKEN)}'`,
+    `export NVIDIA_API_KEY=${shellQuote(process.env.NVIDIA_API_KEY)}`,
+    `export GITHUB_TOKEN=${shellQuote(process.env.GITHUB_TOKEN)}`,
     `export NEMOCLAW_NON_INTERACTIVE=1`,
     `export NEMOCLAW_SANDBOX_NAME=e2e-test`,
   ].join(" && ");
@@ -131,12 +126,18 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
     );
     brev("login", "--token", process.env.BREV_API_TOKEN);
 
-    // Create bare CPU instance via brev search cpu | brev create
-    console.log(`[${elapsed()}] Creating CPU instance via brev search cpu | brev create...`);
-    console.log(`[${elapsed()}]   min-vcpu: ${BREV_MIN_VCPU}, min-ram: ${BREV_MIN_RAM}GB`);
+    // Delete any leftover instance from a previous failed run
+    try {
+      brev("delete", INSTANCE_NAME);
+      console.log(`[${elapsed()}] Deleted leftover instance "${INSTANCE_NAME}"`);
+    } catch {
+      // Expected — no leftover instance
+    }
+
+    // Create CPU instance with a known GCP instance type
+    console.log(`[${elapsed()}] Creating CPU instance (type: ${BREV_INSTANCE_TYPE})...`);
     execSync(
-      `brev search cpu --min-vcpu ${BREV_MIN_VCPU} --min-ram ${BREV_MIN_RAM} --sort price | ` +
-        `brev create ${INSTANCE_NAME} --detached`,
+      `brev create --type ${BREV_INSTANCE_TYPE} ${INSTANCE_NAME} --detached`,
       { encoding: "utf-8", timeout: 180_000, stdio: ["pipe", "inherit", "inherit"] },
     );
     instanceCreated = true;
@@ -161,6 +162,11 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
     );
     console.log(`[${elapsed()}] Code synced`);
 
+    // Wait for cloud-init to finish — Brev instances run apt provisioning on boot
+    console.log(`[${elapsed()}] Waiting for cloud-init to finish...`);
+    ssh(`cloud-init status --wait 2>/dev/null || true`, { timeout: 600_000, stream: true });
+    console.log(`[${elapsed()}] cloud-init done`);
+
     // Bootstrap VM — stream output to CI log so we can see progress
     console.log(`[${elapsed()}] Running brev-setup.sh (bootstrap)...`);
     sshEnv(`cd ${remoteDir} && SKIP_VLLM=1 bash scripts/brev-setup.sh`, {