Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e-brev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ concurrency:

jobs:
e2e-brev:
if: github.repository == 'NVIDIA/NemoClaw'
if: github.repository == 'NVIDIA/NemoClaw' || github.repository == 'jyaunches/NemoClaw'
runs-on: ubuntu-latest
timeout-minutes: 90
steps:
Expand Down
63 changes: 58 additions & 5 deletions scripts/brev-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,28 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
export NEEDRESTART_MODE=a
export DEBIAN_FRONTEND=noninteractive

# ── Acquire exclusive apt access ──────────────────────────────────
# Cloud VMs run multiple apt processes on boot: cloud-init, apt-daily,
# and unattended-upgrades. We must wait for cloud-init AND disable the
# background services before we can safely use apt ourselves.
if command -v cloud-init >/dev/null 2>&1; then
info "Waiting for cloud-init to finish..."
cloud-init status --wait >/dev/null 2>&1 || true
info "cloud-init done"
fi
# Stop background apt services that run independently of cloud-init
info "Disabling background apt services..."
sudo systemctl stop apt-daily.timer apt-daily-upgrade.timer unattended-upgrades 2>/dev/null || true
sudo systemctl disable apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true
# Kill any straggler apt/dpkg processes
sudo pkill -9 -x apt-get 2>/dev/null || true
sudo pkill -9 -x apt 2>/dev/null || true
sudo pkill -9 -x dpkg 2>/dev/null || true
# Release any stale locks left by killed processes
sudo rm -f /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null || true
sudo dpkg --configure -a 2>/dev/null || true
info "apt is now exclusively ours"

# --- 0. Node.js (needed for services) ---
if ! command -v node >/dev/null 2>&1; then
info "Installing Node.js..."
Expand All @@ -55,9 +77,16 @@ if ! command -v node >/dev/null 2>&1; then
else
fail "No SHA-256 verification tool found (need sha256sum or shasum)"
fi
sudo -E bash "$tmpdir/setup_node.sh" >/dev/null 2>&1
sudo -E bash "$tmpdir/setup_node.sh"
)
sudo apt-get install -y -qq nodejs >/dev/null 2>&1
for attempt in 1 2 3 4 5; do
if sudo apt-get install -y -qq nodejs; then
break
fi
[ "$attempt" -eq 5 ] && fail "Node.js install failed after 5 attempts"
info "Node.js install failed (attempt $attempt/5), retrying in 30s..."
sleep 30
done
info "Node.js $(node --version) installed"
else
info "Node.js already installed: $(node --version)"
Expand All @@ -66,16 +95,24 @@ fi
# --- 1. Docker ---
if ! command -v docker >/dev/null 2>&1; then
info "Installing Docker..."
sudo apt-get update -qq >/dev/null 2>&1
sudo apt-get install -y -qq docker.io >/dev/null 2>&1
for attempt in 1 2 3 4 5; do
if sudo apt-get update -qq && sudo apt-get install -y -qq docker.io; then
break
fi
[ "$attempt" -eq 5 ] && fail "Docker install failed after 5 attempts"
info "Docker install failed (attempt $attempt/5), retrying in 30s..."
sleep 30
done
sudo usermod -aG docker "$(whoami)"
info "Docker installed"
else
info "Docker already installed"
fi

# --- 2. NVIDIA Container Toolkit (if GPU present) ---
if command -v nvidia-smi >/dev/null 2>&1; then
# Check that nvidia-smi actually works, not just that the binary exists.
# Brev GPU images ship nvidia-smi even on CPU-only instances.
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
Comment on lines +113 to +115
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use one GPU capability flag across all GPU-gated sections.

This block correctly checks runnable nvidia-smi, but later vLLM gating still checks only binary presence (Line 168). On CPU-only images that still ship nvidia-smi, that can trigger wrong behavior.

Proposed fix
+# Detect GPU availability once and reuse everywhere.
+HAS_GPU=false
+if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
+  HAS_GPU=true
+fi
+
 # --- 2. NVIDIA Container Toolkit (if GPU present) ---
-if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
+if [ "$HAS_GPU" = true ]; then
   ...
 fi

 # --- 4. vLLM (local inference, if GPU present) ---
 ...
-elif command -v nvidia-smi >/dev/null 2>&1; then
+elif [ "$HAS_GPU" = true ]; then
   ...
 fi
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/brev-setup.sh` around lines 86 - 88, The script currently tests a
runnable nvidia-smi in one place but later re-checks only binary presence when
gating vLLM; define and export a single GPU capability flag (e.g.,
GPU_AVAILABLE) right after the runnable check (the block using command -v
nvidia-smi and nvidia-smi) and then replace other ad-hoc checks (the later vLLM
gating that only calls command -v nvidia-smi) to consult that single flag;
ensure the flag is set to true only when nvidia-smi successfully runs and false
otherwise, and use that variable everywhere the script needs to decide GPU vs
CPU behavior (including vLLM gating).

if ! dpkg -s nvidia-container-toolkit >/dev/null 2>&1; then
info "Installing NVIDIA Container Toolkit..."
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
Expand All @@ -91,6 +128,22 @@ if command -v nvidia-smi >/dev/null 2>&1; then
else
info "NVIDIA Container Toolkit already installed"
fi
else
# CPU-only instance: ensure Docker uses runc (not nvidia) as default runtime.
# Brev GPU images pre-configure nvidia as the default Docker runtime even on
# CPU instances, causing "nvidia-container-cli: nvml error: driver not loaded"
# when starting containers.
if grep -q '"default-runtime".*nvidia' /etc/docker/daemon.json 2>/dev/null; then
info "Resetting Docker default runtime to runc (no GPU detected)..."
sudo python3 -c "
import json
with open('/etc/docker/daemon.json') as f: cfg = json.load(f)
cfg.pop('default-runtime', None)
with open('/etc/docker/daemon.json', 'w') as f: json.dump(cfg, f, indent=2)
"
sudo systemctl restart docker
info "Docker runtime reset to runc"
fi
fi

# --- 3. openshell CLI (binary release, not pip) ---
Expand Down
3 changes: 2 additions & 1 deletion scripts/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ info "Starting OpenShell gateway..."
openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | grep . && docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs docker volume rm || true
GATEWAY_ARGS=(--name nemoclaw)
command -v nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
# Only enable GPU if nvidia-smi actually works (driver loaded), not just present on PATH
nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
if ! openshell gateway start "${GATEWAY_ARGS[@]}" 2>&1 | grep -E "Gateway|✓|Error|error"; then
warn "Gateway start failed. Cleaning up stale state..."
openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
Expand Down
40 changes: 23 additions & 17 deletions test/e2e/brev-e2e.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
*
* Optional env vars:
* TEST_SUITE — which test to run: full (default), credential-sanitization, telegram-injection, all
* BREV_MIN_VCPU — Minimum vCPUs for CPU instance (default: 4)
* BREV_MIN_RAM — Minimum RAM in GB for CPU instance (default: 16)
* BREV_INSTANCE_TYPE — Brev/GCP instance type (default: n2-standard-4)
*/

import { describe, it, expect, beforeAll, afterAll } from "vitest";
Expand All @@ -28,8 +27,8 @@ import { homedir } from "node:os";
import path from "node:path";

// CPU instance specs: min vCPUs and RAM for the instance search
const BREV_MIN_VCPU = parseInt(process.env.BREV_MIN_VCPU || "4", 10);
const BREV_MIN_RAM = parseInt(process.env.BREV_MIN_RAM || "16", 10);
// Use a known CPU-only GCP instance type to avoid GPU images with broken nvidia runtime
const BREV_INSTANCE_TYPE = process.env.BREV_INSTANCE_TYPE || "n2-standard-4";
const INSTANCE_NAME = process.env.INSTANCE_NAME;
const TEST_SUITE = process.env.TEST_SUITE || "full";
const REPO_DIR = path.resolve(import.meta.dirname, "../..");
Expand Down Expand Up @@ -58,19 +57,15 @@ function ssh(cmd, { timeout = 120_000, stream = false } = {}) {
return stream ? "" : result.trim();
}

/**
* Escape a value for safe inclusion in a single-quoted shell string.
* Replaces single quotes with the shell-safe sequence: '\''
*/
function shellEscape(value) {
return String(value).replace(/'/g, "'\\''");
function shellQuote(value) {
return `'${String(value).replace(/'/g, "'\\''")}'`;
}

/** Run a command on the remote VM with env vars set for NemoClaw. */
function sshEnv(cmd, { timeout = 600_000, stream = false } = {}) {
const envPrefix = [
`export NVIDIA_API_KEY='${shellEscape(process.env.NVIDIA_API_KEY)}'`,
`export GITHUB_TOKEN='${shellEscape(process.env.GITHUB_TOKEN)}'`,
`export NVIDIA_API_KEY=${shellQuote(process.env.NVIDIA_API_KEY)}`,
`export GITHUB_TOKEN=${shellQuote(process.env.GITHUB_TOKEN)}`,
`export NEMOCLAW_NON_INTERACTIVE=1`,
`export NEMOCLAW_SANDBOX_NAME=e2e-test`,
].join(" && ");
Expand Down Expand Up @@ -131,12 +126,18 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
);
brev("login", "--token", process.env.BREV_API_TOKEN);

// Create bare CPU instance via brev search cpu | brev create
console.log(`[${elapsed()}] Creating CPU instance via brev search cpu | brev create...`);
console.log(`[${elapsed()}] min-vcpu: ${BREV_MIN_VCPU}, min-ram: ${BREV_MIN_RAM}GB`);
// Delete any leftover instance from a previous failed run
try {
brev("delete", INSTANCE_NAME);
console.log(`[${elapsed()}] Deleted leftover instance "${INSTANCE_NAME}"`);
} catch {
// Expected — no leftover instance
}

// Create CPU instance with a known GCP instance type
console.log(`[${elapsed()}] Creating CPU instance (type: ${BREV_INSTANCE_TYPE})...`);
execSync(
`brev search cpu --min-vcpu ${BREV_MIN_VCPU} --min-ram ${BREV_MIN_RAM} --sort price | ` +
`brev create ${INSTANCE_NAME} --detached`,
`brev create --type ${BREV_INSTANCE_TYPE} ${INSTANCE_NAME} --detached`,
{ encoding: "utf-8", timeout: 180_000, stdio: ["pipe", "inherit", "inherit"] },
);
instanceCreated = true;
Expand All @@ -161,6 +162,11 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
);
console.log(`[${elapsed()}] Code synced`);

// Wait for cloud-init to finish — Brev instances run apt provisioning on boot
console.log(`[${elapsed()}] Waiting for cloud-init to finish...`);
ssh(`cloud-init status --wait 2>/dev/null || true`, { timeout: 600_000, stream: true });
console.log(`[${elapsed()}] cloud-init done`);

// Bootstrap VM — stream output to CI log so we can see progress
console.log(`[${elapsed()}] Running brev-setup.sh (bootstrap)...`);
sshEnv(`cd ${remoteDir} && SKIP_VLLM=1 bash scripts/brev-setup.sh`, {
Expand Down
Loading
Loading