Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/e2e-brev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ concurrency:

jobs:
e2e-brev:
if: github.repository == 'NVIDIA/NemoClaw'
# Note: Remove this condition when testing on forks
# if: github.repository == 'NVIDIA/NemoClaw'
runs-on: ubuntu-latest
timeout-minutes: 90
steps:
Expand Down
64 changes: 59 additions & 5 deletions scripts/brev-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,29 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
export NEEDRESTART_MODE=a
export DEBIAN_FRONTEND=noninteractive

# Wait for any existing apt locks (e.g. Brev's own provisioning on boot)
wait_for_apt() {
local max_wait=300 # 5 minutes
local waited=0
while fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1; do
if [ $waited -ge $max_wait ]; then
fail "apt lock still held after ${max_wait}s — another process is stuck"
fi
info "Waiting for apt lock to be released... (${waited}s)"
sleep 10
waited=$((waited + 10))
done
# Wait for any apt processes to fully exit — locks can be released
# momentarily between apt operations in a multi-step provisioning sequence
while pgrep -x "apt-get\|apt\|dpkg" >/dev/null 2>&1; do
info "Waiting for apt/dpkg processes to finish..."
sleep 5
done
}

# --- 0. Node.js (needed for services) ---
if ! command -v node >/dev/null 2>&1; then
wait_for_apt
info "Installing Node.js..."
NODESOURCE_URL="https://deb.nodesource.com/setup_22.x"
NODESOURCE_SHA256="575583bbac2fccc0b5edd0dbc03e222d9f9dc8d724da996d22754d6411104fd1"
Expand All @@ -55,27 +76,44 @@ if ! command -v node >/dev/null 2>&1; then
else
fail "No SHA-256 verification tool found (need sha256sum or shasum)"
fi
sudo -E bash "$tmpdir/setup_node.sh" >/dev/null 2>&1
sudo -E bash "$tmpdir/setup_node.sh"
)
sudo apt-get install -y -qq nodejs >/dev/null 2>&1
# Retry apt-get install — Brev provisioning can grab the lock between operations
for attempt in 1 2 3 4 5; do
wait_for_apt
if sudo apt-get install -y -qq nodejs 2>&1; then
break
fi
info "apt-get install nodejs failed (attempt $attempt/5), retrying..."
sleep 10
done
info "Node.js $(node --version) installed"
else
info "Node.js already installed: $(node --version)"
fi

# --- 1. Docker ---
if ! command -v docker >/dev/null 2>&1; then
wait_for_apt
info "Installing Docker..."
sudo apt-get update -qq >/dev/null 2>&1
sudo apt-get install -y -qq docker.io >/dev/null 2>&1
for attempt in 1 2 3 4 5; do
wait_for_apt
if sudo apt-get update -qq >/dev/null 2>&1 && sudo apt-get install -y -qq docker.io >/dev/null 2>&1; then
break
fi
info "Docker install failed (attempt $attempt/5), retrying..."
sleep 10
done
sudo usermod -aG docker "$(whoami)"
info "Docker installed"
else
info "Docker already installed"
fi

# --- 2. NVIDIA Container Toolkit (if GPU present) ---
if command -v nvidia-smi >/dev/null 2>&1; then
# Check that nvidia-smi actually works, not just that the binary exists.
# Brev GPU images ship nvidia-smi even on CPU-only instances.
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
Comment on lines +113 to +115
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use one GPU capability flag across all GPU-gated sections.

This block correctly checks runnable nvidia-smi, but later vLLM gating still checks only binary presence (Line 168). On CPU-only images that still ship nvidia-smi, that can trigger wrong behavior.

Proposed fix
+# Detect GPU availability once and reuse everywhere.
+HAS_GPU=false
+if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
+  HAS_GPU=true
+fi
+
 # --- 2. NVIDIA Container Toolkit (if GPU present) ---
-if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
+if [ "$HAS_GPU" = true ]; then
   ...
 fi

 # --- 4. vLLM (local inference, if GPU present) ---
 ...
-elif command -v nvidia-smi >/dev/null 2>&1; then
+elif [ "$HAS_GPU" = true ]; then
   ...
 fi
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/brev-setup.sh` around lines 86 - 88, The script currently tests a
runnable nvidia-smi in one place but later re-checks only binary presence when
gating vLLM; define and export a single GPU capability flag (e.g.,
GPU_AVAILABLE) right after the runnable check (the block using command -v
nvidia-smi and nvidia-smi) and then replace other ad-hoc checks (the later vLLM
gating that only calls command -v nvidia-smi) to consult that single flag;
ensure the flag is set to true only when nvidia-smi successfully runs and false
otherwise, and use that variable everywhere the script needs to decide GPU vs
CPU behavior (including vLLM gating).

if ! dpkg -s nvidia-container-toolkit >/dev/null 2>&1; then
info "Installing NVIDIA Container Toolkit..."
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
Expand All @@ -91,6 +129,22 @@ if command -v nvidia-smi >/dev/null 2>&1; then
else
info "NVIDIA Container Toolkit already installed"
fi
else
# CPU-only instance: ensure Docker uses runc (not nvidia) as default runtime.
# Brev GPU images pre-configure nvidia as the default Docker runtime even on
# CPU instances, causing "nvidia-container-cli: nvml error: driver not loaded"
# when starting containers.
if grep -q '"default-runtime".*nvidia' /etc/docker/daemon.json 2>/dev/null; then
info "Resetting Docker default runtime to runc (no GPU detected)..."
sudo python3 -c "
import json
with open('/etc/docker/daemon.json') as f: cfg = json.load(f)
cfg.pop('default-runtime', None)
with open('/etc/docker/daemon.json', 'w') as f: json.dump(cfg, f, indent=2)
"
sudo systemctl restart docker
info "Docker runtime reset to runc"
fi
fi

# --- 3. openshell CLI (binary release, not pip) ---
Expand Down
3 changes: 2 additions & 1 deletion scripts/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ info "Starting OpenShell gateway..."
openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | grep . && docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs docker volume rm || true
GATEWAY_ARGS=(--name nemoclaw)
command -v nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
# Only enable GPU if nvidia-smi actually works (driver loaded), not just present on PATH
nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
if ! openshell gateway start "${GATEWAY_ARGS[@]}" 2>&1 | grep -E "Gateway|✓|Error|error"; then
warn "Gateway start failed. Cleaning up stale state..."
openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
Expand Down
34 changes: 24 additions & 10 deletions test/e2e/brev-e2e.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,11 @@ function ssh(cmd, { timeout = 120_000, stream = false } = {}) {
return stream ? "" : result.trim();
}

/**
* Escape a value for safe inclusion in a single-quoted shell string.
* Replaces single quotes with the shell-safe sequence: '\''
*/
function shellEscape(value) {
return String(value).replace(/'/g, "'\\''");
}

/** Run a command on the remote VM with env vars set for NemoClaw. */
function sshEnv(cmd, { timeout = 600_000, stream = false } = {}) {
const envPrefix = [
`export NVIDIA_API_KEY='${shellEscape(process.env.NVIDIA_API_KEY)}'`,
`export GITHUB_TOKEN='${shellEscape(process.env.GITHUB_TOKEN)}'`,
`export NVIDIA_API_KEY='${process.env.NVIDIA_API_KEY}'`,
`export GITHUB_TOKEN='${process.env.GITHUB_TOKEN}'`,
`export NEMOCLAW_NON_INTERACTIVE=1`,
`export NEMOCLAW_SANDBOX_NAME=e2e-test`,
].join(" && ");
Expand Down Expand Up @@ -131,6 +123,14 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
);
brev("login", "--token", process.env.BREV_API_TOKEN);

// Delete any leftover instance from a previous failed run
try {
brev("delete", INSTANCE_NAME);
console.log(`[${elapsed()}] Deleted leftover instance "${INSTANCE_NAME}"`);
} catch {
// Expected — no leftover instance
}

// Create bare CPU instance via brev search cpu | brev create
console.log(`[${elapsed()}] Creating CPU instance via brev search cpu | brev create...`);
console.log(`[${elapsed()}] min-vcpu: ${BREV_MIN_VCPU}, min-ram: ${BREV_MIN_RAM}GB`);
Expand Down Expand Up @@ -161,6 +161,20 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
);
console.log(`[${elapsed()}] Code synced`);

// Wait for Brev's provisioning to finish — it runs apt-get on boot
// and the locks can be released between operations, so we wait for
// both locks AND processes to be gone.
console.log(`[${elapsed()}] Waiting for Brev provisioning (apt) to finish...`);
ssh(
`for i in $(seq 1 120); do ` +
`if fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock >/dev/null 2>&1 || ` +
`pgrep -x "apt-get|apt|dpkg" >/dev/null 2>&1; then ` +
`echo "apt still running... ($i/120)"; sleep 5; ` +
`else break; fi; done`,
{ timeout: 660_000, stream: true },
);
console.log(`[${elapsed()}] apt done`);

// Bootstrap VM — stream output to CI log so we can see progress
console.log(`[${elapsed()}] Running brev-setup.sh (bootstrap)...`);
sshEnv(`cd ${remoteDir} && SKIP_VLLM=1 bash scripts/brev-setup.sh`, {
Expand Down
Loading
Loading