Skip to content

Commit 4487598

Browse files
BenediktSchackenbergbenediktLastLaughingLobster
authored
fix(onboard): force chat completions API for vLLM and NIM-local providers (#980)
* fix(onboard): force chat completions API for vLLM and NIM-local providers vLLM's /v1/responses endpoint does not run the --tool-call-parser, so tool calls arrive as raw text in the response content instead of the structured tool_calls array. The probe picks openai-responses because vLLM accepts the request, but parsing only works on /v1/chat/completions. Override preferredInferenceApi to openai-completions after validation for both vllm and nim-local provider paths. Fixes #976 * fix: log when overriding probe result to chat completions Surface the API override during onboard so users see why responses API was not selected. * test: add regression test for vLLM chat completions override Verifies that setupNim() forces preferredInferenceApi to openai-completions for the vLLM provider path even when the probe detects openai-responses first. This locks down the fix for #976 so the tool-call-parser override cannot silently regress. * test: add NIM-local chat completions override regression test Companion test for the vLLM case: verifies that setupNim() forces openai-completions for the NIM-local path too, since NIM uses vLLM internally and has the same tool-call-parser limitation. --------- Co-authored-by: Benedikt Schackenberg <[email protected]> Co-authored-by: Benedikt Schackenberg <[email protected]>
1 parent a1a93c2 commit 4487598

File tree

2 files changed

+220
-0
lines changed

2 files changed

+220
-0
lines changed

bin/lib/onboard.js

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2180,6 +2180,12 @@ async function setupNim(gpu) {
21802180
if (!preferredInferenceApi) {
21812181
continue selectionLoop;
21822182
}
2183+
// NIM uses vLLM internally — same tool-call-parser limitation
2184+
// applies to /v1/responses. Force chat completions.
2185+
if (preferredInferenceApi !== "openai-completions") {
2186+
console.log(" ℹ Using chat completions API (tool-call-parser requires /v1/chat/completions)");
2187+
}
2188+
preferredInferenceApi = "openai-completions";
21832189
}
21842190
}
21852191
break;
@@ -2296,6 +2302,13 @@ async function setupNim(gpu) {
22962302
if (!preferredInferenceApi) {
22972303
continue selectionLoop;
22982304
}
2305+
// Force chat completions — vLLM's /v1/responses endpoint does not
2306+
// run the --tool-call-parser, so tool calls arrive as raw text.
2307+
// See: https://github.com/NVIDIA/NemoClaw/issues/976
2308+
if (preferredInferenceApi !== "openai-completions") {
2309+
console.log(" ℹ Using chat completions API (tool-call-parser requires /v1/chat/completions)");
2310+
}
2311+
preferredInferenceApi = "openai-completions";
22992312
break;
23002313
}
23012314
}

test/onboard-selection.test.js

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,4 +1303,211 @@ const { setupNim } = require(${onboardPath});
13031303
assert.ok(payload.lines.some((line) => line.includes("Please choose a provider/model again")));
13041304
assert.equal(payload.messages.filter((message) => /Choose \[/.test(message)).length, 2);
13051305
});
1306+
1307+
it("forces openai-completions for vLLM even when probe detects openai-responses", () => {
1308+
const repoRoot = path.join(import.meta.dirname, "..");
1309+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-vllm-override-"));
1310+
const fakeBin = path.join(tmpDir, "bin");
1311+
const scriptPath = path.join(tmpDir, "vllm-override-check.js");
1312+
const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js"));
1313+
const credentialsPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "credentials.js"));
1314+
const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js"));
1315+
1316+
fs.mkdirSync(fakeBin, { recursive: true });
1317+
// Fake curl: /v1/responses returns 200 (so probe detects openai-responses),
1318+
// /v1/models returns a vLLM model list
1319+
fs.writeFileSync(
1320+
path.join(fakeBin, "curl"),
1321+
`#!/usr/bin/env bash
1322+
body=''
1323+
status="200"
1324+
outfile=""
1325+
url=""
1326+
while [ "$#" -gt 0 ]; do
1327+
case "$1" in
1328+
-o) outfile="$2"; shift 2 ;;
1329+
*) url="$1"; shift ;;
1330+
esac
1331+
done
1332+
if echo "$url" | grep -q '/v1/models'; then
1333+
body='{"data":[{"id":"meta-llama/Llama-3.3-70B-Instruct"}]}'
1334+
elif echo "$url" | grep -q '/v1/responses'; then
1335+
body='{"id":"resp_123","output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]}]}'
1336+
elif echo "$url" | grep -q '/v1/chat/completions'; then
1337+
body='{"id":"chatcmpl-123","choices":[{"message":{"content":"ok"}}]}'
1338+
fi
1339+
printf '%s' "$body" > "$outfile"
1340+
printf '%s' "$status"
1341+
`,
1342+
{ mode: 0o755 },
1343+
);
1344+
1345+
// vLLM is option 7 (build, openai, custom, anthropic, anthropicCompatible, gemini, vllm)
1346+
const script = String.raw`
1347+
const credentials = require(${credentialsPath});
1348+
const runner = require(${runnerPath});
1349+
1350+
const answers = ["7"];
1351+
const messages = [];
1352+
1353+
credentials.prompt = async (message) => {
1354+
messages.push(message);
1355+
return answers.shift() || "";
1356+
};
1357+
credentials.ensureApiKey = async () => {};
1358+
runner.runCapture = (command) => {
1359+
if (command.includes("command -v ollama")) return "";
1360+
if (command.includes("localhost:11434")) return "";
1361+
if (command.includes("localhost:8000/v1/models")) return JSON.stringify({ data: [{ id: "meta-llama/Llama-3.3-70B-Instruct" }] });
1362+
return "";
1363+
};
1364+
1365+
const { setupNim } = require(${onboardPath});
1366+
1367+
(async () => {
1368+
const originalLog = console.log;
1369+
const lines = [];
1370+
console.log = (...args) => lines.push(args.join(" "));
1371+
try {
1372+
const result = await setupNim(null);
1373+
originalLog(JSON.stringify({ result, messages, lines }));
1374+
} finally {
1375+
console.log = originalLog;
1376+
}
1377+
})().catch((error) => {
1378+
console.error(error);
1379+
process.exit(1);
1380+
});
1381+
`;
1382+
fs.writeFileSync(scriptPath, script);
1383+
1384+
const result = spawnSync(process.execPath, [scriptPath], {
1385+
cwd: repoRoot,
1386+
encoding: "utf-8",
1387+
env: {
1388+
...process.env,
1389+
HOME: tmpDir,
1390+
PATH: `${fakeBin}:${process.env.PATH || ""}`,
1391+
NEMOCLAW_EXPERIMENTAL: "1",
1392+
},
1393+
});
1394+
1395+
assert.equal(result.status, 0, result.stderr);
1396+
const payload = JSON.parse(result.stdout.trim());
1397+
assert.equal(payload.result.provider, "vllm-local");
1398+
assert.equal(payload.result.model, "meta-llama/Llama-3.3-70B-Instruct");
1399+
// Key assertion: even though probe detected openai-responses, the override
1400+
// forces openai-completions so tool-call-parser works correctly.
1401+
assert.equal(payload.result.preferredInferenceApi, "openai-completions");
1402+
assert.ok(payload.lines.some((line) => line.includes("Using existing vLLM")));
1403+
assert.ok(payload.lines.some((line) => line.includes("tool-call-parser requires")));
1404+
});
1405+
1406+
it("forces openai-completions for NIM-local even when probe detects openai-responses", () => {
1407+
const repoRoot = path.join(import.meta.dirname, "..");
1408+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-nim-override-"));
1409+
const fakeBin = path.join(tmpDir, "bin");
1410+
const scriptPath = path.join(tmpDir, "nim-override-check.js");
1411+
const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js"));
1412+
const credentialsPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "credentials.js"));
1413+
const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js"));
1414+
const nimPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "nim.js"));
1415+
1416+
fs.mkdirSync(fakeBin, { recursive: true });
1417+
// Fake curl: /v1/responses returns 200 (probe detects openai-responses)
1418+
fs.writeFileSync(
1419+
path.join(fakeBin, "curl"),
1420+
`#!/usr/bin/env bash
1421+
body=''
1422+
status="200"
1423+
outfile=""
1424+
url=""
1425+
while [ "$#" -gt 0 ]; do
1426+
case "$1" in
1427+
-o) outfile="$2"; shift 2 ;;
1428+
*) url="$1"; shift ;;
1429+
esac
1430+
done
1431+
if echo "$url" | grep -q '/v1/models'; then
1432+
body='{"data":[{"id":"nvidia/nemotron-3-nano"}]}'
1433+
elif echo "$url" | grep -q '/v1/responses'; then
1434+
body='{"id":"resp_123","output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]}]}'
1435+
elif echo "$url" | grep -q '/v1/chat/completions'; then
1436+
body='{"id":"chatcmpl-123","choices":[{"message":{"content":"ok"}}]}'
1437+
fi
1438+
printf '%s' "$body" > "$outfile"
1439+
printf '%s' "$status"
1440+
`,
1441+
{ mode: 0o755 },
1442+
);
1443+
1444+
// NIM-local is option 7 (build, openai, custom, anthropic, anthropicCompatible, gemini, nim-local)
1445+
// No ollama, no vLLM — only NIM-local shows up as experimental option
1446+
const script = String.raw`
1447+
const credentials = require(${credentialsPath});
1448+
const runner = require(${runnerPath});
1449+
1450+
// Mock nim module before onboard.js requires it
1451+
const nimMod = require(${nimPath});
1452+
nimMod.listModels = () => [{ name: "nvidia/nemotron-3-nano", image: "fake", minGpuMemoryMB: 8000 }];
1453+
nimMod.pullNimImage = () => {};
1454+
nimMod.containerName = () => "nemoclaw-nim-test";
1455+
nimMod.startNimContainerByName = () => "container-123";
1456+
nimMod.waitForNimHealth = () => true;
1457+
1458+
// Select option 7 (nim-local), then model 1
1459+
const answers = ["7", "1"];
1460+
const messages = [];
1461+
1462+
credentials.prompt = async (message) => {
1463+
messages.push(message);
1464+
return answers.shift() || "";
1465+
};
1466+
credentials.ensureApiKey = async () => {};
1467+
runner.runCapture = (command) => {
1468+
if (command.includes("command -v ollama")) return "";
1469+
if (command.includes("localhost:11434")) return "";
1470+
if (command.includes("localhost:8000/v1/models")) return "";
1471+
return "";
1472+
};
1473+
1474+
const { setupNim } = require(${onboardPath});
1475+
1476+
(async () => {
1477+
const originalLog = console.log;
1478+
const lines = [];
1479+
console.log = (...args) => lines.push(args.join(" "));
1480+
try {
1481+
// Pass a GPU object with nimCapable: true
1482+
const result = await setupNim({ type: "nvidia", totalMemoryMB: 16000, nimCapable: true });
1483+
originalLog(JSON.stringify({ result, messages, lines }));
1484+
} finally {
1485+
console.log = originalLog;
1486+
}
1487+
})().catch((error) => {
1488+
console.error(error);
1489+
process.exit(1);
1490+
});
1491+
`;
1492+
fs.writeFileSync(scriptPath, script);
1493+
1494+
const result = spawnSync(process.execPath, [scriptPath], {
1495+
cwd: repoRoot,
1496+
encoding: "utf-8",
1497+
env: {
1498+
...process.env,
1499+
HOME: tmpDir,
1500+
PATH: `${fakeBin}:${process.env.PATH || ""}`,
1501+
NEMOCLAW_EXPERIMENTAL: "1",
1502+
},
1503+
});
1504+
1505+
assert.equal(result.status, 0, result.stderr);
1506+
const payload = JSON.parse(result.stdout.trim());
1507+
assert.equal(payload.result.provider, "vllm-local");
1508+
assert.equal(payload.result.model, "nvidia/nemotron-3-nano");
1509+
// Key assertion: NIM uses vLLM internally — same override must apply.
1510+
assert.equal(payload.result.preferredInferenceApi, "openai-completions");
1511+
assert.ok(payload.lines.some((line) => line.includes("tool-call-parser requires")));
1512+
});
13061513
});

0 commit comments

Comments
 (0)