fix(onboard): force chat completions API for vLLM and NIM-local providers (#980)

BenediktSchackenberg · benedikt · LastLaughingLobster · web-flow · commit 448759894339 · 2026-03-30T08:59:33.000-07:00
* fix(onboard): force chat completions API for vLLM and NIM-local providers vLLM's /v1/responses endpoint does not run the --tool-call-parser, so tool calls arrive as raw text in the response content instead of the structured tool_calls array. The probe picks openai-responses because vLLM accepts the request, but parsing only works on /v1/chat/completions. Override preferredInferenceApi to openai-completions after validation for both vllm and nim-local provider paths. Fixes #976 * fix: log when overriding probe result to chat completions Surface the API override during onboard so users see why responses API was not selected. * test: add regression test for vLLM chat completions override Verifies that setupNim() forces preferredInferenceApi to openai-completions for the vLLM provider path even when the probe detects openai-responses first. This locks down the fix for #976 so the tool-call-parser override cannot silently regress. * test: add NIM-local chat completions override regression test Companion test for the vLLM case: verifies that setupNim() forces openai-completions for the NIM-local path too, since NIM uses vLLM internally and has the same tool-call-parser limitation. --------- Co-authored-by: Benedikt Schackenberg <benedikt@users.noreply.github.com> Co-authored-by: Benedikt Schackenberg <69834303+BenediktSchackenberg@users.noreply.github.com>
diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js
@@ -2180,6 +2180,12 @@ async function setupNim(gpu) {
           if (!preferredInferenceApi) {
             continue selectionLoop;
           }
+          // NIM uses vLLM internally — same tool-call-parser limitation
+          // applies to /v1/responses. Force chat completions.
+          if (preferredInferenceApi !== "openai-completions") {
+            console.log("  ℹ Using chat completions API (tool-call-parser requires /v1/chat/completions)");
+          }
+          preferredInferenceApi = "openai-completions";
         }
       }
       break;
@@ -2296,6 +2302,13 @@ async function setupNim(gpu) {
       if (!preferredInferenceApi) {
         continue selectionLoop;
       }
+      // Force chat completions — vLLM's /v1/responses endpoint does not
+      // run the --tool-call-parser, so tool calls arrive as raw text.
+      // See: https://github.com/NVIDIA/NemoClaw/issues/976
+      if (preferredInferenceApi !== "openai-completions") {
+        console.log("  ℹ Using chat completions API (tool-call-parser requires /v1/chat/completions)");
+      }
+      preferredInferenceApi = "openai-completions";
       break;
     }
   }
diff --git a/test/onboard-selection.test.js b/test/onboard-selection.test.js
@@ -1303,4 +1303,211 @@ const { setupNim } = require(${onboardPath});
     assert.ok(payload.lines.some((line) => line.includes("Please choose a provider/model again")));
     assert.equal(payload.messages.filter((message) => /Choose \[/.test(message)).length, 2);
   });
+
+  it("forces openai-completions for vLLM even when probe detects openai-responses", () => {
+    const repoRoot = path.join(import.meta.dirname, "..");
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-vllm-override-"));
+    const fakeBin = path.join(tmpDir, "bin");
+    const scriptPath = path.join(tmpDir, "vllm-override-check.js");
+    const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js"));
+    const credentialsPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "credentials.js"));
+    const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js"));
+
+    fs.mkdirSync(fakeBin, { recursive: true });
+    // Fake curl: /v1/responses returns 200 (so probe detects openai-responses),
+    // /v1/models returns a vLLM model list
+    fs.writeFileSync(
+      path.join(fakeBin, "curl"),
+      `#!/usr/bin/env bash
+body=''
+status="200"
+outfile=""
+url=""
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    -o) outfile="$2"; shift 2 ;;
+    *) url="$1"; shift ;;
+  esac
+done
+if echo "$url" | grep -q '/v1/models'; then
+  body='{"data":[{"id":"meta-llama/Llama-3.3-70B-Instruct"}]}'
+elif echo "$url" | grep -q '/v1/responses'; then
+  body='{"id":"resp_123","output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]}]}'
+elif echo "$url" | grep -q '/v1/chat/completions'; then
+  body='{"id":"chatcmpl-123","choices":[{"message":{"content":"ok"}}]}'
+fi
+printf '%s' "$body" > "$outfile"
+printf '%s' "$status"
+`,
+      { mode: 0o755 },
+    );
+
+    // vLLM is option 7 (build, openai, custom, anthropic, anthropicCompatible, gemini, vllm)
+    const script = String.raw`
+const credentials = require(${credentialsPath});
+const runner = require(${runnerPath});
+
+const answers = ["7"];
+const messages = [];
+
+credentials.prompt = async (message) => {
+  messages.push(message);
+  return answers.shift() || "";
+};
+credentials.ensureApiKey = async () => {};
+runner.runCapture = (command) => {
+  if (command.includes("command -v ollama")) return "";
+  if (command.includes("localhost:11434")) return "";
+  if (command.includes("localhost:8000/v1/models")) return JSON.stringify({ data: [{ id: "meta-llama/Llama-3.3-70B-Instruct" }] });
+  return "";
+};
+
+const { setupNim } = require(${onboardPath});
+
+(async () => {
+  const originalLog = console.log;
+  const lines = [];
+  console.log = (...args) => lines.push(args.join(" "));
+  try {
+    const result = await setupNim(null);
+    originalLog(JSON.stringify({ result, messages, lines }));
+  } finally {
+    console.log = originalLog;
+  }
+})().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
+`;
+    fs.writeFileSync(scriptPath, script);
+
+    const result = spawnSync(process.execPath, [scriptPath], {
+      cwd: repoRoot,
+      encoding: "utf-8",
+      env: {
+        ...process.env,
+        HOME: tmpDir,
+        PATH: `${fakeBin}:${process.env.PATH || ""}`,
+        NEMOCLAW_EXPERIMENTAL: "1",
+      },
+    });
+
+    assert.equal(result.status, 0, result.stderr);
+    const payload = JSON.parse(result.stdout.trim());
+    assert.equal(payload.result.provider, "vllm-local");
+    assert.equal(payload.result.model, "meta-llama/Llama-3.3-70B-Instruct");
+    // Key assertion: even though probe detected openai-responses, the override
+    // forces openai-completions so tool-call-parser works correctly.
+    assert.equal(payload.result.preferredInferenceApi, "openai-completions");
+    assert.ok(payload.lines.some((line) => line.includes("Using existing vLLM")));
+    assert.ok(payload.lines.some((line) => line.includes("tool-call-parser requires")));
+  });
+
+  it("forces openai-completions for NIM-local even when probe detects openai-responses", () => {
+    const repoRoot = path.join(import.meta.dirname, "..");
+    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-onboard-nim-override-"));
+    const fakeBin = path.join(tmpDir, "bin");
+    const scriptPath = path.join(tmpDir, "nim-override-check.js");
+    const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js"));
+    const credentialsPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "credentials.js"));
+    const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js"));
+    const nimPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "nim.js"));
+
+    fs.mkdirSync(fakeBin, { recursive: true });
+    // Fake curl: /v1/responses returns 200 (probe detects openai-responses)
+    fs.writeFileSync(
+      path.join(fakeBin, "curl"),
+      `#!/usr/bin/env bash
+body=''
+status="200"
+outfile=""
+url=""
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    -o) outfile="$2"; shift 2 ;;
+    *) url="$1"; shift ;;
+  esac
+done
+if echo "$url" | grep -q '/v1/models'; then
+  body='{"data":[{"id":"nvidia/nemotron-3-nano"}]}'
+elif echo "$url" | grep -q '/v1/responses'; then
+  body='{"id":"resp_123","output":[{"type":"message","content":[{"type":"output_text","text":"ok"}]}]}'
+elif echo "$url" | grep -q '/v1/chat/completions'; then
+  body='{"id":"chatcmpl-123","choices":[{"message":{"content":"ok"}}]}'
+fi
+printf '%s' "$body" > "$outfile"
+printf '%s' "$status"
+`,
+      { mode: 0o755 },
+    );
+
+    // NIM-local is option 7 (build, openai, custom, anthropic, anthropicCompatible, gemini, nim-local)
+    // No ollama, no vLLM — only NIM-local shows up as experimental option
+    const script = String.raw`
+const credentials = require(${credentialsPath});
+const runner = require(${runnerPath});
+
+// Mock nim module before onboard.js requires it
+const nimMod = require(${nimPath});
+nimMod.listModels = () => [{ name: "nvidia/nemotron-3-nano", image: "fake", minGpuMemoryMB: 8000 }];
+nimMod.pullNimImage = () => {};
+nimMod.containerName = () => "nemoclaw-nim-test";
+nimMod.startNimContainerByName = () => "container-123";
+nimMod.waitForNimHealth = () => true;
+
+// Select option 7 (nim-local), then model 1
+const answers = ["7", "1"];
+const messages = [];
+
+credentials.prompt = async (message) => {
+  messages.push(message);
+  return answers.shift() || "";
+};
+credentials.ensureApiKey = async () => {};
+runner.runCapture = (command) => {
+  if (command.includes("command -v ollama")) return "";
+  if (command.includes("localhost:11434")) return "";
+  if (command.includes("localhost:8000/v1/models")) return "";
+  return "";
+};
+
+const { setupNim } = require(${onboardPath});
+
+(async () => {
+  const originalLog = console.log;
+  const lines = [];
+  console.log = (...args) => lines.push(args.join(" "));
+  try {
+    // Pass a GPU object with nimCapable: true
+    const result = await setupNim({ type: "nvidia", totalMemoryMB: 16000, nimCapable: true });
+    originalLog(JSON.stringify({ result, messages, lines }));
+  } finally {
+    console.log = originalLog;
+  }
+})().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});
+`;
+    fs.writeFileSync(scriptPath, script);
+
+    const result = spawnSync(process.execPath, [scriptPath], {
+      cwd: repoRoot,
+      encoding: "utf-8",
+      env: {
+        ...process.env,
+        HOME: tmpDir,
+        PATH: `${fakeBin}:${process.env.PATH || ""}`,
+        NEMOCLAW_EXPERIMENTAL: "1",
+      },
+    });
+
+    assert.equal(result.status, 0, result.stderr);
+    const payload = JSON.parse(result.stdout.trim());
+    assert.equal(payload.result.provider, "vllm-local");
+    assert.equal(payload.result.model, "nvidia/nemotron-3-nano");
+    // Key assertion: NIM uses vLLM internally — same override must apply.
+    assert.equal(payload.result.preferredInferenceApi, "openai-completions");
+    assert.ok(payload.lines.some((line) => line.includes("tool-call-parser requires")));
+  });
 });

Original file line number	Diff line number	Diff line change
`@@ -2180,6 +2180,12 @@ async function setupNim(gpu) {`
`2180`	`2180`	`if (!preferredInferenceApi) {`
`2181`	`2181`	`continue selectionLoop;`
`2182`	`2182`	`}`
	`2183`	`+ // NIM uses vLLM internally — same tool-call-parser limitation`
	`2184`	`+ // applies to /v1/responses. Force chat completions.`
	`2185`	`+ if (preferredInferenceApi !== "openai-completions") {`
	`2186`	`+ console.log(" ℹ Using chat completions API (tool-call-parser requires /v1/chat/completions)");`
	`2187`	`+ }`
	`2188`	`+ preferredInferenceApi = "openai-completions";`
`2183`	`2189`	`}`
`2184`	`2190`	`}`
`2185`	`2191`	`break;`
`@@ -2296,6 +2302,13 @@ async function setupNim(gpu) {`
`2296`	`2302`	`if (!preferredInferenceApi) {`
`2297`	`2303`	`continue selectionLoop;`
`2298`	`2304`	`}`
	`2305`	`+ // Force chat completions — vLLM's /v1/responses endpoint does not`
	`2306`	`+ // run the --tool-call-parser, so tool calls arrive as raw text.`
	`2307`	`+ // See: https://github.com/NVIDIA/NemoClaw/issues/976`
	`2308`	`+ if (preferredInferenceApi !== "openai-completions") {`
	`2309`	`+ console.log(" ℹ Using chat completions API (tool-call-parser requires /v1/chat/completions)");`
	`2310`	`+ }`
	`2311`	`+ preferredInferenceApi = "openai-completions";`
`2299`	`2312`	`break;`
`2300`	`2313`	`}`
`2301`	`2314`	`}`