Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 3 additions & 255 deletions bin/lib/nim.js
Original file line number Diff line number Diff line change
@@ -1,259 +1,7 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// NIM container management — pull, start, stop, health-check NIM images.
// Thin re-export shim — the implementation lives in src/lib/nim.ts,
// compiled to dist/lib/nim.js.

const { run, runCapture, shellQuote } = require("./runner");
const nimImages = require("./nim-images.json");
const UNIFIED_MEMORY_GPU_TAGS = ["GB10", "Thor", "Orin", "Xavier"];

function containerName(sandboxName) {
return `nemoclaw-nim-${sandboxName}`;
}

function getImageForModel(modelName) {
const entry = nimImages.models.find((m) => m.name === modelName);
return entry ? entry.image : null;
}

function listModels() {
return nimImages.models.map((m) => ({
name: m.name,
image: m.image,
minGpuMemoryMB: m.minGpuMemoryMB,
}));
}

function canRunNimWithMemory(totalMemoryMB) {
return nimImages.models.some((m) => m.minGpuMemoryMB <= totalMemoryMB);
}

function detectGpu() {
// Try NVIDIA first — query VRAM
try {
const output = runCapture("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits", {
ignoreError: true,
});
if (output) {
const lines = output.split("\n").filter((l) => l.trim());
const perGpuMB = lines.map((l) => parseInt(l.trim(), 10)).filter((n) => !isNaN(n));
if (perGpuMB.length > 0) {
const totalMemoryMB = perGpuMB.reduce((a, b) => a + b, 0);
return {
type: "nvidia",
count: perGpuMB.length,
totalMemoryMB,
perGpuMB: perGpuMB[0],
nimCapable: canRunNimWithMemory(totalMemoryMB),
};
}
}
} catch {
/* ignored */
}

// Fallback: unified-memory NVIDIA devices where discrete VRAM is not queryable.
try {
const nameOutput = runCapture("nvidia-smi --query-gpu=name --format=csv,noheader,nounits", {
ignoreError: true,
});
const gpuNames = nameOutput
.split("\n")
.map((line) => line.trim())
.filter(Boolean);
const unifiedGpuNames = gpuNames.filter((name) =>
UNIFIED_MEMORY_GPU_TAGS.some((tag) => new RegExp(tag, "i").test(name)),
);
if (unifiedGpuNames.length > 0) {
let totalMemoryMB = 0;
try {
const memLine = runCapture("free -m | awk '/Mem:/ {print $2}'", { ignoreError: true });
if (memLine) totalMemoryMB = parseInt(memLine.trim(), 10) || 0;
} catch {
/* ignored */
}
const count = unifiedGpuNames.length;
const perGpuMB = count > 0 ? Math.floor(totalMemoryMB / count) : totalMemoryMB;
const isSpark = unifiedGpuNames.some((name) => /GB10/i.test(name));
return {
type: "nvidia",
name: unifiedGpuNames[0],
count,
totalMemoryMB,
perGpuMB: perGpuMB || totalMemoryMB,
nimCapable: canRunNimWithMemory(totalMemoryMB),
unifiedMemory: true,
spark: isSpark,
};
}
} catch {
/* ignored */
}

// macOS: detect Apple Silicon or discrete GPU
if (process.platform === "darwin") {
try {
const spOutput = runCapture("system_profiler SPDisplaysDataType 2>/dev/null", {
ignoreError: true,
});
if (spOutput) {
const chipMatch = spOutput.match(/Chipset Model:\s*(.+)/);
const vramMatch = spOutput.match(/VRAM.*?:\s*(\d+)\s*(MB|GB)/i);
const coresMatch = spOutput.match(/Total Number of Cores:\s*(\d+)/);

if (chipMatch) {
const name = chipMatch[1].trim();
let memoryMB = 0;

if (vramMatch) {
memoryMB = parseInt(vramMatch[1], 10);
if (vramMatch[2].toUpperCase() === "GB") memoryMB *= 1024;
} else {
// Apple Silicon shares system RAM — read total memory
try {
const memBytes = runCapture("sysctl -n hw.memsize", { ignoreError: true });
if (memBytes) memoryMB = Math.floor(parseInt(memBytes, 10) / 1024 / 1024);
} catch {
/* ignored */
}
}

return {
type: "apple",
name,
count: 1,
cores: coresMatch ? parseInt(coresMatch[1], 10) : null,
totalMemoryMB: memoryMB,
perGpuMB: memoryMB,
nimCapable: false,
};
}
}
} catch {
/* ignored */
}
}

return null;
}

function pullNimImage(model) {
const image = getImageForModel(model);
if (!image) {
console.error(` Unknown model: ${model}`);
process.exit(1);
}
console.log(` Pulling NIM image: ${image}`);
run(`docker pull ${shellQuote(image)}`);
return image;
}

function startNimContainer(sandboxName, model, port = 8000) {
const name = containerName(sandboxName);
return startNimContainerByName(name, model, port);
}

function startNimContainerByName(name, model, port = 8000) {
const image = getImageForModel(model);
if (!image) {
console.error(` Unknown model: ${model}`);
process.exit(1);
}

// Stop any existing container with same name
const qn = shellQuote(name);
run(`docker rm -f ${qn} 2>/dev/null || true`, { ignoreError: true });

console.log(` Starting NIM container: ${name}`);
run(
`docker run -d --gpus all -p ${Number(port)}:8000 --name ${qn} --shm-size 16g ${shellQuote(image)}`,
);
return name;
}

function waitForNimHealth(port = 8000, timeout = 300) {
const start = Date.now();
const intervalSec = 5;
const hostPort = Number(port);
console.log(` Waiting for NIM health on port ${hostPort} (timeout: ${timeout}s)...`);

while ((Date.now() - start) / 1000 < timeout) {
try {
const result = runCapture(`curl -sf http://localhost:${hostPort}/v1/models`, {
ignoreError: true,
});
if (result) {
console.log(" NIM is healthy.");
return true;
}
} catch {
/* ignored */
}
require("child_process").spawnSync("sleep", [String(intervalSec)]);
}
console.error(` NIM did not become healthy within ${timeout}s.`);
return false;
}

function stopNimContainer(sandboxName) {
const name = containerName(sandboxName);
stopNimContainerByName(name);
}

function stopNimContainerByName(name) {
const qn = shellQuote(name);
console.log(` Stopping NIM container: ${name}`);
run(`docker stop ${qn} 2>/dev/null || true`, { ignoreError: true });
run(`docker rm ${qn} 2>/dev/null || true`, { ignoreError: true });
}

function nimStatus(sandboxName, port) {
const name = containerName(sandboxName);
return nimStatusByName(name, port);
}

function nimStatusByName(name, port) {
try {
const qn = shellQuote(name);
const state = runCapture(`docker inspect --format '{{.State.Status}}' ${qn} 2>/dev/null`, {
ignoreError: true,
});
if (!state) return { running: false, container: name };

let healthy = false;
if (state === "running") {
let resolvedHostPort = port != null ? Number(port) : 0;
if (!resolvedHostPort) {
const mapping = runCapture(`docker port ${qn} 8000 2>/dev/null`, {
ignoreError: true,
});
const m = mapping && mapping.match(/:(\d+)\s*$/);
resolvedHostPort = m ? Number(m[1]) : 8000;
}
const health = runCapture(
`curl -sf http://localhost:${resolvedHostPort}/v1/models 2>/dev/null`,
{ ignoreError: true },
);
healthy = !!health;
}
return { running: state === "running", healthy, container: name, state };
} catch {
return { running: false, container: name };
}
}

module.exports = {
containerName,
getImageForModel,
listModels,
canRunNimWithMemory,
detectGpu,
pullNimImage,
startNimContainer,
startNimContainerByName,
waitForNimHealth,
stopNimContainer,
stopNimContainerByName,
nimStatus,
nimStatusByName,
};
module.exports = require("../../dist/lib/nim");
Comment on lines +4 to +7
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

This shim now breaks bin/ execution before a build.

bin/lib/onboard.js and bin/nemoclaw.js require this module at runtime, so redirecting it to ../../dist/lib/nim turns a clean source checkout into MODULE_NOT_FOUND unless the build has already run. That regresses the CLI entry path this shim is supposed to keep stable. As per coding guidelines, "The bin/ directory uses CommonJS intentionally — it's the CLI entry point that must work without a build step."

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@bin/lib/nim.js` around lines 4 - 7, The shim currently hard-codes
module.exports = require("../../dist/lib/nim") which breaks runtime usage before
a build; update the shim to try requiring the compiled dist module first and
fall back to the source implementation when dist is missing (i.e., wrap the
require in a try/catch and require the source module on failure), so
module.exports still resolves for the CLI entrypoints (referenced by
bin/lib/onboard.js and bin/nemoclaw.js) without forcing a prior build.

Loading
Loading