Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions bin/nemoclaw.js
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,167 @@ function stripAnsi(value = "") {
return String(value).replace(/\x1b\[[0-9;]*m/g, "");
}

// ── Sandbox process health (OpenClaw gateway inside the sandbox) ─────────

/**
* Run a command inside the sandbox via SSH and return { status, stdout, stderr }.
* Returns null if SSH config cannot be obtained.
*/
function executeSandboxCommand(sandboxName, command) {
const sshConfigResult = captureOpenshell(["sandbox", "ssh-config", sandboxName], {
ignoreError: true,
});
if (sshConfigResult.status !== 0) return null;

const tmpFile = path.join(os.tmpdir(), `nemoclaw-ssh-${process.pid}-${Date.now()}.conf`);
fs.writeFileSync(tmpFile, sshConfigResult.output, { mode: 0o600 });
try {
const result = spawnSync(
"ssh",
[
"-F",
tmpFile,
"-o",
"StrictHostKeyChecking=no",
"-o",
"UserKnownHostsFile=/dev/null",
"-o",
"ConnectTimeout=5",
"-o",
"LogLevel=ERROR",
`openshell-${sandboxName}`,
command,
],
{ encoding: "utf-8", stdio: ["ignore", "pipe", "pipe"], timeout: 15000 },
);
return {
status: result.status ?? 1,
stdout: (result.stdout || "").trim(),
stderr: (result.stderr || "").trim(),
};
} catch {
return null;
} finally {
try {
fs.unlinkSync(tmpFile);
} catch {
/* ignore */
}
}
}

/**
* Check whether the OpenClaw gateway process is running inside the sandbox.
* Uses the gateway's HTTP endpoint (port 18789) as the source of truth,
* since the gateway runs as a separate user and pgrep may not see it.
* Returns true (running), false (stopped), or null (cannot determine).
*/
function isSandboxGatewayRunning(sandboxName) {
const result = executeSandboxCommand(
sandboxName,
"curl -sf --max-time 3 http://127.0.0.1:18789/ > /dev/null 2>&1 && echo RUNNING || echo STOPPED",
);
if (!result) return null;
if (result.stdout === "RUNNING") return true;
if (result.stdout === "STOPPED") return false;
return null;
}

/**
* Restart the OpenClaw gateway process inside the sandbox after a pod restart.
* Cleans stale lock/temp files, sources proxy config, and launches the gateway
* in the background. Returns true on success.
*/
function recoverSandboxProcesses(sandboxName) {
// The recovery script runs as the sandbox user (non-root). This matches
// the non-root fallback path in nemoclaw-start.sh — no privilege
// separation, but the gateway runs and inference works.
const script = [
// Source proxy config (written to .bashrc by nemoclaw-start on first boot)
"[ -f ~/.bashrc ] && . ~/.bashrc 2>/dev/null;",
// Re-check liveness before touching anything — another caller may have
// already recovered the gateway between our initial check and now (TOCTOU).
"if curl -sf --max-time 3 http://127.0.0.1:18789/ > /dev/null 2>&1; then echo ALREADY_RUNNING; exit 0; fi;",
// Clean stale lock files from the previous run (gateway checks these)
"rm -rf /tmp/openclaw-*/gateway.*.lock 2>/dev/null;",
// Clean stale temp files from the previous run
"rm -f /tmp/gateway.log /tmp/auto-pair.log;",
"touch /tmp/gateway.log; chmod 600 /tmp/gateway.log;",
"touch /tmp/auto-pair.log; chmod 600 /tmp/auto-pair.log;",
// Resolve and start gateway
'OPENCLAW="$(command -v openclaw)";',
'if [ -z "$OPENCLAW" ]; then echo OPENCLAW_MISSING; exit 1; fi;',
'nohup "$OPENCLAW" gateway run > /tmp/gateway.log 2>&1 &',
"GPID=$!; sleep 2;",
// Verify the gateway actually started (didn't crash immediately)
'if kill -0 "$GPID" 2>/dev/null; then echo "GATEWAY_PID=$GPID"; else echo GATEWAY_FAILED; cat /tmp/gateway.log 2>/dev/null | tail -5; fi',
].join(" ");

const result = executeSandboxCommand(sandboxName, script);
if (!result) return false;
return (
result.status === 0 &&
(result.stdout.includes("GATEWAY_PID=") || result.stdout.includes("ALREADY_RUNNING"))
);
}

/**
* Re-establish the dashboard port forward (18789) to the sandbox.
*/
function ensureSandboxPortForward(sandboxName) {
runOpenshell(["forward", "stop", DASHBOARD_FORWARD_PORT], { ignoreError: true });
runOpenshell(["forward", "start", "--background", DASHBOARD_FORWARD_PORT, sandboxName], {
ignoreError: true,
});
}

/**
* Detect and recover from a sandbox that survived a gateway restart but
* whose OpenClaw processes are not running. Returns an object describing
* the outcome: { checked, wasRunning, recovered }.
*/
function checkAndRecoverSandboxProcesses(sandboxName, { quiet = false } = {}) {
const running = isSandboxGatewayRunning(sandboxName);
if (running === null) {
return { checked: false, wasRunning: null, recovered: false };
}
if (running) {
return { checked: true, wasRunning: true, recovered: false };
}

// Gateway not running — attempt recovery
if (!quiet) {
console.log("");
console.log(" OpenClaw gateway is not running inside the sandbox (sandbox likely restarted).");
console.log(" Recovering...");
}

const recovered = recoverSandboxProcesses(sandboxName);
if (recovered) {
// Wait for gateway to bind its HTTP port before declaring success
spawnSync("sleep", ["3"]);
if (isSandboxGatewayRunning(sandboxName) !== true) {
// Gateway process started but HTTP endpoint never came up
if (!quiet) {
console.error(" Gateway process started but is not responding.");
console.error(" Check /tmp/gateway.log inside the sandbox for details.");
}
return { checked: true, wasRunning: false, recovered: false };
}
ensureSandboxPortForward(sandboxName);
if (!quiet) {
console.log(` ${G}✓${R} OpenClaw gateway restarted inside sandbox.`);
console.log(` ${G}✓${R} Dashboard port forward re-established.`);
}
} else if (!quiet) {
console.error(" Could not restart OpenClaw gateway automatically.");
console.error(" Connect to the sandbox and run manually:");
console.error(" nohup openclaw gateway run > /tmp/gateway.log 2>&1 &");
}

return { checked: true, wasRunning: false, recovered };
}

function buildRecoveredSandboxEntry(name, metadata = {}) {
return {
name,
Expand Down Expand Up @@ -961,6 +1122,7 @@ async function listSandboxes() {

async function sandboxConnect(sandboxName) {
await ensureLiveSandboxOrExit(sandboxName);
checkAndRecoverSandboxProcesses(sandboxName);
const result = spawnSync(getOpenshellBinary(), ["sandbox", "connect", sandboxName], {
stdio: "inherit",
cwd: ROOT,
Expand Down Expand Up @@ -1050,6 +1212,28 @@ async function sandboxStatus(sandboxName) {
printGatewayLifecycleHint(lookup.output, sandboxName, console.log);
}

// OpenClaw process health inside the sandbox
if (lookup.state === "present") {
const processCheck = checkAndRecoverSandboxProcesses(sandboxName, { quiet: true });
if (processCheck.checked) {
if (processCheck.wasRunning) {
console.log(` OpenClaw: ${G}running${R}`);
} else if (processCheck.recovered) {
console.log(` OpenClaw: ${G}recovered${R} (gateway restarted after sandbox restart)`);
} else {
console.log(` OpenClaw: ${_RD}not running${R}`);
console.log("");
console.log(" The sandbox is alive but the OpenClaw gateway process is not running.");
console.log(" This typically happens after a gateway restart (e.g., laptop close/open).");
console.log("");
console.log(" To recover, run:");
console.log(` ${D}nemoclaw ${sandboxName} connect${R} (auto-recovers on connect)`);
console.log(" Or manually inside the sandbox:");
console.log(` ${D}nohup openclaw gateway run > /tmp/gateway.log 2>&1 &${R}`);
}
}
}

// NIM health
const nimStat =
sb && sb.nimContainer ? nim.nimStatusByName(sb.nimContainer) : nim.nimStatus(sandboxName);
Expand Down
5 changes: 3 additions & 2 deletions scripts/install-openshell.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ esac

info "Detected $OS_LABEL ($ARCH_LABEL)"

# Minimum version required for cgroup v2 fix (NVIDIA/OpenShell#329)
MIN_VERSION="0.0.7"
# Minimum version required for sandbox persistence across gateway restarts
# (deterministic k3s node name + workspace PVC: NVIDIA/OpenShell#739, #488)
MIN_VERSION="0.0.22"

version_gte() {
# Returns 0 (true) if $1 >= $2 — portable, no sort -V (BSD compat)
Expand Down
Loading
Loading