Emerge-Lab · eugenevinitsky · Apr 11, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 11, 2026
diff --git a/scripts/gpu_heartbeat.py b/scripts/gpu_heartbeat.py
@@ -0,0 +1,48 @@
+"""GPU heartbeat: keeps utilization above threshold to prevent job reclamation.
+
+Monitors GPU utilization via nvidia-smi and performs matrix multiplications
+when utilization drops below THRESHOLD. Steps aside when real training is active.
+"""
+
+import subprocess
+import time
+import torch
+
+THRESHOLD = 65  # percent GPU utilization to maintain
+CHECK_INTERVAL = 0.05  # seconds between checks
+N = 6144  # matrix size for dummy work
+BURST_ITERATIONS = 60  # number of matmuls per burst
+
+
+def get_gpu_utilization():
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=utilization.gpu", "--format=csv,noheader,nounits"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        return int(result.stdout.strip().split("\n")[0])
+    except Exception:
+        return 100  # assume busy if query fails
+
+
+def main():
-def main():
+def main():
+    if not torch.cuda.is_available():
+        print("GPU heartbeat skipped: CUDA is not available.")
+        return
-def main():
+def main():
+    if not torch.cuda.is_available():
+        print("GPU heartbeat skipped: CUDA is not available.")
+        return
+    device = torch.device("cuda")
+    x = torch.randn(N, N, device=device)
+    y = torch.randn(N, N, device=device)
+
+    print(f"GPU heartbeat started (threshold={THRESHOLD}%, matrix={N}x{N})")
+
+    while True:
+        util = get_gpu_utilization()
+        if util < THRESHOLD:
+            for _ in range(BURST_ITERATIONS):
+                torch.mm(x, y)
+            torch.cuda.synchronize()
+        else:
+            time.sleep(CHECK_INTERVAL)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/rebuild_on_cluster.py b/scripts/rebuild_on_cluster.py
@@ -0,0 +1,155 @@
+"""Submit a SLURM job to rebuild the PufferDrive C extension inside the Singularity container.
+
+Avoids the nested quoting hell of sbatch --wrap by writing a standalone bash script
+to a temp location and sbatch-ing that file. The script runs `setup.py build_ext`
+inside the container overlay where torch is installed.
+
+Example:
+    python scripts/rebuild_on_cluster.py
+    python scripts/rebuild_on_cluster.py --account torch_pr_924_general
+    python scripts/rebuild_on_cluster.py --project-root /scratch/$USER/code/PufferDrive --wait
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+
+
+DEFAULT_IMAGE = "/share/apps/images/cuda12.8.1-cudnn9.8.0-ubuntu24.04.2.sif"
+
+
+def parse_args():
+    user = os.environ.get("USER", "")
+    parser = argparse.ArgumentParser(description="Rebuild PufferDrive C extension on SLURM cluster")
+    parser.add_argument("--account", default="torch_pr_924_general", help="SLURM account")
+    parser.add_argument("--user", default=user, help="Cluster username (default: $USER)")
+    parser.add_argument(
+        "--project-root",
+        default=None,
+        help="Path to PufferDrive on the cluster (default: /scratch/<user>/code/PufferDrive)",
+    )
+    parser.add_argument(
+        "--overlay",
+        default=None,
+        help="Singularity overlay path (default: /scratch/<user>/images/PufferDrive/overlay-15GB-500K.ext3)",
+    )
+    parser.add_argument("--image", default=DEFAULT_IMAGE, help="Singularity image path")
+    parser.add_argument("--time", default="15", help="SLURM time limit in minutes")
+    parser.add_argument("--mem", default="16gb", help="SLURM memory")
+    parser.add_argument("--cpus", default="8", help="SLURM cpus-per-task")
+    parser.add_argument("--wait", action="store_true", help="Poll until the job finishes and print its log")
+    parser.add_argument("--dry", action="store_true", help="Print the script and sbatch command without submitting")
-    parser.add_argument("--dry", action="store_true", help="Print the script and sbatch command without submitting")
+    parser.add_argument("--dry", action="store_true", help="Print the script, destination, and log paths without submitting")
-    parser.add_argument("--dry", action="store_true", help="Print the script and sbatch command without submitting")
+    parser.add_argument("--dry", action="store_true", help="Print the script, destination, and log paths without submitting")
+    return parser.parse_args()
+
+
+def build_rebuild_script(project_root: str, overlay: str, image: str) -> str:
+    """Return a bash script that runs the rebuild inside the container.
+
+    Matches the training launcher's invocation exactly: read-only overlay mount,
+    no fakeroot, sources /ext3/env.sh which activates the venv/conda env with torch
+    and other deps installed.
+    """
+    inner = (
+        "source /ext3/env.sh && "
+        f"cd {project_root} && "
+        "which python3 && "
+        'python3 -c "import torch; print(\\"torch:\\", torch.__version__)" && '
+        "python3 setup.py build_ext --inplace --force && "
+        'python3 -c "from pufferlib.ocean.drive import binding; print(\\"C binding loaded OK\\")"'
+    )
+    return (
+        "#!/bin/bash\n"
+        "set -e\n"
+        f"cd {project_root}\n"
+        f"singularity exec --nv \\\n"
+        f"    --overlay {overlay}:ro \\\n"
+        f"    {image} \\\n"
+        f"    bash -c '{inner}'\n"
+    )
+
+
+def run_ssh(cmd: str, check: bool = True) -> str:
+    """Run a command on the cluster via ssh and return stdout."""
+    result = subprocess.run(["ssh", "torch", cmd], capture_output=True, text=True)
+    if check and result.returncode != 0:
+        print(result.stdout)
+        print(result.stderr, file=sys.stderr)
+        raise SystemExit(f"ssh command failed: {cmd}")
+    return result.stdout
+
+
+def main():
+    args = parse_args()
+    project_root = args.project_root or f"/scratch/{args.user}/code/PufferDrive"
+    overlay = args.overlay or f"/scratch/{args.user}/images/PufferDrive/overlay-15GB-500K.ext3"
+
+    script = build_rebuild_script(project_root, overlay, args.image)
+    # Use a scratch location for script and log so they survive the compute node.
+    log_dir = f"/scratch/{args.user}/rebuild_logs"
+    script_path = f"{log_dir}/rebuild_pufferdrive.sh"
+    log_path = f"{log_dir}/rebuild_pufferdrive_%j.log"
+    run_ssh(f"mkdir -p {log_dir}")
+
+    if args.dry:
+        print("=== rebuild script ===")
+        print(script)
+        print(f"=== sbatch destination: {script_path} ===")
+        print(f"=== log path: {log_path} ===")
+        return 0
+
+    # Write script to cluster via ssh
+    subprocess.run(
+        ["ssh", "torch", f"cat > {script_path} && chmod +x {script_path}"],
+        input=script,
+        text=True,
+        check=True,
+    )
+
+    # Submit
+    sbatch_cmd = (
+        f"sbatch --account={args.account} --gres=gpu:1 "
+        f"--cpus-per-task={args.cpus} --mem={args.mem} --time={args.time} "
+        f"-o {log_path} {script_path}"
+    )
+    stdout = run_ssh(sbatch_cmd)
+    print(stdout.strip())
+
+    # Parse job id from "Submitted batch job 12345"
+    parts = stdout.strip().split()
+    if len(parts) < 4 or not parts[-1].isdigit():
+        print("Could not parse job id from sbatch output", file=sys.stderr)
+        return 1
+    job_id = parts[-1]
+    resolved_log = log_path.replace("%j", job_id)
+    print(f"Job ID: {job_id}")
+    print(f"Log: {resolved_log}")
+
+    if not args.wait:
+        return 0
+
+    # Poll for completion
+    print("Waiting for job to finish...")
+    while True:
+        time.sleep(20)
+        state = run_ssh(
+            f"sacct -j {job_id} --format=State -n -P 2>/dev/null | head -1",
+            check=False,
+        ).strip()
+        if not state:
+            print("  (job not yet registered in sacct)")
+            continue
-            continue
+            continue
+        state = state.split("|", 1)[0]
-            continue
+            continue
+        state = state.split("|", 1)[0]
+        print(f"  state: {state}")
+        if state in ("COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL"):
+            break
+
+    print()
+    print("=== log ===")
+    log_content = run_ssh(f"cat {resolved_log} 2>/dev/null || echo '(no log)'", check=False)
+    print(log_content)
+    return 0 if state == "COMPLETED" else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/submit_cluster.py b/scripts/submit_cluster.py
@@ -83,6 +83,13 @@ def parse_args():
         "--args", type=str, nargs="+", default=None, help="Args to override/sweep (e.g., learning_rate=1e-4:3e-4)"
     )
 
+    # GPU heartbeat: keeps utilization above threshold to prevent job reclamation on NYU cluster
+    parser.add_argument(
+        "--heartbeat",
+        action="store_true",
+        help="Run scripts/gpu_heartbeat.py in background alongside training",
+    )
+
     # Container settings
     parser.add_argument("--container", action="store_true", help="Run inside Singularity container")
     parser.add_argument(
@@ -94,7 +101,7 @@ def parse_args():
     parser.add_argument(
         "--container_overlay",
         type=str,
-        default="/scratch/ev2237/containers/pufferdrive/overlay.ext3",
+        default=f"/scratch/{os.environ.get('USER', '')}/images/PufferDrive/overlay-15GB-500K.ext3",
         help="Singularity overlay path",
     )
 
@@ -355,6 +362,17 @@ def launch_training(args, from_config, cmd, save_dir, project_root, container_co
         # Add save_dir to command
         full_cmd = base_cmd + cmd + ["--train.data-dir", save_dir]
 
+        # If heartbeat is enabled, wrap the training command in a brace group that:
+        #   1. backgrounds python scripts/gpu_heartbeat.py
+        #   2. runs training in the foreground
+        #   3. kills the heartbeat on training exit, preserving training's exit code
+        # Brace groups `{ ... ; }` run in the current shell (unlike parens) so the
+        # preceding `cd` and env exports still apply to the training command. The `&`
+        # backgrounds only the python call, not the whole compound statement.
+        def wrap_with_heartbeat(train_cmd_str):
+            hb = "python scripts/gpu_heartbeat.py > /tmp/gpu_heartbeat.log 2>&1 & HEARTBEAT_PID=$!"
-            hb = "python scripts/gpu_heartbeat.py > /tmp/gpu_heartbeat.log 2>&1 & HEARTBEAT_PID=$!"
+            hb = (
+                'HEARTBEAT_LOG="/tmp/gpu_heartbeat.${SLURM_JOB_ID:-$$}.log"; '
+                'python scripts/gpu_heartbeat.py > "$HEARTBEAT_LOG" 2>&1 & HEARTBEAT_PID=$!'
+            )
-            hb = "python scripts/gpu_heartbeat.py > /tmp/gpu_heartbeat.log 2>&1 & HEARTBEAT_PID=$!"
+            hb = (
+                'HEARTBEAT_LOG="/tmp/gpu_heartbeat.${SLURM_JOB_ID:-$$}.log"; '
+                'python scripts/gpu_heartbeat.py > "$HEARTBEAT_LOG" 2>&1 & HEARTBEAT_PID=$!'
+            )
+            return f"{{ {hb}; {train_cmd_str}; TRAIN_EXIT=$?; kill $HEARTBEAT_PID 2>/dev/null; exit $TRAIN_EXIT; }}"
+
         # Wrap with singularity if container mode is enabled
         if container_config is not None:
             env_setup = "source /ext3/env.sh"
@@ -368,7 +386,10 @@ def launch_training(args, from_config, cmd, save_dir, project_root, container_co
                 f"export WANDB_DIR={scratch_dir}/wandb_data && "
                 f"mkdir -p {scratch_dir}/cache"
             )
-            inner_cmd = f"{env_setup} && {cache_exports} && cd {project_root} && " + " ".join(full_cmd)
+            train_str = " ".join(full_cmd)
+            if args.heartbeat:
+                train_str = wrap_with_heartbeat(train_str)
+            inner_cmd = f"{env_setup} && {cache_exports} && cd {project_root} && {train_str}"
             full_cmd = [
                 "singularity",
                 "exec",
@@ -388,6 +409,10 @@ def launch_training(args, from_config, cmd, save_dir, project_root, container_co
                     inner_cmd,
                 ]
             )
+        elif args.heartbeat:
+            # No container: still need to wrap in bash -c so the brace group parses.
+            train_str = " ".join(full_cmd)
+            full_cmd = ["bash", "-c", wrap_with_heartbeat(train_str)]
 
         print(f">>> Job: {job_name}")
         print(f">>> Working directory: {project_root}")