From ccfde8f0affd206ce07b137bc52d5b5dd1d3e9a3 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 18 May 2026 18:05:48 -0700 Subject: [PATCH 1/2] docs(k8s): filter trainer torchrun logs to rank 0 instead of silencing The trainer torchrun snippet in docs/kubernetes.md previously omitted --local-ranks-filter / --tee / --redirect / --log-dir, so every rank's stdout reached the pod's console and Loki ingested N copies of each line (visible as the duplicated lines in the dashboard's Trainer log tab on an N-GPU trainer pod). Add the same flags the local launcher and SLURM templates use: --local-ranks-filter=0 + --tee=3 keep only rank 0 on the pod console, while --redirect=3 + --log-dir=/data/outputs/logs/trainer/torchrun still writes every rank's stdout/stderr to per-rank files under the mounted PVC for debugging. This is an alternative to #2550 (which fixed the same dashboard duplication by silencing loguru on non-zero ranks in setup_logger). Doing it at torchrun keeps per-rank logs available on disk and avoids the in-process rank_zero_only flag. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/kubernetes.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/kubernetes.md b/docs/kubernetes.md index f718f1df01..70dc203ac9 100644 --- a/docs/kubernetes.md +++ b/docs/kubernetes.md @@ -258,9 +258,19 @@ torchrun \ --node-rank=$RANK \ --nproc-per-node=8 \ --rdzv-endpoint=my-exp-trainer-0.$HEADLESS_SERVICE:29501 \ + --log-dir=/data/outputs/logs/trainer/torchrun \ + --local-ranks-filter=0 \ + --redirect=3 \ + --tee=3 \ src/prime_rl/trainer/sft/train.py @ configs/train.toml ``` +`--local-ranks-filter=0 --tee=3` keeps only rank 0's stdout/stderr on the pod's +console (so Loki/the dashboard see each log line once instead of N times for an +N-GPU pod), while `--redirect=3 --log-dir=...` still writes every rank's +stdout/stderr to per-rank files under the mounted PVC for debugging. This +matches what the launcher does on single-node / SLURM deployments. + ## Troubleshooting ### Can't access shared storage From 52f9a20ac2b0959f18567acbacd4186e01d32146 Mon Sep 17 00:00:00 2001 From: samsja Date: Mon, 18 May 2026 18:09:49 -0700 Subject: [PATCH 2/2] docs(k8s): point helm values.yaml comment at the multi-GPU torchrun pattern Mention the --local-ranks-filter=0 --tee=3 --redirect=3 --log-dir=... torchrun flags directly in the values.yaml trainer.command comment so users authoring multi-GPU helm values see them without having to read docs/kubernetes.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- k8s/prime-rl/values.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/k8s/prime-rl/values.yaml b/k8s/prime-rl/values.yaml index 7fc3b48989..fc5a4b88ec 100644 --- a/k8s/prime-rl/values.yaml +++ b/k8s/prime-rl/values.yaml @@ -99,7 +99,11 @@ trainer: # Auto-start configuration (set to false to use sleep infinity for debugging) autoStart: false - command: "" # e.g., "uv run trainer @ /app/examples/reverse_text/rl/train.toml --output-dir /data/outputs" + # Single GPU: "uv run trainer @ /app/examples/reverse_text/rl/train.toml --output-dir /data/outputs" + # Multi-GPU: use torchrun and pass --local-ranks-filter=0 --tee=3 --redirect=3 --log-dir=... so + # only rank 0's stdout reaches the pod console (Loki/dashboard see each line once) while every + # rank's stdout/stderr is still written to per-rank files. See docs/kubernetes.md. + command: "" # Helps reduce CUDA memory fragmentation with PyTorch allocator pytorchCudaAllocConf: "expandable_segments:True"