From ccfde8f0affd206ce07b137bc52d5b5dd1d3e9a3 Mon Sep 17 00:00:00 2001
From: samsja <sami@primeintellect.ai>
Date: Mon, 18 May 2026 18:05:48 -0700
Subject: [PATCH 1/2] docs(k8s): filter trainer torchrun logs to rank 0 instead
 of silencing

The trainer torchrun snippet in docs/kubernetes.md previously omitted
--local-ranks-filter / --tee / --redirect / --log-dir, so every rank's
stdout reached the pod's console and Loki ingested N copies of each line
(visible as the duplicated lines in the dashboard's Trainer log tab on
an N-GPU trainer pod).

Add the same flags the local launcher and SLURM templates use:
--local-ranks-filter=0 + --tee=3 keep only rank 0 on the pod console,
while --redirect=3 + --log-dir=/data/outputs/logs/trainer/torchrun
still writes every rank's stdout/stderr to per-rank files under the
mounted PVC for debugging.

This is an alternative to #2550 (which fixed the same dashboard
duplication by silencing loguru on non-zero ranks in setup_logger).
Doing it at torchrun keeps per-rank logs available on disk and avoids
the in-process rank_zero_only flag.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/kubernetes.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/kubernetes.md b/docs/kubernetes.md
index f718f1df01..70dc203ac9 100644
--- a/docs/kubernetes.md
+++ b/docs/kubernetes.md
@@ -258,9 +258,19 @@ torchrun \
   --node-rank=$RANK \
   --nproc-per-node=8 \
   --rdzv-endpoint=my-exp-trainer-0.$HEADLESS_SERVICE:29501 \
+  --log-dir=/data/outputs/logs/trainer/torchrun \
+  --local-ranks-filter=0 \
+  --redirect=3 \
+  --tee=3 \
   src/prime_rl/trainer/sft/train.py @ configs/train.toml
 ```
 
+`--local-ranks-filter=0 --tee=3` keeps only rank 0's stdout/stderr on the pod's
+console (so Loki/the dashboard see each log line once instead of N times for an
+N-GPU pod), while `--redirect=3 --log-dir=...` still writes every rank's
+stdout/stderr to per-rank files under the mounted PVC for debugging. This
+matches what the launcher does on single-node / SLURM deployments.
+
 ## Troubleshooting
 
 ### Can't access shared storage

From 52f9a20ac2b0959f18567acbacd4186e01d32146 Mon Sep 17 00:00:00 2001
From: samsja <sami@primeintellect.ai>
Date: Mon, 18 May 2026 18:09:49 -0700
Subject: [PATCH 2/2] docs(k8s): point helm values.yaml comment at the
 multi-GPU torchrun pattern

Mention the --local-ranks-filter=0 --tee=3 --redirect=3 --log-dir=...
torchrun flags directly in the values.yaml trainer.command comment so
users authoring multi-GPU helm values see them without having to read
docs/kubernetes.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 k8s/prime-rl/values.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/k8s/prime-rl/values.yaml b/k8s/prime-rl/values.yaml
index 7fc3b48989..fc5a4b88ec 100644
--- a/k8s/prime-rl/values.yaml
+++ b/k8s/prime-rl/values.yaml
@@ -99,7 +99,11 @@ trainer:
 
   # Auto-start configuration (set to false to use sleep infinity for debugging)
   autoStart: false
-  command: ""  # e.g., "uv run trainer @ /app/examples/reverse_text/rl/train.toml --output-dir /data/outputs"
+  # Single GPU: "uv run trainer @ /app/examples/reverse_text/rl/train.toml --output-dir /data/outputs"
+  # Multi-GPU: use torchrun and pass --local-ranks-filter=0 --tee=3 --redirect=3 --log-dir=... so
+  # only rank 0's stdout reaches the pod console (Loki/dashboard see each line once) while every
+  # rank's stdout/stderr is still written to per-rank files. See docs/kubernetes.md.
+  command: ""
 
   # Helps reduce CUDA memory fragmentation with PyTorch allocator
   pytorchCudaAllocConf: "expandable_segments:True"