PrimeIntellect-ai · samsja · May 19, 2026 · May 19, 2026
diff --git a/docs/kubernetes.md b/docs/kubernetes.md
@@ -258,9 +258,19 @@ torchrun \
   --node-rank=$RANK \
   --nproc-per-node=8 \
   --rdzv-endpoint=my-exp-trainer-0.$HEADLESS_SERVICE:29501 \
+  --log-dir=/data/outputs/logs/trainer/torchrun \
+  --local-ranks-filter=0 \
+  --redirect=3 \
+  --tee=3 \
   src/prime_rl/trainer/sft/train.py @ configs/train.toml
 ```
 
+`--local-ranks-filter=0 --tee=3` keeps only rank 0's stdout/stderr on the pod's
+console (so Loki/the dashboard see each log line once instead of N times for an
+N-GPU pod), while `--redirect=3 --log-dir=...` still writes every rank's
+stdout/stderr to per-rank files under the mounted PVC for debugging. This
+matches what the launcher does on single-node / SLURM deployments.
+
 ## Troubleshooting
 
 ### Can't access shared storage

diff --git a/k8s/prime-rl/values.yaml b/k8s/prime-rl/values.yaml
@@ -99,7 +99,11 @@ trainer:
 
   # Auto-start configuration (set to false to use sleep infinity for debugging)
   autoStart: false
-  command: ""  # e.g., "uv run trainer @ /app/examples/reverse_text/rl/train.toml --output-dir /data/outputs"
+  # Single GPU: "uv run trainer @ /app/examples/reverse_text/rl/train.toml --output-dir /data/outputs"
+  # Multi-GPU: use torchrun and pass --local-ranks-filter=0 --tee=3 --redirect=3 --log-dir=... so
+  # only rank 0's stdout reaches the pod console (Loki/dashboard see each line once) while every
+  # rank's stdout/stderr is still written to per-rank files. See docs/kubernetes.md.
+  command: ""
 
   # Helps reduce CUDA memory fragmentation with PyTorch allocator
   pytorchCudaAllocConf: "expandable_segments:True"