From 0f9b246d7fcfe9f1a85444e5be51d73ea765e2c6 Mon Sep 17 00:00:00 2001
From: Ogulcan Aydogan <ogulcanaydogan@hotmail.com>
Date: Fri, 6 Mar 2026 09:08:24 +0000
Subject: [PATCH 1/3] a100-runtime-reconcile: add managed v8 stable-reset flow

---
 .../turkcell_7b_a100_v5_recovery_low_lr.yaml  |  25 ++
 ...urkcell_7b_a100_v6_recovery_reset_opt.yaml |  14 +
 .../turkcell_7b_a100_v7_balanced_stable.yaml  |  12 +
 .../turkcell_7b_a100_v8_stable_reset.yaml     |  17 +
 ...ell_7b_a100_v8b_ultra_stable_fallback.yaml |  10 +
 configs/serving/vllm_a100_v6_merged.yaml      |  13 +
 configs/serving/vllm_a100_v8_merged.yaml      |  13 +
 deploy/systemd/forge-posttrain.path           |  11 +
 deploy/systemd/forge-posttrain.service        |  16 +
 deploy/systemd/forge-training-monitor.service |  19 ++
 .../systemd/forge-training-watchdog.service   |  18 ++
 deploy/systemd/forge-training.service         |  18 ++
 scripts/generate_training_manifest.py         | 122 ++++++++
 scripts/install_training_services.sh          |  81 +++++
 scripts/monitor_a100_training.sh              | 230 ++++++++++++++
 scripts/post_training_pipeline.sh             | 174 ++++++++++
 scripts/run_posttrain_if_complete.sh          | 176 +++++++++++
 scripts/start_a100_training.sh                | 102 ++++++
 scripts/training_watchdog.py                  | 296 ++++++++++++++++++
 src/forge/training/callbacks.py               |  96 +++++-
 src/forge/training/trainer.py                 |  67 +++-
 src/forge/utils/config.py                     |   2 +
 22 files changed, 1514 insertions(+), 18 deletions(-)
 create mode 100644 configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml
 create mode 100644 configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml
 create mode 100644 configs/models/turkcell_7b_a100_v7_balanced_stable.yaml
 create mode 100644 configs/models/turkcell_7b_a100_v8_stable_reset.yaml
 create mode 100644 configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml
 create mode 100644 configs/serving/vllm_a100_v6_merged.yaml
 create mode 100644 configs/serving/vllm_a100_v8_merged.yaml
 create mode 100644 deploy/systemd/forge-posttrain.path
 create mode 100644 deploy/systemd/forge-posttrain.service
 create mode 100644 deploy/systemd/forge-training-monitor.service
 create mode 100644 deploy/systemd/forge-training-watchdog.service
 create mode 100644 deploy/systemd/forge-training.service
 create mode 100755 scripts/generate_training_manifest.py
 create mode 100755 scripts/install_training_services.sh
 create mode 100755 scripts/monitor_a100_training.sh
 create mode 100644 scripts/post_training_pipeline.sh
 create mode 100755 scripts/run_posttrain_if_complete.sh
 create mode 100755 scripts/start_a100_training.sh
 create mode 100755 scripts/training_watchdog.py

diff --git a/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml b/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml
new file mode 100644
index 0000000..f650172
--- /dev/null
+++ b/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml
@@ -0,0 +1,25 @@
+# Turkcell-7B A100 recovery profile after NaN stop.
+_base: "./turkcell_7b.yaml"
+
+model:
+  max_seq_length: 2048
+
+data:
+  train_path: "data/processed/turkish_sft_v3_clean.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+training:
+  num_epochs: 1
+  learning_rate: 2.0e-5
+  lr_scheduler_type: "cosine"
+  warmup_ratio: 0.05
+  max_grad_norm: 1.0
+  per_device_train_batch_size: 8
+  gradient_accumulation_steps: 2
+  eval_steps: 500
+  save_steps: 500
+  fp16: false
+  bf16: true
+
+wandb:
+  run_name: "turkcell-7b-sft-v5-a100-bf16-recovery-low-lr"
diff --git a/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml b/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml
new file mode 100644
index 0000000..0b57077
--- /dev/null
+++ b/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml
@@ -0,0 +1,14 @@
+# Turkcell-7B A100 recovery profile with optimizer reset.
+# Use adapter warm-start from checkpoint-500 without resuming optimizer state.
+_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml"
+
+training:
+  learning_rate: 1.0e-5
+  warmup_ratio: 0.08
+  max_grad_norm: 0.3
+  eval_steps: 200
+  save_steps: 100
+  adapter_init_path: "artifacts/training/turkcell-7b-sft-v3-a100-bf16-stable/checkpoint-500"
+
+wandb:
+  run_name: "turkcell-7b-sft-v6-a100-bf16-recovery-reset-opt"
diff --git a/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml b/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml
new file mode 100644
index 0000000..16d2d40
--- /dev/null
+++ b/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml
@@ -0,0 +1,12 @@
+# Turkcell-7B A100 balanced-stable profile.
+# Goal: reduce NaN risk without excessive eval/checkpoint overhead.
+_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml"
+
+training:
+  learning_rate: 1.0e-5
+  max_grad_norm: 0.3
+  eval_steps: 500
+  save_steps: 250
+
+wandb:
+  run_name: "turkcell-7b-sft-v7-a100-bf16-balanced-stable"
diff --git a/configs/models/turkcell_7b_a100_v8_stable_reset.yaml b/configs/models/turkcell_7b_a100_v8_stable_reset.yaml
new file mode 100644
index 0000000..a8c2623
--- /dev/null
+++ b/configs/models/turkcell_7b_a100_v8_stable_reset.yaml
@@ -0,0 +1,17 @@
+# Turkcell-7B A100 stable-reset profile.
+# Resume from v7 checkpoint weights via adapter_init_path only.
+# Do not resume optimizer/scheduler state.
+_base: "./turkcell_7b_a100_v7_balanced_stable.yaml"
+
+training:
+  learning_rate: 5.0e-6
+  max_grad_norm: 0.3
+  warmup_ratio: 0.10
+  eval_steps: 500
+  save_steps: 250
+  fp16: false
+  bf16: true
+  adapter_init_path: "artifacts/training/turkcell-7b-sft-v7-a100-bf16-balanced-stable/checkpoint-1000"
+
+wandb:
+  run_name: "turkcell-7b-sft-v8-a100-bf16-stable-reset"
diff --git a/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml b/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml
new file mode 100644
index 0000000..f7c0619
--- /dev/null
+++ b/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml
@@ -0,0 +1,10 @@
+# Turkcell-7B A100 ultra-stable fallback profile.
+# Use only when v8 fails the first 300-step stability gate.
+_base: "./turkcell_7b_a100_v8_stable_reset.yaml"
+
+training:
+  learning_rate: 3.0e-6
+  max_grad_norm: 0.2
+
+wandb:
+  run_name: "turkcell-7b-sft-v8b-a100-bf16-ultra-stable-fallback"
diff --git a/configs/serving/vllm_a100_v6_merged.yaml b/configs/serving/vllm_a100_v6_merged.yaml
new file mode 100644
index 0000000..5a3b0bd
--- /dev/null
+++ b/configs/serving/vllm_a100_v6_merged.yaml
@@ -0,0 +1,13 @@
+# vLLM serving configuration for A100 merged v6 model
+
+model_path: "artifacts/merged/turkcell-7b-a100-v6-recovery-reset-opt"
+host: "0.0.0.0"
+port: 18020
+tensor_parallel_size: 1
+gpu_memory_utilization: 0.85
+max_model_len: 4096
+dtype: "float16"
+enable_prefix_caching: true
+trust_remote_code: false
+enforce_eager: false
+max_num_seqs: 64
diff --git a/configs/serving/vllm_a100_v8_merged.yaml b/configs/serving/vllm_a100_v8_merged.yaml
new file mode 100644
index 0000000..bcf4121
--- /dev/null
+++ b/configs/serving/vllm_a100_v8_merged.yaml
@@ -0,0 +1,13 @@
+# vLLM serving configuration for A100 merged v8 model
+
+model_path: "artifacts/merged/turkcell-7b-a100-v8-stable-reset"
+host: "0.0.0.0"
+port: 18030
+tensor_parallel_size: 1
+gpu_memory_utilization: 0.85
+max_model_len: 4096
+dtype: "float16"
+enable_prefix_caching: true
+trust_remote_code: false
+enforce_eager: false
+max_num_seqs: 64
diff --git a/deploy/systemd/forge-posttrain.path b/deploy/systemd/forge-posttrain.path
new file mode 100644
index 0000000..cfcd1be
--- /dev/null
+++ b/deploy/systemd/forge-posttrain.path
@@ -0,0 +1,11 @@
+[Unit]
+Description=Watch training status changes and trigger post-training pipeline
+After=forge-training-monitor.service
+Wants=forge-training-monitor.service
+
+[Path]
+PathModified=%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_status_a100.txt
+Unit=forge-posttrain.service
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-posttrain.service b/deploy/systemd/forge-posttrain.service
new file mode 100644
index 0000000..3bbcf0d
--- /dev/null
+++ b/deploy/systemd/forge-posttrain.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=LowResource-LLM-Forge Post-Training Pipeline Trigger
+After=forge-training-monitor.service forge-training.service
+Wants=forge-training-monitor.service
+
+[Service]
+Type=oneshot
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/run_posttrain_if_complete.sh
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training-monitor.service b/deploy/systemd/forge-training-monitor.service
new file mode 100644
index 0000000..92d1a9b
--- /dev/null
+++ b/deploy/systemd/forge-training-monitor.service
@@ -0,0 +1,19 @@
+[Unit]
+Description=LowResource-LLM-Forge Training Progress Monitor
+After=forge-training.service
+Wants=forge-training.service
+PartOf=forge-training.service
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/monitor_a100_training.sh
+Restart=on-failure
+RestartSec=20
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training-watchdog.service b/deploy/systemd/forge-training-watchdog.service
new file mode 100644
index 0000000..4032595
--- /dev/null
+++ b/deploy/systemd/forge-training-watchdog.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=LowResource-LLM-Forge Training Watchdog
+After=forge-training.service
+Wants=forge-training.service
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/training_watchdog.py --service forge-training.service
+Restart=always
+RestartSec=10
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training.service b/deploy/systemd/forge-training.service
new file mode 100644
index 0000000..188040f
--- /dev/null
+++ b/deploy/systemd/forge-training.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=LowResource-LLM-Forge A100 Training
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_a100_training.sh
+Restart=on-failure
+RestartSec=20
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=default.target
diff --git a/scripts/generate_training_manifest.py b/scripts/generate_training_manifest.py
new file mode 100755
index 0000000..de8e620
--- /dev/null
+++ b/scripts/generate_training_manifest.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""Generate a deterministic training manifest for a completed run."""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import re
+import subprocess
+from datetime import UTC, datetime
+from pathlib import Path
+
+from forge.utils.config import load_training_config
+
+TIMESTAMP_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)")
+
+
+def _utc_now() -> str:
+    return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def _sha256_file(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _line_count(path: Path) -> int:
+    if not path.exists():
+        return 0
+    with path.open("rb") as handle:
+        return sum(1 for _ in handle)
+
+
+def _git_commit() -> str:
+    proc = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    return proc.stdout.strip() if proc.returncode == 0 else "unknown"
+
+
+def _extract_log_times(log_file: Path) -> tuple[str, str]:
+    if not log_file.exists():
+        return "unknown", "unknown"
+
+    start_ts = "unknown"
+    end_ts = "unknown"
+    with log_file.open(encoding="utf-8", errors="ignore") as handle:
+        for line in handle:
+            if "training_started" in line and start_ts == "unknown":
+                match = TIMESTAMP_RE.match(line.strip())
+                if match:
+                    start_ts = match.group(1)
+            if "training_complete" in line or "Training complete. Adapter saved to" in line:
+                match = TIMESTAMP_RE.match(line.strip())
+                if match:
+                    end_ts = match.group(1)
+    return start_ts, end_ts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate training manifest JSON.")
+    parser.add_argument("--config", required=True, help="Training config path.")
+    parser.add_argument("--run-dir", required=True, help="Training run directory.")
+    parser.add_argument("--log-file", required=True, help="Training log file path.")
+    parser.add_argument(
+        "--output",
+        default=None,
+        help="Output manifest path (defaults to <run-dir>/manifest.json).",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    config_path = Path(args.config).resolve()
+    run_dir = Path(args.run_dir).resolve()
+    log_file = Path(args.log_file).resolve()
+    output_path = Path(args.output).resolve() if args.output else run_dir / "manifest.json"
+
+    cfg = load_training_config(config_path)
+    train_path = Path(cfg.train_data_path).resolve()
+    eval_path = Path(cfg.eval_data_path).resolve()
+
+    final_dir = run_dir / "final"
+    checkpoints = sorted(p.name for p in run_dir.glob("checkpoint-*") if p.is_dir())
+    start_ts, end_ts = _extract_log_times(log_file)
+
+    manifest = {
+        "created_utc": _utc_now(),
+        "git_commit": _git_commit(),
+        "config_path": str(config_path),
+        "config_sha256": _sha256_file(config_path),
+        "run_dir": str(run_dir),
+        "log_file": str(log_file),
+        "run_start_utc": start_ts,
+        "run_end_utc": end_ts,
+        "model_name": cfg.model.name,
+        "run_name": cfg.wandb.run_name,
+        "train_data_path": str(train_path),
+        "eval_data_path": str(eval_path),
+        "train_records": _line_count(train_path),
+        "eval_records": _line_count(eval_path),
+        "final_dir_exists": final_dir.exists(),
+        "checkpoint_dirs": checkpoints,
+    }
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    payload = json.dumps(manifest, indent=2, ensure_ascii=False) + "\n"
+    output_path.write_text(payload, encoding="utf-8")
+    print(f"manifest_written={output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/install_training_services.sh b/scripts/install_training_services.sh
new file mode 100755
index 0000000..a0d76d1
--- /dev/null
+++ b/scripts/install_training_services.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+SYSTEMD_USER_DIR="${SYSTEMD_USER_DIR:-$HOME/.config/systemd/user}"
+FORGE_ENV_DIR="${FORGE_ENV_DIR:-$HOME/.config/forge}"
+FORGE_ENV_FILE="${FORGE_ENV_FILE:-$FORGE_ENV_DIR/training.env}"
+
+mkdir -p "$SYSTEMD_USER_DIR" "$PROJECT_ROOT/artifacts/logs" "$FORGE_ENV_DIR"
+
+if [[ ! -f "$FORGE_ENV_FILE" ]]; then
+    cat >"$FORGE_ENV_FILE" <<'EOF'
+# Required for training with WandB.
+# Set your real key before starting forge-training.service.
+WANDB_API_KEY=
+
+# Optional overrides:
+# TRAIN_CONFIG=configs/models/turkcell_7b_a100_v8_stable_reset.yaml
+# TRAIN_RUN_DIR=artifacts/training/turkcell-7b-sft-v8-a100-bf16-stable-reset
+# TRAIN_LOG=artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log
+# TARGET_STEPS=8601
+# SAVE_STEPS=250
+# ENABLE_RESUME=0
+# REQUIRE_WANDB=0
+# BOOTSTRAP_CHECKPOINT=
+EOF
+    chmod 600 "$FORGE_ENV_FILE"
+fi
+
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-training.service" \
+    "$SYSTEMD_USER_DIR/forge-training.service"
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-training-watchdog.service" \
+    "$SYSTEMD_USER_DIR/forge-training-watchdog.service"
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-training-monitor.service" \
+    "$SYSTEMD_USER_DIR/forge-training-monitor.service"
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-posttrain.service" \
+    "$SYSTEMD_USER_DIR/forge-posttrain.service"
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-posttrain.path" \
+    "$SYSTEMD_USER_DIR/forge-posttrain.path"
+
+chmod +x \
+    "$PROJECT_ROOT/scripts/start_a100_training.sh" \
+    "$PROJECT_ROOT/scripts/monitor_a100_training.sh" \
+    "$PROJECT_ROOT/scripts/training_watchdog.py" \
+    "$PROJECT_ROOT/scripts/run_posttrain_if_complete.sh"
+
+systemctl --user daemon-reload
+systemctl --user enable forge-training.service
+systemctl --user enable forge-training-watchdog.service
+systemctl --user enable forge-training-monitor.service
+systemctl --user enable forge-posttrain.path
+
+require_wandb="$(grep -E '^REQUIRE_WANDB=' "$FORGE_ENV_FILE" | tail -n 1 | cut -d '=' -f2 | tr -d '[:space:]' || true)"
+require_wandb="${require_wandb:-1}"
+
+if [[ "$require_wandb" == "0" ]] || grep -qE '^WANDB_API_KEY=.+$' "$FORGE_ENV_FILE"; then
+    systemctl --user restart forge-training.service
+    systemctl --user restart forge-training-watchdog.service
+    systemctl --user restart forge-training-monitor.service
+    systemctl --user restart forge-posttrain.path
+else
+    systemctl --user stop forge-posttrain.path || true
+    systemctl --user stop forge-posttrain.service || true
+    systemctl --user stop forge-training-monitor.service || true
+    systemctl --user stop forge-training-watchdog.service || true
+    systemctl --user stop forge-training.service || true
+fi
+
+systemctl --user --no-pager --lines=20 status forge-training.service || true
+systemctl --user --no-pager --lines=20 status forge-training-watchdog.service || true
+systemctl --user --no-pager --lines=20 status forge-training-monitor.service || true
+systemctl --user --no-pager --lines=20 status forge-posttrain.path || true
+systemctl --user --no-pager --lines=20 status forge-posttrain.service || true
+echo
+echo "Edit $FORGE_ENV_FILE and set WANDB_API_KEY before starting training."
+echo "Or run: scripts/set_wandb_key.sh"
diff --git a/scripts/monitor_a100_training.sh b/scripts/monitor_a100_training.sh
new file mode 100755
index 0000000..f8b1e01
--- /dev/null
+++ b/scripts/monitor_a100_training.sh
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd /home/weezboo/projects/LowResource-LLM-Forge
+
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}"
+CONFIG_BASENAME="$(basename "$TRAIN_CONFIG")"
+CONFIG_SLUG="${CONFIG_BASENAME%.*}"
+LOG_FILE="${LOG_FILE:-${TRAIN_LOG:-artifacts/logs/training_${CONFIG_SLUG}.log}}"
+STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status_a100.txt}"
+ETA_STATE_FILE="${ETA_STATE_FILE:-artifacts/logs/training_monitor_eta_state_${CONFIG_SLUG}.env}"
+TARGET_STEPS="${TARGET_STEPS:-8601}"
+PATTERN="${PATTERN:-run_training.py --config ${TRAIN_CONFIG}}"
+SLEEP_SECS="${SLEEP_SECS:-60}"
+SAVE_STEPS="${SAVE_STEPS:-250}"
+CHECKPOINT_EVENT_FILE="${CHECKPOINT_EVENT_FILE:-artifacts/logs/training_checkpoint_events_${CONFIG_SLUG}.log}"
+CHECKPOINT_STATE_FILE="${CHECKPOINT_STATE_FILE:-artifacts/logs/training_checkpoint_state_${CONFIG_SLUG}.env}"
+TRAIN_RUN_DIR="${TRAIN_RUN_DIR:-}"
+
+mkdir -p artifacts/logs
+
+prev_ts=0
+prev_step=0
+ema_sps=""
+speed_source="none"
+last_announced_checkpoint_step=0
+
+if [[ -z "$TRAIN_RUN_DIR" ]] && [[ -f "$LOG_FILE" ]]; then
+    run_dir_from_log="$(grep -a -oE "output_dir=artifacts/training/[^[:space:]]+" "$LOG_FILE" | tail -n 1 | cut -d '=' -f 2 || true)"
+    if [[ -n "$run_dir_from_log" ]]; then
+        TRAIN_RUN_DIR="$run_dir_from_log"
+    fi
+fi
+
+if [[ -z "$TRAIN_RUN_DIR" ]]; then
+    TRAIN_RUN_DIR="artifacts/training/${CONFIG_SLUG}"
+fi
+
+if [[ -f "$ETA_STATE_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$ETA_STATE_FILE"
+fi
+
+if [[ -f "$CHECKPOINT_STATE_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$CHECKPOINT_STATE_FILE"
+fi
+
+if [[ ! "$last_announced_checkpoint_step" =~ ^[0-9]+$ ]]; then
+    last_announced_checkpoint_step=0
+fi
+
+find_latest_checkpoint_step() {
+    local current_step="$1"
+    if [[ ! -d "$TRAIN_RUN_DIR" ]]; then
+        echo "0"
+        return
+    fi
+
+    latest="$(find "$TRAIN_RUN_DIR" -maxdepth 1 -type d -name "checkpoint-*" 2>/dev/null \
+        | sed -E 's#.*/checkpoint-##' \
+        | grep -E '^[0-9]+$' \
+        | awk -v s="$current_step" 's == "" || s !~ /^[0-9]+$/ || $1 <= s' \
+        | sort -n \
+        | tail -n 1 || true)"
+
+    if [[ -z "$latest" ]]; then
+        echo "0"
+    else
+        echo "$latest"
+    fi
+}
+
+while true; do
+    ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    now_epoch="$(date -u +%s)"
+
+    running="no"
+    if pgrep -f "$PATTERN" >/dev/null 2>&1 || pgrep -f "scripts/run_training.py" >/dev/null 2>&1; then
+        running="yes"
+    fi
+
+    progress="none"
+    log_start_line=1
+    if [[ -f "$LOG_FILE" ]]; then
+        marker_line="$(grep -a -n "forge-training-start" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)"
+        if [[ "$marker_line" =~ ^[0-9]+$ ]] && [[ "$marker_line" -gt 0 ]]; then
+            log_start_line=$((marker_line + 1))
+        fi
+
+        progress="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -oE "[0-9]+/${TARGET_STEPS}" | tail -n 1 || true)"
+        if [[ -z "$progress" ]]; then
+            progress="none"
+        fi
+    fi
+
+    step="0"
+    if [[ "$progress" != "none" ]]; then
+        step="${progress%%/*}"
+    fi
+
+    pct="0"
+    if [[ "$step" =~ ^[0-9]+$ ]] && [[ $TARGET_STEPS -gt 0 ]]; then
+        pct=$((step * 100 / TARGET_STEPS))
+    fi
+
+    nan_count="0"
+    if [[ -f "$LOG_FILE" ]]; then
+        # Count only real NaN/Inf metric values and explicit NaN guard events.
+        metric_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" 2>/dev/null | grep -a -E -i "'(loss|grad_norm|eval_loss)':[[:space:]]*'?(nan|inf)'?" | wc -l | tr -d '[:space:]' || true)"
+        guard_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" 2>/dev/null | grep -a -E -c "nan_guard_detected|nan_guard_stopping_training" || true)"
+        metric_nan_count="${metric_nan_count:-0}"
+        guard_nan_count="${guard_nan_count:-0}"
+        nan_count=$((metric_nan_count + guard_nan_count))
+    fi
+
+    gpu_line="$(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,noheader | head -n1 2>/dev/null || echo unknown)"
+
+    steps_per_hour="unknown"
+    eta_seconds="unknown"
+    eta_utc="unknown"
+    remaining_steps="unknown"
+    latest_checkpoint_step="$(find_latest_checkpoint_step "$step")"
+    next_checkpoint_step="unknown"
+    steps_to_next_checkpoint="unknown"
+    checkpoint_eta_utc="unknown"
+
+    if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$step" -lt "$TARGET_STEPS" ]]; then
+        remaining_steps=$((TARGET_STEPS - step))
+    fi
+
+    if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$SAVE_STEPS" =~ ^[0-9]+$ ]] && [[ "$SAVE_STEPS" -gt 0 ]]; then
+        next_checkpoint_step=$((((step / SAVE_STEPS) + 1) * SAVE_STEPS))
+        if [[ "$next_checkpoint_step" -le "$TARGET_STEPS" ]]; then
+            steps_to_next_checkpoint=$((next_checkpoint_step - step))
+        else
+            next_checkpoint_step="none"
+            steps_to_next_checkpoint="none"
+        fi
+    fi
+
+    if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$prev_ts" =~ ^[0-9]+$ ]] && [[ "$prev_step" =~ ^[0-9]+$ ]]; then
+        if [[ $prev_ts -gt 0 ]] && [[ $now_epoch -gt $prev_ts ]] && [[ $step -gt $prev_step ]]; then
+            delta_steps=$((step - prev_step))
+            delta_secs=$((now_epoch - prev_ts))
+            # Skip the first large jump after resume (e.g. 0 -> 750) to avoid bogus ETA.
+            if [[ -z "$ema_sps" ]] && [[ "$prev_step" -eq 0 ]] && [[ "$delta_steps" -gt 1 ]]; then
+                speed_source="bootstrap_skip"
+            elif [[ "$delta_secs" -gt 0 ]]; then
+                instant_sps="$(awk -v ds="$delta_steps" -v dt="$delta_secs" 'BEGIN { printf "%.8f", ds / dt }')"
+
+                if [[ -n "$ema_sps" ]]; then
+                    ema_sps="$(awk -v e="$ema_sps" -v i="$instant_sps" 'BEGIN { printf "%.8f", (0.7 * e) + (0.3 * i) }')"
+                    speed_source="ema"
+                else
+                    ema_sps="$instant_sps"
+                    speed_source="instant"
+                fi
+            fi
+        fi
+    fi
+
+    if [[ -n "$ema_sps" ]] && awk -v s="$ema_sps" 'BEGIN { exit !(s > 0) }'; then
+        steps_per_hour="$(awk -v s="$ema_sps" 'BEGIN { printf "%.1f", s * 3600 }')"
+        if [[ "$remaining_steps" =~ ^[0-9]+$ ]]; then
+            eta_seconds="$(awk -v rem="$remaining_steps" -v s="$ema_sps" 'BEGIN { printf "%.0f", rem / s }')"
+            if [[ "$eta_seconds" =~ ^[0-9]+$ ]]; then
+                eta_utc="$(date -u -d "@$((now_epoch + eta_seconds))" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo unknown)"
+            fi
+        fi
+        if [[ "$steps_to_next_checkpoint" =~ ^[0-9]+$ ]]; then
+            checkpoint_eta_seconds="$(awk -v rem="$steps_to_next_checkpoint" -v s="$ema_sps" 'BEGIN { printf "%.0f", rem / s }')"
+            if [[ "$checkpoint_eta_seconds" =~ ^[0-9]+$ ]]; then
+                checkpoint_eta_utc="$(date -u -d "@$((now_epoch + checkpoint_eta_seconds))" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo unknown)"
+            fi
+        fi
+    fi
+
+    if [[ "$latest_checkpoint_step" =~ ^[0-9]+$ ]] && [[ "$latest_checkpoint_step" -gt "$last_announced_checkpoint_step" ]]; then
+        echo "${ts} checkpoint_saved step=${latest_checkpoint_step} run_dir=${TRAIN_RUN_DIR}" >>"$CHECKPOINT_EVENT_FILE"
+        last_announced_checkpoint_step="$latest_checkpoint_step"
+    fi
+
+    {
+        echo "timestamp_utc=$ts"
+        echo "running=$running"
+        echo "step=$step"
+        echo "target_steps=$TARGET_STEPS"
+        echo "progress=$progress"
+        echo "percent=$pct"
+        echo "remaining_steps=$remaining_steps"
+        echo "steps_per_hour=$steps_per_hour"
+        echo "eta_seconds=$eta_seconds"
+        echo "eta_utc=$eta_utc"
+        echo "speed_source=$speed_source"
+        echo "nan_count=$nan_count"
+        echo "gpu=$gpu_line"
+        echo "save_steps=$SAVE_STEPS"
+        echo "latest_checkpoint_step=$latest_checkpoint_step"
+        echo "next_checkpoint_step=$next_checkpoint_step"
+        echo "steps_to_next_checkpoint=$steps_to_next_checkpoint"
+        echo "checkpoint_eta_utc=$checkpoint_eta_utc"
+    } >"$STATUS_FILE"
+
+    prev_ts="$now_epoch"
+    prev_step="$step"
+    {
+        echo "prev_ts=$prev_ts"
+        echo "prev_step=$prev_step"
+        echo "ema_sps=$ema_sps"
+        echo "speed_source=$speed_source"
+    } >"$ETA_STATE_FILE"
+    {
+        echo "last_announced_checkpoint_step=$last_announced_checkpoint_step"
+        echo "train_run_dir=$TRAIN_RUN_DIR"
+    } >"$CHECKPOINT_STATE_FILE"
+
+    if [[ "$running" == "no" ]]; then
+        echo "state=stopped" >>"$STATUS_FILE"
+        exit 0
+    fi
+
+    if [[ "$step" =~ ^[0-9]+$ ]] && [[ $step -ge $TARGET_STEPS ]]; then
+        echo "state=completed" >>"$STATUS_FILE"
+        exit 0
+    fi
+
+    echo "state=running" >>"$STATUS_FILE"
+    sleep "$SLEEP_SECS"
+done
diff --git a/scripts/post_training_pipeline.sh b/scripts/post_training_pipeline.sh
new file mode 100644
index 0000000..1f93b34
--- /dev/null
+++ b/scripts/post_training_pipeline.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+cd "$PROJECT_ROOT"
+
+UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}"
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}"
+RUN_DIR="${RUN_DIR:-artifacts/training/turkcell-7b-sft-v8-a100-bf16-stable-reset}"
+TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log}"
+ADAPTER_DIR="${ADAPTER_DIR:-$RUN_DIR/final}"
+
+BASE_MODEL="${BASE_MODEL:-TURKCELL/Turkcell-LLM-7b-v1}"
+MERGED_OUTPUT="${MERGED_OUTPUT:-artifacts/merged/turkcell-7b-a100-v8-stable-reset}"
+EVAL_OUTPUT_ROOT="${EVAL_OUTPUT_ROOT:-artifacts/eval/turkcell-7b-a100-v8-stable-reset}"
+
+PUSH_TO_HUB="${PUSH_TO_HUB:-0}"
+HUB_REPO="${HUB_REPO:-}"
+
+SERVE_BASE_URL="${SERVE_BASE_URL:-}"
+SERVE_API_KEY="${SERVE_API_KEY:-}"
+SERVE_CONFIG="${SERVE_CONFIG:-configs/serving/vllm_a100_v8_merged.yaml}"
+SERVE_TIMEOUT="${SERVE_TIMEOUT:-240}"
+AUTO_START_SERVE="${AUTO_START_SERVE:-1}"
+BENCHMARK_NUM_REQUESTS="${BENCHMARK_NUM_REQUESTS:-50}"
+BENCHMARK_CONCURRENCY="${BENCHMARK_CONCURRENCY:-5}"
+BENCHMARK_OUTPUT_DIR="${BENCHMARK_OUTPUT_DIR:-artifacts/benchmarks/turkcell-7b-a100-v8}"
+BENCHMARK_OUTPUT="${BENCHMARK_OUTPUT:-$BENCHMARK_OUTPUT_DIR/benchmark_$(date -u +%Y%m%dT%H%M%SZ).json}"
+SERVE_LOG="${SERVE_LOG:-artifacts/logs/posttrain_serve_v8.log}"
+
+if [[ ! -x "$UV_BIN" ]]; then
+    echo "UV executable not found: $UV_BIN" >&2
+    exit 1
+fi
+
+if [[ ! -d "$ADAPTER_DIR" ]]; then
+    echo "Adapter directory not found: $ADAPTER_DIR" >&2
+    exit 1
+fi
+
+if [[ "$PUSH_TO_HUB" == "1" ]] && [[ -z "$HUB_REPO" ]]; then
+    echo "HUB_REPO is required when PUSH_TO_HUB=1." >&2
+    exit 1
+fi
+
+mkdir -p "$BENCHMARK_OUTPUT_DIR" artifacts/logs
+
+echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] post-training-pipeline-start"
+echo "train_config=$TRAIN_CONFIG"
+echo "run_dir=$RUN_DIR"
+echo "adapter_dir=$ADAPTER_DIR"
+
+echo
+echo "[1/4] Generate training manifest"
+"$UV_BIN" run python scripts/generate_training_manifest.py \
+    --config "$TRAIN_CONFIG" \
+    --run-dir "$RUN_DIR" \
+    --log-file "$TRAIN_LOG"
+
+echo
+echo "[2/4] Run offline evaluations (mmlu_tr, perplexity, generation)"
+for bench in mmlu_tr perplexity generation; do
+    out_dir="$EVAL_OUTPUT_ROOT/$bench"
+    mkdir -p "$out_dir"
+    echo "  - benchmark=$bench output=$out_dir"
+    "$UV_BIN" run python scripts/run_eval.py \
+        --model "$ADAPTER_DIR" \
+        --benchmark "$bench" \
+        --output-dir "$out_dir"
+done
+
+echo
+echo "[3/4] Merge adapters into base model"
+merge_cmd=(
+    "$UV_BIN" run python scripts/merge_and_push.py
+    --base-model "$BASE_MODEL"
+    --adapter "$ADAPTER_DIR"
+    --output "$MERGED_OUTPUT"
+)
+if [[ "$PUSH_TO_HUB" == "1" ]]; then
+    merge_cmd+=(--push --hub-repo "$HUB_REPO")
+fi
+"${merge_cmd[@]}"
+
+echo
+echo "[4/4] Optional serving smoke/benchmark"
+run_endpoint_checks() {
+    local base_url="$1"
+    smoke_cmd=("$UV_BIN" run python scripts/smoke_serve.py --base-url "$base_url")
+    bench_cmd=(
+        "$UV_BIN" run python scripts/benchmark_openai_endpoint.py
+        --base-url "$base_url"
+        --num-requests "$BENCHMARK_NUM_REQUESTS"
+        --concurrency "$BENCHMARK_CONCURRENCY"
+        --output "$BENCHMARK_OUTPUT"
+    )
+    if [[ -n "$SERVE_API_KEY" ]]; then
+        smoke_cmd+=(--api-key "$SERVE_API_KEY")
+        bench_cmd+=(--api-key "$SERVE_API_KEY")
+    fi
+    "${smoke_cmd[@]}"
+    "${bench_cmd[@]}"
+}
+
+if [[ -n "$SERVE_BASE_URL" ]]; then
+    echo "  - using external endpoint: $SERVE_BASE_URL"
+    run_endpoint_checks "$SERVE_BASE_URL"
+elif [[ "$AUTO_START_SERVE" == "1" ]]; then
+    if [[ ! -f "$SERVE_CONFIG" ]]; then
+        echo "Serving config not found: $SERVE_CONFIG" >&2
+        exit 1
+    fi
+
+    serve_host="$(grep -E '^host:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' "' || true)"
+    serve_port="$(grep -E '^port:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' ' || true)"
+    if [[ -z "$serve_host" ]]; then
+        serve_host="127.0.0.1"
+    fi
+    if [[ "$serve_host" == "0.0.0.0" ]] || [[ "$serve_host" == "::" ]]; then
+        serve_host="127.0.0.1"
+    fi
+    if [[ -z "$serve_port" ]]; then
+        serve_port="18020"
+    fi
+    serve_base_url="http://${serve_host}:${serve_port}"
+
+    serve_cmd=(
+        "$UV_BIN" run python scripts/run_serve.py
+        --config "$SERVE_CONFIG"
+        --no-wait
+        --timeout "$SERVE_TIMEOUT"
+    )
+    if [[ -n "$SERVE_API_KEY" ]]; then
+        serve_cmd+=(--api-key "$SERVE_API_KEY")
+    fi
+
+    "${serve_cmd[@]}" >"$SERVE_LOG" 2>&1 &
+    serve_pid="$!"
+    cleanup_serve() {
+        if [[ -n "${serve_pid:-}" ]] && kill -0 "$serve_pid" >/dev/null 2>&1; then
+            kill -INT "$serve_pid" >/dev/null 2>&1 || true
+            sleep 1
+            if kill -0 "$serve_pid" >/dev/null 2>&1; then
+                kill "$serve_pid" >/dev/null 2>&1 || true
+            fi
+            wait "$serve_pid" >/dev/null 2>&1 || true
+        fi
+    }
+    trap cleanup_serve EXIT
+
+    ready="0"
+    for _ in $(seq 1 "$SERVE_TIMEOUT"); do
+        if curl -fsS "${serve_base_url}/health" >/dev/null 2>&1; then
+            ready="1"
+            break
+        fi
+        sleep 1
+    done
+    if [[ "$ready" != "1" ]]; then
+        echo "vLLM did not become healthy in ${SERVE_TIMEOUT}s (${serve_base_url})" >&2
+        exit 1
+    fi
+
+    echo "  - started local vLLM endpoint: $serve_base_url"
+    run_endpoint_checks "$serve_base_url"
+    cleanup_serve
+    trap - EXIT
+else
+    echo "  - SERVE_BASE_URL not set and AUTO_START_SERVE=0; skipping serve smoke + benchmark"
+fi
+
+echo
+echo "benchmark_output=$BENCHMARK_OUTPUT"
+echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] post-training-pipeline-complete"
diff --git a/scripts/run_posttrain_if_complete.sh b/scripts/run_posttrain_if_complete.sh
new file mode 100755
index 0000000..ce54427
--- /dev/null
+++ b/scripts/run_posttrain_if_complete.sh
@@ -0,0 +1,176 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+cd "$PROJECT_ROOT"
+
+STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status_a100.txt}"
+LOCK_FILE="${LOCK_FILE:-artifacts/logs/posttrain_v8.lock}"
+DONE_FILE="${DONE_FILE:-artifacts/logs/posttrain_v8.done}"
+SUMMARY_FILE="${SUMMARY_FILE:-artifacts/logs/posttrain_v8_summary.md}"
+POSTTRAIN_LOG="${POSTTRAIN_LOG:-artifacts/logs/posttrain_v8.log}"
+
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}"
+RUN_DIR="${RUN_DIR:-${TRAIN_RUN_DIR:-artifacts/training/turkcell-7b-sft-v8-a100-bf16-stable-reset}}"
+TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log}"
+ADAPTER_DIR="${ADAPTER_DIR:-$RUN_DIR/final}"
+MERGED_OUTPUT="${MERGED_OUTPUT:-artifacts/merged/turkcell-7b-a100-v8-stable-reset}"
+EVAL_OUTPUT_ROOT="${EVAL_OUTPUT_ROOT:-artifacts/eval/turkcell-7b-a100-v8-stable-reset}"
+SERVE_CONFIG="${SERVE_CONFIG:-configs/serving/vllm_a100_v8_merged.yaml}"
+BENCHMARK_OUTPUT_DIR="${BENCHMARK_OUTPUT_DIR:-artifacts/benchmarks/turkcell-7b-a100-v8}"
+BENCHMARK_OUTPUT="${BENCHMARK_OUTPUT:-$BENCHMARK_OUTPUT_DIR/benchmark_$(date -u +%Y%m%dT%H%M%SZ).json}"
+
+mkdir -p "$(dirname "$LOCK_FILE")" "$(dirname "$DONE_FILE")" "$(dirname "$SUMMARY_FILE")" "$BENCHMARK_OUTPUT_DIR"
+
+resolve_serve_endpoint() {
+    local host
+    local port
+    host="$(grep -E '^host:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' "' || true)"
+    port="$(grep -E '^port:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' ' || true)"
+    if [[ -z "$host" ]]; then
+        host="127.0.0.1"
+    fi
+    if [[ "$host" == "0.0.0.0" ]] || [[ "$host" == "::" ]]; then
+        host="127.0.0.1"
+    fi
+    if [[ -z "$port" ]]; then
+        port="18020"
+    fi
+    echo "http://${host}:${port}"
+}
+
+collect_eval_status_lines() {
+    local bench
+    local results_json
+    local pass_total
+    local status
+    for bench in mmlu_tr perplexity generation; do
+        results_json="${EVAL_OUTPUT_ROOT}/${bench}/results.json"
+        if [[ -f "$results_json" ]]; then
+            pass_total="$(python3 -c 'import json,sys; s=json.load(open(sys.argv[1])).get("summary",{}); print(f"{int(s.get(\"passed\",0))}/{int(s.get(\"total\",0))}")' "$results_json")"
+            status="$(python3 -c 'import json,sys; b=(json.load(open(sys.argv[1])).get("benchmarks") or [{}])[0]; print("PASS" if b.get("passed") else "FAIL")' "$results_json")"
+            echo "- ${bench}: ${status} (${pass_total})"
+        else
+            echo "- ${bench}: MISSING (${results_json})"
+        fi
+    done
+}
+
+if [[ -f "$DONE_FILE" ]]; then
+    echo "posttrain_already_done file=$DONE_FILE"
+    exit 0
+fi
+
+if [[ ! -f "$STATUS_FILE" ]]; then
+    echo "posttrain_waiting status_file_missing=$STATUS_FILE"
+    exit 0
+fi
+
+state="$(awk -F '=' '$1=="state" {print $2}' "$STATUS_FILE" | tail -n 1 || true)"
+step="$(awk -F '=' '$1=="step" {print $2}' "$STATUS_FILE" | tail -n 1 || true)"
+target_steps="$(awk -F '=' '$1=="target_steps" {print $2}' "$STATUS_FILE" | tail -n 1 || true)"
+
+if [[ ! "$step" =~ ^[0-9]+$ ]]; then
+    step=0
+fi
+if [[ ! "$target_steps" =~ ^[0-9]+$ ]] || [[ "$target_steps" -le 0 ]]; then
+    target_steps=8601
+fi
+
+if [[ "$state" != "completed" ]] && [[ "$step" -lt "$target_steps" ]]; then
+    echo "posttrain_waiting state=${state:-unknown} step=$step target_steps=$target_steps"
+    exit 0
+fi
+
+if [[ -f "$LOCK_FILE" ]]; then
+    locked_pid="$(awk -F '=' '$1=="pid" {print $2}' "$LOCK_FILE" | tail -n 1 || true)"
+    if [[ "$locked_pid" =~ ^[0-9]+$ ]] && kill -0 "$locked_pid" >/dev/null 2>&1; then
+        echo "posttrain_lock_active pid=$locked_pid file=$LOCK_FILE"
+        exit 0
+    fi
+    rm -f "$LOCK_FILE"
+fi
+
+{
+    echo "pid=$$"
+    echo "started_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+} >"$LOCK_FILE"
+
+cleanup_lock() {
+    rm -f "$LOCK_FILE"
+}
+trap cleanup_lock EXIT
+
+if [[ -f "$DONE_FILE" ]]; then
+    echo "posttrain_already_done file=$DONE_FILE"
+    exit 0
+fi
+
+echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] posttrain_trigger_start" | tee -a "$POSTTRAIN_LOG"
+echo "status_state=$state step=$step target_steps=$target_steps" | tee -a "$POSTTRAIN_LOG"
+serve_endpoint="$(resolve_serve_endpoint)"
+
+set +e
+(
+    export TRAIN_CONFIG
+    export RUN_DIR
+    export TRAIN_LOG
+    export ADAPTER_DIR
+    export MERGED_OUTPUT
+    export EVAL_OUTPUT_ROOT
+    export PUSH_TO_HUB=0
+    export SERVE_CONFIG
+    export AUTO_START_SERVE=1
+    export BENCHMARK_OUTPUT_DIR
+    export BENCHMARK_OUTPUT
+    bash scripts/post_training_pipeline.sh
+) >>"$POSTTRAIN_LOG" 2>&1
+pipeline_rc=$?
+set -e
+
+if [[ "$pipeline_rc" -ne 0 ]]; then
+    {
+        echo "# Post-Training v8 Summary"
+        echo
+        echo "- status: FAILED"
+        echo "- finished_utc: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
+        echo "- pipeline_exit_code: $pipeline_rc"
+        echo "- train_config: $TRAIN_CONFIG"
+        echo "- run_dir: $RUN_DIR"
+        echo "- serve_endpoint: $serve_endpoint"
+        echo "- eval_results:"
+        collect_eval_status_lines
+        echo "- log: $POSTTRAIN_LOG"
+    } >"$SUMMARY_FILE"
+    echo "posttrain_failed rc=$pipeline_rc log=$POSTTRAIN_LOG summary=$SUMMARY_FILE"
+    exit "$pipeline_rc"
+fi
+
+{
+    echo "completed_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    echo "train_config=$TRAIN_CONFIG"
+    echo "run_dir=$RUN_DIR"
+    echo "merged_output=$MERGED_OUTPUT"
+    echo "eval_output_root=$EVAL_OUTPUT_ROOT"
+    echo "benchmark_output=$BENCHMARK_OUTPUT"
+    echo "serve_endpoint=$serve_endpoint"
+    echo "posttrain_log=$POSTTRAIN_LOG"
+} >"$DONE_FILE"
+
+{
+    echo "# Post-Training v8 Summary"
+    echo
+    echo "- status: SUCCESS"
+    echo "- completed_utc: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    echo "- train_config: $TRAIN_CONFIG"
+    echo "- run_dir: $RUN_DIR"
+    echo "- eval_results:"
+    collect_eval_status_lines
+    echo "- merged_model: $MERGED_OUTPUT"
+    echo "- serve_endpoint: $serve_endpoint"
+    echo "- eval_output_root: $EVAL_OUTPUT_ROOT"
+    echo "- benchmark_output: $BENCHMARK_OUTPUT"
+    echo "- posttrain_log: $POSTTRAIN_LOG"
+} >"$SUMMARY_FILE"
+
+echo "posttrain_complete done_file=$DONE_FILE summary=$SUMMARY_FILE"
diff --git a/scripts/start_a100_training.sh b/scripts/start_a100_training.sh
new file mode 100755
index 0000000..8bca40a
--- /dev/null
+++ b/scripts/start_a100_training.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+cd "$PROJECT_ROOT"
+
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}"
+CONFIG_BASENAME="$(basename "$TRAIN_CONFIG")"
+CONFIG_SLUG="${CONFIG_BASENAME%.*}"
+TRAIN_RUN_DIR="${TRAIN_RUN_DIR:-artifacts/training/${CONFIG_SLUG}}"
+TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_${CONFIG_SLUG}.log}"
+BOOTSTRAP_CHECKPOINT="${BOOTSTRAP_CHECKPOINT:-}"
+ENABLE_RESUME="${ENABLE_RESUME:-0}"
+HF_HOME_DIR="${HF_HOME_DIR:-$PROJECT_ROOT/.hf_cache}"
+HF_DATASETS_CACHE_DIR="${HF_DATASETS_CACHE_DIR:-$HF_HOME_DIR/datasets}"
+HF_HUB_CACHE_DIR="${HF_HUB_CACHE_DIR:-$HF_HOME_DIR/hub}"
+UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}"
+REQUIRE_WANDB="${REQUIRE_WANDB:-1}"
+
+mkdir -p \
+    "$(dirname "$TRAIN_RUN_DIR")" \
+    "$(dirname "$TRAIN_LOG")" \
+    "$HF_HOME_DIR" \
+    "$HF_DATASETS_CACHE_DIR" \
+    "$HF_HUB_CACHE_DIR" \
+    artifacts/logs
+
+# Keep a durable per-run log even when systemd unit output targets change.
+exec > >(tee -a "$TRAIN_LOG") 2>&1
+
+if [[ ! -x "$UV_BIN" ]]; then
+    echo "UV executable not found: $UV_BIN" >&2
+    exit 1
+fi
+
+if [[ "$REQUIRE_WANDB" == "1" ]] && [[ -z "${WANDB_API_KEY:-}" ]]; then
+    echo "WANDB_API_KEY is required for this run (REQUIRE_WANDB=1)." >&2
+    exit 1
+fi
+
+find_latest_checkpoint() {
+    if [[ ! -d "$TRAIN_RUN_DIR" ]]; then
+        return
+    fi
+
+    monitor_status_file="artifacts/logs/training_monitor_status_a100.txt"
+    current_step=""
+    if [[ -f "$monitor_status_file" ]]; then
+        current_step="$(grep -E '^step=' "$monitor_status_file" | tail -n 1 | cut -d '=' -f 2 || true)"
+    fi
+
+    if [[ "$current_step" =~ ^[0-9]+$ ]] && [[ "$current_step" -gt 0 ]]; then
+        filtered_checkpoint="$(find "$TRAIN_RUN_DIR" -maxdepth 1 -type d -name "checkpoint-*" 2>/dev/null \
+            | sed -E 's#(.*checkpoint-)([0-9]+)$#\2 \1\2#' \
+            | awk -v s="$current_step" '$1 <= s' \
+            | sort -n \
+            | tail -n 1 \
+            | cut -d ' ' -f 2- || true)"
+        if [[ -n "$filtered_checkpoint" ]]; then
+            echo "$filtered_checkpoint"
+            return
+        fi
+    fi
+
+    find "$TRAIN_RUN_DIR" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 1
+}
+
+resume_from=""
+if [[ "$ENABLE_RESUME" == "1" ]]; then
+    latest_checkpoint="$(find_latest_checkpoint || true)"
+    if [[ -n "$latest_checkpoint" ]]; then
+        resume_from="$latest_checkpoint"
+    elif [[ -n "$BOOTSTRAP_CHECKPOINT" ]] && [[ -d "$BOOTSTRAP_CHECKPOINT" ]]; then
+        resume_from="$BOOTSTRAP_CHECKPOINT"
+    fi
+fi
+
+cmd=("$UV_BIN" "run" "python" "scripts/run_training.py" "--config" "$TRAIN_CONFIG")
+if [[ -n "$resume_from" ]]; then
+    cmd+=("--resume-from" "$resume_from")
+fi
+
+echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] forge-training-start"
+echo "project_root=$PROJECT_ROOT"
+echo "train_config=$TRAIN_CONFIG"
+echo "config_slug=$CONFIG_SLUG"
+echo "train_run_dir=$TRAIN_RUN_DIR"
+echo "train_log=$TRAIN_LOG"
+echo "resume_from=${resume_from:-none}"
+echo "enable_resume=$ENABLE_RESUME"
+echo "require_wandb=$REQUIRE_WANDB"
+echo "hf_home=$HF_HOME_DIR"
+echo "hf_datasets_cache=$HF_DATASETS_CACHE_DIR"
+echo "hf_hub_cache=$HF_HUB_CACHE_DIR"
+echo "command=${cmd[*]}"
+
+exec env \
+    FORGE_EXECUTION_CONTEXT=remote \
+    HF_HOME="$HF_HOME_DIR" \
+    HF_DATASETS_CACHE="$HF_DATASETS_CACHE_DIR" \
+    HUGGINGFACE_HUB_CACHE="$HF_HUB_CACHE_DIR" \
+    "${cmd[@]}"
diff --git a/scripts/training_watchdog.py b/scripts/training_watchdog.py
new file mode 100755
index 0000000..992b3db
--- /dev/null
+++ b/scripts/training_watchdog.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""Watchdog for long-running training on remote GPU hosts.
+
+Restarts a user-level systemd training service when:
+1) Too many consecutive metric lines contain NaN.
+2) Training step does not advance for a configured stall timeout.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import subprocess
+import time
+from dataclasses import asdict, dataclass
+from pathlib import Path
+
+
+@dataclass
+class WatchdogState:
+    """Persisted state between watchdog loops."""
+
+    last_metric_hash: str = ""
+    nan_consecutive: int = 0
+    last_step: int = 0
+    last_step_change_ts: float = 0.0
+
+
+def _config_slug() -> str:
+    train_config = os.getenv("TRAIN_CONFIG", "configs/models/turkcell_7b_a100_v4_recovery.yaml")
+    return Path(train_config).stem
+
+
+def _int_env(name: str, default: int) -> int:
+    value = os.getenv(name)
+    if value is None or value.strip() == "":
+        return default
+    try:
+        return int(value)
+    except ValueError:
+        return default
+
+
+def _load_status_file(path: Path) -> dict[str, str]:
+    if not path.exists():
+        return {}
+
+    result: dict[str, str] = {}
+    for line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
+        if "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        result[key.strip()] = value.strip()
+    return result
+
+
+def _status_completed(status: dict[str, str], target_steps: int) -> bool:
+    state = status.get("state", "")
+    try:
+        step = int(status.get("step", "0"))
+    except ValueError:
+        step = 0
+
+    if state == "completed":
+        return True
+    return step >= target_steps
+
+
+def parse_args() -> argparse.Namespace:
+    slug = _config_slug()
+    default_log_file = os.getenv("TRAIN_LOG", f"artifacts/logs/training_{slug}.log")
+    default_state_file = os.getenv(
+        "TRAIN_WATCHDOG_STATE_FILE",
+        f"artifacts/logs/training_watchdog_state_{slug}.json",
+    )
+    default_status_file = os.getenv(
+        "TRAIN_WATCHDOG_STATUS_FILE",
+        f"artifacts/logs/training_watchdog_status_{slug}.txt",
+    )
+    default_monitor_status_file = os.getenv(
+        "TRAIN_MONITOR_STATUS_FILE",
+        "artifacts/logs/training_monitor_status_a100.txt",
+    )
+    default_target_steps = _int_env("TARGET_STEPS", 8601)
+    default_poll_seconds = _int_env("WATCHDOG_POLL_SECONDS", 60)
+    default_stall_seconds = _int_env("WATCHDOG_STALL_SECONDS", 5400)
+    default_nan_limit = _int_env("WATCHDOG_NAN_CONSECUTIVE_LIMIT", 5)
+
+    parser = argparse.ArgumentParser(description="Monitor training and auto-restart on failures.")
+    parser.add_argument("--service", default="forge-training.service")
+    parser.add_argument("--log-file", default=default_log_file)
+    parser.add_argument("--state-file", default=default_state_file)
+    parser.add_argument("--status-file", default=default_status_file)
+    parser.add_argument("--monitor-status-file", default=default_monitor_status_file)
+    parser.add_argument("--target-steps", type=int, default=default_target_steps)
+    parser.add_argument("--poll-seconds", type=int, default=default_poll_seconds)
+    parser.add_argument("--nan-consecutive-limit", type=int, default=default_nan_limit)
+    parser.add_argument("--stall-seconds", type=int, default=default_stall_seconds)
+    parser.add_argument("--max-read-bytes", type=int, default=2_000_000)
+    return parser.parse_args()
+
+
+def _run_systemctl(*args: str) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        ["systemctl", "--user", *args],
+        check=False,
+        text=True,
+        capture_output=True,
+    )
+
+
+def is_service_active(service: str) -> bool:
+    proc = _run_systemctl("is-active", "--quiet", service)
+    return proc.returncode == 0
+
+
+def restart_service(service: str) -> bool:
+    proc = _run_systemctl("restart", service)
+    return proc.returncode == 0
+
+
+def start_service(service: str) -> bool:
+    proc = _run_systemctl("start", service)
+    return proc.returncode == 0
+
+
+def read_tail_text(path: Path, max_bytes: int) -> str:
+    if not path.exists():
+        return ""
+    with path.open("rb") as handle:
+        handle.seek(0, os.SEEK_END)
+        size = handle.tell()
+        handle.seek(max(0, size - max_bytes), os.SEEK_SET)
+        return handle.read().decode("utf-8", errors="ignore")
+
+
+def parse_training_tail(text: str, target_steps: int) -> tuple[int, str]:
+    if not text:
+        return 0, ""
+
+    marker_idx = text.rfind("forge-training-start")
+    if marker_idx != -1:
+        text = text[marker_idx:]
+
+    step_pattern = re.compile(rf"(\d+)/{target_steps}\b")
+    steps = [int(match.group(1)) for match in step_pattern.finditer(text)]
+    last_step = steps[-1] if steps else 0
+
+    metric_lines: list[str] = []
+    for line in text.splitlines():
+        if ("'loss':" in line and "'grad_norm':" in line) or "'eval_loss':" in line:
+            metric_lines.append(line)
+    last_metric = metric_lines[-1] if metric_lines else ""
+    return last_step, last_metric
+
+
+def metric_hash(metric_line: str) -> str:
+    if not metric_line:
+        return ""
+    return hashlib.sha256(metric_line.encode("utf-8", errors="ignore")).hexdigest()
+
+
+def load_state(path: Path) -> WatchdogState:
+    if not path.exists():
+        return WatchdogState()
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+        return WatchdogState(
+            last_metric_hash=str(payload.get("last_metric_hash", "")),
+            nan_consecutive=int(payload.get("nan_consecutive", 0)),
+            last_step=int(payload.get("last_step", 0)),
+            last_step_change_ts=float(payload.get("last_step_change_ts", 0.0)),
+        )
+    except (json.JSONDecodeError, OSError, ValueError, TypeError):
+        return WatchdogState()
+
+
+def save_state(path: Path, state: WatchdogState) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(asdict(state), indent=2), encoding="utf-8")
+
+
+def write_status(
+    path: Path,
+    *,
+    service: str,
+    active: bool,
+    step: int,
+    target_steps: int,
+    metric_line: str,
+    state: WatchdogState,
+    action: str,
+) -> None:
+    pct = int((step * 100) / target_steps) if target_steps > 0 else 0
+    timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+    status_lines = [
+        f"timestamp_utc={timestamp}",
+        f"service={service}",
+        f"active={'yes' if active else 'no'}",
+        f"step={step}",
+        f"target_steps={target_steps}",
+        f"percent={pct}",
+        f"nan_consecutive={state.nan_consecutive}",
+        f"last_step_change_ts={int(state.last_step_change_ts)}",
+        f"last_metric_contains_nan={'yes' if 'nan' in metric_line.lower() else 'no'}",
+        f"action={action}",
+    ]
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text("\n".join(status_lines) + "\n", encoding="utf-8")
+
+
+def main() -> None:
+    args = parse_args()
+    log_file = Path(args.log_file)
+    state_file = Path(args.state_file)
+    status_file = Path(args.status_file)
+    monitor_status_file = Path(args.monitor_status_file)
+    state = load_state(state_file)
+
+    while True:
+        now = time.time()
+        action = "none"
+        active = is_service_active(args.service)
+        monitor_status = _load_status_file(monitor_status_file)
+        training_completed = _status_completed(monitor_status, args.target_steps)
+
+        if not active:
+            if training_completed:
+                action = "completed_no_restart"
+            else:
+                started = start_service(args.service)
+                action = "start_service" if started else "start_failed"
+                active = is_service_active(args.service)
+
+        tail_text = read_tail_text(log_file, args.max_read_bytes)
+        step, last_metric = parse_training_tail(tail_text, args.target_steps)
+        current_metric_hash = metric_hash(last_metric)
+
+        if step < state.last_step:
+            # Service likely restarted and step counter reset.
+            state.last_step = step
+            state.last_step_change_ts = now
+            state.nan_consecutive = 0
+            state.last_metric_hash = current_metric_hash
+        elif step > state.last_step:
+            state.last_step = step
+            state.last_step_change_ts = now
+        elif state.last_step_change_ts == 0.0 and step > 0:
+            state.last_step_change_ts = now
+
+        if current_metric_hash and current_metric_hash != state.last_metric_hash:
+            state.last_metric_hash = current_metric_hash
+            if "nan" in last_metric.lower():
+                state.nan_consecutive += 1
+            else:
+                state.nan_consecutive = 0
+
+        stalled = (
+            active
+            and state.last_step_change_ts > 0
+            and (now - state.last_step_change_ts) >= args.stall_seconds
+        )
+        nan_limit_hit = state.nan_consecutive >= args.nan_consecutive_limit
+
+        if (nan_limit_hit or stalled) and not training_completed:
+            restarted = restart_service(args.service)
+            if nan_limit_hit:
+                action = "restart_nan_limit_hit" if restarted else "restart_nan_failed"
+            else:
+                action = "restart_stall_timeout" if restarted else "restart_stall_failed"
+            state.nan_consecutive = 0
+            state.last_metric_hash = ""
+            state.last_step_change_ts = now
+            active = is_service_active(args.service)
+        elif training_completed:
+            action = "completed_no_restart"
+
+        save_state(state_file, state)
+        write_status(
+            status_file,
+            service=args.service,
+            active=active,
+            step=step,
+            target_steps=args.target_steps,
+            metric_line=last_metric,
+            state=state,
+            action=action,
+        )
+        time.sleep(args.poll_seconds)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/forge/training/callbacks.py b/src/forge/training/callbacks.py
index 124481d..10bbc37 100644
--- a/src/forge/training/callbacks.py
+++ b/src/forge/training/callbacks.py
@@ -2,14 +2,23 @@
 
 from __future__ import annotations
 
+import math
 from typing import Any
 
 from forge.utils.logging import get_logger
 
+try:
+    from transformers import TrainerCallback
+except Exception:  # pragma: no cover - fallback for non-training environments
+    class TrainerCallback:  # type: ignore[no-redef]
+        """Fallback base class when transformers is unavailable."""
+
+        pass
+
 logger = get_logger(__name__)
 
 
-class EarlyStoppingOnPlateau:
+class EarlyStoppingOnPlateau(TrainerCallback):
     """Stop training when eval loss plateaus for `patience` eval steps.
 
     Compatible with the ``transformers.TrainerCallback`` protocol.
@@ -57,3 +66,88 @@ def on_evaluate(
                 logger.info("early_stopping", step=state.global_step)
                 # HF Trainer checks this flag after eval
                 control.should_training_stop = True
+
+
+def _is_non_finite(value: object) -> bool:
+    """Return True when a metric value is NaN/Inf."""
+    if isinstance(value, bool):
+        return False
+    if isinstance(value, (int, float)):
+        return not math.isfinite(float(value))
+    if isinstance(value, str):
+        normalized = value.strip().lower()
+        return normalized in {"nan", "inf", "+inf", "-inf"}
+    return False
+
+
+class NaNGuardCallback(TrainerCallback):
+    """Stop training when NaN/Inf metrics appear repeatedly."""
+
+    def __init__(
+        self,
+        consecutive_limit: int = 5,
+        watch_keys: tuple[str, ...] = ("loss", "grad_norm", "eval_loss"),
+    ) -> None:
+        self.consecutive_limit = consecutive_limit
+        self.watch_keys = watch_keys
+        self._consecutive_hits = 0
+
+    def _handle_metrics(
+        self,
+        *,
+        metrics: dict[str, object] | None,
+        state: Any,
+        control: Any,
+        source: str,
+    ) -> None:
+        if not metrics:
+            return
+
+        bad_values: dict[str, object] = {
+            key: value
+            for key, value in metrics.items()
+            if key in self.watch_keys and _is_non_finite(value)
+        }
+
+        if not bad_values:
+            self._consecutive_hits = 0
+            return
+
+        self._consecutive_hits += 1
+        logger.warning(
+            "nan_guard_detected",
+            source=source,
+            step=state.global_step,
+            hits=self._consecutive_hits,
+            limit=self.consecutive_limit,
+            bad_metrics=bad_values,
+        )
+
+        if self._consecutive_hits >= self.consecutive_limit:
+            logger.error(
+                "nan_guard_stopping_training",
+                source=source,
+                step=state.global_step,
+                limit=self.consecutive_limit,
+            )
+            control.should_training_stop = True
+
+    def on_log(
+        self,
+        args: Any,
+        state: Any,
+        control: Any,
+        logs: dict[str, object] | None = None,
+        **kwargs: object,
+    ) -> None:
+        self._handle_metrics(metrics=logs, state=state, control=control, source="log")
+
+    def on_evaluate(
+        self,
+        args: Any,
+        state: Any,
+        control: Any,
+        metrics: dict[str, object] | None = None,
+        **kwargs: object,
+    ) -> None:
+        self._handle_metrics(metrics=metrics, state=state, control=control, source="eval")
diff --git a/src/forge/training/trainer.py b/src/forge/training/trainer.py
index 3676da4..54d2039 100644
--- a/src/forge/training/trainer.py
+++ b/src/forge/training/trainer.py
@@ -8,12 +8,16 @@
 
 from datasets import load_dataset
 
+from forge.training.callbacks import EarlyStoppingOnPlateau, NaNGuardCallback
 from forge.utils.config import TrainingConfig
 from forge.utils.logging import get_logger
 
 logger = get_logger(__name__)
 
 _TRUE_VALUES = {"1", "true", "yes", "on"}
+_EARLY_STOPPING_PATIENCE = 5
+_EARLY_STOPPING_MIN_DELTA = 0.001
+_NAN_GUARD_CONSECUTIVE_LIMIT = 5
 
 
 def _is_truthy(value: str | None) -> bool:
@@ -74,7 +78,7 @@ def _setup_unsloth(self) -> None:
     def _setup_peft(self) -> None:
         """Load model via standard PEFT (fallback when Unsloth unavailable)."""
         import torch
-        from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+        from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
         from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 
         logger.info(
@@ -106,24 +110,35 @@ def _setup_peft(self) -> None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
 
         self.model = prepare_model_for_kbit_training(self.model)
-
-        lora_bias = self.config.lora.bias.lower()
-        valid_lora_bias = {"none", "all", "lora_only"}
-        if lora_bias not in valid_lora_bias:
-            raise ValueError(
-                f"Invalid LoRA bias '{self.config.lora.bias}'. "
-                "Expected one of: none, all, lora_only."
+        adapter_init_path = self.config.training.adapter_init_path
+        if adapter_init_path:
+            adapter_path = Path(adapter_init_path).expanduser()
+            if not adapter_path.exists():
+                raise FileNotFoundError(f"Adapter init path not found: {adapter_path}")
+            logger.info("loading_adapter_init", path=str(adapter_path))
+            self.model = PeftModel.from_pretrained(
+                self.model,
+                str(adapter_path),
+                is_trainable=True,
             )
+        else:
+            lora_bias = self.config.lora.bias.lower()
+            valid_lora_bias = {"none", "all", "lora_only"}
+            if lora_bias not in valid_lora_bias:
+                raise ValueError(
+                    f"Invalid LoRA bias '{self.config.lora.bias}'. "
+                    "Expected one of: none, all, lora_only."
+                )
 
-        lora_config = LoraConfig(
-            r=self.config.lora.r,
-            lora_alpha=self.config.lora.alpha,
-            lora_dropout=self.config.lora.dropout,
-            target_modules=self.config.lora.target_modules,
-            bias=cast(Literal["none", "all", "lora_only"], lora_bias),
-            task_type=self.config.lora.task_type,
-        )
-        self.model = get_peft_model(self.model, lora_config)
+            lora_config = LoraConfig(
+                r=self.config.lora.r,
+                lora_alpha=self.config.lora.alpha,
+                lora_dropout=self.config.lora.dropout,
+                target_modules=self.config.lora.target_modules,
+                bias=cast(Literal["none", "all", "lora_only"], lora_bias),
+                task_type=self.config.lora.task_type,
+            )
+            self.model = get_peft_model(self.model, lora_config)
 
         logger.info("model_loaded_peft", trainable_params=self._count_trainable_params())
 
@@ -247,6 +262,7 @@ def train(self, resume_from_checkpoint: str | None = None) -> Path:
             warmup_ratio=self.config.training.warmup_ratio,
             lr_scheduler_type=self.config.training.lr_scheduler_type,
             weight_decay=self.config.training.weight_decay,
+            max_grad_norm=self.config.training.max_grad_norm,
             seed=self.config.training.seed,
             max_steps=self.config.training.max_steps,
             report_to="wandb" if wandb_enabled else "none",
@@ -263,6 +279,23 @@ def train(self, resume_from_checkpoint: str | None = None) -> Path:
             args=training_args,
             formatting_func=self._format_prompt,
         )
+        trainer.add_callback(
+            EarlyStoppingOnPlateau(
+                patience=_EARLY_STOPPING_PATIENCE,
+                min_delta=_EARLY_STOPPING_MIN_DELTA,
+            )
+        )
+        trainer.add_callback(
+            NaNGuardCallback(
+                consecutive_limit=_NAN_GUARD_CONSECUTIVE_LIMIT,
+            )
+        )
+        logger.info(
+            "training_callbacks_enabled",
+            early_stopping_patience=_EARLY_STOPPING_PATIENCE,
+            early_stopping_min_delta=_EARLY_STOPPING_MIN_DELTA,
+            nan_guard_consecutive_limit=_NAN_GUARD_CONSECUTIVE_LIMIT,
+        )
 
         logger.info("training_started", output_dir=str(output_dir))
         if resume_from_checkpoint:
diff --git a/src/forge/utils/config.py b/src/forge/utils/config.py
index 759fda9..8fec97a 100644
--- a/src/forge/utils/config.py
+++ b/src/forge/utils/config.py
@@ -38,6 +38,7 @@ class TrainingParams(BaseModel):
     warmup_ratio: float = 0.1
     lr_scheduler_type: str = "cosine"
     weight_decay: float = 0.01
+    max_grad_norm: float = 1.0
     logging_steps: int = 10
     save_steps: int = 200
     save_total_limit: int = 3
@@ -45,6 +46,7 @@ class TrainingParams(BaseModel):
     fp16: bool = True   # always True for Volta arch
     bf16: bool = False  # NOT supported on V100
     max_steps: int = -1
+    adapter_init_path: str | None = None
     seed: int = 42
 
 

From d570910c7e91816c1526c4e3f514d18c2825efd2 Mon Sep 17 00:00:00 2001
From: Ogulcan Aydogan <ogulcanaydogan@hotmail.com>
Date: Fri, 6 Mar 2026 09:14:54 +0000
Subject: [PATCH 2/3] ops: add automated v8 stability gate fallback watcher

---
 scripts/watch_v8_stability_gate.sh | 130 +++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100755 scripts/watch_v8_stability_gate.sh

diff --git a/scripts/watch_v8_stability_gate.sh b/scripts/watch_v8_stability_gate.sh
new file mode 100755
index 0000000..e42d39b
--- /dev/null
+++ b/scripts/watch_v8_stability_gate.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+cd "$PROJECT_ROOT"
+
+ENV_FILE="${ENV_FILE:-$HOME/.config/forge/training.env}"
+STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status_a100.txt}"
+POLL_SECONDS="${POLL_SECONDS:-30}"
+GATE_STEPS="${GATE_STEPS:-300}"
+
+FALLBACK_CONFIG="${FALLBACK_CONFIG:-configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml}"
+FALLBACK_RUN_DIR="${FALLBACK_RUN_DIR:-artifacts/training/turkcell-7b-sft-v8b-a100-bf16-ultra-stable-fallback}"
+FALLBACK_LOG="${FALLBACK_LOG:-artifacts/logs/training_turkcell_7b_a100_v8b_ultra_stable_fallback.log}"
+
+SCRIPT_LOG="${SCRIPT_LOG:-artifacts/logs/v8_stability_gate.log}"
+mkdir -p "$(dirname "$SCRIPT_LOG")"
+touch "$SCRIPT_LOG"
+
+if [[ ! -f "$STATUS_FILE" ]]; then
+    echo "status_file_missing path=$STATUS_FILE" | tee -a "$SCRIPT_LOG"
+    exit 1
+fi
+
+current_step="$(awk -F '=' '$1=="step" {print $2}' "$STATUS_FILE" | tail -n 1 || true)"
+if [[ ! "$current_step" =~ ^[0-9]+$ ]]; then
+    current_step=0
+fi
+start_step="$current_step"
+target_step=$((start_step + GATE_STEPS))
+
+current_log="$(awk -F '=' '$1=="TRAIN_LOG" {print $2}' "$ENV_FILE" | tail -n 1 || true)"
+if [[ -z "$current_log" ]]; then
+    current_log="artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log"
+fi
+
+marker_line=1
+if [[ -f "$current_log" ]]; then
+    latest_marker="$(grep -a -n "forge-training-start" "$current_log" | tail -n 1 | cut -d ':' -f 1 || true)"
+    if [[ "$latest_marker" =~ ^[0-9]+$ ]] && [[ "$latest_marker" -gt 0 ]]; then
+        marker_line=$((latest_marker + 1))
+    fi
+fi
+
+echo "gate_start_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ) start_step=$start_step target_step=$target_step log=$current_log marker_line=$marker_line" | tee -a "$SCRIPT_LOG"
+
+apply_fallback() {
+    local ts
+    ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    echo "gate_fallback_triggered_utc=$ts reason=$1" | tee -a "$SCRIPT_LOG"
+    if [[ ! -f "$ENV_FILE" ]]; then
+        echo "env_file_missing path=$ENV_FILE" | tee -a "$SCRIPT_LOG"
+        return 1
+    fi
+
+    python3 - "$ENV_FILE" "$FALLBACK_CONFIG" "$FALLBACK_RUN_DIR" "$FALLBACK_LOG" <<'PY'
+import sys
+from pathlib import Path
+
+env_file = Path(sys.argv[1])
+fallback_config = sys.argv[2]
+fallback_run_dir = sys.argv[3]
+fallback_log = sys.argv[4]
+
+raw = env_file.read_text(encoding="utf-8", errors="ignore").splitlines()
+pairs = []
+seen = set()
+for line in raw:
+    if "=" in line and not line.lstrip().startswith("#"):
+        key, value = line.split("=", 1)
+        key = key.strip()
+        if key not in seen:
+            pairs.append(key)
+            seen.add(key)
+
+updates = {
+    "TRAIN_CONFIG": fallback_config,
+    "TRAIN_RUN_DIR": fallback_run_dir,
+    "TRAIN_LOG": fallback_log,
+    "ENABLE_RESUME": "0",
+    "SAVE_STEPS": "250",
+}
+
+for key in updates:
+    if key not in seen:
+        pairs.append(key)
+        seen.add(key)
+
+kv = {}
+for line in raw:
+    if "=" in line and not line.lstrip().startswith("#"):
+        key, value = line.split("=", 1)
+        kv[key.strip()] = value.strip()
+for key, value in updates.items():
+    kv[key] = value
+
+env_file.write_text("".join(f"{k}={kv.get(k, '')}\n" for k in pairs), encoding="utf-8")
+PY
+
+    systemctl --user daemon-reload
+    systemctl --user restart forge-training.service forge-training-monitor.service forge-training-watchdog.service
+    echo "fallback_service_restart_complete_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" | tee -a "$SCRIPT_LOG"
+}
+
+while true; do
+    step="$(awk -F '=' '$1=="step" {print $2}' "$STATUS_FILE" | tail -n 1 || true)"
+    state="$(awk -F '=' '$1=="state" {print $2}' "$STATUS_FILE" | tail -n 1 || true)"
+    if [[ ! "$step" =~ ^[0-9]+$ ]]; then
+        step=0
+    fi
+
+    if [[ "$step" -ge "$target_step" ]]; then
+        echo "gate_pass_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ) step=$step target_step=$target_step" | tee -a "$SCRIPT_LOG"
+        exit 0
+    fi
+
+    if [[ -f "$current_log" ]]; then
+        if tail -n +"$marker_line" "$current_log" | grep -a -q "nan_guard_stopping_training"; then
+            apply_fallback "nan_guard_stopping_training_detected"
+            exit 0
+        fi
+    fi
+
+    if [[ "$state" == "stopped" ]]; then
+        apply_fallback "training_stopped_before_gate"
+        exit 0
+    fi
+
+    sleep "$POLL_SECONDS"
+done

From 758927770e1f80230be55315c1c5e2661994a6ae Mon Sep 17 00:00:00 2001
From: Ogulcan Aydogan <ogulcanaydogan@hotmail.com>
Date: Sat, 7 Mar 2026 22:27:22 +0000
Subject: [PATCH 3/3] feat: add v100 training configs, systemd services and
 operational scripts

- Add Turkcell 7B v100 model configs (v2 stable, v3 ultrastable, fallbacks)
- Add vLLM v100 merged serving config
- Add systemd training service, watchdog service and timer
- Add training monitoring, watchdog and completion scripts
---
 .../models/turkcell_7b_v100_v2_stable.yaml    |  37 +++
 .../models/turkcell_7b_v100_v2b_fallback.yaml |   9 +
 .../turkcell_7b_v100_v3_ultrastable.yaml      |  37 +++
 .../models/turkcell_7b_v100_v3b_fallback.yaml |   9 +
 configs/serving/vllm_v100_v3_merged.yaml      |  13 +
 deploy/systemd/forge-v100-training.service    |  21 ++
 deploy/systemd/forge-v100-watchdog.service    |   8 +
 deploy/systemd/forge-v100-watchdog.timer      |  12 +
 scripts/install_v100_watchdog.sh              |  52 ++++
 scripts/monitor_training.sh                   |   7 +
 scripts/monitor_v100_training.sh              | 105 +++++++
 scripts/run_v100_completion.sh                | 120 ++++++++
 scripts/start_or_resume_full_training.sh      |  57 ++++
 scripts/start_v100_training.sh                | 167 +++++++++++
 scripts/watchdog_training.sh                  | 271 ++++++++++++++++++
 15 files changed, 925 insertions(+)
 create mode 100644 configs/models/turkcell_7b_v100_v2_stable.yaml
 create mode 100644 configs/models/turkcell_7b_v100_v2b_fallback.yaml
 create mode 100644 configs/models/turkcell_7b_v100_v3_ultrastable.yaml
 create mode 100644 configs/models/turkcell_7b_v100_v3b_fallback.yaml
 create mode 100644 configs/serving/vllm_v100_v3_merged.yaml
 create mode 100644 deploy/systemd/forge-v100-training.service
 create mode 100644 deploy/systemd/forge-v100-watchdog.service
 create mode 100644 deploy/systemd/forge-v100-watchdog.timer
 create mode 100755 scripts/install_v100_watchdog.sh
 create mode 100755 scripts/monitor_training.sh
 create mode 100755 scripts/monitor_v100_training.sh
 create mode 100644 scripts/run_v100_completion.sh
 create mode 100755 scripts/start_or_resume_full_training.sh
 create mode 100755 scripts/start_v100_training.sh
 create mode 100644 scripts/watchdog_training.sh

diff --git a/configs/models/turkcell_7b_v100_v2_stable.yaml b/configs/models/turkcell_7b_v100_v2_stable.yaml
new file mode 100644
index 0000000..5a6f42b
--- /dev/null
+++ b/configs/models/turkcell_7b_v100_v2_stable.yaml
@@ -0,0 +1,37 @@
+# V100 stable recovery profile (clean restart)
+_base: "../base.yaml"
+
+model:
+  name: "TURKCELL/Turkcell-LLM-7b-v1"
+  max_seq_length: 2048
+  dtype: "float16"
+
+training:
+  num_epochs: 3
+  max_steps: 8601
+  per_device_train_batch_size: 2
+  gradient_accumulation_steps: 8
+  learning_rate: 5.0e-5
+  warmup_ratio: 0.10
+  lr_scheduler_type: "cosine"
+  weight_decay: 0.01
+  max_grad_norm: 0.3
+  logging_steps: 10
+  eval_steps: 500
+  save_steps: 250
+  save_total_limit: 20
+  fp16: true
+  bf16: false
+
+lora:
+  r: 32
+  alpha: 64
+
+data:
+  train_path: "data/processed/turkish_sft.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+wandb:
+  project: "lowresource-llm-forge"
+  run_name: "turkcell-7b-sft-v100-v2-stable"
+  enabled: false
diff --git a/configs/models/turkcell_7b_v100_v2b_fallback.yaml b/configs/models/turkcell_7b_v100_v2b_fallback.yaml
new file mode 100644
index 0000000..4ba8869
--- /dev/null
+++ b/configs/models/turkcell_7b_v100_v2b_fallback.yaml
@@ -0,0 +1,9 @@
+# V100 fallback profile (single retry after NaN)
+_base: "turkcell_7b_v100_v2_stable.yaml"
+
+training:
+  learning_rate: 3.0e-5
+  max_grad_norm: 0.2
+
+wandb:
+  run_name: "turkcell-7b-sft-v100-v2b-fallback"
diff --git a/configs/models/turkcell_7b_v100_v3_ultrastable.yaml b/configs/models/turkcell_7b_v100_v3_ultrastable.yaml
new file mode 100644
index 0000000..042ff13
--- /dev/null
+++ b/configs/models/turkcell_7b_v100_v3_ultrastable.yaml
@@ -0,0 +1,37 @@
+# V100 ultra-stable profile (clean restart)
+_base: "../base.yaml"
+
+model:
+  name: "TURKCELL/Turkcell-LLM-7b-v1"
+  max_seq_length: 2048
+  dtype: "float16"
+
+training:
+  num_epochs: 3
+  max_steps: 8601
+  per_device_train_batch_size: 2
+  gradient_accumulation_steps: 8
+  learning_rate: 2.0e-5
+  warmup_ratio: 0.12
+  lr_scheduler_type: "cosine"
+  weight_decay: 0.01
+  max_grad_norm: 0.2
+  logging_steps: 10
+  eval_steps: 250
+  save_steps: 250
+  save_total_limit: 30
+  fp16: true
+  bf16: false
+
+lora:
+  r: 32
+  alpha: 64
+
+data:
+  train_path: "data/processed/turkish_sft.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+wandb:
+  project: "lowresource-llm-forge"
+  run_name: "turkcell-7b-sft-v100-v3-ultrastable"
+  enabled: false
diff --git a/configs/models/turkcell_7b_v100_v3b_fallback.yaml b/configs/models/turkcell_7b_v100_v3b_fallback.yaml
new file mode 100644
index 0000000..f1763c6
--- /dev/null
+++ b/configs/models/turkcell_7b_v100_v3b_fallback.yaml
@@ -0,0 +1,9 @@
+# V100 fallback profile (single retry after NaN)
+_base: "turkcell_7b_v100_v3_ultrastable.yaml"
+
+training:
+  learning_rate: 1.0e-5
+  max_grad_norm: 0.1
+
+wandb:
+  run_name: "turkcell-7b-sft-v100-v3b-fallback"
diff --git a/configs/serving/vllm_v100_v3_merged.yaml b/configs/serving/vllm_v100_v3_merged.yaml
new file mode 100644
index 0000000..a87bf08
--- /dev/null
+++ b/configs/serving/vllm_v100_v3_merged.yaml
@@ -0,0 +1,13 @@
+# vLLM serving configuration for V100 merged v3 model
+
+model_path: "artifacts/merged/turkcell-7b-v100-v3-ultrastable"
+host: "0.0.0.0"
+port: 18040
+tensor_parallel_size: 1
+gpu_memory_utilization: 0.75
+max_model_len: 4096
+dtype: "float16"
+enable_prefix_caching: true
+trust_remote_code: false
+enforce_eager: false
+max_num_seqs: 48
diff --git a/deploy/systemd/forge-v100-training.service b/deploy/systemd/forge-v100-training.service
new file mode 100644
index 0000000..aaa3438
--- /dev/null
+++ b/deploy/systemd/forge-v100-training.service
@@ -0,0 +1,21 @@
+[Unit]
+Description=LowResource-LLM-Forge V100 Training
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/v100_training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_v100_training.sh
+Restart=on-failure
+RestartSec=20
+StartLimitIntervalSec=600
+StartLimitBurst=3
+KillMode=control-group
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-v100-watchdog.service b/deploy/systemd/forge-v100-watchdog.service
new file mode 100644
index 0000000..80613ec
--- /dev/null
+++ b/deploy/systemd/forge-v100-watchdog.service
@@ -0,0 +1,8 @@
+[Unit]
+Description=LowResource-LLM-Forge V100 Training Watchdog
+After=network-online.target
+
+[Service]
+Type=oneshot
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+ExecStart=/usr/bin/env bash %h/projects/LowResource-LLM-Forge/scripts/watchdog_training.sh
diff --git a/deploy/systemd/forge-v100-watchdog.timer b/deploy/systemd/forge-v100-watchdog.timer
new file mode 100644
index 0000000..537ea77
--- /dev/null
+++ b/deploy/systemd/forge-v100-watchdog.timer
@@ -0,0 +1,12 @@
+[Unit]
+Description=Run V100 training watchdog every 30 seconds
+
+[Timer]
+OnBootSec=1min
+OnUnitActiveSec=30s
+AccuracySec=5s
+Unit=forge-v100-watchdog.service
+Persistent=true
+
+[Install]
+WantedBy=timers.target
diff --git a/scripts/install_v100_watchdog.sh b/scripts/install_v100_watchdog.sh
new file mode 100755
index 0000000..dbe62f8
--- /dev/null
+++ b/scripts/install_v100_watchdog.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+SYSTEMD_USER_DIR="${SYSTEMD_USER_DIR:-$HOME/.config/systemd/user}"
+FORGE_ENV_DIR="${FORGE_ENV_DIR:-$HOME/.config/forge}"
+FORGE_ENV_FILE="${FORGE_ENV_FILE:-$FORGE_ENV_DIR/v100_training.env}"
+
+mkdir -p "$SYSTEMD_USER_DIR" "$FORGE_ENV_DIR" "$PROJECT_ROOT/artifacts/logs"
+
+if [[ ! -f "$FORGE_ENV_FILE" ]]; then
+    cat >"$FORGE_ENV_FILE" <<'ENVEOF'
+# V100 runtime contract
+TRAIN_CONFIG=configs/models/turkcell_7b_v100_v3_ultrastable.yaml
+FALLBACK_CONFIG=configs/models/turkcell_7b_v100_v3b_fallback.yaml
+TARGET_STEPS=8601
+SAVE_STEPS=250
+ENABLE_RESUME=0
+RESUME_AFTER_STEP=500
+REQUIRE_WANDB=0
+CUDA_VISIBLE_DEVICES=0
+ENVEOF
+    chmod 600 "$FORGE_ENV_FILE"
+fi
+
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-v100-training.service" \
+    "$SYSTEMD_USER_DIR/forge-v100-training.service"
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-v100-watchdog.service" \
+    "$SYSTEMD_USER_DIR/forge-v100-watchdog.service"
+install -m 0644 \
+    "$PROJECT_ROOT/deploy/systemd/forge-v100-watchdog.timer" \
+    "$SYSTEMD_USER_DIR/forge-v100-watchdog.timer"
+
+chmod +x \
+    "$PROJECT_ROOT/scripts/start_v100_training.sh" \
+    "$PROJECT_ROOT/scripts/start_or_resume_full_training.sh" \
+    "$PROJECT_ROOT/scripts/monitor_v100_training.sh" \
+    "$PROJECT_ROOT/scripts/monitor_training.sh" \
+    "$PROJECT_ROOT/scripts/watchdog_training.sh" \
+    "$PROJECT_ROOT/scripts/run_v100_completion.sh"
+
+systemctl --user daemon-reload
+systemctl --user enable forge-v100-training.service
+systemctl --user enable --now forge-v100-watchdog.timer
+systemctl --user restart forge-v100-watchdog.service
+systemctl --user --no-pager --lines=20 status forge-v100-training.service || true
+systemctl --user --no-pager --lines=20 status forge-v100-watchdog.timer || true
+systemctl --user --no-pager --lines=20 status forge-v100-watchdog.service || true
+
+echo "V100 watchdog installed. Env file: $FORGE_ENV_FILE"
diff --git a/scripts/monitor_training.sh b/scripts/monitor_training.sh
new file mode 100755
index 0000000..292b30f
--- /dev/null
+++ b/scripts/monitor_training.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+cd "$PROJECT_ROOT"
+
+bash scripts/monitor_v100_training.sh
diff --git a/scripts/monitor_v100_training.sh b/scripts/monitor_v100_training.sh
new file mode 100755
index 0000000..91a68f3
--- /dev/null
+++ b/scripts/monitor_v100_training.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}"
+ACTIVE_RUN_FILE="${ACTIVE_RUN_FILE:-$PROJECT_ROOT/artifacts/logs/v100_active_run.env}"
+STATUS_FILE="${STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_monitor_status.txt}"
+
+if [[ -f "$FORGE_ENV_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$FORGE_ENV_FILE"
+fi
+
+if [[ -f "$ACTIVE_RUN_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$ACTIVE_RUN_FILE"
+fi
+
+cd "$PROJECT_ROOT"
+
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}"
+TARGET_STEPS="${TARGET_STEPS:-8601}"
+TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_full.log}"
+RUN_ID="${RUN_ID:-unknown}"
+
+abs_path() {
+    local path="$1"
+    if [[ "$path" = /* ]]; then
+        echo "$path"
+    else
+        echo "$PROJECT_ROOT/$path"
+    fi
+}
+
+LOG_FILE="$(abs_path "$TRAIN_LOG")"
+
+running="no"
+if pgrep -f "run_training.py --config ${TRAIN_CONFIG}" >/dev/null 2>&1 || pgrep -f "scripts/run_training.py" >/dev/null 2>&1; then
+    running="yes"
+fi
+
+log_start_line=1
+if [[ -f "$LOG_FILE" ]]; then
+    if [[ "$RUN_ID" != "unknown" ]]; then
+        marker_line="$(grep -a -n "forge-training-start run_id=${RUN_ID}" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)"
+    else
+        marker_line="$(grep -a -n "forge-training-start" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)"
+    fi
+    if [[ "$marker_line" =~ ^[0-9]+$ ]] && [[ "$marker_line" -gt 0 ]]; then
+        log_start_line=$((marker_line + 1))
+    fi
+fi
+
+progress="none"
+if [[ -f "$LOG_FILE" ]]; then
+    progress="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -oE "[0-9]+/${TARGET_STEPS}" | tail -n 1 || true)"
+    [[ -z "$progress" ]] && progress="none"
+fi
+
+step=0
+if [[ "$progress" != "none" ]]; then
+    step="${progress%%/*}"
+fi
+
+percent=0
+if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$TARGET_STEPS" -gt 0 ]]; then
+    percent=$((step * 100 / TARGET_STEPS))
+fi
+
+nan_hits=0
+if [[ -f "$LOG_FILE" ]]; then
+    metric_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -E -i -c "'(loss|grad_norm|eval_loss|entropy)':[[:space:]]*'?(nan|inf)'?" || true)"
+    guard_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -E -c "nan_guard_detected|nan_guard_stopping_training" || true)"
+    metric_nan_count="${metric_nan_count:-0}"
+    guard_nan_count="${guard_nan_count:-0}"
+    nan_hits=$((metric_nan_count + guard_nan_count))
+fi
+
+gpu_line="$(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,noheader 2>/dev/null | sed -n '1p' || true)"
+if [[ -z "$gpu_line" ]]; then
+    gpu_line="unknown"
+fi
+
+state="stopped"
+if [[ "$running" == "yes" ]]; then
+    state="running"
+fi
+if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$step" -ge "$TARGET_STEPS" ]]; then
+    state="completed"
+fi
+
+{
+    echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    echo "running=$running"
+    echo "step=$step"
+    echo "target_steps=$TARGET_STEPS"
+    echo "progress=$progress"
+    echo "percent=$percent"
+    echo "nan_hits=$nan_hits"
+    echo "run_segment_id=$RUN_ID"
+    echo "train_config=$TRAIN_CONFIG"
+    echo "log_file=$TRAIN_LOG"
+    echo "gpu=$gpu_line"
+    echo "state=$state"
+} >"$STATUS_FILE"
diff --git a/scripts/run_v100_completion.sh b/scripts/run_v100_completion.sh
new file mode 100644
index 0000000..aa3a160
--- /dev/null
+++ b/scripts/run_v100_completion.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}"
+MONITOR_STATUS_FILE="${MONITOR_STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_monitor_status.txt}"
+WATCHDOG_STATUS_FILE="${WATCHDOG_STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_watchdog_status.txt}"
+SUMMARY_FILE="${SUMMARY_FILE:-$PROJECT_ROOT/artifacts/logs/v100_completion_summary.md}"
+LOCK_DIR="${LOCK_DIR:-$PROJECT_ROOT/artifacts/logs/v100_completion.lock.d}"
+DONE_FILE="${DONE_FILE:-$PROJECT_ROOT/artifacts/logs/v100_completion.done}"
+UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}"
+
+cd "$PROJECT_ROOT"
+
+if [[ -f "$DONE_FILE" ]]; then
+    exit 0
+fi
+
+if ! mkdir "$LOCK_DIR" 2>/dev/null; then
+    exit 0
+fi
+trap 'rmdir "$LOCK_DIR" >/dev/null 2>&1 || true' EXIT
+
+if [[ ! -f "$MONITOR_STATUS_FILE" ]]; then
+    exit 0
+fi
+
+target_steps="$(grep -E '^target_steps=' "$MONITOR_STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)"
+step="$(grep -E '^step=' "$MONITOR_STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)"
+state="$(grep -E '^state=' "$MONITOR_STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)"
+
+if [[ ! "$target_steps" =~ ^[0-9]+$ ]]; then
+    target_steps=8601
+fi
+if [[ ! "$step" =~ ^[0-9]+$ ]]; then
+    step=0
+fi
+
+if [[ "$state" != "completed" ]] && [[ "$step" -lt "$target_steps" ]]; then
+    exit 0
+fi
+
+if [[ -f "$FORGE_ENV_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$FORGE_ENV_FILE"
+fi
+
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}"
+RUN_DIR="${RUN_DIR:-artifacts/training/turkcell-7b-sft-v100-v3-ultrastable}"
+TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_turkcell_7b_v100_v3_ultrastable.log}"
+ADAPTER_DIR="${ADAPTER_DIR:-$RUN_DIR/final}"
+
+BASE_MODEL="${BASE_MODEL:-TURKCELL/Turkcell-LLM-7b-v1}"
+MERGED_OUTPUT="${MERGED_OUTPUT:-artifacts/merged/turkcell-7b-v100-v3-ultrastable}"
+EVAL_OUTPUT_ROOT="${EVAL_OUTPUT_ROOT:-artifacts/eval/turkcell-7b-v100-v3-ultrastable}"
+SERVE_CONFIG="${SERVE_CONFIG:-configs/serving/vllm_v100_v3_merged.yaml}"
+BENCHMARK_OUTPUT_DIR="${BENCHMARK_OUTPUT_DIR:-artifacts/benchmarks/turkcell-7b-v100-v3-ultrastable}"
+
+export UV_BIN
+export TRAIN_CONFIG
+export RUN_DIR
+export TRAIN_LOG
+export ADAPTER_DIR
+export BASE_MODEL
+export MERGED_OUTPUT
+export EVAL_OUTPUT_ROOT
+export SERVE_CONFIG
+export BENCHMARK_OUTPUT_DIR
+export PUSH_TO_HUB=0
+export AUTO_START_SERVE=1
+export FORGE_EXECUTION_CONTEXT=remote
+
+pipeline_status="success"
+pipeline_error=""
+if ! bash "$PROJECT_ROOT/scripts/post_training_pipeline.sh"; then
+    pipeline_status="failed"
+    pipeline_error="post_training_pipeline_failed"
+fi
+
+timestamp="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+watchdog_snapshot=""
+if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then
+    watchdog_snapshot="$(cat "$WATCHDOG_STATUS_FILE")"
+fi
+
+mkdir -p "$(dirname "$SUMMARY_FILE")"
+cat >"$SUMMARY_FILE" <<EOF
+# V100 Completion Summary
+
+- completed_at_utc: ${timestamp}
+- final_step: ${step}
+- target_steps: ${target_steps}
+- pipeline_status: ${pipeline_status}
+- train_config: ${TRAIN_CONFIG}
+- run_dir: ${RUN_DIR}
+- adapter_dir: ${ADAPTER_DIR}
+- merged_output: ${MERGED_OUTPUT}
+- eval_outputs:
+  - ${EVAL_OUTPUT_ROOT}/mmlu_tr
+  - ${EVAL_OUTPUT_ROOT}/perplexity
+  - ${EVAL_OUTPUT_ROOT}/generation
+- serve_config: ${SERVE_CONFIG}
+- benchmark_output_dir: ${BENCHMARK_OUTPUT_DIR}
+
+## Watchdog Snapshot
+\`\`\`
+${watchdog_snapshot}
+\`\`\`
+EOF
+
+if [[ "$pipeline_status" == "success" ]]; then
+    touch "$DONE_FILE"
+else
+    {
+        echo
+        echo "## Error"
+        echo "${pipeline_error}"
+    } >>"$SUMMARY_FILE"
+    exit 1
+fi
diff --git a/scripts/start_or_resume_full_training.sh b/scripts/start_or_resume_full_training.sh
new file mode 100755
index 0000000..6ed9bcc
--- /dev/null
+++ b/scripts/start_or_resume_full_training.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}"
+PID_FILE="${PID_FILE:-$PROJECT_ROOT/artifacts/logs/training_full.pid}"
+TRAINING_SERVICE_NAME="${TRAINING_SERVICE_NAME:-forge-v100-training.service}"
+FORCE_RESTART="${FORCE_RESTART:-0}"
+
+if [[ -f "$FORGE_ENV_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$FORGE_ENV_FILE"
+fi
+
+cd "$PROJECT_ROOT"
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}"
+RUN_PATTERN="run_training.py --config ${TRAIN_CONFIG}"
+
+mkdir -p "$(dirname "$PID_FILE")"
+
+if [[ "$FORCE_RESTART" != "1" ]] && systemctl --user is-active --quiet "$TRAINING_SERVICE_NAME"; then
+    echo "training_service_active"
+    systemctl --user --no-pager --lines=0 status "$TRAINING_SERVICE_NAME" | head -n 1 || true
+    exit 0
+fi
+
+if [[ "$FORCE_RESTART" != "1" ]] && pgrep -f "$RUN_PATTERN" >/dev/null 2>&1; then
+    echo "training_already_running"
+    pgrep -af "$RUN_PATTERN" | head -n 4
+    exit 0
+fi
+
+RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}"
+systemctl --user set-environment RUN_ID="$RUN_ID"
+if [[ "$FORCE_RESTART" == "1" ]]; then
+    systemctl --user restart "$TRAINING_SERVICE_NAME"
+else
+    systemctl --user start "$TRAINING_SERVICE_NAME"
+fi
+
+sleep 2
+service_state="$(systemctl --user is-active "$TRAINING_SERVICE_NAME" || true)"
+if [[ "$service_state" != "active" ]] && [[ "$service_state" != "activating" ]]; then
+    echo "training_service_failed"
+    systemctl --user --no-pager --lines=50 status "$TRAINING_SERVICE_NAME" || true
+    exit 1
+fi
+
+if pgrep -f "scripts/run_training.py" >/dev/null 2>&1; then
+    pgrep -f "scripts/run_training.py" | head -n 1 > "$PID_FILE"
+fi
+
+echo "service=$TRAINING_SERVICE_NAME"
+echo "service_state=$service_state"
+echo "run_id=$RUN_ID"
+echo "train_config=$TRAIN_CONFIG"
+echo "pid_file=$PID_FILE"
diff --git a/scripts/start_v100_training.sh b/scripts/start_v100_training.sh
new file mode 100755
index 0000000..c15aecc
--- /dev/null
+++ b/scripts/start_v100_training.sh
@@ -0,0 +1,167 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}"
+
+if [[ -f "$FORGE_ENV_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$FORGE_ENV_FILE"
+fi
+
+cd "$PROJECT_ROOT"
+
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}"
+FALLBACK_CONFIG="${FALLBACK_CONFIG:-configs/models/turkcell_7b_v100_v3b_fallback.yaml}"
+TARGET_STEPS="${TARGET_STEPS:-8601}"
+SAVE_STEPS="${SAVE_STEPS:-250}"
+ENABLE_RESUME="${ENABLE_RESUME:-0}"
+REQUIRE_WANDB="${REQUIRE_WANDB:-0}"
+RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}"
+CONFIG_BASENAME="$(basename "$TRAIN_CONFIG")"
+CONFIG_SLUG="${CONFIG_BASENAME%.*}"
+TRAIN_LOG_DIR="${TRAIN_LOG_DIR:-artifacts/logs}"
+ACTIVE_RUN_FILE="${ACTIVE_RUN_FILE:-artifacts/logs/v100_active_run.env}"
+STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status.txt}"
+PID_FILE="${PID_FILE:-artifacts/logs/training_full.pid}"
+UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}"
+HF_HOME_DIR="${HF_HOME_DIR:-$PROJECT_ROOT/.hf_cache}"
+HF_DATASETS_CACHE_DIR="${HF_DATASETS_CACHE_DIR:-$HF_HOME_DIR/datasets}"
+HF_HUB_CACHE_DIR="${HF_HUB_CACHE_DIR:-$HF_HOME_DIR/hub}"
+CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+
+if [[ -z "${TRAIN_RUN_DIR:-}" ]]; then
+    case "$TRAIN_CONFIG" in
+        *turkcell_7b_v100_v3b_fallback.yaml)
+            TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v3b-fallback"
+            ;;
+        *turkcell_7b_v100_v3_ultrastable.yaml)
+            TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v3-ultrastable"
+            ;;
+        *turkcell_7b_v100_v2b_fallback.yaml)
+            TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v2b-fallback"
+            ;;
+        *)
+            TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v2-stable"
+            ;;
+    esac
+fi
+
+TRAIN_LOG="${TRAIN_LOG:-$TRAIN_LOG_DIR/training_${CONFIG_SLUG}_${RUN_ID}.log}"
+
+abs_path() {
+    local path="$1"
+    if [[ "$path" = /* ]]; then
+        echo "$path"
+    else
+        echo "$PROJECT_ROOT/$path"
+    fi
+}
+
+TRAIN_RUN_DIR_ABS="$(abs_path "$TRAIN_RUN_DIR")"
+TRAIN_LOG_ABS="$(abs_path "$TRAIN_LOG")"
+ACTIVE_RUN_FILE_ABS="$(abs_path "$ACTIVE_RUN_FILE")"
+PID_FILE_ABS="$(abs_path "$PID_FILE")"
+
+mkdir -p \
+    "$(dirname "$TRAIN_RUN_DIR_ABS")" \
+    "$(dirname "$TRAIN_LOG_ABS")" \
+    "$(dirname "$ACTIVE_RUN_FILE_ABS")" \
+    "$(dirname "$PID_FILE_ABS")" \
+    "$HF_HOME_DIR" \
+    "$HF_DATASETS_CACHE_DIR" \
+    "$HF_HUB_CACHE_DIR"
+
+if [[ ! -x "$UV_BIN" ]]; then
+    echo "UV executable not found: $UV_BIN" >&2
+    exit 1
+fi
+
+if [[ "$REQUIRE_WANDB" == "1" ]] && [[ -z "${WANDB_API_KEY:-}" ]]; then
+    echo "WANDB_API_KEY is required for this run (REQUIRE_WANDB=1)." >&2
+    exit 1
+fi
+
+find_latest_checkpoint() {
+    if [[ ! -d "$TRAIN_RUN_DIR_ABS" ]]; then
+        return
+    fi
+
+    local current_step=""
+    if [[ -f "$STATUS_FILE" ]]; then
+        current_step="$(grep -E '^step=' "$STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)"
+    fi
+
+    if [[ "$current_step" =~ ^[0-9]+$ ]] && [[ "$current_step" -gt 0 ]]; then
+        local filtered_checkpoint
+        filtered_checkpoint="$(find "$TRAIN_RUN_DIR_ABS" -maxdepth 1 -type d -name 'checkpoint-*' 2>/dev/null \
+            | sed -E 's#(.*checkpoint-)([0-9]+)$#\2 \1\2#' \
+            | awk -v s="$current_step" '$1 <= s' \
+            | sort -n \
+            | tail -n 1 \
+            | cut -d ' ' -f 2- || true)"
+        if [[ -n "$filtered_checkpoint" ]]; then
+            echo "$filtered_checkpoint"
+            return
+        fi
+    fi
+
+    find "$TRAIN_RUN_DIR_ABS" -maxdepth 1 -type d -name 'checkpoint-*' | sort -V | tail -n 1
+}
+
+resume_from=""
+if [[ "$ENABLE_RESUME" == "1" ]]; then
+    latest_checkpoint="$(find_latest_checkpoint || true)"
+    if [[ -n "$latest_checkpoint" ]]; then
+        resume_from="$latest_checkpoint"
+    fi
+fi
+
+cmd=("$UV_BIN" "run" "python" "scripts/run_training.py" "--config" "$TRAIN_CONFIG")
+if [[ -n "$resume_from" ]]; then
+    cmd+=("--resume-from" "$resume_from")
+fi
+
+ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+{
+    echo "[$ts] forge-training-start run_id=$RUN_ID"
+    echo "run_id=$RUN_ID"
+    echo "train_config=$TRAIN_CONFIG"
+    echo "fallback_config=$FALLBACK_CONFIG"
+    echo "target_steps=$TARGET_STEPS"
+    echo "save_steps=$SAVE_STEPS"
+    echo "train_run_dir=$TRAIN_RUN_DIR"
+    echo "train_log=$TRAIN_LOG"
+    echo "resume_from=${resume_from:-none}"
+    echo "enable_resume=$ENABLE_RESUME"
+    echo "require_wandb=$REQUIRE_WANDB"
+    echo "command=${cmd[*]}"
+} >>"$TRAIN_LOG_ABS"
+
+{
+    echo "RUN_ID=$RUN_ID"
+    echo "TRAIN_CONFIG=$TRAIN_CONFIG"
+    echo "FALLBACK_CONFIG=$FALLBACK_CONFIG"
+    echo "TARGET_STEPS=$TARGET_STEPS"
+    echo "SAVE_STEPS=$SAVE_STEPS"
+    echo "TRAIN_RUN_DIR=$TRAIN_RUN_DIR"
+    echo "TRAIN_LOG=$TRAIN_LOG"
+    echo "RESUME_FROM=${resume_from:-}"
+    echo "ENABLE_RESUME=$ENABLE_RESUME"
+    echo "REQUIRE_WANDB=$REQUIRE_WANDB"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+} >"$ACTIVE_RUN_FILE_ABS"
+
+echo "$$" >"$PID_FILE_ABS"
+
+exec > >(tee -a "$TRAIN_LOG_ABS") 2>&1
+
+echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] forge-training-exec run_id=$RUN_ID"
+
+exec env \
+    FORGE_EXECUTION_CONTEXT=remote \
+    HF_HOME="$HF_HOME_DIR" \
+    HF_DATASETS_CACHE="$HF_DATASETS_CACHE_DIR" \
+    HUGGINGFACE_HUB_CACHE="$HF_HUB_CACHE_DIR" \
+    CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" \
+    "${cmd[@]}"
diff --git a/scripts/watchdog_training.sh b/scripts/watchdog_training.sh
new file mode 100644
index 0000000..2829fa0
--- /dev/null
+++ b/scripts/watchdog_training.sh
@@ -0,0 +1,271 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}"
+FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}"
+ACTIVE_RUN_FILE="${ACTIVE_RUN_FILE:-$PROJECT_ROOT/artifacts/logs/v100_active_run.env}"
+STATUS_FILE="${STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_watchdog_status.txt}"
+STATE_FILE="${STATE_FILE:-$PROJECT_ROOT/artifacts/logs/training_watchdog_state.env}"
+MONITOR_STATUS_FILE="${MONITOR_STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_monitor_status.txt}"
+TRAINING_SERVICE_NAME="${TRAINING_SERVICE_NAME:-forge-v100-training.service}"
+COMPLETION_SCRIPT="${COMPLETION_SCRIPT:-$PROJECT_ROOT/scripts/run_v100_completion.sh}"
+
+if [[ -f "$FORGE_ENV_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$FORGE_ENV_FILE"
+fi
+if [[ -f "$ACTIVE_RUN_FILE" ]]; then
+    # shellcheck disable=SC1090
+    source "$ACTIVE_RUN_FILE"
+fi
+
+cd "$PROJECT_ROOT"
+
+TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}"
+FALLBACK_CONFIG="${FALLBACK_CONFIG:-configs/models/turkcell_7b_v100_v3b_fallback.yaml}"
+TARGET_STEPS="${TARGET_STEPS:-8601}"
+MAX_IDLE_SECONDS="${MAX_IDLE_SECONDS:-5400}"
+MAX_LOG_STALE_SECONDS="${MAX_LOG_STALE_SECONDS:-900}"
+RESUME_AFTER_STEP="${RESUME_AFTER_STEP:-500}"
+RUN_ID="${RUN_ID:-unknown}"
+TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_full.log}"
+TOPIC="${FORGE_NOTIFY_TOPIC:-weezboo-forge-training}"
+NTFY_URL="${FORGE_NOTIFY_URL:-https://ntfy.sh/${TOPIC}}"
+
+abs_path() {
+    local path="$1"
+    if [[ "$path" = /* ]]; then
+        echo "$path"
+    else
+        echo "$PROJECT_ROOT/$path"
+    fi
+}
+
+upsert_env() {
+    local key="$1"
+    local value="$2"
+    local file="$3"
+    mkdir -p "$(dirname "$file")"
+    if [[ -f "$file" ]] && grep -q "^${key}=" "$file"; then
+        sed -i "s#^${key}=.*#${key}=${value}#" "$file"
+    else
+        echo "${key}=${value}" >>"$file"
+    fi
+}
+
+send_notify() {
+    local title="$1"
+    local body="$2"
+    local tags="${3:-warning}"
+    curl -fsS -m 20 \
+        -H "Title: ${title}" \
+        -H "Tags: ${tags}" \
+        -H "Priority: high" \
+        -d "$body" \
+        "$NTFY_URL" >/dev/null || true
+}
+
+LOG_FILE="$(abs_path "$TRAIN_LOG")"
+STATUS_FILE_ABS="$(abs_path "$STATUS_FILE")"
+STATE_FILE_ABS="$(abs_path "$STATE_FILE")"
+FORGE_ENV_FILE_ABS="$(abs_path "$FORGE_ENV_FILE")"
+MONITOR_STATUS_FILE_ABS="$(abs_path "$MONITOR_STATUS_FILE")"
+mkdir -p "$(dirname "$STATUS_FILE_ABS")"
+
+fallback_applied=0
+fatal_stopped=0
+last_step=0
+last_step_ts=0
+nan_hits=0
+resume_armed=0
+active_profile="primary"
+last_nan_signature=""
+posttrain_triggered=0
+
+if [[ -f "$STATE_FILE_ABS" ]]; then
+    # shellcheck disable=SC1090
+    source "$STATE_FILE_ABS" || true
+fi
+
+segment_start_line=1
+if [[ -f "$LOG_FILE" ]]; then
+    if [[ "$RUN_ID" != "unknown" ]]; then
+        marker_line="$(grep -a -n "forge-training-start run_id=${RUN_ID}" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)"
+    else
+        marker_line="$(grep -a -n "forge-training-start" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)"
+    fi
+    if [[ "$marker_line" =~ ^[0-9]+$ ]] && [[ "$marker_line" -gt 0 ]]; then
+        segment_start_line=$((marker_line + 1))
+    fi
+fi
+
+progress="none"
+if [[ -f "$LOG_FILE" ]]; then
+    progress="$(tail -n +"$segment_start_line" "$LOG_FILE" | grep -a -oE "[0-9]+/${TARGET_STEPS}" | tail -n 1 || true)"
+    [[ -z "$progress" ]] && progress="none"
+fi
+
+step=0
+if [[ "$progress" != "none" ]]; then
+    step="${progress%%/*}"
+fi
+
+service_state="$(systemctl --user is-active "$TRAINING_SERVICE_NAME" 2>/dev/null || true)"
+running="no"
+if [[ "$service_state" == "active" ]] || [[ "$service_state" == "activating" ]]; then
+    running="yes"
+fi
+
+now_epoch="$(date +%s)"
+if [[ -f "$LOG_FILE" ]]; then
+    log_mtime_epoch="$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)"
+else
+    log_mtime_epoch=0
+fi
+log_age="$((now_epoch - log_mtime_epoch))"
+
+if [[ "$step" -gt "$last_step" ]]; then
+    last_step="$step"
+    last_step_ts="$now_epoch"
+elif [[ "$last_step_ts" -eq 0 ]]; then
+    last_step_ts="$now_epoch"
+fi
+
+latest_bad_line=""
+if [[ -f "$LOG_FILE" ]]; then
+    latest_bad_line="$(tail -n +"$segment_start_line" "$LOG_FILE" | grep -a -E -i "'(loss|grad_norm|eval_loss|entropy)':[[:space:]]*'?(nan|inf)'?|nan_guard_detected|nan_guard_stopping_training" | tail -n 1 || true)"
+fi
+
+new_nan_event="no"
+if [[ -n "$latest_bad_line" ]]; then
+    signature="$(printf '%s' "$latest_bad_line" | sha256sum | awk '{print $1}')"
+    if [[ "$signature" != "$last_nan_signature" ]]; then
+        last_nan_signature="$signature"
+        nan_hits=$((nan_hits + 1))
+        new_nan_event="yes"
+    fi
+fi
+
+stop_training() {
+    systemctl --user stop "$TRAINING_SERVICE_NAME" >/dev/null 2>&1 || true
+    pkill -f "scripts/run_training.py --config" >/dev/null 2>&1 || true
+    sleep 2
+}
+
+start_service_with_config() {
+    local cfg="$1"
+    local enable_resume="$2"
+    local run_id
+
+    upsert_env "TRAIN_CONFIG" "$cfg" "$FORGE_ENV_FILE_ABS"
+    upsert_env "ENABLE_RESUME" "$enable_resume" "$FORGE_ENV_FILE_ABS"
+
+    run_id="$(date -u +%Y%m%dT%H%M%SZ)"
+    systemctl --user set-environment RUN_ID="$run_id"
+    systemctl --user start "$TRAINING_SERVICE_NAME"
+
+    RUN_ID="$run_id"
+    TRAIN_CONFIG="$cfg"
+}
+
+action="none"
+message="ok"
+
+if [[ "$fatal_stopped" -eq 1 ]]; then
+    action="fatal_locked"
+    message="watchdog_fatal_stop_active"
+else
+    if [[ "$step" -ge "$TARGET_STEPS" ]]; then
+        action="completed"
+        message="target_step_reached"
+        if [[ "$posttrain_triggered" -eq 0 ]] && [[ -x "$COMPLETION_SCRIPT" ]]; then
+            if bash "$COMPLETION_SCRIPT"; then
+                posttrain_triggered=1
+                action="posttrain_triggered"
+                message="completion_pipeline_started"
+            else
+                action="posttrain_failed"
+                message="completion_pipeline_failed"
+                send_notify "LLM v100 completion failed" "Post-training pipeline failed after step completion." "warning,rotating_light"
+            fi
+        fi
+    elif [[ "$new_nan_event" == "yes" ]]; then
+        if [[ "$fallback_applied" -eq 0 ]] && [[ "$active_profile" != "fallback" ]]; then
+            stop_training
+            start_service_with_config "$FALLBACK_CONFIG" "0"
+            fallback_applied=1
+            active_profile="fallback"
+            nan_hits=0
+            action="fallback_restart"
+            message="nan detected on primary; fallback started"
+            send_notify "LLM v100 fallback restart" "NaN/Inf detected on primary profile. Restarted with fallback config." "warning,repeat"
+            last_step_ts="$now_epoch"
+        else
+            stop_training
+            fatal_stopped=1
+            action="fatal_stop_after_fallback"
+            message="nan detected on fallback; training stopped"
+            send_notify "LLM v100 stopped" "NaN/Inf detected on fallback. Fatal lock enabled." "rotating_light,warning"
+        fi
+    else
+        idle_seconds="$((now_epoch - last_step_ts))"
+        if [[ "$running" == "yes" ]] && [[ "$idle_seconds" -ge "$MAX_IDLE_SECONDS" ]] && [[ "$log_age" -ge "$MAX_LOG_STALE_SECONDS" ]]; then
+            stop_training
+            if [[ "$active_profile" == "fallback" ]]; then
+                start_service_with_config "$FALLBACK_CONFIG" "0"
+            else
+                start_service_with_config "$TRAIN_CONFIG" "0"
+            fi
+            action="restart_stalled"
+            message="stalled run restarted"
+            send_notify "LLM v100 stalled restart" "No progress for ${idle_seconds}s; service restarted." "warning,repeat"
+            last_step_ts="$now_epoch"
+        elif [[ "$running" == "no" ]] && [[ "$step" -lt "$TARGET_STEPS" ]]; then
+            if [[ "$active_profile" == "fallback" ]]; then
+                start_service_with_config "$FALLBACK_CONFIG" "0"
+            else
+                start_service_with_config "$TRAIN_CONFIG" "0"
+            fi
+            action="restart_down"
+            message="service down; restarted"
+            send_notify "LLM v100 restarted" "Training service was down and restarted." "warning,repeat"
+            last_step_ts="$now_epoch"
+        fi
+    fi
+fi
+
+if [[ "$resume_armed" -eq 0 ]] && [[ "$step" -ge "$RESUME_AFTER_STEP" ]] && [[ "$nan_hits" -eq 0 ]]; then
+    upsert_env "ENABLE_RESUME" "1" "$FORGE_ENV_FILE_ABS"
+    resume_armed=1
+fi
+
+bash "$PROJECT_ROOT/scripts/monitor_v100_training.sh" || true
+
+{
+    echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    echo "running=$running"
+    echo "step=$step"
+    echo "target_steps=$TARGET_STEPS"
+    echo "run_segment_id=$RUN_ID"
+    echo "active_profile=$active_profile"
+    echo "nan_hits=$nan_hits"
+    echo "fallback_applied=$fallback_applied"
+    echo "fatal_stopped=$fatal_stopped"
+    echo "resume_armed=$resume_armed"
+    echo "posttrain_triggered=$posttrain_triggered"
+    echo "log_age_seconds=$log_age"
+    echo "action=$action"
+    echo "message=$message"
+} > "$STATUS_FILE_ABS"
+
+{
+    echo "fallback_applied=$fallback_applied"
+    echo "fatal_stopped=$fatal_stopped"
+    echo "last_step=$last_step"
+    echo "last_step_ts=$last_step_ts"
+    echo "nan_hits=$nan_hits"
+    echo "resume_armed=$resume_armed"
+    echo "active_profile=$active_profile"
+    echo "last_nan_signature=$last_nan_signature"
+    echo "posttrain_triggered=$posttrain_triggered"
+} > "$STATE_FILE_ABS"