From 0f9b246d7fcfe9f1a85444e5be51d73ea765e2c6 Mon Sep 17 00:00:00 2001 From: Ogulcan Aydogan Date: Fri, 6 Mar 2026 09:08:24 +0000 Subject: [PATCH 1/3] a100-runtime-reconcile: add managed v8 stable-reset flow --- .../turkcell_7b_a100_v5_recovery_low_lr.yaml | 25 ++ ...urkcell_7b_a100_v6_recovery_reset_opt.yaml | 14 + .../turkcell_7b_a100_v7_balanced_stable.yaml | 12 + .../turkcell_7b_a100_v8_stable_reset.yaml | 17 + ...ell_7b_a100_v8b_ultra_stable_fallback.yaml | 10 + configs/serving/vllm_a100_v6_merged.yaml | 13 + configs/serving/vllm_a100_v8_merged.yaml | 13 + deploy/systemd/forge-posttrain.path | 11 + deploy/systemd/forge-posttrain.service | 16 + deploy/systemd/forge-training-monitor.service | 19 ++ .../systemd/forge-training-watchdog.service | 18 ++ deploy/systemd/forge-training.service | 18 ++ scripts/generate_training_manifest.py | 122 ++++++++ scripts/install_training_services.sh | 81 +++++ scripts/monitor_a100_training.sh | 230 ++++++++++++++ scripts/post_training_pipeline.sh | 174 ++++++++++ scripts/run_posttrain_if_complete.sh | 176 +++++++++++ scripts/start_a100_training.sh | 102 ++++++ scripts/training_watchdog.py | 296 ++++++++++++++++++ src/forge/training/callbacks.py | 96 +++++- src/forge/training/trainer.py | 67 +++- src/forge/utils/config.py | 2 + 22 files changed, 1514 insertions(+), 18 deletions(-) create mode 100644 configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml create mode 100644 configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml create mode 100644 configs/models/turkcell_7b_a100_v7_balanced_stable.yaml create mode 100644 configs/models/turkcell_7b_a100_v8_stable_reset.yaml create mode 100644 configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml create mode 100644 configs/serving/vllm_a100_v6_merged.yaml create mode 100644 configs/serving/vllm_a100_v8_merged.yaml create mode 100644 deploy/systemd/forge-posttrain.path create mode 100644 deploy/systemd/forge-posttrain.service create mode 100644 deploy/systemd/forge-training-monitor.service create mode 100644 deploy/systemd/forge-training-watchdog.service create mode 100644 deploy/systemd/forge-training.service create mode 100755 scripts/generate_training_manifest.py create mode 100755 scripts/install_training_services.sh create mode 100755 scripts/monitor_a100_training.sh create mode 100644 scripts/post_training_pipeline.sh create mode 100755 scripts/run_posttrain_if_complete.sh create mode 100755 scripts/start_a100_training.sh create mode 100755 scripts/training_watchdog.py diff --git a/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml b/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml new file mode 100644 index 0000000..f650172 --- /dev/null +++ b/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml @@ -0,0 +1,25 @@ +# Turkcell-7B A100 recovery profile after NaN stop. +_base: "./turkcell_7b.yaml" + +model: + max_seq_length: 2048 + +data: + train_path: "data/processed/turkish_sft_v3_clean.jsonl" + eval_path: "data/processed/turkish_eval.jsonl" + +training: + num_epochs: 1 + learning_rate: 2.0e-5 + lr_scheduler_type: "cosine" + warmup_ratio: 0.05 + max_grad_norm: 1.0 + per_device_train_batch_size: 8 + gradient_accumulation_steps: 2 + eval_steps: 500 + save_steps: 500 + fp16: false + bf16: true + +wandb: + run_name: "turkcell-7b-sft-v5-a100-bf16-recovery-low-lr" diff --git a/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml b/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml new file mode 100644 index 0000000..0b57077 --- /dev/null +++ b/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml @@ -0,0 +1,14 @@ +# Turkcell-7B A100 recovery profile with optimizer reset. +# Use adapter warm-start from checkpoint-500 without resuming optimizer state. +_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml" + +training: + learning_rate: 1.0e-5 + warmup_ratio: 0.08 + max_grad_norm: 0.3 + eval_steps: 200 + save_steps: 100 + adapter_init_path: "artifacts/training/turkcell-7b-sft-v3-a100-bf16-stable/checkpoint-500" + +wandb: + run_name: "turkcell-7b-sft-v6-a100-bf16-recovery-reset-opt" diff --git a/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml b/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml new file mode 100644 index 0000000..16d2d40 --- /dev/null +++ b/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml @@ -0,0 +1,12 @@ +# Turkcell-7B A100 balanced-stable profile. +# Goal: reduce NaN risk without excessive eval/checkpoint overhead. +_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml" + +training: + learning_rate: 1.0e-5 + max_grad_norm: 0.3 + eval_steps: 500 + save_steps: 250 + +wandb: + run_name: "turkcell-7b-sft-v7-a100-bf16-balanced-stable" diff --git a/configs/models/turkcell_7b_a100_v8_stable_reset.yaml b/configs/models/turkcell_7b_a100_v8_stable_reset.yaml new file mode 100644 index 0000000..a8c2623 --- /dev/null +++ b/configs/models/turkcell_7b_a100_v8_stable_reset.yaml @@ -0,0 +1,17 @@ +# Turkcell-7B A100 stable-reset profile. +# Resume from v7 checkpoint weights via adapter_init_path only. +# Do not resume optimizer/scheduler state. +_base: "./turkcell_7b_a100_v7_balanced_stable.yaml" + +training: + learning_rate: 5.0e-6 + max_grad_norm: 0.3 + warmup_ratio: 0.10 + eval_steps: 500 + save_steps: 250 + fp16: false + bf16: true + adapter_init_path: "artifacts/training/turkcell-7b-sft-v7-a100-bf16-balanced-stable/checkpoint-1000" + +wandb: + run_name: "turkcell-7b-sft-v8-a100-bf16-stable-reset" diff --git a/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml b/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml new file mode 100644 index 0000000..f7c0619 --- /dev/null +++ b/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml @@ -0,0 +1,10 @@ +# Turkcell-7B A100 ultra-stable fallback profile. +# Use only when v8 fails the first 300-step stability gate. +_base: "./turkcell_7b_a100_v8_stable_reset.yaml" + +training: + learning_rate: 3.0e-6 + max_grad_norm: 0.2 + +wandb: + run_name: "turkcell-7b-sft-v8b-a100-bf16-ultra-stable-fallback" diff --git a/configs/serving/vllm_a100_v6_merged.yaml b/configs/serving/vllm_a100_v6_merged.yaml new file mode 100644 index 0000000..5a3b0bd --- /dev/null +++ b/configs/serving/vllm_a100_v6_merged.yaml @@ -0,0 +1,13 @@ +# vLLM serving configuration for A100 merged v6 model + +model_path: "artifacts/merged/turkcell-7b-a100-v6-recovery-reset-opt" +host: "0.0.0.0" +port: 18020 +tensor_parallel_size: 1 +gpu_memory_utilization: 0.85 +max_model_len: 4096 +dtype: "float16" +enable_prefix_caching: true +trust_remote_code: false +enforce_eager: false +max_num_seqs: 64 diff --git a/configs/serving/vllm_a100_v8_merged.yaml b/configs/serving/vllm_a100_v8_merged.yaml new file mode 100644 index 0000000..bcf4121 --- /dev/null +++ b/configs/serving/vllm_a100_v8_merged.yaml @@ -0,0 +1,13 @@ +# vLLM serving configuration for A100 merged v8 model + +model_path: "artifacts/merged/turkcell-7b-a100-v8-stable-reset" +host: "0.0.0.0" +port: 18030 +tensor_parallel_size: 1 +gpu_memory_utilization: 0.85 +max_model_len: 4096 +dtype: "float16" +enable_prefix_caching: true +trust_remote_code: false +enforce_eager: false +max_num_seqs: 64 diff --git a/deploy/systemd/forge-posttrain.path b/deploy/systemd/forge-posttrain.path new file mode 100644 index 0000000..cfcd1be --- /dev/null +++ b/deploy/systemd/forge-posttrain.path @@ -0,0 +1,11 @@ +[Unit] +Description=Watch training status changes and trigger post-training pipeline +After=forge-training-monitor.service +Wants=forge-training-monitor.service + +[Path] +PathModified=%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_status_a100.txt +Unit=forge-posttrain.service + +[Install] +WantedBy=default.target diff --git a/deploy/systemd/forge-posttrain.service b/deploy/systemd/forge-posttrain.service new file mode 100644 index 0000000..3bbcf0d --- /dev/null +++ b/deploy/systemd/forge-posttrain.service @@ -0,0 +1,16 @@ +[Unit] +Description=LowResource-LLM-Forge Post-Training Pipeline Trigger +After=forge-training-monitor.service forge-training.service +Wants=forge-training-monitor.service + +[Service] +Type=oneshot +WorkingDirectory=%h/projects/LowResource-LLM-Forge +Environment=PYTHONUNBUFFERED=1 +EnvironmentFile=-%h/.config/forge/training.env +ExecStart=%h/projects/LowResource-LLM-Forge/scripts/run_posttrain_if_complete.sh +StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log +StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log + +[Install] +WantedBy=default.target diff --git a/deploy/systemd/forge-training-monitor.service b/deploy/systemd/forge-training-monitor.service new file mode 100644 index 0000000..92d1a9b --- /dev/null +++ b/deploy/systemd/forge-training-monitor.service @@ -0,0 +1,19 @@ +[Unit] +Description=LowResource-LLM-Forge Training Progress Monitor +After=forge-training.service +Wants=forge-training.service +PartOf=forge-training.service + +[Service] +Type=simple +WorkingDirectory=%h/projects/LowResource-LLM-Forge +Environment=PYTHONUNBUFFERED=1 +EnvironmentFile=-%h/.config/forge/training.env +ExecStart=%h/projects/LowResource-LLM-Forge/scripts/monitor_a100_training.sh +Restart=on-failure +RestartSec=20 +StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log +StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log + +[Install] +WantedBy=default.target diff --git a/deploy/systemd/forge-training-watchdog.service b/deploy/systemd/forge-training-watchdog.service new file mode 100644 index 0000000..4032595 --- /dev/null +++ b/deploy/systemd/forge-training-watchdog.service @@ -0,0 +1,18 @@ +[Unit] +Description=LowResource-LLM-Forge Training Watchdog +After=forge-training.service +Wants=forge-training.service + +[Service] +Type=simple +WorkingDirectory=%h/projects/LowResource-LLM-Forge +Environment=PYTHONUNBUFFERED=1 +EnvironmentFile=-%h/.config/forge/training.env +ExecStart=%h/projects/LowResource-LLM-Forge/scripts/training_watchdog.py --service forge-training.service +Restart=always +RestartSec=10 +StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log +StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log + +[Install] +WantedBy=default.target diff --git a/deploy/systemd/forge-training.service b/deploy/systemd/forge-training.service new file mode 100644 index 0000000..188040f --- /dev/null +++ b/deploy/systemd/forge-training.service @@ -0,0 +1,18 @@ +[Unit] +Description=LowResource-LLM-Forge A100 Training +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=%h/projects/LowResource-LLM-Forge +Environment=PYTHONUNBUFFERED=1 +EnvironmentFile=-%h/.config/forge/training.env +ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_a100_training.sh +Restart=on-failure +RestartSec=20 +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=default.target diff --git a/scripts/generate_training_manifest.py b/scripts/generate_training_manifest.py new file mode 100755 index 0000000..de8e620 --- /dev/null +++ b/scripts/generate_training_manifest.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Generate a deterministic training manifest for a completed run.""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import re +import subprocess +from datetime import UTC, datetime +from pathlib import Path + +from forge.utils.config import load_training_config + +TIMESTAMP_RE = re.compile(r"^(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)") + + +def _utc_now() -> str: + return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _line_count(path: Path) -> int: + if not path.exists(): + return 0 + with path.open("rb") as handle: + return sum(1 for _ in handle) + + +def _git_commit() -> str: + proc = subprocess.run( + ["git", "rev-parse", "HEAD"], + check=False, + capture_output=True, + text=True, + ) + return proc.stdout.strip() if proc.returncode == 0 else "unknown" + + +def _extract_log_times(log_file: Path) -> tuple[str, str]: + if not log_file.exists(): + return "unknown", "unknown" + + start_ts = "unknown" + end_ts = "unknown" + with log_file.open(encoding="utf-8", errors="ignore") as handle: + for line in handle: + if "training_started" in line and start_ts == "unknown": + match = TIMESTAMP_RE.match(line.strip()) + if match: + start_ts = match.group(1) + if "training_complete" in line or "Training complete. Adapter saved to" in line: + match = TIMESTAMP_RE.match(line.strip()) + if match: + end_ts = match.group(1) + return start_ts, end_ts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Generate training manifest JSON.") + parser.add_argument("--config", required=True, help="Training config path.") + parser.add_argument("--run-dir", required=True, help="Training run directory.") + parser.add_argument("--log-file", required=True, help="Training log file path.") + parser.add_argument( + "--output", + default=None, + help="Output manifest path (defaults to /manifest.json).", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + config_path = Path(args.config).resolve() + run_dir = Path(args.run_dir).resolve() + log_file = Path(args.log_file).resolve() + output_path = Path(args.output).resolve() if args.output else run_dir / "manifest.json" + + cfg = load_training_config(config_path) + train_path = Path(cfg.train_data_path).resolve() + eval_path = Path(cfg.eval_data_path).resolve() + + final_dir = run_dir / "final" + checkpoints = sorted(p.name for p in run_dir.glob("checkpoint-*") if p.is_dir()) + start_ts, end_ts = _extract_log_times(log_file) + + manifest = { + "created_utc": _utc_now(), + "git_commit": _git_commit(), + "config_path": str(config_path), + "config_sha256": _sha256_file(config_path), + "run_dir": str(run_dir), + "log_file": str(log_file), + "run_start_utc": start_ts, + "run_end_utc": end_ts, + "model_name": cfg.model.name, + "run_name": cfg.wandb.run_name, + "train_data_path": str(train_path), + "eval_data_path": str(eval_path), + "train_records": _line_count(train_path), + "eval_records": _line_count(eval_path), + "final_dir_exists": final_dir.exists(), + "checkpoint_dirs": checkpoints, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + payload = json.dumps(manifest, indent=2, ensure_ascii=False) + "\n" + output_path.write_text(payload, encoding="utf-8") + print(f"manifest_written={output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/install_training_services.sh b/scripts/install_training_services.sh new file mode 100755 index 0000000..a0d76d1 --- /dev/null +++ b/scripts/install_training_services.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +SYSTEMD_USER_DIR="${SYSTEMD_USER_DIR:-$HOME/.config/systemd/user}" +FORGE_ENV_DIR="${FORGE_ENV_DIR:-$HOME/.config/forge}" +FORGE_ENV_FILE="${FORGE_ENV_FILE:-$FORGE_ENV_DIR/training.env}" + +mkdir -p "$SYSTEMD_USER_DIR" "$PROJECT_ROOT/artifacts/logs" "$FORGE_ENV_DIR" + +if [[ ! -f "$FORGE_ENV_FILE" ]]; then + cat >"$FORGE_ENV_FILE" <<'EOF' +# Required for training with WandB. +# Set your real key before starting forge-training.service. +WANDB_API_KEY= + +# Optional overrides: +# TRAIN_CONFIG=configs/models/turkcell_7b_a100_v8_stable_reset.yaml +# TRAIN_RUN_DIR=artifacts/training/turkcell-7b-sft-v8-a100-bf16-stable-reset +# TRAIN_LOG=artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log +# TARGET_STEPS=8601 +# SAVE_STEPS=250 +# ENABLE_RESUME=0 +# REQUIRE_WANDB=0 +# BOOTSTRAP_CHECKPOINT= +EOF + chmod 600 "$FORGE_ENV_FILE" +fi + +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-training.service" \ + "$SYSTEMD_USER_DIR/forge-training.service" +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-training-watchdog.service" \ + "$SYSTEMD_USER_DIR/forge-training-watchdog.service" +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-training-monitor.service" \ + "$SYSTEMD_USER_DIR/forge-training-monitor.service" +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-posttrain.service" \ + "$SYSTEMD_USER_DIR/forge-posttrain.service" +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-posttrain.path" \ + "$SYSTEMD_USER_DIR/forge-posttrain.path" + +chmod +x \ + "$PROJECT_ROOT/scripts/start_a100_training.sh" \ + "$PROJECT_ROOT/scripts/monitor_a100_training.sh" \ + "$PROJECT_ROOT/scripts/training_watchdog.py" \ + "$PROJECT_ROOT/scripts/run_posttrain_if_complete.sh" + +systemctl --user daemon-reload +systemctl --user enable forge-training.service +systemctl --user enable forge-training-watchdog.service +systemctl --user enable forge-training-monitor.service +systemctl --user enable forge-posttrain.path + +require_wandb="$(grep -E '^REQUIRE_WANDB=' "$FORGE_ENV_FILE" | tail -n 1 | cut -d '=' -f2 | tr -d '[:space:]' || true)" +require_wandb="${require_wandb:-1}" + +if [[ "$require_wandb" == "0" ]] || grep -qE '^WANDB_API_KEY=.+$' "$FORGE_ENV_FILE"; then + systemctl --user restart forge-training.service + systemctl --user restart forge-training-watchdog.service + systemctl --user restart forge-training-monitor.service + systemctl --user restart forge-posttrain.path +else + systemctl --user stop forge-posttrain.path || true + systemctl --user stop forge-posttrain.service || true + systemctl --user stop forge-training-monitor.service || true + systemctl --user stop forge-training-watchdog.service || true + systemctl --user stop forge-training.service || true +fi + +systemctl --user --no-pager --lines=20 status forge-training.service || true +systemctl --user --no-pager --lines=20 status forge-training-watchdog.service || true +systemctl --user --no-pager --lines=20 status forge-training-monitor.service || true +systemctl --user --no-pager --lines=20 status forge-posttrain.path || true +systemctl --user --no-pager --lines=20 status forge-posttrain.service || true +echo +echo "Edit $FORGE_ENV_FILE and set WANDB_API_KEY before starting training." +echo "Or run: scripts/set_wandb_key.sh" diff --git a/scripts/monitor_a100_training.sh b/scripts/monitor_a100_training.sh new file mode 100755 index 0000000..f8b1e01 --- /dev/null +++ b/scripts/monitor_a100_training.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd /home/weezboo/projects/LowResource-LLM-Forge + +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}" +CONFIG_BASENAME="$(basename "$TRAIN_CONFIG")" +CONFIG_SLUG="${CONFIG_BASENAME%.*}" +LOG_FILE="${LOG_FILE:-${TRAIN_LOG:-artifacts/logs/training_${CONFIG_SLUG}.log}}" +STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status_a100.txt}" +ETA_STATE_FILE="${ETA_STATE_FILE:-artifacts/logs/training_monitor_eta_state_${CONFIG_SLUG}.env}" +TARGET_STEPS="${TARGET_STEPS:-8601}" +PATTERN="${PATTERN:-run_training.py --config ${TRAIN_CONFIG}}" +SLEEP_SECS="${SLEEP_SECS:-60}" +SAVE_STEPS="${SAVE_STEPS:-250}" +CHECKPOINT_EVENT_FILE="${CHECKPOINT_EVENT_FILE:-artifacts/logs/training_checkpoint_events_${CONFIG_SLUG}.log}" +CHECKPOINT_STATE_FILE="${CHECKPOINT_STATE_FILE:-artifacts/logs/training_checkpoint_state_${CONFIG_SLUG}.env}" +TRAIN_RUN_DIR="${TRAIN_RUN_DIR:-}" + +mkdir -p artifacts/logs + +prev_ts=0 +prev_step=0 +ema_sps="" +speed_source="none" +last_announced_checkpoint_step=0 + +if [[ -z "$TRAIN_RUN_DIR" ]] && [[ -f "$LOG_FILE" ]]; then + run_dir_from_log="$(grep -a -oE "output_dir=artifacts/training/[^[:space:]]+" "$LOG_FILE" | tail -n 1 | cut -d '=' -f 2 || true)" + if [[ -n "$run_dir_from_log" ]]; then + TRAIN_RUN_DIR="$run_dir_from_log" + fi +fi + +if [[ -z "$TRAIN_RUN_DIR" ]]; then + TRAIN_RUN_DIR="artifacts/training/${CONFIG_SLUG}" +fi + +if [[ -f "$ETA_STATE_FILE" ]]; then + # shellcheck disable=SC1090 + source "$ETA_STATE_FILE" +fi + +if [[ -f "$CHECKPOINT_STATE_FILE" ]]; then + # shellcheck disable=SC1090 + source "$CHECKPOINT_STATE_FILE" +fi + +if [[ ! "$last_announced_checkpoint_step" =~ ^[0-9]+$ ]]; then + last_announced_checkpoint_step=0 +fi + +find_latest_checkpoint_step() { + local current_step="$1" + if [[ ! -d "$TRAIN_RUN_DIR" ]]; then + echo "0" + return + fi + + latest="$(find "$TRAIN_RUN_DIR" -maxdepth 1 -type d -name "checkpoint-*" 2>/dev/null \ + | sed -E 's#.*/checkpoint-##' \ + | grep -E '^[0-9]+$' \ + | awk -v s="$current_step" 's == "" || s !~ /^[0-9]+$/ || $1 <= s' \ + | sort -n \ + | tail -n 1 || true)" + + if [[ -z "$latest" ]]; then + echo "0" + else + echo "$latest" + fi +} + +while true; do + ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + now_epoch="$(date -u +%s)" + + running="no" + if pgrep -f "$PATTERN" >/dev/null 2>&1 || pgrep -f "scripts/run_training.py" >/dev/null 2>&1; then + running="yes" + fi + + progress="none" + log_start_line=1 + if [[ -f "$LOG_FILE" ]]; then + marker_line="$(grep -a -n "forge-training-start" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)" + if [[ "$marker_line" =~ ^[0-9]+$ ]] && [[ "$marker_line" -gt 0 ]]; then + log_start_line=$((marker_line + 1)) + fi + + progress="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -oE "[0-9]+/${TARGET_STEPS}" | tail -n 1 || true)" + if [[ -z "$progress" ]]; then + progress="none" + fi + fi + + step="0" + if [[ "$progress" != "none" ]]; then + step="${progress%%/*}" + fi + + pct="0" + if [[ "$step" =~ ^[0-9]+$ ]] && [[ $TARGET_STEPS -gt 0 ]]; then + pct=$((step * 100 / TARGET_STEPS)) + fi + + nan_count="0" + if [[ -f "$LOG_FILE" ]]; then + # Count only real NaN/Inf metric values and explicit NaN guard events. + metric_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" 2>/dev/null | grep -a -E -i "'(loss|grad_norm|eval_loss)':[[:space:]]*'?(nan|inf)'?" | wc -l | tr -d '[:space:]' || true)" + guard_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" 2>/dev/null | grep -a -E -c "nan_guard_detected|nan_guard_stopping_training" || true)" + metric_nan_count="${metric_nan_count:-0}" + guard_nan_count="${guard_nan_count:-0}" + nan_count=$((metric_nan_count + guard_nan_count)) + fi + + gpu_line="$(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,noheader | head -n1 2>/dev/null || echo unknown)" + + steps_per_hour="unknown" + eta_seconds="unknown" + eta_utc="unknown" + remaining_steps="unknown" + latest_checkpoint_step="$(find_latest_checkpoint_step "$step")" + next_checkpoint_step="unknown" + steps_to_next_checkpoint="unknown" + checkpoint_eta_utc="unknown" + + if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$step" -lt "$TARGET_STEPS" ]]; then + remaining_steps=$((TARGET_STEPS - step)) + fi + + if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$SAVE_STEPS" =~ ^[0-9]+$ ]] && [[ "$SAVE_STEPS" -gt 0 ]]; then + next_checkpoint_step=$((((step / SAVE_STEPS) + 1) * SAVE_STEPS)) + if [[ "$next_checkpoint_step" -le "$TARGET_STEPS" ]]; then + steps_to_next_checkpoint=$((next_checkpoint_step - step)) + else + next_checkpoint_step="none" + steps_to_next_checkpoint="none" + fi + fi + + if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$prev_ts" =~ ^[0-9]+$ ]] && [[ "$prev_step" =~ ^[0-9]+$ ]]; then + if [[ $prev_ts -gt 0 ]] && [[ $now_epoch -gt $prev_ts ]] && [[ $step -gt $prev_step ]]; then + delta_steps=$((step - prev_step)) + delta_secs=$((now_epoch - prev_ts)) + # Skip the first large jump after resume (e.g. 0 -> 750) to avoid bogus ETA. + if [[ -z "$ema_sps" ]] && [[ "$prev_step" -eq 0 ]] && [[ "$delta_steps" -gt 1 ]]; then + speed_source="bootstrap_skip" + elif [[ "$delta_secs" -gt 0 ]]; then + instant_sps="$(awk -v ds="$delta_steps" -v dt="$delta_secs" 'BEGIN { printf "%.8f", ds / dt }')" + + if [[ -n "$ema_sps" ]]; then + ema_sps="$(awk -v e="$ema_sps" -v i="$instant_sps" 'BEGIN { printf "%.8f", (0.7 * e) + (0.3 * i) }')" + speed_source="ema" + else + ema_sps="$instant_sps" + speed_source="instant" + fi + fi + fi + fi + + if [[ -n "$ema_sps" ]] && awk -v s="$ema_sps" 'BEGIN { exit !(s > 0) }'; then + steps_per_hour="$(awk -v s="$ema_sps" 'BEGIN { printf "%.1f", s * 3600 }')" + if [[ "$remaining_steps" =~ ^[0-9]+$ ]]; then + eta_seconds="$(awk -v rem="$remaining_steps" -v s="$ema_sps" 'BEGIN { printf "%.0f", rem / s }')" + if [[ "$eta_seconds" =~ ^[0-9]+$ ]]; then + eta_utc="$(date -u -d "@$((now_epoch + eta_seconds))" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo unknown)" + fi + fi + if [[ "$steps_to_next_checkpoint" =~ ^[0-9]+$ ]]; then + checkpoint_eta_seconds="$(awk -v rem="$steps_to_next_checkpoint" -v s="$ema_sps" 'BEGIN { printf "%.0f", rem / s }')" + if [[ "$checkpoint_eta_seconds" =~ ^[0-9]+$ ]]; then + checkpoint_eta_utc="$(date -u -d "@$((now_epoch + checkpoint_eta_seconds))" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo unknown)" + fi + fi + fi + + if [[ "$latest_checkpoint_step" =~ ^[0-9]+$ ]] && [[ "$latest_checkpoint_step" -gt "$last_announced_checkpoint_step" ]]; then + echo "${ts} checkpoint_saved step=${latest_checkpoint_step} run_dir=${TRAIN_RUN_DIR}" >>"$CHECKPOINT_EVENT_FILE" + last_announced_checkpoint_step="$latest_checkpoint_step" + fi + + { + echo "timestamp_utc=$ts" + echo "running=$running" + echo "step=$step" + echo "target_steps=$TARGET_STEPS" + echo "progress=$progress" + echo "percent=$pct" + echo "remaining_steps=$remaining_steps" + echo "steps_per_hour=$steps_per_hour" + echo "eta_seconds=$eta_seconds" + echo "eta_utc=$eta_utc" + echo "speed_source=$speed_source" + echo "nan_count=$nan_count" + echo "gpu=$gpu_line" + echo "save_steps=$SAVE_STEPS" + echo "latest_checkpoint_step=$latest_checkpoint_step" + echo "next_checkpoint_step=$next_checkpoint_step" + echo "steps_to_next_checkpoint=$steps_to_next_checkpoint" + echo "checkpoint_eta_utc=$checkpoint_eta_utc" + } >"$STATUS_FILE" + + prev_ts="$now_epoch" + prev_step="$step" + { + echo "prev_ts=$prev_ts" + echo "prev_step=$prev_step" + echo "ema_sps=$ema_sps" + echo "speed_source=$speed_source" + } >"$ETA_STATE_FILE" + { + echo "last_announced_checkpoint_step=$last_announced_checkpoint_step" + echo "train_run_dir=$TRAIN_RUN_DIR" + } >"$CHECKPOINT_STATE_FILE" + + if [[ "$running" == "no" ]]; then + echo "state=stopped" >>"$STATUS_FILE" + exit 0 + fi + + if [[ "$step" =~ ^[0-9]+$ ]] && [[ $step -ge $TARGET_STEPS ]]; then + echo "state=completed" >>"$STATUS_FILE" + exit 0 + fi + + echo "state=running" >>"$STATUS_FILE" + sleep "$SLEEP_SECS" +done diff --git a/scripts/post_training_pipeline.sh b/scripts/post_training_pipeline.sh new file mode 100644 index 0000000..1f93b34 --- /dev/null +++ b/scripts/post_training_pipeline.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +cd "$PROJECT_ROOT" + +UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}" +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}" +RUN_DIR="${RUN_DIR:-artifacts/training/turkcell-7b-sft-v8-a100-bf16-stable-reset}" +TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log}" +ADAPTER_DIR="${ADAPTER_DIR:-$RUN_DIR/final}" + +BASE_MODEL="${BASE_MODEL:-TURKCELL/Turkcell-LLM-7b-v1}" +MERGED_OUTPUT="${MERGED_OUTPUT:-artifacts/merged/turkcell-7b-a100-v8-stable-reset}" +EVAL_OUTPUT_ROOT="${EVAL_OUTPUT_ROOT:-artifacts/eval/turkcell-7b-a100-v8-stable-reset}" + +PUSH_TO_HUB="${PUSH_TO_HUB:-0}" +HUB_REPO="${HUB_REPO:-}" + +SERVE_BASE_URL="${SERVE_BASE_URL:-}" +SERVE_API_KEY="${SERVE_API_KEY:-}" +SERVE_CONFIG="${SERVE_CONFIG:-configs/serving/vllm_a100_v8_merged.yaml}" +SERVE_TIMEOUT="${SERVE_TIMEOUT:-240}" +AUTO_START_SERVE="${AUTO_START_SERVE:-1}" +BENCHMARK_NUM_REQUESTS="${BENCHMARK_NUM_REQUESTS:-50}" +BENCHMARK_CONCURRENCY="${BENCHMARK_CONCURRENCY:-5}" +BENCHMARK_OUTPUT_DIR="${BENCHMARK_OUTPUT_DIR:-artifacts/benchmarks/turkcell-7b-a100-v8}" +BENCHMARK_OUTPUT="${BENCHMARK_OUTPUT:-$BENCHMARK_OUTPUT_DIR/benchmark_$(date -u +%Y%m%dT%H%M%SZ).json}" +SERVE_LOG="${SERVE_LOG:-artifacts/logs/posttrain_serve_v8.log}" + +if [[ ! -x "$UV_BIN" ]]; then + echo "UV executable not found: $UV_BIN" >&2 + exit 1 +fi + +if [[ ! -d "$ADAPTER_DIR" ]]; then + echo "Adapter directory not found: $ADAPTER_DIR" >&2 + exit 1 +fi + +if [[ "$PUSH_TO_HUB" == "1" ]] && [[ -z "$HUB_REPO" ]]; then + echo "HUB_REPO is required when PUSH_TO_HUB=1." >&2 + exit 1 +fi + +mkdir -p "$BENCHMARK_OUTPUT_DIR" artifacts/logs + +echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] post-training-pipeline-start" +echo "train_config=$TRAIN_CONFIG" +echo "run_dir=$RUN_DIR" +echo "adapter_dir=$ADAPTER_DIR" + +echo +echo "[1/4] Generate training manifest" +"$UV_BIN" run python scripts/generate_training_manifest.py \ + --config "$TRAIN_CONFIG" \ + --run-dir "$RUN_DIR" \ + --log-file "$TRAIN_LOG" + +echo +echo "[2/4] Run offline evaluations (mmlu_tr, perplexity, generation)" +for bench in mmlu_tr perplexity generation; do + out_dir="$EVAL_OUTPUT_ROOT/$bench" + mkdir -p "$out_dir" + echo " - benchmark=$bench output=$out_dir" + "$UV_BIN" run python scripts/run_eval.py \ + --model "$ADAPTER_DIR" \ + --benchmark "$bench" \ + --output-dir "$out_dir" +done + +echo +echo "[3/4] Merge adapters into base model" +merge_cmd=( + "$UV_BIN" run python scripts/merge_and_push.py + --base-model "$BASE_MODEL" + --adapter "$ADAPTER_DIR" + --output "$MERGED_OUTPUT" +) +if [[ "$PUSH_TO_HUB" == "1" ]]; then + merge_cmd+=(--push --hub-repo "$HUB_REPO") +fi +"${merge_cmd[@]}" + +echo +echo "[4/4] Optional serving smoke/benchmark" +run_endpoint_checks() { + local base_url="$1" + smoke_cmd=("$UV_BIN" run python scripts/smoke_serve.py --base-url "$base_url") + bench_cmd=( + "$UV_BIN" run python scripts/benchmark_openai_endpoint.py + --base-url "$base_url" + --num-requests "$BENCHMARK_NUM_REQUESTS" + --concurrency "$BENCHMARK_CONCURRENCY" + --output "$BENCHMARK_OUTPUT" + ) + if [[ -n "$SERVE_API_KEY" ]]; then + smoke_cmd+=(--api-key "$SERVE_API_KEY") + bench_cmd+=(--api-key "$SERVE_API_KEY") + fi + "${smoke_cmd[@]}" + "${bench_cmd[@]}" +} + +if [[ -n "$SERVE_BASE_URL" ]]; then + echo " - using external endpoint: $SERVE_BASE_URL" + run_endpoint_checks "$SERVE_BASE_URL" +elif [[ "$AUTO_START_SERVE" == "1" ]]; then + if [[ ! -f "$SERVE_CONFIG" ]]; then + echo "Serving config not found: $SERVE_CONFIG" >&2 + exit 1 + fi + + serve_host="$(grep -E '^host:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' "' || true)" + serve_port="$(grep -E '^port:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' ' || true)" + if [[ -z "$serve_host" ]]; then + serve_host="127.0.0.1" + fi + if [[ "$serve_host" == "0.0.0.0" ]] || [[ "$serve_host" == "::" ]]; then + serve_host="127.0.0.1" + fi + if [[ -z "$serve_port" ]]; then + serve_port="18020" + fi + serve_base_url="http://${serve_host}:${serve_port}" + + serve_cmd=( + "$UV_BIN" run python scripts/run_serve.py + --config "$SERVE_CONFIG" + --no-wait + --timeout "$SERVE_TIMEOUT" + ) + if [[ -n "$SERVE_API_KEY" ]]; then + serve_cmd+=(--api-key "$SERVE_API_KEY") + fi + + "${serve_cmd[@]}" >"$SERVE_LOG" 2>&1 & + serve_pid="$!" + cleanup_serve() { + if [[ -n "${serve_pid:-}" ]] && kill -0 "$serve_pid" >/dev/null 2>&1; then + kill -INT "$serve_pid" >/dev/null 2>&1 || true + sleep 1 + if kill -0 "$serve_pid" >/dev/null 2>&1; then + kill "$serve_pid" >/dev/null 2>&1 || true + fi + wait "$serve_pid" >/dev/null 2>&1 || true + fi + } + trap cleanup_serve EXIT + + ready="0" + for _ in $(seq 1 "$SERVE_TIMEOUT"); do + if curl -fsS "${serve_base_url}/health" >/dev/null 2>&1; then + ready="1" + break + fi + sleep 1 + done + if [[ "$ready" != "1" ]]; then + echo "vLLM did not become healthy in ${SERVE_TIMEOUT}s (${serve_base_url})" >&2 + exit 1 + fi + + echo " - started local vLLM endpoint: $serve_base_url" + run_endpoint_checks "$serve_base_url" + cleanup_serve + trap - EXIT +else + echo " - SERVE_BASE_URL not set and AUTO_START_SERVE=0; skipping serve smoke + benchmark" +fi + +echo +echo "benchmark_output=$BENCHMARK_OUTPUT" +echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] post-training-pipeline-complete" diff --git a/scripts/run_posttrain_if_complete.sh b/scripts/run_posttrain_if_complete.sh new file mode 100755 index 0000000..ce54427 --- /dev/null +++ b/scripts/run_posttrain_if_complete.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +cd "$PROJECT_ROOT" + +STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status_a100.txt}" +LOCK_FILE="${LOCK_FILE:-artifacts/logs/posttrain_v8.lock}" +DONE_FILE="${DONE_FILE:-artifacts/logs/posttrain_v8.done}" +SUMMARY_FILE="${SUMMARY_FILE:-artifacts/logs/posttrain_v8_summary.md}" +POSTTRAIN_LOG="${POSTTRAIN_LOG:-artifacts/logs/posttrain_v8.log}" + +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}" +RUN_DIR="${RUN_DIR:-${TRAIN_RUN_DIR:-artifacts/training/turkcell-7b-sft-v8-a100-bf16-stable-reset}}" +TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log}" +ADAPTER_DIR="${ADAPTER_DIR:-$RUN_DIR/final}" +MERGED_OUTPUT="${MERGED_OUTPUT:-artifacts/merged/turkcell-7b-a100-v8-stable-reset}" +EVAL_OUTPUT_ROOT="${EVAL_OUTPUT_ROOT:-artifacts/eval/turkcell-7b-a100-v8-stable-reset}" +SERVE_CONFIG="${SERVE_CONFIG:-configs/serving/vllm_a100_v8_merged.yaml}" +BENCHMARK_OUTPUT_DIR="${BENCHMARK_OUTPUT_DIR:-artifacts/benchmarks/turkcell-7b-a100-v8}" +BENCHMARK_OUTPUT="${BENCHMARK_OUTPUT:-$BENCHMARK_OUTPUT_DIR/benchmark_$(date -u +%Y%m%dT%H%M%SZ).json}" + +mkdir -p "$(dirname "$LOCK_FILE")" "$(dirname "$DONE_FILE")" "$(dirname "$SUMMARY_FILE")" "$BENCHMARK_OUTPUT_DIR" + +resolve_serve_endpoint() { + local host + local port + host="$(grep -E '^host:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' "' || true)" + port="$(grep -E '^port:' "$SERVE_CONFIG" | head -n 1 | cut -d ':' -f2- | tr -d ' ' || true)" + if [[ -z "$host" ]]; then + host="127.0.0.1" + fi + if [[ "$host" == "0.0.0.0" ]] || [[ "$host" == "::" ]]; then + host="127.0.0.1" + fi + if [[ -z "$port" ]]; then + port="18020" + fi + echo "http://${host}:${port}" +} + +collect_eval_status_lines() { + local bench + local results_json + local pass_total + local status + for bench in mmlu_tr perplexity generation; do + results_json="${EVAL_OUTPUT_ROOT}/${bench}/results.json" + if [[ -f "$results_json" ]]; then + pass_total="$(python3 -c 'import json,sys; s=json.load(open(sys.argv[1])).get("summary",{}); print(f"{int(s.get(\"passed\",0))}/{int(s.get(\"total\",0))}")' "$results_json")" + status="$(python3 -c 'import json,sys; b=(json.load(open(sys.argv[1])).get("benchmarks") or [{}])[0]; print("PASS" if b.get("passed") else "FAIL")' "$results_json")" + echo "- ${bench}: ${status} (${pass_total})" + else + echo "- ${bench}: MISSING (${results_json})" + fi + done +} + +if [[ -f "$DONE_FILE" ]]; then + echo "posttrain_already_done file=$DONE_FILE" + exit 0 +fi + +if [[ ! -f "$STATUS_FILE" ]]; then + echo "posttrain_waiting status_file_missing=$STATUS_FILE" + exit 0 +fi + +state="$(awk -F '=' '$1=="state" {print $2}' "$STATUS_FILE" | tail -n 1 || true)" +step="$(awk -F '=' '$1=="step" {print $2}' "$STATUS_FILE" | tail -n 1 || true)" +target_steps="$(awk -F '=' '$1=="target_steps" {print $2}' "$STATUS_FILE" | tail -n 1 || true)" + +if [[ ! "$step" =~ ^[0-9]+$ ]]; then + step=0 +fi +if [[ ! "$target_steps" =~ ^[0-9]+$ ]] || [[ "$target_steps" -le 0 ]]; then + target_steps=8601 +fi + +if [[ "$state" != "completed" ]] && [[ "$step" -lt "$target_steps" ]]; then + echo "posttrain_waiting state=${state:-unknown} step=$step target_steps=$target_steps" + exit 0 +fi + +if [[ -f "$LOCK_FILE" ]]; then + locked_pid="$(awk -F '=' '$1=="pid" {print $2}' "$LOCK_FILE" | tail -n 1 || true)" + if [[ "$locked_pid" =~ ^[0-9]+$ ]] && kill -0 "$locked_pid" >/dev/null 2>&1; then + echo "posttrain_lock_active pid=$locked_pid file=$LOCK_FILE" + exit 0 + fi + rm -f "$LOCK_FILE" +fi + +{ + echo "pid=$$" + echo "started_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" +} >"$LOCK_FILE" + +cleanup_lock() { + rm -f "$LOCK_FILE" +} +trap cleanup_lock EXIT + +if [[ -f "$DONE_FILE" ]]; then + echo "posttrain_already_done file=$DONE_FILE" + exit 0 +fi + +echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] posttrain_trigger_start" | tee -a "$POSTTRAIN_LOG" +echo "status_state=$state step=$step target_steps=$target_steps" | tee -a "$POSTTRAIN_LOG" +serve_endpoint="$(resolve_serve_endpoint)" + +set +e +( + export TRAIN_CONFIG + export RUN_DIR + export TRAIN_LOG + export ADAPTER_DIR + export MERGED_OUTPUT + export EVAL_OUTPUT_ROOT + export PUSH_TO_HUB=0 + export SERVE_CONFIG + export AUTO_START_SERVE=1 + export BENCHMARK_OUTPUT_DIR + export BENCHMARK_OUTPUT + bash scripts/post_training_pipeline.sh +) >>"$POSTTRAIN_LOG" 2>&1 +pipeline_rc=$? +set -e + +if [[ "$pipeline_rc" -ne 0 ]]; then + { + echo "# Post-Training v8 Summary" + echo + echo "- status: FAILED" + echo "- finished_utc: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "- pipeline_exit_code: $pipeline_rc" + echo "- train_config: $TRAIN_CONFIG" + echo "- run_dir: $RUN_DIR" + echo "- serve_endpoint: $serve_endpoint" + echo "- eval_results:" + collect_eval_status_lines + echo "- log: $POSTTRAIN_LOG" + } >"$SUMMARY_FILE" + echo "posttrain_failed rc=$pipeline_rc log=$POSTTRAIN_LOG summary=$SUMMARY_FILE" + exit "$pipeline_rc" +fi + +{ + echo "completed_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "train_config=$TRAIN_CONFIG" + echo "run_dir=$RUN_DIR" + echo "merged_output=$MERGED_OUTPUT" + echo "eval_output_root=$EVAL_OUTPUT_ROOT" + echo "benchmark_output=$BENCHMARK_OUTPUT" + echo "serve_endpoint=$serve_endpoint" + echo "posttrain_log=$POSTTRAIN_LOG" +} >"$DONE_FILE" + +{ + echo "# Post-Training v8 Summary" + echo + echo "- status: SUCCESS" + echo "- completed_utc: $(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "- train_config: $TRAIN_CONFIG" + echo "- run_dir: $RUN_DIR" + echo "- eval_results:" + collect_eval_status_lines + echo "- merged_model: $MERGED_OUTPUT" + echo "- serve_endpoint: $serve_endpoint" + echo "- eval_output_root: $EVAL_OUTPUT_ROOT" + echo "- benchmark_output: $BENCHMARK_OUTPUT" + echo "- posttrain_log: $POSTTRAIN_LOG" +} >"$SUMMARY_FILE" + +echo "posttrain_complete done_file=$DONE_FILE summary=$SUMMARY_FILE" diff --git a/scripts/start_a100_training.sh b/scripts/start_a100_training.sh new file mode 100755 index 0000000..8bca40a --- /dev/null +++ b/scripts/start_a100_training.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +cd "$PROJECT_ROOT" + +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_a100_v8_stable_reset.yaml}" +CONFIG_BASENAME="$(basename "$TRAIN_CONFIG")" +CONFIG_SLUG="${CONFIG_BASENAME%.*}" +TRAIN_RUN_DIR="${TRAIN_RUN_DIR:-artifacts/training/${CONFIG_SLUG}}" +TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_${CONFIG_SLUG}.log}" +BOOTSTRAP_CHECKPOINT="${BOOTSTRAP_CHECKPOINT:-}" +ENABLE_RESUME="${ENABLE_RESUME:-0}" +HF_HOME_DIR="${HF_HOME_DIR:-$PROJECT_ROOT/.hf_cache}" +HF_DATASETS_CACHE_DIR="${HF_DATASETS_CACHE_DIR:-$HF_HOME_DIR/datasets}" +HF_HUB_CACHE_DIR="${HF_HUB_CACHE_DIR:-$HF_HOME_DIR/hub}" +UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}" +REQUIRE_WANDB="${REQUIRE_WANDB:-1}" + +mkdir -p \ + "$(dirname "$TRAIN_RUN_DIR")" \ + "$(dirname "$TRAIN_LOG")" \ + "$HF_HOME_DIR" \ + "$HF_DATASETS_CACHE_DIR" \ + "$HF_HUB_CACHE_DIR" \ + artifacts/logs + +# Keep a durable per-run log even when systemd unit output targets change. +exec > >(tee -a "$TRAIN_LOG") 2>&1 + +if [[ ! -x "$UV_BIN" ]]; then + echo "UV executable not found: $UV_BIN" >&2 + exit 1 +fi + +if [[ "$REQUIRE_WANDB" == "1" ]] && [[ -z "${WANDB_API_KEY:-}" ]]; then + echo "WANDB_API_KEY is required for this run (REQUIRE_WANDB=1)." >&2 + exit 1 +fi + +find_latest_checkpoint() { + if [[ ! -d "$TRAIN_RUN_DIR" ]]; then + return + fi + + monitor_status_file="artifacts/logs/training_monitor_status_a100.txt" + current_step="" + if [[ -f "$monitor_status_file" ]]; then + current_step="$(grep -E '^step=' "$monitor_status_file" | tail -n 1 | cut -d '=' -f 2 || true)" + fi + + if [[ "$current_step" =~ ^[0-9]+$ ]] && [[ "$current_step" -gt 0 ]]; then + filtered_checkpoint="$(find "$TRAIN_RUN_DIR" -maxdepth 1 -type d -name "checkpoint-*" 2>/dev/null \ + | sed -E 's#(.*checkpoint-)([0-9]+)$#\2 \1\2#' \ + | awk -v s="$current_step" '$1 <= s' \ + | sort -n \ + | tail -n 1 \ + | cut -d ' ' -f 2- || true)" + if [[ -n "$filtered_checkpoint" ]]; then + echo "$filtered_checkpoint" + return + fi + fi + + find "$TRAIN_RUN_DIR" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 1 +} + +resume_from="" +if [[ "$ENABLE_RESUME" == "1" ]]; then + latest_checkpoint="$(find_latest_checkpoint || true)" + if [[ -n "$latest_checkpoint" ]]; then + resume_from="$latest_checkpoint" + elif [[ -n "$BOOTSTRAP_CHECKPOINT" ]] && [[ -d "$BOOTSTRAP_CHECKPOINT" ]]; then + resume_from="$BOOTSTRAP_CHECKPOINT" + fi +fi + +cmd=("$UV_BIN" "run" "python" "scripts/run_training.py" "--config" "$TRAIN_CONFIG") +if [[ -n "$resume_from" ]]; then + cmd+=("--resume-from" "$resume_from") +fi + +echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] forge-training-start" +echo "project_root=$PROJECT_ROOT" +echo "train_config=$TRAIN_CONFIG" +echo "config_slug=$CONFIG_SLUG" +echo "train_run_dir=$TRAIN_RUN_DIR" +echo "train_log=$TRAIN_LOG" +echo "resume_from=${resume_from:-none}" +echo "enable_resume=$ENABLE_RESUME" +echo "require_wandb=$REQUIRE_WANDB" +echo "hf_home=$HF_HOME_DIR" +echo "hf_datasets_cache=$HF_DATASETS_CACHE_DIR" +echo "hf_hub_cache=$HF_HUB_CACHE_DIR" +echo "command=${cmd[*]}" + +exec env \ + FORGE_EXECUTION_CONTEXT=remote \ + HF_HOME="$HF_HOME_DIR" \ + HF_DATASETS_CACHE="$HF_DATASETS_CACHE_DIR" \ + HUGGINGFACE_HUB_CACHE="$HF_HUB_CACHE_DIR" \ + "${cmd[@]}" diff --git a/scripts/training_watchdog.py b/scripts/training_watchdog.py new file mode 100755 index 0000000..992b3db --- /dev/null +++ b/scripts/training_watchdog.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +"""Watchdog for long-running training on remote GPU hosts. + +Restarts a user-level systemd training service when: +1) Too many consecutive metric lines contain NaN. +2) Training step does not advance for a configured stall timeout. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import subprocess +import time +from dataclasses import asdict, dataclass +from pathlib import Path + + +@dataclass +class WatchdogState: + """Persisted state between watchdog loops.""" + + last_metric_hash: str = "" + nan_consecutive: int = 0 + last_step: int = 0 + last_step_change_ts: float = 0.0 + + +def _config_slug() -> str: + train_config = os.getenv("TRAIN_CONFIG", "configs/models/turkcell_7b_a100_v4_recovery.yaml") + return Path(train_config).stem + + +def _int_env(name: str, default: int) -> int: + value = os.getenv(name) + if value is None or value.strip() == "": + return default + try: + return int(value) + except ValueError: + return default + + +def _load_status_file(path: Path) -> dict[str, str]: + if not path.exists(): + return {} + + result: dict[str, str] = {} + for line in path.read_text(encoding="utf-8", errors="ignore").splitlines(): + if "=" not in line: + continue + key, value = line.split("=", 1) + result[key.strip()] = value.strip() + return result + + +def _status_completed(status: dict[str, str], target_steps: int) -> bool: + state = status.get("state", "") + try: + step = int(status.get("step", "0")) + except ValueError: + step = 0 + + if state == "completed": + return True + return step >= target_steps + + +def parse_args() -> argparse.Namespace: + slug = _config_slug() + default_log_file = os.getenv("TRAIN_LOG", f"artifacts/logs/training_{slug}.log") + default_state_file = os.getenv( + "TRAIN_WATCHDOG_STATE_FILE", + f"artifacts/logs/training_watchdog_state_{slug}.json", + ) + default_status_file = os.getenv( + "TRAIN_WATCHDOG_STATUS_FILE", + f"artifacts/logs/training_watchdog_status_{slug}.txt", + ) + default_monitor_status_file = os.getenv( + "TRAIN_MONITOR_STATUS_FILE", + "artifacts/logs/training_monitor_status_a100.txt", + ) + default_target_steps = _int_env("TARGET_STEPS", 8601) + default_poll_seconds = _int_env("WATCHDOG_POLL_SECONDS", 60) + default_stall_seconds = _int_env("WATCHDOG_STALL_SECONDS", 5400) + default_nan_limit = _int_env("WATCHDOG_NAN_CONSECUTIVE_LIMIT", 5) + + parser = argparse.ArgumentParser(description="Monitor training and auto-restart on failures.") + parser.add_argument("--service", default="forge-training.service") + parser.add_argument("--log-file", default=default_log_file) + parser.add_argument("--state-file", default=default_state_file) + parser.add_argument("--status-file", default=default_status_file) + parser.add_argument("--monitor-status-file", default=default_monitor_status_file) + parser.add_argument("--target-steps", type=int, default=default_target_steps) + parser.add_argument("--poll-seconds", type=int, default=default_poll_seconds) + parser.add_argument("--nan-consecutive-limit", type=int, default=default_nan_limit) + parser.add_argument("--stall-seconds", type=int, default=default_stall_seconds) + parser.add_argument("--max-read-bytes", type=int, default=2_000_000) + return parser.parse_args() + + +def _run_systemctl(*args: str) -> subprocess.CompletedProcess[str]: + return subprocess.run( + ["systemctl", "--user", *args], + check=False, + text=True, + capture_output=True, + ) + + +def is_service_active(service: str) -> bool: + proc = _run_systemctl("is-active", "--quiet", service) + return proc.returncode == 0 + + +def restart_service(service: str) -> bool: + proc = _run_systemctl("restart", service) + return proc.returncode == 0 + + +def start_service(service: str) -> bool: + proc = _run_systemctl("start", service) + return proc.returncode == 0 + + +def read_tail_text(path: Path, max_bytes: int) -> str: + if not path.exists(): + return "" + with path.open("rb") as handle: + handle.seek(0, os.SEEK_END) + size = handle.tell() + handle.seek(max(0, size - max_bytes), os.SEEK_SET) + return handle.read().decode("utf-8", errors="ignore") + + +def parse_training_tail(text: str, target_steps: int) -> tuple[int, str]: + if not text: + return 0, "" + + marker_idx = text.rfind("forge-training-start") + if marker_idx != -1: + text = text[marker_idx:] + + step_pattern = re.compile(rf"(\d+)/{target_steps}\b") + steps = [int(match.group(1)) for match in step_pattern.finditer(text)] + last_step = steps[-1] if steps else 0 + + metric_lines: list[str] = [] + for line in text.splitlines(): + if ("'loss':" in line and "'grad_norm':" in line) or "'eval_loss':" in line: + metric_lines.append(line) + last_metric = metric_lines[-1] if metric_lines else "" + return last_step, last_metric + + +def metric_hash(metric_line: str) -> str: + if not metric_line: + return "" + return hashlib.sha256(metric_line.encode("utf-8", errors="ignore")).hexdigest() + + +def load_state(path: Path) -> WatchdogState: + if not path.exists(): + return WatchdogState() + try: + payload = json.loads(path.read_text(encoding="utf-8")) + return WatchdogState( + last_metric_hash=str(payload.get("last_metric_hash", "")), + nan_consecutive=int(payload.get("nan_consecutive", 0)), + last_step=int(payload.get("last_step", 0)), + last_step_change_ts=float(payload.get("last_step_change_ts", 0.0)), + ) + except (json.JSONDecodeError, OSError, ValueError, TypeError): + return WatchdogState() + + +def save_state(path: Path, state: WatchdogState) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(asdict(state), indent=2), encoding="utf-8") + + +def write_status( + path: Path, + *, + service: str, + active: bool, + step: int, + target_steps: int, + metric_line: str, + state: WatchdogState, + action: str, +) -> None: + pct = int((step * 100) / target_steps) if target_steps > 0 else 0 + timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + status_lines = [ + f"timestamp_utc={timestamp}", + f"service={service}", + f"active={'yes' if active else 'no'}", + f"step={step}", + f"target_steps={target_steps}", + f"percent={pct}", + f"nan_consecutive={state.nan_consecutive}", + f"last_step_change_ts={int(state.last_step_change_ts)}", + f"last_metric_contains_nan={'yes' if 'nan' in metric_line.lower() else 'no'}", + f"action={action}", + ] + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(status_lines) + "\n", encoding="utf-8") + + +def main() -> None: + args = parse_args() + log_file = Path(args.log_file) + state_file = Path(args.state_file) + status_file = Path(args.status_file) + monitor_status_file = Path(args.monitor_status_file) + state = load_state(state_file) + + while True: + now = time.time() + action = "none" + active = is_service_active(args.service) + monitor_status = _load_status_file(monitor_status_file) + training_completed = _status_completed(monitor_status, args.target_steps) + + if not active: + if training_completed: + action = "completed_no_restart" + else: + started = start_service(args.service) + action = "start_service" if started else "start_failed" + active = is_service_active(args.service) + + tail_text = read_tail_text(log_file, args.max_read_bytes) + step, last_metric = parse_training_tail(tail_text, args.target_steps) + current_metric_hash = metric_hash(last_metric) + + if step < state.last_step: + # Service likely restarted and step counter reset. + state.last_step = step + state.last_step_change_ts = now + state.nan_consecutive = 0 + state.last_metric_hash = current_metric_hash + elif step > state.last_step: + state.last_step = step + state.last_step_change_ts = now + elif state.last_step_change_ts == 0.0 and step > 0: + state.last_step_change_ts = now + + if current_metric_hash and current_metric_hash != state.last_metric_hash: + state.last_metric_hash = current_metric_hash + if "nan" in last_metric.lower(): + state.nan_consecutive += 1 + else: + state.nan_consecutive = 0 + + stalled = ( + active + and state.last_step_change_ts > 0 + and (now - state.last_step_change_ts) >= args.stall_seconds + ) + nan_limit_hit = state.nan_consecutive >= args.nan_consecutive_limit + + if (nan_limit_hit or stalled) and not training_completed: + restarted = restart_service(args.service) + if nan_limit_hit: + action = "restart_nan_limit_hit" if restarted else "restart_nan_failed" + else: + action = "restart_stall_timeout" if restarted else "restart_stall_failed" + state.nan_consecutive = 0 + state.last_metric_hash = "" + state.last_step_change_ts = now + active = is_service_active(args.service) + elif training_completed: + action = "completed_no_restart" + + save_state(state_file, state) + write_status( + status_file, + service=args.service, + active=active, + step=step, + target_steps=args.target_steps, + metric_line=last_metric, + state=state, + action=action, + ) + time.sleep(args.poll_seconds) + + +if __name__ == "__main__": + main() diff --git a/src/forge/training/callbacks.py b/src/forge/training/callbacks.py index 124481d..10bbc37 100644 --- a/src/forge/training/callbacks.py +++ b/src/forge/training/callbacks.py @@ -2,14 +2,23 @@ from __future__ import annotations +import math from typing import Any from forge.utils.logging import get_logger +try: + from transformers import TrainerCallback +except Exception: # pragma: no cover - fallback for non-training environments + class TrainerCallback: # type: ignore[no-redef] + """Fallback base class when transformers is unavailable.""" + + pass + logger = get_logger(__name__) -class EarlyStoppingOnPlateau: +class EarlyStoppingOnPlateau(TrainerCallback): """Stop training when eval loss plateaus for `patience` eval steps. Compatible with the ``transformers.TrainerCallback`` protocol. @@ -57,3 +66,88 @@ def on_evaluate( logger.info("early_stopping", step=state.global_step) # HF Trainer checks this flag after eval control.should_training_stop = True + + +def _is_non_finite(value: object) -> bool: + """Return True when a metric value is NaN/Inf.""" + if isinstance(value, bool): + return False + if isinstance(value, (int, float)): + return not math.isfinite(float(value)) + if isinstance(value, str): + normalized = value.strip().lower() + return normalized in {"nan", "inf", "+inf", "-inf"} + return False + + +class NaNGuardCallback(TrainerCallback): + """Stop training when NaN/Inf metrics appear repeatedly.""" + + def __init__( + self, + consecutive_limit: int = 5, + watch_keys: tuple[str, ...] = ("loss", "grad_norm", "eval_loss"), + ) -> None: + self.consecutive_limit = consecutive_limit + self.watch_keys = watch_keys + self._consecutive_hits = 0 + + def _handle_metrics( + self, + *, + metrics: dict[str, object] | None, + state: Any, + control: Any, + source: str, + ) -> None: + if not metrics: + return + + bad_values: dict[str, object] = { + key: value + for key, value in metrics.items() + if key in self.watch_keys and _is_non_finite(value) + } + + if not bad_values: + self._consecutive_hits = 0 + return + + self._consecutive_hits += 1 + logger.warning( + "nan_guard_detected", + source=source, + step=state.global_step, + hits=self._consecutive_hits, + limit=self.consecutive_limit, + bad_metrics=bad_values, + ) + + if self._consecutive_hits >= self.consecutive_limit: + logger.error( + "nan_guard_stopping_training", + source=source, + step=state.global_step, + limit=self.consecutive_limit, + ) + control.should_training_stop = True + + def on_log( + self, + args: Any, + state: Any, + control: Any, + logs: dict[str, object] | None = None, + **kwargs: object, + ) -> None: + self._handle_metrics(metrics=logs, state=state, control=control, source="log") + + def on_evaluate( + self, + args: Any, + state: Any, + control: Any, + metrics: dict[str, object] | None = None, + **kwargs: object, + ) -> None: + self._handle_metrics(metrics=metrics, state=state, control=control, source="eval") diff --git a/src/forge/training/trainer.py b/src/forge/training/trainer.py index 3676da4..54d2039 100644 --- a/src/forge/training/trainer.py +++ b/src/forge/training/trainer.py @@ -8,12 +8,16 @@ from datasets import load_dataset +from forge.training.callbacks import EarlyStoppingOnPlateau, NaNGuardCallback from forge.utils.config import TrainingConfig from forge.utils.logging import get_logger logger = get_logger(__name__) _TRUE_VALUES = {"1", "true", "yes", "on"} +_EARLY_STOPPING_PATIENCE = 5 +_EARLY_STOPPING_MIN_DELTA = 0.001 +_NAN_GUARD_CONSECUTIVE_LIMIT = 5 def _is_truthy(value: str | None) -> bool: @@ -74,7 +78,7 @@ def _setup_unsloth(self) -> None: def _setup_peft(self) -> None: """Load model via standard PEFT (fallback when Unsloth unavailable).""" import torch - from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training + from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig logger.info( @@ -106,24 +110,35 @@ def _setup_peft(self) -> None: self.tokenizer.pad_token = self.tokenizer.eos_token self.model = prepare_model_for_kbit_training(self.model) - - lora_bias = self.config.lora.bias.lower() - valid_lora_bias = {"none", "all", "lora_only"} - if lora_bias not in valid_lora_bias: - raise ValueError( - f"Invalid LoRA bias '{self.config.lora.bias}'. " - "Expected one of: none, all, lora_only." + adapter_init_path = self.config.training.adapter_init_path + if adapter_init_path: + adapter_path = Path(adapter_init_path).expanduser() + if not adapter_path.exists(): + raise FileNotFoundError(f"Adapter init path not found: {adapter_path}") + logger.info("loading_adapter_init", path=str(adapter_path)) + self.model = PeftModel.from_pretrained( + self.model, + str(adapter_path), + is_trainable=True, ) + else: + lora_bias = self.config.lora.bias.lower() + valid_lora_bias = {"none", "all", "lora_only"} + if lora_bias not in valid_lora_bias: + raise ValueError( + f"Invalid LoRA bias '{self.config.lora.bias}'. " + "Expected one of: none, all, lora_only." + ) - lora_config = LoraConfig( - r=self.config.lora.r, - lora_alpha=self.config.lora.alpha, - lora_dropout=self.config.lora.dropout, - target_modules=self.config.lora.target_modules, - bias=cast(Literal["none", "all", "lora_only"], lora_bias), - task_type=self.config.lora.task_type, - ) - self.model = get_peft_model(self.model, lora_config) + lora_config = LoraConfig( + r=self.config.lora.r, + lora_alpha=self.config.lora.alpha, + lora_dropout=self.config.lora.dropout, + target_modules=self.config.lora.target_modules, + bias=cast(Literal["none", "all", "lora_only"], lora_bias), + task_type=self.config.lora.task_type, + ) + self.model = get_peft_model(self.model, lora_config) logger.info("model_loaded_peft", trainable_params=self._count_trainable_params()) @@ -247,6 +262,7 @@ def train(self, resume_from_checkpoint: str | None = None) -> Path: warmup_ratio=self.config.training.warmup_ratio, lr_scheduler_type=self.config.training.lr_scheduler_type, weight_decay=self.config.training.weight_decay, + max_grad_norm=self.config.training.max_grad_norm, seed=self.config.training.seed, max_steps=self.config.training.max_steps, report_to="wandb" if wandb_enabled else "none", @@ -263,6 +279,23 @@ def train(self, resume_from_checkpoint: str | None = None) -> Path: args=training_args, formatting_func=self._format_prompt, ) + trainer.add_callback( + EarlyStoppingOnPlateau( + patience=_EARLY_STOPPING_PATIENCE, + min_delta=_EARLY_STOPPING_MIN_DELTA, + ) + ) + trainer.add_callback( + NaNGuardCallback( + consecutive_limit=_NAN_GUARD_CONSECUTIVE_LIMIT, + ) + ) + logger.info( + "training_callbacks_enabled", + early_stopping_patience=_EARLY_STOPPING_PATIENCE, + early_stopping_min_delta=_EARLY_STOPPING_MIN_DELTA, + nan_guard_consecutive_limit=_NAN_GUARD_CONSECUTIVE_LIMIT, + ) logger.info("training_started", output_dir=str(output_dir)) if resume_from_checkpoint: diff --git a/src/forge/utils/config.py b/src/forge/utils/config.py index 759fda9..8fec97a 100644 --- a/src/forge/utils/config.py +++ b/src/forge/utils/config.py @@ -38,6 +38,7 @@ class TrainingParams(BaseModel): warmup_ratio: float = 0.1 lr_scheduler_type: str = "cosine" weight_decay: float = 0.01 + max_grad_norm: float = 1.0 logging_steps: int = 10 save_steps: int = 200 save_total_limit: int = 3 @@ -45,6 +46,7 @@ class TrainingParams(BaseModel): fp16: bool = True # always True for Volta arch bf16: bool = False # NOT supported on V100 max_steps: int = -1 + adapter_init_path: str | None = None seed: int = 42 From d570910c7e91816c1526c4e3f514d18c2825efd2 Mon Sep 17 00:00:00 2001 From: Ogulcan Aydogan Date: Fri, 6 Mar 2026 09:14:54 +0000 Subject: [PATCH 2/3] ops: add automated v8 stability gate fallback watcher --- scripts/watch_v8_stability_gate.sh | 130 +++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100755 scripts/watch_v8_stability_gate.sh diff --git a/scripts/watch_v8_stability_gate.sh b/scripts/watch_v8_stability_gate.sh new file mode 100755 index 0000000..e42d39b --- /dev/null +++ b/scripts/watch_v8_stability_gate.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +cd "$PROJECT_ROOT" + +ENV_FILE="${ENV_FILE:-$HOME/.config/forge/training.env}" +STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status_a100.txt}" +POLL_SECONDS="${POLL_SECONDS:-30}" +GATE_STEPS="${GATE_STEPS:-300}" + +FALLBACK_CONFIG="${FALLBACK_CONFIG:-configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml}" +FALLBACK_RUN_DIR="${FALLBACK_RUN_DIR:-artifacts/training/turkcell-7b-sft-v8b-a100-bf16-ultra-stable-fallback}" +FALLBACK_LOG="${FALLBACK_LOG:-artifacts/logs/training_turkcell_7b_a100_v8b_ultra_stable_fallback.log}" + +SCRIPT_LOG="${SCRIPT_LOG:-artifacts/logs/v8_stability_gate.log}" +mkdir -p "$(dirname "$SCRIPT_LOG")" +touch "$SCRIPT_LOG" + +if [[ ! -f "$STATUS_FILE" ]]; then + echo "status_file_missing path=$STATUS_FILE" | tee -a "$SCRIPT_LOG" + exit 1 +fi + +current_step="$(awk -F '=' '$1=="step" {print $2}' "$STATUS_FILE" | tail -n 1 || true)" +if [[ ! "$current_step" =~ ^[0-9]+$ ]]; then + current_step=0 +fi +start_step="$current_step" +target_step=$((start_step + GATE_STEPS)) + +current_log="$(awk -F '=' '$1=="TRAIN_LOG" {print $2}' "$ENV_FILE" | tail -n 1 || true)" +if [[ -z "$current_log" ]]; then + current_log="artifacts/logs/training_turkcell_7b_a100_v8_stable_reset.log" +fi + +marker_line=1 +if [[ -f "$current_log" ]]; then + latest_marker="$(grep -a -n "forge-training-start" "$current_log" | tail -n 1 | cut -d ':' -f 1 || true)" + if [[ "$latest_marker" =~ ^[0-9]+$ ]] && [[ "$latest_marker" -gt 0 ]]; then + marker_line=$((latest_marker + 1)) + fi +fi + +echo "gate_start_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ) start_step=$start_step target_step=$target_step log=$current_log marker_line=$marker_line" | tee -a "$SCRIPT_LOG" + +apply_fallback() { + local ts + ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "gate_fallback_triggered_utc=$ts reason=$1" | tee -a "$SCRIPT_LOG" + if [[ ! -f "$ENV_FILE" ]]; then + echo "env_file_missing path=$ENV_FILE" | tee -a "$SCRIPT_LOG" + return 1 + fi + + python3 - "$ENV_FILE" "$FALLBACK_CONFIG" "$FALLBACK_RUN_DIR" "$FALLBACK_LOG" <<'PY' +import sys +from pathlib import Path + +env_file = Path(sys.argv[1]) +fallback_config = sys.argv[2] +fallback_run_dir = sys.argv[3] +fallback_log = sys.argv[4] + +raw = env_file.read_text(encoding="utf-8", errors="ignore").splitlines() +pairs = [] +seen = set() +for line in raw: + if "=" in line and not line.lstrip().startswith("#"): + key, value = line.split("=", 1) + key = key.strip() + if key not in seen: + pairs.append(key) + seen.add(key) + +updates = { + "TRAIN_CONFIG": fallback_config, + "TRAIN_RUN_DIR": fallback_run_dir, + "TRAIN_LOG": fallback_log, + "ENABLE_RESUME": "0", + "SAVE_STEPS": "250", +} + +for key in updates: + if key not in seen: + pairs.append(key) + seen.add(key) + +kv = {} +for line in raw: + if "=" in line and not line.lstrip().startswith("#"): + key, value = line.split("=", 1) + kv[key.strip()] = value.strip() +for key, value in updates.items(): + kv[key] = value + +env_file.write_text("".join(f"{k}={kv.get(k, '')}\n" for k in pairs), encoding="utf-8") +PY + + systemctl --user daemon-reload + systemctl --user restart forge-training.service forge-training-monitor.service forge-training-watchdog.service + echo "fallback_service_restart_complete_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" | tee -a "$SCRIPT_LOG" +} + +while true; do + step="$(awk -F '=' '$1=="step" {print $2}' "$STATUS_FILE" | tail -n 1 || true)" + state="$(awk -F '=' '$1=="state" {print $2}' "$STATUS_FILE" | tail -n 1 || true)" + if [[ ! "$step" =~ ^[0-9]+$ ]]; then + step=0 + fi + + if [[ "$step" -ge "$target_step" ]]; then + echo "gate_pass_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ) step=$step target_step=$target_step" | tee -a "$SCRIPT_LOG" + exit 0 + fi + + if [[ -f "$current_log" ]]; then + if tail -n +"$marker_line" "$current_log" | grep -a -q "nan_guard_stopping_training"; then + apply_fallback "nan_guard_stopping_training_detected" + exit 0 + fi + fi + + if [[ "$state" == "stopped" ]]; then + apply_fallback "training_stopped_before_gate" + exit 0 + fi + + sleep "$POLL_SECONDS" +done From 758927770e1f80230be55315c1c5e2661994a6ae Mon Sep 17 00:00:00 2001 From: Ogulcan Aydogan Date: Sat, 7 Mar 2026 22:27:22 +0000 Subject: [PATCH 3/3] feat: add v100 training configs, systemd services and operational scripts - Add Turkcell 7B v100 model configs (v2 stable, v3 ultrastable, fallbacks) - Add vLLM v100 merged serving config - Add systemd training service, watchdog service and timer - Add training monitoring, watchdog and completion scripts --- .../models/turkcell_7b_v100_v2_stable.yaml | 37 +++ .../models/turkcell_7b_v100_v2b_fallback.yaml | 9 + .../turkcell_7b_v100_v3_ultrastable.yaml | 37 +++ .../models/turkcell_7b_v100_v3b_fallback.yaml | 9 + configs/serving/vllm_v100_v3_merged.yaml | 13 + deploy/systemd/forge-v100-training.service | 21 ++ deploy/systemd/forge-v100-watchdog.service | 8 + deploy/systemd/forge-v100-watchdog.timer | 12 + scripts/install_v100_watchdog.sh | 52 ++++ scripts/monitor_training.sh | 7 + scripts/monitor_v100_training.sh | 105 +++++++ scripts/run_v100_completion.sh | 120 ++++++++ scripts/start_or_resume_full_training.sh | 57 ++++ scripts/start_v100_training.sh | 167 +++++++++++ scripts/watchdog_training.sh | 271 ++++++++++++++++++ 15 files changed, 925 insertions(+) create mode 100644 configs/models/turkcell_7b_v100_v2_stable.yaml create mode 100644 configs/models/turkcell_7b_v100_v2b_fallback.yaml create mode 100644 configs/models/turkcell_7b_v100_v3_ultrastable.yaml create mode 100644 configs/models/turkcell_7b_v100_v3b_fallback.yaml create mode 100644 configs/serving/vllm_v100_v3_merged.yaml create mode 100644 deploy/systemd/forge-v100-training.service create mode 100644 deploy/systemd/forge-v100-watchdog.service create mode 100644 deploy/systemd/forge-v100-watchdog.timer create mode 100755 scripts/install_v100_watchdog.sh create mode 100755 scripts/monitor_training.sh create mode 100755 scripts/monitor_v100_training.sh create mode 100644 scripts/run_v100_completion.sh create mode 100755 scripts/start_or_resume_full_training.sh create mode 100755 scripts/start_v100_training.sh create mode 100644 scripts/watchdog_training.sh diff --git a/configs/models/turkcell_7b_v100_v2_stable.yaml b/configs/models/turkcell_7b_v100_v2_stable.yaml new file mode 100644 index 0000000..5a6f42b --- /dev/null +++ b/configs/models/turkcell_7b_v100_v2_stable.yaml @@ -0,0 +1,37 @@ +# V100 stable recovery profile (clean restart) +_base: "../base.yaml" + +model: + name: "TURKCELL/Turkcell-LLM-7b-v1" + max_seq_length: 2048 + dtype: "float16" + +training: + num_epochs: 3 + max_steps: 8601 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 8 + learning_rate: 5.0e-5 + warmup_ratio: 0.10 + lr_scheduler_type: "cosine" + weight_decay: 0.01 + max_grad_norm: 0.3 + logging_steps: 10 + eval_steps: 500 + save_steps: 250 + save_total_limit: 20 + fp16: true + bf16: false + +lora: + r: 32 + alpha: 64 + +data: + train_path: "data/processed/turkish_sft.jsonl" + eval_path: "data/processed/turkish_eval.jsonl" + +wandb: + project: "lowresource-llm-forge" + run_name: "turkcell-7b-sft-v100-v2-stable" + enabled: false diff --git a/configs/models/turkcell_7b_v100_v2b_fallback.yaml b/configs/models/turkcell_7b_v100_v2b_fallback.yaml new file mode 100644 index 0000000..4ba8869 --- /dev/null +++ b/configs/models/turkcell_7b_v100_v2b_fallback.yaml @@ -0,0 +1,9 @@ +# V100 fallback profile (single retry after NaN) +_base: "turkcell_7b_v100_v2_stable.yaml" + +training: + learning_rate: 3.0e-5 + max_grad_norm: 0.2 + +wandb: + run_name: "turkcell-7b-sft-v100-v2b-fallback" diff --git a/configs/models/turkcell_7b_v100_v3_ultrastable.yaml b/configs/models/turkcell_7b_v100_v3_ultrastable.yaml new file mode 100644 index 0000000..042ff13 --- /dev/null +++ b/configs/models/turkcell_7b_v100_v3_ultrastable.yaml @@ -0,0 +1,37 @@ +# V100 ultra-stable profile (clean restart) +_base: "../base.yaml" + +model: + name: "TURKCELL/Turkcell-LLM-7b-v1" + max_seq_length: 2048 + dtype: "float16" + +training: + num_epochs: 3 + max_steps: 8601 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 8 + learning_rate: 2.0e-5 + warmup_ratio: 0.12 + lr_scheduler_type: "cosine" + weight_decay: 0.01 + max_grad_norm: 0.2 + logging_steps: 10 + eval_steps: 250 + save_steps: 250 + save_total_limit: 30 + fp16: true + bf16: false + +lora: + r: 32 + alpha: 64 + +data: + train_path: "data/processed/turkish_sft.jsonl" + eval_path: "data/processed/turkish_eval.jsonl" + +wandb: + project: "lowresource-llm-forge" + run_name: "turkcell-7b-sft-v100-v3-ultrastable" + enabled: false diff --git a/configs/models/turkcell_7b_v100_v3b_fallback.yaml b/configs/models/turkcell_7b_v100_v3b_fallback.yaml new file mode 100644 index 0000000..f1763c6 --- /dev/null +++ b/configs/models/turkcell_7b_v100_v3b_fallback.yaml @@ -0,0 +1,9 @@ +# V100 fallback profile (single retry after NaN) +_base: "turkcell_7b_v100_v3_ultrastable.yaml" + +training: + learning_rate: 1.0e-5 + max_grad_norm: 0.1 + +wandb: + run_name: "turkcell-7b-sft-v100-v3b-fallback" diff --git a/configs/serving/vllm_v100_v3_merged.yaml b/configs/serving/vllm_v100_v3_merged.yaml new file mode 100644 index 0000000..a87bf08 --- /dev/null +++ b/configs/serving/vllm_v100_v3_merged.yaml @@ -0,0 +1,13 @@ +# vLLM serving configuration for V100 merged v3 model + +model_path: "artifacts/merged/turkcell-7b-v100-v3-ultrastable" +host: "0.0.0.0" +port: 18040 +tensor_parallel_size: 1 +gpu_memory_utilization: 0.75 +max_model_len: 4096 +dtype: "float16" +enable_prefix_caching: true +trust_remote_code: false +enforce_eager: false +max_num_seqs: 48 diff --git a/deploy/systemd/forge-v100-training.service b/deploy/systemd/forge-v100-training.service new file mode 100644 index 0000000..aaa3438 --- /dev/null +++ b/deploy/systemd/forge-v100-training.service @@ -0,0 +1,21 @@ +[Unit] +Description=LowResource-LLM-Forge V100 Training +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=%h/projects/LowResource-LLM-Forge +Environment=PYTHONUNBUFFERED=1 +EnvironmentFile=-%h/.config/forge/v100_training.env +ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_v100_training.sh +Restart=on-failure +RestartSec=20 +StartLimitIntervalSec=600 +StartLimitBurst=3 +KillMode=control-group +StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log +StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log + +[Install] +WantedBy=default.target diff --git a/deploy/systemd/forge-v100-watchdog.service b/deploy/systemd/forge-v100-watchdog.service new file mode 100644 index 0000000..80613ec --- /dev/null +++ b/deploy/systemd/forge-v100-watchdog.service @@ -0,0 +1,8 @@ +[Unit] +Description=LowResource-LLM-Forge V100 Training Watchdog +After=network-online.target + +[Service] +Type=oneshot +WorkingDirectory=%h/projects/LowResource-LLM-Forge +ExecStart=/usr/bin/env bash %h/projects/LowResource-LLM-Forge/scripts/watchdog_training.sh diff --git a/deploy/systemd/forge-v100-watchdog.timer b/deploy/systemd/forge-v100-watchdog.timer new file mode 100644 index 0000000..537ea77 --- /dev/null +++ b/deploy/systemd/forge-v100-watchdog.timer @@ -0,0 +1,12 @@ +[Unit] +Description=Run V100 training watchdog every 30 seconds + +[Timer] +OnBootSec=1min +OnUnitActiveSec=30s +AccuracySec=5s +Unit=forge-v100-watchdog.service +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/scripts/install_v100_watchdog.sh b/scripts/install_v100_watchdog.sh new file mode 100755 index 0000000..dbe62f8 --- /dev/null +++ b/scripts/install_v100_watchdog.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +SYSTEMD_USER_DIR="${SYSTEMD_USER_DIR:-$HOME/.config/systemd/user}" +FORGE_ENV_DIR="${FORGE_ENV_DIR:-$HOME/.config/forge}" +FORGE_ENV_FILE="${FORGE_ENV_FILE:-$FORGE_ENV_DIR/v100_training.env}" + +mkdir -p "$SYSTEMD_USER_DIR" "$FORGE_ENV_DIR" "$PROJECT_ROOT/artifacts/logs" + +if [[ ! -f "$FORGE_ENV_FILE" ]]; then + cat >"$FORGE_ENV_FILE" <<'ENVEOF' +# V100 runtime contract +TRAIN_CONFIG=configs/models/turkcell_7b_v100_v3_ultrastable.yaml +FALLBACK_CONFIG=configs/models/turkcell_7b_v100_v3b_fallback.yaml +TARGET_STEPS=8601 +SAVE_STEPS=250 +ENABLE_RESUME=0 +RESUME_AFTER_STEP=500 +REQUIRE_WANDB=0 +CUDA_VISIBLE_DEVICES=0 +ENVEOF + chmod 600 "$FORGE_ENV_FILE" +fi + +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-v100-training.service" \ + "$SYSTEMD_USER_DIR/forge-v100-training.service" +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-v100-watchdog.service" \ + "$SYSTEMD_USER_DIR/forge-v100-watchdog.service" +install -m 0644 \ + "$PROJECT_ROOT/deploy/systemd/forge-v100-watchdog.timer" \ + "$SYSTEMD_USER_DIR/forge-v100-watchdog.timer" + +chmod +x \ + "$PROJECT_ROOT/scripts/start_v100_training.sh" \ + "$PROJECT_ROOT/scripts/start_or_resume_full_training.sh" \ + "$PROJECT_ROOT/scripts/monitor_v100_training.sh" \ + "$PROJECT_ROOT/scripts/monitor_training.sh" \ + "$PROJECT_ROOT/scripts/watchdog_training.sh" \ + "$PROJECT_ROOT/scripts/run_v100_completion.sh" + +systemctl --user daemon-reload +systemctl --user enable forge-v100-training.service +systemctl --user enable --now forge-v100-watchdog.timer +systemctl --user restart forge-v100-watchdog.service +systemctl --user --no-pager --lines=20 status forge-v100-training.service || true +systemctl --user --no-pager --lines=20 status forge-v100-watchdog.timer || true +systemctl --user --no-pager --lines=20 status forge-v100-watchdog.service || true + +echo "V100 watchdog installed. Env file: $FORGE_ENV_FILE" diff --git a/scripts/monitor_training.sh b/scripts/monitor_training.sh new file mode 100755 index 0000000..292b30f --- /dev/null +++ b/scripts/monitor_training.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +cd "$PROJECT_ROOT" + +bash scripts/monitor_v100_training.sh diff --git a/scripts/monitor_v100_training.sh b/scripts/monitor_v100_training.sh new file mode 100755 index 0000000..91a68f3 --- /dev/null +++ b/scripts/monitor_v100_training.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}" +ACTIVE_RUN_FILE="${ACTIVE_RUN_FILE:-$PROJECT_ROOT/artifacts/logs/v100_active_run.env}" +STATUS_FILE="${STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_monitor_status.txt}" + +if [[ -f "$FORGE_ENV_FILE" ]]; then + # shellcheck disable=SC1090 + source "$FORGE_ENV_FILE" +fi + +if [[ -f "$ACTIVE_RUN_FILE" ]]; then + # shellcheck disable=SC1090 + source "$ACTIVE_RUN_FILE" +fi + +cd "$PROJECT_ROOT" + +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}" +TARGET_STEPS="${TARGET_STEPS:-8601}" +TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_full.log}" +RUN_ID="${RUN_ID:-unknown}" + +abs_path() { + local path="$1" + if [[ "$path" = /* ]]; then + echo "$path" + else + echo "$PROJECT_ROOT/$path" + fi +} + +LOG_FILE="$(abs_path "$TRAIN_LOG")" + +running="no" +if pgrep -f "run_training.py --config ${TRAIN_CONFIG}" >/dev/null 2>&1 || pgrep -f "scripts/run_training.py" >/dev/null 2>&1; then + running="yes" +fi + +log_start_line=1 +if [[ -f "$LOG_FILE" ]]; then + if [[ "$RUN_ID" != "unknown" ]]; then + marker_line="$(grep -a -n "forge-training-start run_id=${RUN_ID}" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)" + else + marker_line="$(grep -a -n "forge-training-start" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)" + fi + if [[ "$marker_line" =~ ^[0-9]+$ ]] && [[ "$marker_line" -gt 0 ]]; then + log_start_line=$((marker_line + 1)) + fi +fi + +progress="none" +if [[ -f "$LOG_FILE" ]]; then + progress="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -oE "[0-9]+/${TARGET_STEPS}" | tail -n 1 || true)" + [[ -z "$progress" ]] && progress="none" +fi + +step=0 +if [[ "$progress" != "none" ]]; then + step="${progress%%/*}" +fi + +percent=0 +if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$TARGET_STEPS" -gt 0 ]]; then + percent=$((step * 100 / TARGET_STEPS)) +fi + +nan_hits=0 +if [[ -f "$LOG_FILE" ]]; then + metric_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -E -i -c "'(loss|grad_norm|eval_loss|entropy)':[[:space:]]*'?(nan|inf)'?" || true)" + guard_nan_count="$(tail -n +"$log_start_line" "$LOG_FILE" | grep -a -E -c "nan_guard_detected|nan_guard_stopping_training" || true)" + metric_nan_count="${metric_nan_count:-0}" + guard_nan_count="${guard_nan_count:-0}" + nan_hits=$((metric_nan_count + guard_nan_count)) +fi + +gpu_line="$(nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,noheader 2>/dev/null | sed -n '1p' || true)" +if [[ -z "$gpu_line" ]]; then + gpu_line="unknown" +fi + +state="stopped" +if [[ "$running" == "yes" ]]; then + state="running" +fi +if [[ "$step" =~ ^[0-9]+$ ]] && [[ "$step" -ge "$TARGET_STEPS" ]]; then + state="completed" +fi + +{ + echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "running=$running" + echo "step=$step" + echo "target_steps=$TARGET_STEPS" + echo "progress=$progress" + echo "percent=$percent" + echo "nan_hits=$nan_hits" + echo "run_segment_id=$RUN_ID" + echo "train_config=$TRAIN_CONFIG" + echo "log_file=$TRAIN_LOG" + echo "gpu=$gpu_line" + echo "state=$state" +} >"$STATUS_FILE" diff --git a/scripts/run_v100_completion.sh b/scripts/run_v100_completion.sh new file mode 100644 index 0000000..aa3a160 --- /dev/null +++ b/scripts/run_v100_completion.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}" +MONITOR_STATUS_FILE="${MONITOR_STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_monitor_status.txt}" +WATCHDOG_STATUS_FILE="${WATCHDOG_STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_watchdog_status.txt}" +SUMMARY_FILE="${SUMMARY_FILE:-$PROJECT_ROOT/artifacts/logs/v100_completion_summary.md}" +LOCK_DIR="${LOCK_DIR:-$PROJECT_ROOT/artifacts/logs/v100_completion.lock.d}" +DONE_FILE="${DONE_FILE:-$PROJECT_ROOT/artifacts/logs/v100_completion.done}" +UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}" + +cd "$PROJECT_ROOT" + +if [[ -f "$DONE_FILE" ]]; then + exit 0 +fi + +if ! mkdir "$LOCK_DIR" 2>/dev/null; then + exit 0 +fi +trap 'rmdir "$LOCK_DIR" >/dev/null 2>&1 || true' EXIT + +if [[ ! -f "$MONITOR_STATUS_FILE" ]]; then + exit 0 +fi + +target_steps="$(grep -E '^target_steps=' "$MONITOR_STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)" +step="$(grep -E '^step=' "$MONITOR_STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)" +state="$(grep -E '^state=' "$MONITOR_STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)" + +if [[ ! "$target_steps" =~ ^[0-9]+$ ]]; then + target_steps=8601 +fi +if [[ ! "$step" =~ ^[0-9]+$ ]]; then + step=0 +fi + +if [[ "$state" != "completed" ]] && [[ "$step" -lt "$target_steps" ]]; then + exit 0 +fi + +if [[ -f "$FORGE_ENV_FILE" ]]; then + # shellcheck disable=SC1090 + source "$FORGE_ENV_FILE" +fi + +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}" +RUN_DIR="${RUN_DIR:-artifacts/training/turkcell-7b-sft-v100-v3-ultrastable}" +TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_turkcell_7b_v100_v3_ultrastable.log}" +ADAPTER_DIR="${ADAPTER_DIR:-$RUN_DIR/final}" + +BASE_MODEL="${BASE_MODEL:-TURKCELL/Turkcell-LLM-7b-v1}" +MERGED_OUTPUT="${MERGED_OUTPUT:-artifacts/merged/turkcell-7b-v100-v3-ultrastable}" +EVAL_OUTPUT_ROOT="${EVAL_OUTPUT_ROOT:-artifacts/eval/turkcell-7b-v100-v3-ultrastable}" +SERVE_CONFIG="${SERVE_CONFIG:-configs/serving/vllm_v100_v3_merged.yaml}" +BENCHMARK_OUTPUT_DIR="${BENCHMARK_OUTPUT_DIR:-artifacts/benchmarks/turkcell-7b-v100-v3-ultrastable}" + +export UV_BIN +export TRAIN_CONFIG +export RUN_DIR +export TRAIN_LOG +export ADAPTER_DIR +export BASE_MODEL +export MERGED_OUTPUT +export EVAL_OUTPUT_ROOT +export SERVE_CONFIG +export BENCHMARK_OUTPUT_DIR +export PUSH_TO_HUB=0 +export AUTO_START_SERVE=1 +export FORGE_EXECUTION_CONTEXT=remote + +pipeline_status="success" +pipeline_error="" +if ! bash "$PROJECT_ROOT/scripts/post_training_pipeline.sh"; then + pipeline_status="failed" + pipeline_error="post_training_pipeline_failed" +fi + +timestamp="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +watchdog_snapshot="" +if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then + watchdog_snapshot="$(cat "$WATCHDOG_STATUS_FILE")" +fi + +mkdir -p "$(dirname "$SUMMARY_FILE")" +cat >"$SUMMARY_FILE" <>"$SUMMARY_FILE" + exit 1 +fi diff --git a/scripts/start_or_resume_full_training.sh b/scripts/start_or_resume_full_training.sh new file mode 100755 index 0000000..6ed9bcc --- /dev/null +++ b/scripts/start_or_resume_full_training.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}" +PID_FILE="${PID_FILE:-$PROJECT_ROOT/artifacts/logs/training_full.pid}" +TRAINING_SERVICE_NAME="${TRAINING_SERVICE_NAME:-forge-v100-training.service}" +FORCE_RESTART="${FORCE_RESTART:-0}" + +if [[ -f "$FORGE_ENV_FILE" ]]; then + # shellcheck disable=SC1090 + source "$FORGE_ENV_FILE" +fi + +cd "$PROJECT_ROOT" +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}" +RUN_PATTERN="run_training.py --config ${TRAIN_CONFIG}" + +mkdir -p "$(dirname "$PID_FILE")" + +if [[ "$FORCE_RESTART" != "1" ]] && systemctl --user is-active --quiet "$TRAINING_SERVICE_NAME"; then + echo "training_service_active" + systemctl --user --no-pager --lines=0 status "$TRAINING_SERVICE_NAME" | head -n 1 || true + exit 0 +fi + +if [[ "$FORCE_RESTART" != "1" ]] && pgrep -f "$RUN_PATTERN" >/dev/null 2>&1; then + echo "training_already_running" + pgrep -af "$RUN_PATTERN" | head -n 4 + exit 0 +fi + +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" +systemctl --user set-environment RUN_ID="$RUN_ID" +if [[ "$FORCE_RESTART" == "1" ]]; then + systemctl --user restart "$TRAINING_SERVICE_NAME" +else + systemctl --user start "$TRAINING_SERVICE_NAME" +fi + +sleep 2 +service_state="$(systemctl --user is-active "$TRAINING_SERVICE_NAME" || true)" +if [[ "$service_state" != "active" ]] && [[ "$service_state" != "activating" ]]; then + echo "training_service_failed" + systemctl --user --no-pager --lines=50 status "$TRAINING_SERVICE_NAME" || true + exit 1 +fi + +if pgrep -f "scripts/run_training.py" >/dev/null 2>&1; then + pgrep -f "scripts/run_training.py" | head -n 1 > "$PID_FILE" +fi + +echo "service=$TRAINING_SERVICE_NAME" +echo "service_state=$service_state" +echo "run_id=$RUN_ID" +echo "train_config=$TRAIN_CONFIG" +echo "pid_file=$PID_FILE" diff --git a/scripts/start_v100_training.sh b/scripts/start_v100_training.sh new file mode 100755 index 0000000..c15aecc --- /dev/null +++ b/scripts/start_v100_training.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}" + +if [[ -f "$FORGE_ENV_FILE" ]]; then + # shellcheck disable=SC1090 + source "$FORGE_ENV_FILE" +fi + +cd "$PROJECT_ROOT" + +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}" +FALLBACK_CONFIG="${FALLBACK_CONFIG:-configs/models/turkcell_7b_v100_v3b_fallback.yaml}" +TARGET_STEPS="${TARGET_STEPS:-8601}" +SAVE_STEPS="${SAVE_STEPS:-250}" +ENABLE_RESUME="${ENABLE_RESUME:-0}" +REQUIRE_WANDB="${REQUIRE_WANDB:-0}" +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" +CONFIG_BASENAME="$(basename "$TRAIN_CONFIG")" +CONFIG_SLUG="${CONFIG_BASENAME%.*}" +TRAIN_LOG_DIR="${TRAIN_LOG_DIR:-artifacts/logs}" +ACTIVE_RUN_FILE="${ACTIVE_RUN_FILE:-artifacts/logs/v100_active_run.env}" +STATUS_FILE="${STATUS_FILE:-artifacts/logs/training_monitor_status.txt}" +PID_FILE="${PID_FILE:-artifacts/logs/training_full.pid}" +UV_BIN="${UV_BIN:-$HOME/.local/bin/uv}" +HF_HOME_DIR="${HF_HOME_DIR:-$PROJECT_ROOT/.hf_cache}" +HF_DATASETS_CACHE_DIR="${HF_DATASETS_CACHE_DIR:-$HF_HOME_DIR/datasets}" +HF_HUB_CACHE_DIR="${HF_HUB_CACHE_DIR:-$HF_HOME_DIR/hub}" +CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" + +if [[ -z "${TRAIN_RUN_DIR:-}" ]]; then + case "$TRAIN_CONFIG" in + *turkcell_7b_v100_v3b_fallback.yaml) + TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v3b-fallback" + ;; + *turkcell_7b_v100_v3_ultrastable.yaml) + TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v3-ultrastable" + ;; + *turkcell_7b_v100_v2b_fallback.yaml) + TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v2b-fallback" + ;; + *) + TRAIN_RUN_DIR="artifacts/training/turkcell-7b-sft-v100-v2-stable" + ;; + esac +fi + +TRAIN_LOG="${TRAIN_LOG:-$TRAIN_LOG_DIR/training_${CONFIG_SLUG}_${RUN_ID}.log}" + +abs_path() { + local path="$1" + if [[ "$path" = /* ]]; then + echo "$path" + else + echo "$PROJECT_ROOT/$path" + fi +} + +TRAIN_RUN_DIR_ABS="$(abs_path "$TRAIN_RUN_DIR")" +TRAIN_LOG_ABS="$(abs_path "$TRAIN_LOG")" +ACTIVE_RUN_FILE_ABS="$(abs_path "$ACTIVE_RUN_FILE")" +PID_FILE_ABS="$(abs_path "$PID_FILE")" + +mkdir -p \ + "$(dirname "$TRAIN_RUN_DIR_ABS")" \ + "$(dirname "$TRAIN_LOG_ABS")" \ + "$(dirname "$ACTIVE_RUN_FILE_ABS")" \ + "$(dirname "$PID_FILE_ABS")" \ + "$HF_HOME_DIR" \ + "$HF_DATASETS_CACHE_DIR" \ + "$HF_HUB_CACHE_DIR" + +if [[ ! -x "$UV_BIN" ]]; then + echo "UV executable not found: $UV_BIN" >&2 + exit 1 +fi + +if [[ "$REQUIRE_WANDB" == "1" ]] && [[ -z "${WANDB_API_KEY:-}" ]]; then + echo "WANDB_API_KEY is required for this run (REQUIRE_WANDB=1)." >&2 + exit 1 +fi + +find_latest_checkpoint() { + if [[ ! -d "$TRAIN_RUN_DIR_ABS" ]]; then + return + fi + + local current_step="" + if [[ -f "$STATUS_FILE" ]]; then + current_step="$(grep -E '^step=' "$STATUS_FILE" | tail -n 1 | cut -d '=' -f 2 || true)" + fi + + if [[ "$current_step" =~ ^[0-9]+$ ]] && [[ "$current_step" -gt 0 ]]; then + local filtered_checkpoint + filtered_checkpoint="$(find "$TRAIN_RUN_DIR_ABS" -maxdepth 1 -type d -name 'checkpoint-*' 2>/dev/null \ + | sed -E 's#(.*checkpoint-)([0-9]+)$#\2 \1\2#' \ + | awk -v s="$current_step" '$1 <= s' \ + | sort -n \ + | tail -n 1 \ + | cut -d ' ' -f 2- || true)" + if [[ -n "$filtered_checkpoint" ]]; then + echo "$filtered_checkpoint" + return + fi + fi + + find "$TRAIN_RUN_DIR_ABS" -maxdepth 1 -type d -name 'checkpoint-*' | sort -V | tail -n 1 +} + +resume_from="" +if [[ "$ENABLE_RESUME" == "1" ]]; then + latest_checkpoint="$(find_latest_checkpoint || true)" + if [[ -n "$latest_checkpoint" ]]; then + resume_from="$latest_checkpoint" + fi +fi + +cmd=("$UV_BIN" "run" "python" "scripts/run_training.py" "--config" "$TRAIN_CONFIG") +if [[ -n "$resume_from" ]]; then + cmd+=("--resume-from" "$resume_from") +fi + +ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" +{ + echo "[$ts] forge-training-start run_id=$RUN_ID" + echo "run_id=$RUN_ID" + echo "train_config=$TRAIN_CONFIG" + echo "fallback_config=$FALLBACK_CONFIG" + echo "target_steps=$TARGET_STEPS" + echo "save_steps=$SAVE_STEPS" + echo "train_run_dir=$TRAIN_RUN_DIR" + echo "train_log=$TRAIN_LOG" + echo "resume_from=${resume_from:-none}" + echo "enable_resume=$ENABLE_RESUME" + echo "require_wandb=$REQUIRE_WANDB" + echo "command=${cmd[*]}" +} >>"$TRAIN_LOG_ABS" + +{ + echo "RUN_ID=$RUN_ID" + echo "TRAIN_CONFIG=$TRAIN_CONFIG" + echo "FALLBACK_CONFIG=$FALLBACK_CONFIG" + echo "TARGET_STEPS=$TARGET_STEPS" + echo "SAVE_STEPS=$SAVE_STEPS" + echo "TRAIN_RUN_DIR=$TRAIN_RUN_DIR" + echo "TRAIN_LOG=$TRAIN_LOG" + echo "RESUME_FROM=${resume_from:-}" + echo "ENABLE_RESUME=$ENABLE_RESUME" + echo "REQUIRE_WANDB=$REQUIRE_WANDB" + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +} >"$ACTIVE_RUN_FILE_ABS" + +echo "$$" >"$PID_FILE_ABS" + +exec > >(tee -a "$TRAIN_LOG_ABS") 2>&1 + +echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] forge-training-exec run_id=$RUN_ID" + +exec env \ + FORGE_EXECUTION_CONTEXT=remote \ + HF_HOME="$HF_HOME_DIR" \ + HF_DATASETS_CACHE="$HF_DATASETS_CACHE_DIR" \ + HUGGINGFACE_HUB_CACHE="$HF_HUB_CACHE_DIR" \ + CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" \ + "${cmd[@]}" diff --git a/scripts/watchdog_training.sh b/scripts/watchdog_training.sh new file mode 100644 index 0000000..2829fa0 --- /dev/null +++ b/scripts/watchdog_training.sh @@ -0,0 +1,271 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT="${PROJECT_ROOT:-$HOME/projects/LowResource-LLM-Forge}" +FORGE_ENV_FILE="${FORGE_ENV_FILE:-$HOME/.config/forge/v100_training.env}" +ACTIVE_RUN_FILE="${ACTIVE_RUN_FILE:-$PROJECT_ROOT/artifacts/logs/v100_active_run.env}" +STATUS_FILE="${STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_watchdog_status.txt}" +STATE_FILE="${STATE_FILE:-$PROJECT_ROOT/artifacts/logs/training_watchdog_state.env}" +MONITOR_STATUS_FILE="${MONITOR_STATUS_FILE:-$PROJECT_ROOT/artifacts/logs/training_monitor_status.txt}" +TRAINING_SERVICE_NAME="${TRAINING_SERVICE_NAME:-forge-v100-training.service}" +COMPLETION_SCRIPT="${COMPLETION_SCRIPT:-$PROJECT_ROOT/scripts/run_v100_completion.sh}" + +if [[ -f "$FORGE_ENV_FILE" ]]; then + # shellcheck disable=SC1090 + source "$FORGE_ENV_FILE" +fi +if [[ -f "$ACTIVE_RUN_FILE" ]]; then + # shellcheck disable=SC1090 + source "$ACTIVE_RUN_FILE" +fi + +cd "$PROJECT_ROOT" + +TRAIN_CONFIG="${TRAIN_CONFIG:-configs/models/turkcell_7b_v100_v3_ultrastable.yaml}" +FALLBACK_CONFIG="${FALLBACK_CONFIG:-configs/models/turkcell_7b_v100_v3b_fallback.yaml}" +TARGET_STEPS="${TARGET_STEPS:-8601}" +MAX_IDLE_SECONDS="${MAX_IDLE_SECONDS:-5400}" +MAX_LOG_STALE_SECONDS="${MAX_LOG_STALE_SECONDS:-900}" +RESUME_AFTER_STEP="${RESUME_AFTER_STEP:-500}" +RUN_ID="${RUN_ID:-unknown}" +TRAIN_LOG="${TRAIN_LOG:-artifacts/logs/training_full.log}" +TOPIC="${FORGE_NOTIFY_TOPIC:-weezboo-forge-training}" +NTFY_URL="${FORGE_NOTIFY_URL:-https://ntfy.sh/${TOPIC}}" + +abs_path() { + local path="$1" + if [[ "$path" = /* ]]; then + echo "$path" + else + echo "$PROJECT_ROOT/$path" + fi +} + +upsert_env() { + local key="$1" + local value="$2" + local file="$3" + mkdir -p "$(dirname "$file")" + if [[ -f "$file" ]] && grep -q "^${key}=" "$file"; then + sed -i "s#^${key}=.*#${key}=${value}#" "$file" + else + echo "${key}=${value}" >>"$file" + fi +} + +send_notify() { + local title="$1" + local body="$2" + local tags="${3:-warning}" + curl -fsS -m 20 \ + -H "Title: ${title}" \ + -H "Tags: ${tags}" \ + -H "Priority: high" \ + -d "$body" \ + "$NTFY_URL" >/dev/null || true +} + +LOG_FILE="$(abs_path "$TRAIN_LOG")" +STATUS_FILE_ABS="$(abs_path "$STATUS_FILE")" +STATE_FILE_ABS="$(abs_path "$STATE_FILE")" +FORGE_ENV_FILE_ABS="$(abs_path "$FORGE_ENV_FILE")" +MONITOR_STATUS_FILE_ABS="$(abs_path "$MONITOR_STATUS_FILE")" +mkdir -p "$(dirname "$STATUS_FILE_ABS")" + +fallback_applied=0 +fatal_stopped=0 +last_step=0 +last_step_ts=0 +nan_hits=0 +resume_armed=0 +active_profile="primary" +last_nan_signature="" +posttrain_triggered=0 + +if [[ -f "$STATE_FILE_ABS" ]]; then + # shellcheck disable=SC1090 + source "$STATE_FILE_ABS" || true +fi + +segment_start_line=1 +if [[ -f "$LOG_FILE" ]]; then + if [[ "$RUN_ID" != "unknown" ]]; then + marker_line="$(grep -a -n "forge-training-start run_id=${RUN_ID}" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)" + else + marker_line="$(grep -a -n "forge-training-start" "$LOG_FILE" | tail -n 1 | cut -d ':' -f 1 || true)" + fi + if [[ "$marker_line" =~ ^[0-9]+$ ]] && [[ "$marker_line" -gt 0 ]]; then + segment_start_line=$((marker_line + 1)) + fi +fi + +progress="none" +if [[ -f "$LOG_FILE" ]]; then + progress="$(tail -n +"$segment_start_line" "$LOG_FILE" | grep -a -oE "[0-9]+/${TARGET_STEPS}" | tail -n 1 || true)" + [[ -z "$progress" ]] && progress="none" +fi + +step=0 +if [[ "$progress" != "none" ]]; then + step="${progress%%/*}" +fi + +service_state="$(systemctl --user is-active "$TRAINING_SERVICE_NAME" 2>/dev/null || true)" +running="no" +if [[ "$service_state" == "active" ]] || [[ "$service_state" == "activating" ]]; then + running="yes" +fi + +now_epoch="$(date +%s)" +if [[ -f "$LOG_FILE" ]]; then + log_mtime_epoch="$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)" +else + log_mtime_epoch=0 +fi +log_age="$((now_epoch - log_mtime_epoch))" + +if [[ "$step" -gt "$last_step" ]]; then + last_step="$step" + last_step_ts="$now_epoch" +elif [[ "$last_step_ts" -eq 0 ]]; then + last_step_ts="$now_epoch" +fi + +latest_bad_line="" +if [[ -f "$LOG_FILE" ]]; then + latest_bad_line="$(tail -n +"$segment_start_line" "$LOG_FILE" | grep -a -E -i "'(loss|grad_norm|eval_loss|entropy)':[[:space:]]*'?(nan|inf)'?|nan_guard_detected|nan_guard_stopping_training" | tail -n 1 || true)" +fi + +new_nan_event="no" +if [[ -n "$latest_bad_line" ]]; then + signature="$(printf '%s' "$latest_bad_line" | sha256sum | awk '{print $1}')" + if [[ "$signature" != "$last_nan_signature" ]]; then + last_nan_signature="$signature" + nan_hits=$((nan_hits + 1)) + new_nan_event="yes" + fi +fi + +stop_training() { + systemctl --user stop "$TRAINING_SERVICE_NAME" >/dev/null 2>&1 || true + pkill -f "scripts/run_training.py --config" >/dev/null 2>&1 || true + sleep 2 +} + +start_service_with_config() { + local cfg="$1" + local enable_resume="$2" + local run_id + + upsert_env "TRAIN_CONFIG" "$cfg" "$FORGE_ENV_FILE_ABS" + upsert_env "ENABLE_RESUME" "$enable_resume" "$FORGE_ENV_FILE_ABS" + + run_id="$(date -u +%Y%m%dT%H%M%SZ)" + systemctl --user set-environment RUN_ID="$run_id" + systemctl --user start "$TRAINING_SERVICE_NAME" + + RUN_ID="$run_id" + TRAIN_CONFIG="$cfg" +} + +action="none" +message="ok" + +if [[ "$fatal_stopped" -eq 1 ]]; then + action="fatal_locked" + message="watchdog_fatal_stop_active" +else + if [[ "$step" -ge "$TARGET_STEPS" ]]; then + action="completed" + message="target_step_reached" + if [[ "$posttrain_triggered" -eq 0 ]] && [[ -x "$COMPLETION_SCRIPT" ]]; then + if bash "$COMPLETION_SCRIPT"; then + posttrain_triggered=1 + action="posttrain_triggered" + message="completion_pipeline_started" + else + action="posttrain_failed" + message="completion_pipeline_failed" + send_notify "LLM v100 completion failed" "Post-training pipeline failed after step completion." "warning,rotating_light" + fi + fi + elif [[ "$new_nan_event" == "yes" ]]; then + if [[ "$fallback_applied" -eq 0 ]] && [[ "$active_profile" != "fallback" ]]; then + stop_training + start_service_with_config "$FALLBACK_CONFIG" "0" + fallback_applied=1 + active_profile="fallback" + nan_hits=0 + action="fallback_restart" + message="nan detected on primary; fallback started" + send_notify "LLM v100 fallback restart" "NaN/Inf detected on primary profile. Restarted with fallback config." "warning,repeat" + last_step_ts="$now_epoch" + else + stop_training + fatal_stopped=1 + action="fatal_stop_after_fallback" + message="nan detected on fallback; training stopped" + send_notify "LLM v100 stopped" "NaN/Inf detected on fallback. Fatal lock enabled." "rotating_light,warning" + fi + else + idle_seconds="$((now_epoch - last_step_ts))" + if [[ "$running" == "yes" ]] && [[ "$idle_seconds" -ge "$MAX_IDLE_SECONDS" ]] && [[ "$log_age" -ge "$MAX_LOG_STALE_SECONDS" ]]; then + stop_training + if [[ "$active_profile" == "fallback" ]]; then + start_service_with_config "$FALLBACK_CONFIG" "0" + else + start_service_with_config "$TRAIN_CONFIG" "0" + fi + action="restart_stalled" + message="stalled run restarted" + send_notify "LLM v100 stalled restart" "No progress for ${idle_seconds}s; service restarted." "warning,repeat" + last_step_ts="$now_epoch" + elif [[ "$running" == "no" ]] && [[ "$step" -lt "$TARGET_STEPS" ]]; then + if [[ "$active_profile" == "fallback" ]]; then + start_service_with_config "$FALLBACK_CONFIG" "0" + else + start_service_with_config "$TRAIN_CONFIG" "0" + fi + action="restart_down" + message="service down; restarted" + send_notify "LLM v100 restarted" "Training service was down and restarted." "warning,repeat" + last_step_ts="$now_epoch" + fi + fi +fi + +if [[ "$resume_armed" -eq 0 ]] && [[ "$step" -ge "$RESUME_AFTER_STEP" ]] && [[ "$nan_hits" -eq 0 ]]; then + upsert_env "ENABLE_RESUME" "1" "$FORGE_ENV_FILE_ABS" + resume_armed=1 +fi + +bash "$PROJECT_ROOT/scripts/monitor_v100_training.sh" || true + +{ + echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + echo "running=$running" + echo "step=$step" + echo "target_steps=$TARGET_STEPS" + echo "run_segment_id=$RUN_ID" + echo "active_profile=$active_profile" + echo "nan_hits=$nan_hits" + echo "fallback_applied=$fallback_applied" + echo "fatal_stopped=$fatal_stopped" + echo "resume_armed=$resume_armed" + echo "posttrain_triggered=$posttrain_triggered" + echo "log_age_seconds=$log_age" + echo "action=$action" + echo "message=$message" +} > "$STATUS_FILE_ABS" + +{ + echo "fallback_applied=$fallback_applied" + echo "fatal_stopped=$fatal_stopped" + echo "last_step=$last_step" + echo "last_step_ts=$last_step_ts" + echo "nan_hits=$nan_hits" + echo "resume_armed=$resume_armed" + echo "active_profile=$active_profile" + echo "last_nan_signature=$last_nan_signature" + echo "posttrain_triggered=$posttrain_triggered" +} > "$STATE_FILE_ABS"