ogulcanaydogan · ogulcanaydogan · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help dev test lint typecheck qa train eval serve smoke-serve download-data transcribe publish benchmark ops-dashboard
+.PHONY: help dev test lint typecheck qa train eval serve smoke-serve download-data transcribe publish benchmark manifest post-train set-wandb ops-dashboard
 
 .DEFAULT_GOAL := help
 
@@ -82,5 +82,21 @@ benchmark: ## Benchmark endpoint (BASE_URL=<url> [API_KEY=<key>])
 	if [ -n "$(CONCURRENCY)" ]; then cmd="$$cmd --concurrency $(CONCURRENCY)"; fi; \
 	eval "$$cmd"
 
+manifest: ## Generate training manifest (TRAIN_CONFIG/RUN_DIR/LOG_FILE)
+	@if [ -z "$(RUN_DIR)" ] || [ -z "$(LOG_FILE)" ]; then \
+		echo "Usage: make manifest TRAIN_CONFIG=<yaml> RUN_DIR=<run_dir> LOG_FILE=<train_log>"; \
+		exit 1; \
+	fi
+	uv run python scripts/generate_training_manifest.py \
+		--config "$(TRAIN_CONFIG)" \
+		--run-dir "$(RUN_DIR)" \
+		--log-file "$(LOG_FILE)"
+
+post-train: ## Run post-training pipeline (eval -> merge -> optional smoke)
+	bash scripts/post_training_pipeline.sh
+
+set-wandb: ## Set WANDB_API_KEY for training services
+	bash scripts/set_wandb_key.sh
+
 ops-dashboard: ## Launch runtime ops dashboard
 	bash scripts/runtime_dashboard.sh
diff --git a/README.md b/README.md
@@ -315,6 +315,22 @@ All commands support `--help` for full option documentation. Run `make help` to
 
 ---
 
+## Post-Completion Roadmap
+
+After the current priority training run is completed, the next improvement work is tracked in:
+
+- `docs/ROADMAP.md`
+
+Roadmap phases:
+
+1. Stability hardening (NaN guards, fail-fast, auto-resume)
+2. Turkish data expansion and quality filtering
+3. A100 training recipe optimization
+4. Serving throughput and latency optimization
+5. Evaluation depth and release governance
+
+---
+
 ## Notebooks
 
 Interactive Jupyter notebooks for exploration and analysis:

diff --git a/configs/models/turkcell_7b_a100_resume_bf16_clean.yaml b/configs/models/turkcell_7b_a100_resume_bf16_clean.yaml
@@ -0,0 +1,15 @@
+# A100 80GB optimized bf16 resume profile from checkpoint-800.
+# Uses a dedicated run name and save/eval interval for long remote runs.
+
+_base: "./turkcell_7b.yaml"
+
+training:
+  per_device_train_batch_size: 8
+  gradient_accumulation_steps: 2
+  eval_steps: 1000
+  save_steps: 1000
+  fp16: false
+  bf16: true
+
+wandb:
+  run_name: "turkcell-7b-sft-v1-a100-bf16-r2"
diff --git a/configs/models/turkcell_7b_a100_v3_clean.yaml b/configs/models/turkcell_7b_a100_v3_clean.yaml
@@ -0,0 +1,25 @@
+# Turkcell-7B A100 stable profile (post-NaN recovery).
+_base: "./turkcell_7b.yaml"
+
+model:
+  max_seq_length: 2048
+
+data:
+  train_path: "data/processed/turkish_sft_v3_clean.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+training:
+  num_epochs: 1
+  learning_rate: 5.0e-5
+  lr_scheduler_type: "cosine"
+  warmup_ratio: 0.05
+  max_grad_norm: 1.0
+  per_device_train_batch_size: 8
+  gradient_accumulation_steps: 2
+  eval_steps: 500
+  save_steps: 500
+  fp16: false
+  bf16: true
+
+wandb:
+  run_name: "turkcell-7b-sft-v3-a100-bf16-stable"
diff --git a/configs/models/turkcell_7b_a100_v4_recovery.yaml b/configs/models/turkcell_7b_a100_v4_recovery.yaml
@@ -0,0 +1,25 @@
+# Turkcell-7B A100 recovery profile after NaN stop at step 800.
+_base: "./turkcell_7b.yaml"
+
+model:
+  max_seq_length: 2048
+
+data:
+  train_path: "data/processed/turkish_sft_v3_clean.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+training:
+  num_epochs: 1
+  learning_rate: 3.0e-5
+  lr_scheduler_type: "cosine"
+  warmup_ratio: 0.05
+  max_grad_norm: 1.0
+  per_device_train_batch_size: 8
+  gradient_accumulation_steps: 2
+  eval_steps: 500
+  save_steps: 500
+  fp16: false
+  bf16: true
+
+wandb:
+  run_name: "turkcell-7b-sft-v4-a100-bf16-recovery"
diff --git a/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml b/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml
@@ -0,0 +1,25 @@
+# Turkcell-7B A100 recovery profile after NaN stop.
+_base: "./turkcell_7b.yaml"
+
+model:
+  max_seq_length: 2048
+
+data:
+  train_path: "data/processed/turkish_sft_v3_clean.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+training:
+  num_epochs: 1
+  learning_rate: 2.0e-5
+  lr_scheduler_type: "cosine"
+  warmup_ratio: 0.05
+  max_grad_norm: 1.0
+  per_device_train_batch_size: 8
+  gradient_accumulation_steps: 2
+  eval_steps: 500
+  save_steps: 500
+  fp16: false
+  bf16: true
+
+wandb:
+  run_name: "turkcell-7b-sft-v5-a100-bf16-recovery-low-lr"
diff --git a/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml b/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml
@@ -0,0 +1,14 @@
+# Turkcell-7B A100 recovery profile with optimizer reset.
+# Use adapter warm-start from checkpoint-500 without resuming optimizer state.
+_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml"
+
+training:
+  learning_rate: 3.0e-5
+  warmup_ratio: 0.08
+  max_grad_norm: 0.5
+  eval_steps: 250
+  save_steps: 250
+  adapter_init_path: "artifacts/training/turkcell-7b-sft-v3-a100-bf16-stable/checkpoint-500"
+
+wandb:
+  run_name: "turkcell-7b-sft-v6-a100-bf16-recovery-reset-opt"
diff --git a/deploy/systemd/forge-training-monitor.service b/deploy/systemd/forge-training-monitor.service
@@ -0,0 +1,19 @@
+[Unit]
+Description=LowResource-LLM-Forge Training Progress Monitor
+After=forge-training.service
+Wants=forge-training.service
+PartOf=forge-training.service
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/monitor_a100_training.sh
+Restart=on-failure
+RestartSec=20
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training-watchdog.service b/deploy/systemd/forge-training-watchdog.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=LowResource-LLM-Forge Training Watchdog
+After=forge-training.service
+Wants=forge-training.service
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/training_watchdog.py --service forge-training.service --nan-consecutive-limit 3
+Restart=always
+RestartSec=10
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training.service b/deploy/systemd/forge-training.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=LowResource-LLM-Forge A100 Training
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_a100_training.sh
+Restart=on-failure
+RestartSec=20
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=default.target
diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md
@@ -0,0 +1,98 @@
+# Project Roadmap
+
+This roadmap starts after the current priority training run on A100 is completed and evaluated.
+
+## Current Run Definition of Done
+
+Before moving to improvement work:
+
+1. Complete the active training run (`target_steps=25845`) or end by a valid early-stop condition.
+2. Merge adapter into base model and produce a merged checkpoint.
+3. Run full evaluation (`perplexity`, `generation`, optional `mmlu_tr`) and save report artifacts.
+4. Publish a versioned release candidate with reproducible config references.
+
+## Post-Completion Improvement Plan
+
+### Phase 1: Stability Hardening (Priority P0)
+
+Goal: prevent silent training failure and auto-recover quickly.
+
+- Add NaN/Inf guard callbacks for `loss`, `grad_norm`, and `eval_loss`.
+- Fail fast on unstable metrics and auto-resume from last healthy checkpoint.
+- Keep `systemd --user` + watchdog as the default runtime path on remote hosts.
+- Persist heartbeat and key metrics to machine-readable status files for monitoring.
+
+Exit criteria:
+
+- No silent NaN progression in new runs.
+- Automatic recovery from interruption in under 10 minutes.
+- Stable checkpoints produced on schedule.
+
+### Phase 2: Turkish Data Expansion and Quality (Priority P0)
+
+Goal: improve model quality using larger, cleaner, better-balanced Turkish corpora.
+
+- Expand corpus with open Turkish sources (for example mC4, OSCAR, Wiki-derived text, curated Turkish instruction datasets).
+- Improve deduplication and language filtering thresholds.
+- Add quality scoring filters (length, script ratio, repetition, malformed text checks).
+- Build a versioned dataset mixture and track it in a changelog.
+
+Suggested starting mixture:
+
+- 60% high-quality instruction data
+- 25% domain text relevant to target use-cases
+- 15% synthetic/translated augmentation with strict filtering
+
+Exit criteria:
+
+- At least 2x unique Turkish token coverage vs current baseline.
+- Low-quality sample ratio below 5% after filtering.
+
+### Phase 3: Training Recipe Optimization on A100 (Priority P0)
+
+Goal: increase quality while preserving training stability.
+
+- Run controlled sweeps for learning rate, warmup ratio, LoRA rank/alpha, and effective batch size.
+- Keep bf16 enabled on A100 and tune gradient accumulation for throughput.
+- Tune evaluation cadence (`eval_steps=1000`) and checkpoint cadence (`save_steps=1000`).
+- Promote only runs with finite metrics and consistent convergence.
+
+Exit criteria:
+
+- Perplexity improves by at least 10% from baseline.
+- Generation quality score improves by at least 0.4.
+- No regression in safety/format adherence prompts.
+
+### Phase 4: Inference Throughput and Latency (Priority P1)
+
+Goal: approach high-quality serving UX (fast first token + fluent decode).
+
+- Tune vLLM serving args (`max_num_batched_tokens`, `max_num_seqs`, `gpu_memory_utilization`, tensor parallelism).
+- Benchmark p50/p95 latency and tokens/sec under concurrent load.
+- Add configuration profiles for low-latency and high-throughput modes.
+- Evaluate TensorRT-LLM/NIM path only after vLLM baseline is saturated.
+
+Exit criteria:
+
+- At least 30% tokens/sec gain at target concurrency.
+- p95 time-to-first-token under defined SLO.
+
+### Phase 5: Evaluation Depth and Release Governance (Priority P1)
+
+Goal: make releases trustworthy and repeatable.
+
+- Expand held-out Turkish eval set by domain.
+- Add lightweight human review rubrics for fluency, factuality, and instruction-following.
+- Track every release with dataset version, config hash, and benchmark deltas.
+- Gate promotion on quality thresholds and regression checks.
+
+Exit criteria:
+
+- Every release has reproducible lineage.
+- Promotion decisions are benchmark-backed and auditable.
+
+## Immediate Next Actions After Current Run
+
+1. Generate baseline report from the active A100 run.
+2. Launch Phase 1 stability patch set before the next long training job.
+3. Build `turkish-v2` dataset mixture and run a short smoke training cycle.
diff --git a/docs/TRAINING_GUIDE.md b/docs/TRAINING_GUIDE.md
@@ -133,3 +133,9 @@ For interactive training analysis:
 **Unsloth not found**: Install training extras with `uv sync --extra train`. Pipeline falls back to standard PEFT automatically when Unsloth is unavailable.
 
 **Poor Turkish output**: Check tokenizer coverage — models not trained on Turkish may tokenize inefficiently, reducing effective context length.
+
+## After Current Run Completes
+
+The post-completion improvement backlog (stability, data expansion, A100 recipe tuning, and serving performance) is tracked in:
+
+- `docs/ROADMAP.md`