ogulcanaydogan · ogulcanaydogan · Mar 6, 2026 · Mar 6, 2026 · Mar 7, 2026
diff --git a/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml b/configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml
@@ -0,0 +1,25 @@
+# Turkcell-7B A100 recovery profile after NaN stop.
+_base: "./turkcell_7b.yaml"
+
+model:
+  max_seq_length: 2048
+
+data:
+  train_path: "data/processed/turkish_sft_v3_clean.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+training:
+  num_epochs: 1
+  learning_rate: 2.0e-5
+  lr_scheduler_type: "cosine"
+  warmup_ratio: 0.05
+  max_grad_norm: 1.0
+  per_device_train_batch_size: 8
+  gradient_accumulation_steps: 2
+  eval_steps: 500
+  save_steps: 500
+  fp16: false
+  bf16: true
+
+wandb:
+  run_name: "turkcell-7b-sft-v5-a100-bf16-recovery-low-lr"
diff --git a/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml b/configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml
@@ -0,0 +1,14 @@
+# Turkcell-7B A100 recovery profile with optimizer reset.
+# Use adapter warm-start from checkpoint-500 without resuming optimizer state.
+_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml"
+
+training:
+  learning_rate: 1.0e-5
+  warmup_ratio: 0.08
+  max_grad_norm: 0.3
+  eval_steps: 200
+  save_steps: 100
+  adapter_init_path: "artifacts/training/turkcell-7b-sft-v3-a100-bf16-stable/checkpoint-500"
+
+wandb:
+  run_name: "turkcell-7b-sft-v6-a100-bf16-recovery-reset-opt"
diff --git a/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml b/configs/models/turkcell_7b_a100_v7_balanced_stable.yaml
@@ -0,0 +1,12 @@
+# Turkcell-7B A100 balanced-stable profile.
+# Goal: reduce NaN risk without excessive eval/checkpoint overhead.
+_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml"
+
+training:
+  learning_rate: 1.0e-5
+  max_grad_norm: 0.3
+  eval_steps: 500
+  save_steps: 250
+
+wandb:
+  run_name: "turkcell-7b-sft-v7-a100-bf16-balanced-stable"
diff --git a/configs/models/turkcell_7b_a100_v8_stable_reset.yaml b/configs/models/turkcell_7b_a100_v8_stable_reset.yaml
@@ -0,0 +1,17 @@
+# Turkcell-7B A100 stable-reset profile.
+# Resume from v7 checkpoint weights via adapter_init_path only.
+# Do not resume optimizer/scheduler state.
+_base: "./turkcell_7b_a100_v7_balanced_stable.yaml"
+
+training:
+  learning_rate: 5.0e-6
+  max_grad_norm: 0.3
+  warmup_ratio: 0.10
+  eval_steps: 500
+  save_steps: 250
+  fp16: false
+  bf16: true
+  adapter_init_path: "artifacts/training/turkcell-7b-sft-v7-a100-bf16-balanced-stable/checkpoint-1000"
+
+wandb:
+  run_name: "turkcell-7b-sft-v8-a100-bf16-stable-reset"
diff --git a/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml b/configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml
@@ -0,0 +1,10 @@
+# Turkcell-7B A100 ultra-stable fallback profile.
+# Use only when v8 fails the first 300-step stability gate.
+_base: "./turkcell_7b_a100_v8_stable_reset.yaml"
+
+training:
+  learning_rate: 3.0e-6
+  max_grad_norm: 0.2
+
+wandb:
+  run_name: "turkcell-7b-sft-v8b-a100-bf16-ultra-stable-fallback"
diff --git a/configs/models/turkcell_7b_v100_v2_stable.yaml b/configs/models/turkcell_7b_v100_v2_stable.yaml
@@ -0,0 +1,37 @@
+# V100 stable recovery profile (clean restart)
+_base: "../base.yaml"
+
+model:
+  name: "TURKCELL/Turkcell-LLM-7b-v1"
+  max_seq_length: 2048
+  dtype: "float16"
+
+training:
+  num_epochs: 3
+  max_steps: 8601
+  per_device_train_batch_size: 2
+  gradient_accumulation_steps: 8
+  learning_rate: 5.0e-5
+  warmup_ratio: 0.10
+  lr_scheduler_type: "cosine"
+  weight_decay: 0.01
+  max_grad_norm: 0.3
+  logging_steps: 10
+  eval_steps: 500
+  save_steps: 250
+  save_total_limit: 20
+  fp16: true
+  bf16: false
+
+lora:
+  r: 32
+  alpha: 64
+
+data:
+  train_path: "data/processed/turkish_sft.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+wandb:
+  project: "lowresource-llm-forge"
+  run_name: "turkcell-7b-sft-v100-v2-stable"
+  enabled: false
diff --git a/configs/models/turkcell_7b_v100_v2b_fallback.yaml b/configs/models/turkcell_7b_v100_v2b_fallback.yaml
@@ -0,0 +1,9 @@
+# V100 fallback profile (single retry after NaN)
+_base: "turkcell_7b_v100_v2_stable.yaml"
+
+training:
+  learning_rate: 3.0e-5
+  max_grad_norm: 0.2
+
+wandb:
+  run_name: "turkcell-7b-sft-v100-v2b-fallback"
diff --git a/configs/models/turkcell_7b_v100_v3_ultrastable.yaml b/configs/models/turkcell_7b_v100_v3_ultrastable.yaml
@@ -0,0 +1,37 @@
+# V100 ultra-stable profile (clean restart)
+_base: "../base.yaml"
+
+model:
+  name: "TURKCELL/Turkcell-LLM-7b-v1"
+  max_seq_length: 2048
+  dtype: "float16"
+
+training:
+  num_epochs: 3
+  max_steps: 8601
+  per_device_train_batch_size: 2
+  gradient_accumulation_steps: 8
+  learning_rate: 2.0e-5
+  warmup_ratio: 0.12
+  lr_scheduler_type: "cosine"
+  weight_decay: 0.01
+  max_grad_norm: 0.2
+  logging_steps: 10
+  eval_steps: 250
+  save_steps: 250
+  save_total_limit: 30
+  fp16: true
+  bf16: false
+
+lora:
+  r: 32
+  alpha: 64
+
+data:
+  train_path: "data/processed/turkish_sft.jsonl"
+  eval_path: "data/processed/turkish_eval.jsonl"
+
+wandb:
+  project: "lowresource-llm-forge"
+  run_name: "turkcell-7b-sft-v100-v3-ultrastable"
+  enabled: false
diff --git a/configs/models/turkcell_7b_v100_v3b_fallback.yaml b/configs/models/turkcell_7b_v100_v3b_fallback.yaml
@@ -0,0 +1,9 @@
+# V100 fallback profile (single retry after NaN)
+_base: "turkcell_7b_v100_v3_ultrastable.yaml"
+
+training:
+  learning_rate: 1.0e-5
+  max_grad_norm: 0.1
+
+wandb:
+  run_name: "turkcell-7b-sft-v100-v3b-fallback"
diff --git a/configs/serving/vllm_a100_v6_merged.yaml b/configs/serving/vllm_a100_v6_merged.yaml
@@ -0,0 +1,13 @@
+# vLLM serving configuration for A100 merged v6 model
+
+model_path: "artifacts/merged/turkcell-7b-a100-v6-recovery-reset-opt"
+host: "0.0.0.0"
+port: 18020
+tensor_parallel_size: 1
+gpu_memory_utilization: 0.85
+max_model_len: 4096
+dtype: "float16"
+enable_prefix_caching: true
+trust_remote_code: false
+enforce_eager: false
+max_num_seqs: 64
diff --git a/configs/serving/vllm_a100_v8_merged.yaml b/configs/serving/vllm_a100_v8_merged.yaml
@@ -0,0 +1,13 @@
+# vLLM serving configuration for A100 merged v8 model
+
+model_path: "artifacts/merged/turkcell-7b-a100-v8-stable-reset"
+host: "0.0.0.0"
+port: 18030
+tensor_parallel_size: 1
+gpu_memory_utilization: 0.85
+max_model_len: 4096
+dtype: "float16"
+enable_prefix_caching: true
+trust_remote_code: false
+enforce_eager: false
+max_num_seqs: 64
diff --git a/configs/serving/vllm_v100_v3_merged.yaml b/configs/serving/vllm_v100_v3_merged.yaml
@@ -0,0 +1,13 @@
+# vLLM serving configuration for V100 merged v3 model
+
+model_path: "artifacts/merged/turkcell-7b-v100-v3-ultrastable"
+host: "0.0.0.0"
+port: 18040
+tensor_parallel_size: 1
+gpu_memory_utilization: 0.75
+max_model_len: 4096
+dtype: "float16"
+enable_prefix_caching: true
+trust_remote_code: false
+enforce_eager: false
+max_num_seqs: 48
diff --git a/deploy/systemd/forge-posttrain.path b/deploy/systemd/forge-posttrain.path
@@ -0,0 +1,11 @@
+[Unit]
+Description=Watch training status changes and trigger post-training pipeline
+After=forge-training-monitor.service
+Wants=forge-training-monitor.service
+
+[Path]
+PathModified=%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_status_a100.txt
+Unit=forge-posttrain.service
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-posttrain.service b/deploy/systemd/forge-posttrain.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=LowResource-LLM-Forge Post-Training Pipeline Trigger
+After=forge-training-monitor.service forge-training.service
+Wants=forge-training-monitor.service
+
+[Service]
+Type=oneshot
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/run_posttrain_if_complete.sh
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training-monitor.service b/deploy/systemd/forge-training-monitor.service
@@ -0,0 +1,19 @@
+[Unit]
+Description=LowResource-LLM-Forge Training Progress Monitor
+After=forge-training.service
+Wants=forge-training.service
+PartOf=forge-training.service
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/monitor_a100_training.sh
+Restart=on-failure
+RestartSec=20
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training-watchdog.service b/deploy/systemd/forge-training-watchdog.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=LowResource-LLM-Forge Training Watchdog
+After=forge-training.service
+Wants=forge-training.service
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/training_watchdog.py --service forge-training.service
+Restart=always
+RestartSec=10
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-training.service b/deploy/systemd/forge-training.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=LowResource-LLM-Forge A100 Training
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_a100_training.sh
+Restart=on-failure
+RestartSec=20
+StandardOutput=journal
+StandardError=journal
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-v100-training.service b/deploy/systemd/forge-v100-training.service
@@ -0,0 +1,21 @@
+[Unit]
+Description=LowResource-LLM-Forge V100 Training
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+Environment=PYTHONUNBUFFERED=1
+EnvironmentFile=-%h/.config/forge/v100_training.env
+ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_v100_training.sh
+Restart=on-failure
+RestartSec=20
+StartLimitIntervalSec=600
+StartLimitBurst=3
+KillMode=control-group
+StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log
+StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log
+
+[Install]
+WantedBy=default.target
diff --git a/deploy/systemd/forge-v100-watchdog.service b/deploy/systemd/forge-v100-watchdog.service
@@ -0,0 +1,8 @@
+[Unit]
+Description=LowResource-LLM-Forge V100 Training Watchdog
+After=network-online.target
+
+[Service]
+Type=oneshot
+WorkingDirectory=%h/projects/LowResource-LLM-Forge
+ExecStart=/usr/bin/env bash %h/projects/LowResource-LLM-Forge/scripts/watchdog_training.sh
diff --git a/deploy/systemd/forge-v100-watchdog.timer b/deploy/systemd/forge-v100-watchdog.timer
@@ -0,0 +1,12 @@
+[Unit]
+Description=Run V100 training watchdog every 30 seconds
+
+[Timer]
+OnBootSec=1min
+OnUnitActiveSec=30s
+AccuracySec=5s
+Unit=forge-v100-watchdog.service
+Persistent=true
+
+[Install]
+WantedBy=timers.target