Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions configs/models/turkcell_7b_a100_v5_recovery_low_lr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Turkcell-7B A100 recovery profile after NaN stop.
_base: "./turkcell_7b.yaml"

model:
max_seq_length: 2048

data:
train_path: "data/processed/turkish_sft_v3_clean.jsonl"
eval_path: "data/processed/turkish_eval.jsonl"

training:
num_epochs: 1
learning_rate: 2.0e-5
lr_scheduler_type: "cosine"
warmup_ratio: 0.05
max_grad_norm: 1.0
per_device_train_batch_size: 8
gradient_accumulation_steps: 2
eval_steps: 500
save_steps: 500
fp16: false
bf16: true

wandb:
run_name: "turkcell-7b-sft-v5-a100-bf16-recovery-low-lr"
14 changes: 14 additions & 0 deletions configs/models/turkcell_7b_a100_v6_recovery_reset_opt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Turkcell-7B A100 recovery profile with optimizer reset.
# Use adapter warm-start from checkpoint-500 without resuming optimizer state.
_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml"

training:
learning_rate: 1.0e-5
warmup_ratio: 0.08
max_grad_norm: 0.3
eval_steps: 200
save_steps: 100
adapter_init_path: "artifacts/training/turkcell-7b-sft-v3-a100-bf16-stable/checkpoint-500"

wandb:
run_name: "turkcell-7b-sft-v6-a100-bf16-recovery-reset-opt"
12 changes: 12 additions & 0 deletions configs/models/turkcell_7b_a100_v7_balanced_stable.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Turkcell-7B A100 balanced-stable profile.
# Goal: reduce NaN risk without excessive eval/checkpoint overhead.
_base: "./turkcell_7b_a100_v5_recovery_low_lr.yaml"

training:
learning_rate: 1.0e-5
max_grad_norm: 0.3
eval_steps: 500
save_steps: 250

wandb:
run_name: "turkcell-7b-sft-v7-a100-bf16-balanced-stable"
17 changes: 17 additions & 0 deletions configs/models/turkcell_7b_a100_v8_stable_reset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Turkcell-7B A100 stable-reset profile.
# Resume from v7 checkpoint weights via adapter_init_path only.
# Do not resume optimizer/scheduler state.
_base: "./turkcell_7b_a100_v7_balanced_stable.yaml"

training:
learning_rate: 5.0e-6
max_grad_norm: 0.3
warmup_ratio: 0.10
eval_steps: 500
save_steps: 250
fp16: false
bf16: true
adapter_init_path: "artifacts/training/turkcell-7b-sft-v7-a100-bf16-balanced-stable/checkpoint-1000"

wandb:
run_name: "turkcell-7b-sft-v8-a100-bf16-stable-reset"
10 changes: 10 additions & 0 deletions configs/models/turkcell_7b_a100_v8b_ultra_stable_fallback.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Turkcell-7B A100 ultra-stable fallback profile.
# Use only when v8 fails the first 300-step stability gate.
_base: "./turkcell_7b_a100_v8_stable_reset.yaml"

training:
learning_rate: 3.0e-6
max_grad_norm: 0.2

wandb:
run_name: "turkcell-7b-sft-v8b-a100-bf16-ultra-stable-fallback"
37 changes: 37 additions & 0 deletions configs/models/turkcell_7b_v100_v2_stable.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# V100 stable recovery profile (clean restart)
_base: "../base.yaml"

model:
name: "TURKCELL/Turkcell-LLM-7b-v1"
max_seq_length: 2048
dtype: "float16"

training:
num_epochs: 3
max_steps: 8601
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 5.0e-5
warmup_ratio: 0.10
lr_scheduler_type: "cosine"
weight_decay: 0.01
max_grad_norm: 0.3
logging_steps: 10
eval_steps: 500
save_steps: 250
save_total_limit: 20
fp16: true
bf16: false

lora:
r: 32
alpha: 64

data:
train_path: "data/processed/turkish_sft.jsonl"
eval_path: "data/processed/turkish_eval.jsonl"

wandb:
project: "lowresource-llm-forge"
run_name: "turkcell-7b-sft-v100-v2-stable"
enabled: false
9 changes: 9 additions & 0 deletions configs/models/turkcell_7b_v100_v2b_fallback.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# V100 fallback profile (single retry after NaN)
_base: "turkcell_7b_v100_v2_stable.yaml"

training:
learning_rate: 3.0e-5
max_grad_norm: 0.2

wandb:
run_name: "turkcell-7b-sft-v100-v2b-fallback"
37 changes: 37 additions & 0 deletions configs/models/turkcell_7b_v100_v3_ultrastable.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# V100 ultra-stable profile (clean restart)
_base: "../base.yaml"

model:
name: "TURKCELL/Turkcell-LLM-7b-v1"
max_seq_length: 2048
dtype: "float16"

training:
num_epochs: 3
max_steps: 8601
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 2.0e-5
warmup_ratio: 0.12
lr_scheduler_type: "cosine"
weight_decay: 0.01
max_grad_norm: 0.2
logging_steps: 10
eval_steps: 250
save_steps: 250
save_total_limit: 30
fp16: true
bf16: false

lora:
r: 32
alpha: 64

data:
train_path: "data/processed/turkish_sft.jsonl"
eval_path: "data/processed/turkish_eval.jsonl"

wandb:
project: "lowresource-llm-forge"
run_name: "turkcell-7b-sft-v100-v3-ultrastable"
enabled: false
9 changes: 9 additions & 0 deletions configs/models/turkcell_7b_v100_v3b_fallback.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# V100 fallback profile (single retry after NaN)
_base: "turkcell_7b_v100_v3_ultrastable.yaml"

training:
learning_rate: 1.0e-5
max_grad_norm: 0.1

wandb:
run_name: "turkcell-7b-sft-v100-v3b-fallback"
13 changes: 13 additions & 0 deletions configs/serving/vllm_a100_v6_merged.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# vLLM serving configuration for A100 merged v6 model

model_path: "artifacts/merged/turkcell-7b-a100-v6-recovery-reset-opt"
host: "0.0.0.0"
port: 18020
tensor_parallel_size: 1
gpu_memory_utilization: 0.85
max_model_len: 4096
dtype: "float16"
enable_prefix_caching: true
trust_remote_code: false
enforce_eager: false
max_num_seqs: 64
13 changes: 13 additions & 0 deletions configs/serving/vllm_a100_v8_merged.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# vLLM serving configuration for A100 merged v8 model

model_path: "artifacts/merged/turkcell-7b-a100-v8-stable-reset"
host: "0.0.0.0"
port: 18030
tensor_parallel_size: 1
gpu_memory_utilization: 0.85
max_model_len: 4096
dtype: "float16"
enable_prefix_caching: true
trust_remote_code: false
enforce_eager: false
max_num_seqs: 64
13 changes: 13 additions & 0 deletions configs/serving/vllm_v100_v3_merged.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# vLLM serving configuration for V100 merged v3 model

model_path: "artifacts/merged/turkcell-7b-v100-v3-ultrastable"
host: "0.0.0.0"
port: 18040
tensor_parallel_size: 1
gpu_memory_utilization: 0.75
max_model_len: 4096
dtype: "float16"
enable_prefix_caching: true
trust_remote_code: false
enforce_eager: false
max_num_seqs: 48
11 changes: 11 additions & 0 deletions deploy/systemd/forge-posttrain.path
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Unit]
Description=Watch training status changes and trigger post-training pipeline
After=forge-training-monitor.service
Wants=forge-training-monitor.service

[Path]
PathModified=%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_status_a100.txt
Unit=forge-posttrain.service

[Install]
WantedBy=default.target
16 changes: 16 additions & 0 deletions deploy/systemd/forge-posttrain.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=LowResource-LLM-Forge Post-Training Pipeline Trigger
After=forge-training-monitor.service forge-training.service
Wants=forge-training-monitor.service

[Service]
Type=oneshot
WorkingDirectory=%h/projects/LowResource-LLM-Forge
Environment=PYTHONUNBUFFERED=1
EnvironmentFile=-%h/.config/forge/training.env
ExecStart=%h/projects/LowResource-LLM-Forge/scripts/run_posttrain_if_complete.sh
StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log
StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/posttrain_v8.log

[Install]
WantedBy=default.target
19 changes: 19 additions & 0 deletions deploy/systemd/forge-training-monitor.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=LowResource-LLM-Forge Training Progress Monitor
After=forge-training.service
Wants=forge-training.service
PartOf=forge-training.service

[Service]
Type=simple
WorkingDirectory=%h/projects/LowResource-LLM-Forge
Environment=PYTHONUNBUFFERED=1
EnvironmentFile=-%h/.config/forge/training.env
ExecStart=%h/projects/LowResource-LLM-Forge/scripts/monitor_a100_training.sh
Restart=on-failure
RestartSec=20
StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log
StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_monitor_a100.log

[Install]
WantedBy=default.target
18 changes: 18 additions & 0 deletions deploy/systemd/forge-training-watchdog.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[Unit]
Description=LowResource-LLM-Forge Training Watchdog
After=forge-training.service
Wants=forge-training.service

[Service]
Type=simple
WorkingDirectory=%h/projects/LowResource-LLM-Forge
Environment=PYTHONUNBUFFERED=1
EnvironmentFile=-%h/.config/forge/training.env
ExecStart=%h/projects/LowResource-LLM-Forge/scripts/training_watchdog.py --service forge-training.service
Restart=always
RestartSec=10
StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log
StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/training_watchdog.log

[Install]
WantedBy=default.target
18 changes: 18 additions & 0 deletions deploy/systemd/forge-training.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[Unit]
Description=LowResource-LLM-Forge A100 Training
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
WorkingDirectory=%h/projects/LowResource-LLM-Forge
Environment=PYTHONUNBUFFERED=1
EnvironmentFile=-%h/.config/forge/training.env
ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_a100_training.sh
Restart=on-failure
RestartSec=20
StandardOutput=journal
StandardError=journal

[Install]
WantedBy=default.target
21 changes: 21 additions & 0 deletions deploy/systemd/forge-v100-training.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[Unit]
Description=LowResource-LLM-Forge V100 Training
After=network-online.target
Wants=network-online.target

[Service]
Type=simple
WorkingDirectory=%h/projects/LowResource-LLM-Forge
Environment=PYTHONUNBUFFERED=1
EnvironmentFile=-%h/.config/forge/v100_training.env
ExecStart=%h/projects/LowResource-LLM-Forge/scripts/start_v100_training.sh
Restart=on-failure
RestartSec=20
StartLimitIntervalSec=600
StartLimitBurst=3
KillMode=control-group
StandardOutput=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log
StandardError=append:%h/projects/LowResource-LLM-Forge/artifacts/logs/systemd_v100_training.log

[Install]
WantedBy=default.target
8 changes: 8 additions & 0 deletions deploy/systemd/forge-v100-watchdog.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[Unit]
Description=LowResource-LLM-Forge V100 Training Watchdog
After=network-online.target

[Service]
Type=oneshot
WorkingDirectory=%h/projects/LowResource-LLM-Forge
ExecStart=/usr/bin/env bash %h/projects/LowResource-LLM-Forge/scripts/watchdog_training.sh
12 changes: 12 additions & 0 deletions deploy/systemd/forge-v100-watchdog.timer
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Unit]
Description=Run V100 training watchdog every 30 seconds

[Timer]
OnBootSec=1min
OnUnitActiveSec=30s
AccuracySec=5s
Unit=forge-v100-watchdog.service
Persistent=true

[Install]
WantedBy=timers.target
Loading