Skip to content

Commit 9cb35bf

Browse files
committed
feat: refactor main_ds.py
Introduce a new design for key components of main_ds.py. Namely splitting Model initialization, Accelerator initialization, Optimizer initialization, and Checkpoint saving initialization into classes: Model Accelerator Checkpointer The Model class wraps the various AutoModel classes we support -- and aims to be a lightweight wrapper to help with usability of the library with different model types. setup_optimizer resides within the model class and returns one of the optimizer types we support The Accelerator class aims to both store commonly accessed variables associated with the accelerated model and abstract model/optimizer mutation away from the user who should only access our Model and Optimizer classes The Checkpointer class introduces a unified approach to our various checkpointing techniques. A user can pass in their checkpointing style (full_state or hf_format), and the checkpointer, via checkpointer.checkpoint, will save the model using the selected method and other techniques (LoRA). These classes are one of a few steps needed to "SDK-ify" the training library aditionally add test_train and test_model unit tests, e2e test add test_train.py which mocks `train` and `train_epoch`. Additionally add test_model.py which tests instantiation of all new classes: Model, Accelerator, Optimizer, Checkpointer Signed-off-by: Charlie Doern <[email protected]>
1 parent e8eb284 commit 9cb35bf

14 files changed

+3898
-982
lines changed

.github/workflows/e2e-nvidia-l40s-x4-sdk.yml

Lines changed: 305 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,317 @@
33
name: E2E (NVIDIA L40S x4) SDK Test
44

55
on:
6+
pull_request:
7+
branches:
8+
- "main"
9+
schedule:
10+
- cron: '0 16 * * *' # Runs at 4PM UTC every day
611
workflow_dispatch:
712
inputs:
813
pr_or_branch:
914
description: 'pull request number or branch name'
1015
required: true
1116
default: 'main'
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+
cancel-in-progress: true
20+
21+
env:
22+
TMPDIR: /home/tmp
23+
1224
jobs:
13-
noop:
25+
start-large-ec2-runner:
26+
runs-on: ubuntu-latest
27+
outputs:
28+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+
steps:
32+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+
with:
35+
repository: instructlab/ci-actions
36+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+
path: ci-actions
38+
ref: release-v0.1
39+
sparse-checkout: |
40+
actions/launch-ec2-runner-with-fallback
41+
42+
- name: Launch EC2 Runner with Fallback
43+
id: launch-ec2-instance-with-fallback
44+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
45+
env:
46+
TMPDIR: "/tmp"
47+
with:
48+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
49+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+
regions_config: >
52+
[
53+
{
54+
"region": "us-east-2",
55+
"subnets": {
56+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+
},
60+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+
},
63+
{
64+
"region": "us-east-1",
65+
"subnets": {
66+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+
},
73+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+
}
76+
]
77+
try_spot_instance_first: false
78+
ec2_instance_type: g6e.12xlarge
79+
aws_resource_tags: >
80+
[
81+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+
]
86+
87+
e2e-large-test:
88+
needs:
89+
- start-large-ec2-runner
90+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+
permissions:
93+
pull-requests: write
94+
95+
steps:
96+
- name: "Harden Runner"
97+
# v2.10.1
98+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+
with:
100+
egress-policy: audit
101+
- name: Install Packages
102+
run: |
103+
cat /etc/os-release
104+
mkdir -p "${TMPDIR}"
105+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+
- name: Checkout
108+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+
with:
110+
# https://github.com/actions/checkout/issues/249
111+
fetch-depth: 0
112+
113+
- name: Install dependent PRs if needed
114+
uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
115+
with:
116+
token: ${{ secrets.GITHUB_TOKEN }}
117+
118+
- name: Fetch and checkout PR
119+
if: ${{ github.event_name == 'pull_request_target' }}
120+
run: |
121+
git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
122+
git checkout pr-${{ github.event.number }}
123+
124+
- name: Update instructlab-training library
125+
run: |
126+
export CUDA_HOME="/usr/local/cuda"
127+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
128+
export PATH="$PATH:$CUDA_HOME/bin"
129+
nvidia-smi
130+
python3.11 -m venv --upgrade-deps venv
131+
. venv/bin/activate
132+
pip install instructlab
133+
pip install instructlab[cuda]
134+
pip install vllm
135+
python3.11 -m pip install packaging wheel setuptools-scm
136+
pip install .
137+
pip install .[cuda]
138+
python3.11 -m pip uninstall -y flash-attn
139+
python3.11 -m pip cache purge
140+
python3.11 -m pip install ninja
141+
MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
142+
143+
- name: Check disk before tests
144+
run: |
145+
df -h
146+
147+
# TODO: switch to downloading a ds rather than generating one
148+
# - name: Download SDG Dataset
149+
# working-directory: ./training
150+
# uses: actions/download-artifact@v4
151+
# with:
152+
# name: sdg-dataset.jsonl
153+
# path: dataset
154+
155+
- name: Run e2e test
156+
env:
157+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
158+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
159+
run: |
160+
. venv/bin/activate
161+
ls scripts
162+
ls ./
163+
./scripts/test-sdk.sh
164+
165+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
166+
# and we know that it will be written into a directory created by `mktemp -d`.
167+
# Given this information, we can use the following command to find the file:
168+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
169+
phase_num=1;
170+
for log_file in $log_files; do
171+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
172+
((phase_num++))
173+
done
174+
175+
- name: Check disk after tests
176+
run: |
177+
df -h
178+
179+
- name: Upload training logs Phase 1
180+
uses: actions/upload-artifact@v4
181+
with:
182+
name: phase-1-training-log.jsonl
183+
path: ./phase-1-training-log.jsonl
184+
retention-days: 1
185+
overwrite: true
186+
187+
- name: Upload training logs Phase 2
188+
uses: actions/upload-artifact@v4
189+
with:
190+
name: phase-2-training-log.jsonl
191+
path: ./phase-2-training-log.jsonl
192+
retention-days: 1
193+
overwrite: true
194+
195+
stop-large-ec2-runner:
196+
needs:
197+
- start-large-ec2-runner
198+
- e2e-large-test
14199
runs-on: ubuntu-latest
200+
if: ${{ always() }}
15201
steps:
16-
- name: No-op
17-
run: 'true'
202+
- name: "Harden Runner"
203+
# v2.10.1
204+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
205+
with:
206+
egress-policy: audit
207+
208+
- name: Configure AWS credentials
209+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
210+
with:
211+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
212+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
213+
aws-region: ${{ vars.AWS_REGION }}
214+
215+
- name: Stop EC2 runner
216+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
217+
with:
218+
mode: stop
219+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
220+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
221+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
222+
223+
loss-graphs:
224+
needs:
225+
- stop-large-ec2-runner
226+
runs-on: ubuntu-latest
227+
if: ${{ always() }}
228+
steps:
229+
- name: "Harden Runner"
230+
# v2.10.1
231+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
232+
with:
233+
egress-policy: audit
234+
235+
- name: Configure AWS credentials
236+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
237+
with:
238+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
239+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
240+
aws-region: ${{ vars.AWS_REGION }}
241+
242+
- name: Checkout
243+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
244+
with:
245+
# https://github.com/actions/checkout/issues/249
246+
fetch-depth: 0
247+
248+
- name: Install dependencies
249+
run: |
250+
python -m pip install --upgrade pip
251+
pip install -r requirements-dev.txt
252+
253+
- name: Download loss data Phase 1
254+
id: phase-1-download-logs
255+
uses: actions/download-artifact@v4
256+
with:
257+
name: phase-1-training-log.jsonl
258+
path: downloaded-data
259+
260+
- name: Download loss data Phase 2
261+
id: phase-2-download-logs
262+
uses: actions/download-artifact@v4
263+
with:
264+
name: phase-2-training-log.jsonl
265+
path: downloaded-data
266+
267+
- name: Try to upload Phase 1 to s3
268+
id: phase-1-upload-s3
269+
continue-on-error: true
270+
run: |
271+
python ./scripts/create-loss-graph.py \
272+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
273+
--output-file "./phase-1-test.md" \
274+
--phase "1" \
275+
--aws-region "${{ vars.AWS_REGION }}" \
276+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
277+
--base-branch "${GITHUB_REF##*/}" \
278+
--head-sha "${{ github.sha }}" \
279+
--pr-number "${{ github.event.number }}" \
280+
--origin-repository "${{ github.repository }}"
281+
282+
- name: Try to upload Phase 2 to s3
283+
id: phase-2-upload-s3
284+
continue-on-error: true
285+
run: |
286+
python ./scripts/create-loss-graph.py \
287+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
288+
--output-file "./phase-2-test.md" \
289+
--phase "2" \
290+
--aws-region "${{ vars.AWS_REGION }}" \
291+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
292+
--base-branch "${GITHUB_REF##*/}" \
293+
--head-sha "${{ github.sha }}" \
294+
--pr-number "${{ github.event.number }}" \
295+
--origin-repository "${{ github.repository }}"
296+
297+
- name: Check Phase 1 S3 upload status for success
298+
if: steps.phase-1-upload-s3.outcome == 'success'
299+
run: |
300+
echo "Uploaded Phase 1 loss graph to S3."
301+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
302+
303+
- name: Check Phase 2 S3 upload status for success
304+
if: steps.phase-2-upload-s3.outcome == 'success'
305+
run: |
306+
echo "Uploaded Phase 2 loss graph to S3."
307+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
308+
309+
- name: Check Phase 1 S3 upload status for failure
310+
if: steps.phase-1-upload-s3.outcome == 'failure'
311+
run: |
312+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
313+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
314+
315+
- name: Check Phase 2 S3 upload status for failure
316+
if: steps.phase-2-upload-s3.outcome == 'failure'
317+
run: |
318+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
319+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

scripts/ibm_legacy_tmpl.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# First Party
4+
from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
5+
6+
SPECIAL_TOKENS = SpecialTokens(
7+
system=TokenInfo("<|system|>", add_to_tokenizer=True),
8+
user=TokenInfo("<|user|>", add_to_tokenizer=True),
9+
assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
10+
eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
11+
pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
12+
bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
13+
)
14+
15+
CHAT_TEMPLATE = (
16+
"{% for message in messages %}"
17+
"{% if message['role'] == 'pretraining' %}"
18+
"{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
19+
"{% elif message['role'] == 'system' %}"
20+
"{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
21+
"{% elif message['role'] == 'user' %}"
22+
"{{'<|user|>' + '\n' + message['content'] + '\n'}}"
23+
"{% elif message['role'] == 'assistant' %}"
24+
"{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
25+
"{% endif %}"
26+
"{% if loop.last and add_generation_prompt %}"
27+
"{{ '<|assistant|>' + '\n' }}"
28+
"{% endif %}"
29+
"{% endfor %}"
30+
)

0 commit comments

Comments
 (0)