Skip to content

Commit 435e557

Browse files
committed
feat: test_train and test_model unit tests, e2e test
add test_train.py which mocks `train` and `train_epoch`. Additionally add test_model.py which tests instantiation of all new classes: Model, Accelerator, Optimizer, Checkpointer Signed-off-by: Charlie Doern <[email protected]>
1 parent a75ff0a commit 435e557

File tree

9 files changed

+2029
-2
lines changed

9 files changed

+2029
-2
lines changed

.github/workflows/e2e-nvidia-l40s-x4-sdk.yml

Lines changed: 308 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,317 @@
33
name: E2E (NVIDIA L40S x4) SDK Test
44

55
on:
6+
pull_request:
7+
branches:
8+
- "main"
9+
schedule:
10+
- cron: '0 16 * * *' # Runs at 4PM UTC every day
611
workflow_dispatch:
712
inputs:
813
pr_or_branch:
914
description: 'pull request number or branch name'
1015
required: true
1116
default: 'main'
12-
jobs:
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+
cancel-in-progress: true
20+
21+
env:
22+
TMPDIR: /home/tmp
23+
24+
jobs:
25+
start-large-ec2-runner:
26+
runs-on: ubuntu-latest
27+
outputs:
28+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+
steps:
32+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+
with:
35+
repository: instructlab/ci-actions
36+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+
path: ci-actions
38+
ref: release-v0.1
39+
sparse-checkout: |
40+
actions/launch-ec2-runner-with-fallback
41+
42+
- name: Launch EC2 Runner with Fallback
43+
id: launch-ec2-instance-with-fallback
44+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
45+
env:
46+
TMPDIR: "/tmp"
47+
with:
48+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
49+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+
regions_config: >
52+
[
53+
{
54+
"region": "us-east-2",
55+
"subnets": {
56+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+
},
60+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+
},
63+
{
64+
"region": "us-east-1",
65+
"subnets": {
66+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+
},
73+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+
}
76+
]
77+
try_spot_instance_first: false
78+
ec2_instance_type: g6e.12xlarge
79+
aws_resource_tags: >
80+
[
81+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+
]
86+
87+
e2e-large-test:
88+
needs:
89+
- start-large-ec2-runner
90+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+
permissions:
93+
pull-requests: write
94+
95+
steps:
96+
- name: "Harden Runner"
97+
# v2.10.1
98+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+
with:
100+
egress-policy: audit
101+
- name: Install Packages
102+
run: |
103+
cat /etc/os-release
104+
mkdir -p "${TMPDIR}"
105+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+
- name: Checkout
108+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+
with:
110+
# https://github.com/actions/checkout/issues/249
111+
fetch-depth: 0
112+
113+
- name: Install dependent PRs if needed
114+
uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
115+
with:
116+
token: ${{ secrets.GITHUB_TOKEN }}
117+
118+
- name: Fetch and checkout PR
119+
if: ${{ github.event_name == 'pull_request_target' }}
120+
run: |
121+
git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
122+
git checkout pr-${{ github.event.number }}
123+
124+
- name: Update instructlab-training library
125+
run: |
126+
export CUDA_HOME="/usr/local/cuda"
127+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
128+
export PATH="$PATH:$CUDA_HOME/bin"
129+
nvidia-smi
130+
python3.11 -m venv --upgrade-deps venv
131+
. venv/bin/activate
132+
pip install instructlab
133+
pip install instructlab[cuda]
134+
pip install vllm
135+
python3.11 -m pip install packaging wheel setuptools-scm
136+
pip install .
137+
pip install .[cuda]
138+
python3.11 -m pip uninstall -y flash-attn
139+
python3.11 -m pip cache purge
140+
python3.11 -m pip install ninja
141+
MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
142+
143+
- name: Check disk before tests
144+
run: |
145+
df -h
146+
147+
# TODO: switch to downloading a ds rather than generating one
148+
# - name: Download SDG Dataset
149+
# working-directory: ./training
150+
# uses: actions/download-artifact@v4
151+
# with:
152+
# name: sdg-dataset.jsonl
153+
# path: dataset
154+
155+
- name: Run e2e test
156+
env:
157+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
158+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
159+
run: |
160+
. venv/bin/activate
161+
ls scripts
162+
ls ./
163+
./scripts/test-sdk.sh
164+
165+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
166+
# and we know that it will be written into a directory created by `mktemp -d`.
167+
# Given this information, we can use the following command to find the file:
168+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
169+
phase_num=1;
170+
for log_file in $log_files; do
171+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
172+
((phase_num++))
173+
done
174+
175+
- name: Check disk after tests
176+
run: |
177+
df -h
178+
179+
- name: Upload training logs Phase 1
180+
uses: actions/upload-artifact@v4
181+
with:
182+
name: phase-1-training-log.jsonl
183+
path: ./phase-1-training-log.jsonl
184+
retention-days: 1
185+
overwrite: true
186+
187+
- name: Upload training logs Phase 2
188+
uses: actions/upload-artifact@v4
189+
with:
190+
name: phase-2-training-log.jsonl
191+
path: ./phase-2-training-log.jsonl
192+
retention-days: 1
193+
overwrite: true
194+
195+
stop-large-ec2-runner:
196+
needs:
197+
- start-large-ec2-runner
198+
- e2e-large-test
199+
runs-on: ubuntu-latest
200+
if: ${{ always() }}
201+
steps:
202+
- name: "Harden Runner"
203+
# v2.10.1
204+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
205+
with:
206+
egress-policy: audit
207+
208+
- name: Configure AWS credentials
209+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
210+
with:
211+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
212+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
213+
aws-region: ${{ vars.AWS_REGION }}
214+
215+
- name: Stop EC2 runner
216+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
217+
with:
218+
mode: stop
219+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
220+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
221+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
222+
223+
loss-graphs:
224+
needs:
225+
- stop-large-ec2-runner
226+
runs-on: ubuntu-latest
227+
if: ${{ always() }}
228+
steps:
229+
- name: "Harden Runner"
230+
# v2.10.1
231+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
232+
with:
233+
egress-policy: audit
234+
235+
- name: Configure AWS credentials
236+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
237+
with:
238+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
239+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
240+
aws-region: ${{ vars.AWS_REGION }}
241+
242+
- name: Download loss data Phase 1
243+
id: phase-1-download-logs
244+
uses: actions/download-artifact@v4
245+
with:
246+
name: phase-1-training-log.jsonl
247+
path: downloaded-data
248+
249+
- name: Download loss data Phase 2
250+
id: phase-2-download-logs
251+
uses: actions/download-artifact@v4
252+
with:
253+
name: phase-2-training-log.jsonl
254+
path: downloaded-data
255+
256+
- name: Checkout
257+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
258+
with:
259+
# https://github.com/actions/checkout/issues/249
260+
fetch-depth: 0
261+
262+
- name: Install dependencies
263+
run: |
264+
python -m pip install --upgrade pip
265+
pip install -r requirements-dev.txt
266+
267+
- name: Try to upload Phase 1 to s3
268+
id: phase-1-upload-s3
269+
continue-on-error: true
270+
run: |
271+
python ./scripts/create-loss-graph.py \
272+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
273+
--output-file "./phase-1-test.md" \
274+
--phase "1" \
275+
--aws-region "${{ vars.AWS_REGION }}" \
276+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
277+
--base-branch "${GITHUB_REF##*/}" \
278+
--head-sha "${{ github.sha }}" \
279+
--pr-number "${{ github.event.number }}" \
280+
--origin-repository "${{ github.repository }}"
281+
282+
- name: Try to upload Phase 2 to s3
283+
id: phase-2-upload-s3
284+
continue-on-error: true
285+
run: |
286+
python ./scripts/create-loss-graph.py \
287+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
288+
--output-file "./phase-2-test.md" \
289+
--phase "2" \
290+
--aws-region "${{ vars.AWS_REGION }}" \
291+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
292+
--base-branch "${GITHUB_REF##*/}" \
293+
--head-sha "${{ github.sha }}" \
294+
--pr-number "${{ github.event.number }}" \
295+
--origin-repository "${{ github.repository }}"
296+
297+
- name: Check Phase 1 S3 upload status for success
298+
if: steps.phase-1-upload-s3.outcome == 'success'
299+
run: |
300+
echo "Uploaded Phase 1 loss graph to S3."
301+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
302+
303+
- name: Check Phase 2 S3 upload status for success
304+
if: steps.phase-2-upload-s3.outcome == 'success'
305+
run: |
306+
echo "Uploaded Phase 2 loss graph to S3."
307+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
308+
309+
- name: Check Phase 1 S3 upload status for failure
310+
if: steps.phase-1-upload-s3.outcome == 'failure'
311+
run: |
312+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
313+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
314+
315+
- name: Check Phase 2 S3 upload status for failure
316+
if: steps.phase-2-upload-s3.outcome == 'failure'
317+
run: |
318+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
319+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

scripts/ibm_legacy_tmpl.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
# First Party
4+
from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
5+
6+
SPECIAL_TOKENS = SpecialTokens(
7+
system=TokenInfo("<|system|>", add_to_tokenizer=True),
8+
user=TokenInfo("<|user|>", add_to_tokenizer=True),
9+
assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
10+
eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
11+
pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
12+
bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
13+
)
14+
15+
CHAT_TEMPLATE = (
16+
"{% for message in messages %}"
17+
"{% if message['role'] == 'pretraining' %}"
18+
"{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
19+
"{% elif message['role'] == 'system' %}"
20+
"{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
21+
"{% elif message['role'] == 'user' %}"
22+
"{{'<|user|>' + '\n' + message['content'] + '\n'}}"
23+
"{% elif message['role'] == 'assistant' %}"
24+
"{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
25+
"{% endif %}"
26+
"{% if loop.last and add_generation_prompt %}"
27+
"{{ '<|assistant|>' + '\n' }}"
28+
"{% endif %}"
29+
"{% endfor %}"
30+
)

0 commit comments

Comments
 (0)