Skip to content

Commit 746f76a

Browse files
committed
feat: test_train and test_model unit tests, e2e test
add test_train.py which mocks `train` and `train_epoch`. Additionally add test_model.py which tests instantiation of all new classes: Model, Accelerator, Optimizer, Checkpointer Signed-off-by: Charlie Doern <[email protected]>
1 parent a75ff0a commit 746f76a

File tree

9 files changed

+2074
-2
lines changed

9 files changed

+2074
-2
lines changed

.github/workflows/e2e-nvidia-l40s-x4-sdk.yml

Lines changed: 365 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,374 @@
33
name: E2E (NVIDIA L40S x4) SDK Test
44

55
on:
6+
pull_request:
7+
branches:
8+
- "main"
9+
schedule:
10+
- cron: '0 16 * * *' # Runs at 4PM UTC every day
611
workflow_dispatch:
712
inputs:
813
pr_or_branch:
914
description: 'pull request number or branch name'
1015
required: true
1116
default: 'main'
12-
jobs:
17+
concurrency:
18+
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
19+
cancel-in-progress: true
20+
21+
env:
22+
TMPDIR: /home/tmp
23+
24+
jobs:
25+
start-large-ec2-runner:
26+
runs-on: ubuntu-latest
27+
outputs:
28+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31+
steps:
32+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
33+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34+
with:
35+
repository: instructlab/ci-actions
36+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37+
path: ci-actions
38+
ref: release-v0.1
39+
sparse-checkout: |
40+
actions/launch-ec2-runner-with-fallback
41+
42+
- name: Launch EC2 Runner with Fallback
43+
id: launch-ec2-instance-with-fallback
44+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
45+
env:
46+
TMPDIR: "/tmp"
47+
with:
48+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
49+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51+
regions_config: >
52+
[
53+
{
54+
"region": "us-east-2",
55+
"subnets": {
56+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59+
},
60+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62+
},
63+
{
64+
"region": "us-east-1",
65+
"subnets": {
66+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72+
},
73+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75+
}
76+
]
77+
try_spot_instance_first: false
78+
ec2_instance_type: g6e.12xlarge
79+
aws_resource_tags: >
80+
[
81+
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85+
]
86+
87+
e2e-large-test:
88+
needs:
89+
- start-large-ec2-runner
90+
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
91+
92+
permissions:
93+
pull-requests: write
94+
95+
steps:
96+
- name: "Harden Runner"
97+
# v2.10.1
98+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99+
with:
100+
egress-policy: audit
101+
- name: Install Packages
102+
run: |
103+
cat /etc/os-release
104+
mkdir -p "${TMPDIR}"
105+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106+
107+
- name: Checkout
108+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109+
with:
110+
# https://github.com/actions/checkout/issues/249
111+
fetch-depth: 0
112+
113+
- name: Determine if pr_or_branch is a PR number
114+
id: check_pr
115+
run: |
116+
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
117+
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
118+
echo "is_pr=true" >> "$GITHUB_OUTPUT"
119+
else
120+
echo "is_pr=false" >> "$GITHUB_OUTPUT"
121+
fi
122+
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
123+
124+
- name: Check if gh cli is installed
125+
id: gh_cli
126+
run: |
127+
if command -v gh &> /dev/null ; then
128+
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
129+
else
130+
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
131+
fi
132+
133+
- name: Install gh CLI
134+
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
135+
run: |
136+
sudo dnf install 'dnf-command(config-manager)' -y
137+
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
138+
sudo dnf install gh --repo gh-cli -y
139+
140+
- name: test gh CLI
141+
run: |
142+
gh --version
143+
144+
- name: set default repo
145+
run: |
146+
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
147+
env:
148+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
149+
150+
- name: Add comment to PR
151+
if: steps.check_pr.outputs.is_pr == 'true'
152+
run: |
153+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
154+
env:
155+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
156+
157+
- name: Fetch and checkout PR
158+
if: steps.check_pr.outputs.is_pr == 'true'
159+
run: |
160+
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
161+
env:
162+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
163+
164+
- name: Checkout branch
165+
if: steps.check_pr.outputs.is_pr == 'false'
166+
run: |
167+
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
168+
169+
- name: Update instructlab-training library
170+
run: |
171+
export CUDA_HOME="/usr/local/cuda"
172+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
173+
export PATH="$PATH:$CUDA_HOME/bin"
174+
nvidia-smi
175+
python3.11 -m venv --upgrade-deps venv
176+
. venv/bin/activate
177+
pip install instructlab
178+
pip install instructlab[cuda]
179+
python3.11 -m pip install packaging wheel setuptools-scm
180+
pip install .
181+
pip install .[cuda]
182+
python3.11 -m pip uninstall -y flash-attn
183+
python3.11 -m pip cache purge
184+
python3.11 -m pip install ninja
185+
MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
186+
187+
- name: Check disk before tests
188+
run: |
189+
df -h
190+
191+
# TODO: switch to downloading a ds rather than generating one
192+
# - name: Download SDG Dataset
193+
# working-directory: ./training
194+
# uses: actions/download-artifact@v4
195+
# with:
196+
# name: sdg-dataset.jsonl
197+
# path: dataset
198+
199+
- name: Run e2e test
200+
env:
201+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
202+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
203+
run: |
204+
. venv/bin/activate
205+
206+
./scripts/test-sdk.sh
207+
208+
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
209+
# and we know that it will be written into a directory created by `mktemp -d`.
210+
# Given this information, we can use the following command to find the file:
211+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
212+
phase_num=1;
213+
for log_file in $log_files; do
214+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
215+
((phase_num++))
216+
done
217+
218+
- name: Check disk after tests
219+
run: |
220+
df -h
221+
222+
- name: Upload training logs Phase 1
223+
uses: actions/upload-artifact@v4
224+
with:
225+
name: phase-1-training-log.jsonl
226+
path: ./phase-1-training-log.jsonl
227+
retention-days: 1
228+
overwrite: true
229+
230+
- name: Upload training logs Phase 2
231+
uses: actions/upload-artifact@v4
232+
with:
233+
name: phase-2-training-log.jsonl
234+
path: ./phase-2-training-log.jsonl
235+
retention-days: 1
236+
overwrite: true
237+
238+
- name: Add comment to PR if the workflow failed
239+
if: failure() && steps.check_pr.outputs.is_pr == 'true'
240+
run: |
241+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
242+
env:
243+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
244+
245+
- name: Add comment to PR if the workflow succeeded
246+
if: success() && steps.check_pr.outputs.is_pr == 'true'
247+
run: |
248+
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
249+
env:
250+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
251+
252+
stop-large-ec2-runner:
253+
needs:
254+
- start-large-ec2-runner
255+
- e2e-large-test
256+
runs-on: ubuntu-latest
257+
if: ${{ always() }}
258+
steps:
259+
- name: "Harden Runner"
260+
# v2.10.1
261+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
262+
with:
263+
egress-policy: audit
264+
265+
- name: Configure AWS credentials
266+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
267+
with:
268+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
269+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
270+
aws-region: ${{ vars.AWS_REGION }}
271+
272+
- name: Stop EC2 runner
273+
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
274+
with:
275+
mode: stop
276+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
277+
label: ${{ needs.start-large-ec2-runner.outputs.label }}
278+
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
279+
280+
loss-graphs:
281+
needs:
282+
- stop-large-ec2-runner
283+
runs-on: ubuntu-latest
284+
if: ${{ always() }}
285+
steps:
286+
- name: "Harden Runner"
287+
# v2.10.1
288+
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
289+
with:
290+
egress-policy: audit
291+
292+
- name: Configure AWS credentials
293+
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
294+
with:
295+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
296+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
297+
aws-region: ${{ vars.AWS_REGION }}
298+
299+
- name: Download loss data Phase 1
300+
id: phase-1-download-logs
301+
uses: actions/download-artifact@v4
302+
with:
303+
name: phase-1-training-log.jsonl
304+
path: downloaded-data
305+
306+
- name: Download loss data Phase 2
307+
id: phase-2-download-logs
308+
uses: actions/download-artifact@v4
309+
with:
310+
name: phase-2-training-log.jsonl
311+
path: downloaded-data
312+
313+
- name: Checkout
314+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
315+
with:
316+
# https://github.com/actions/checkout/issues/249
317+
fetch-depth: 0
318+
319+
- name: Install dependencies
320+
run: |
321+
python -m pip install --upgrade pip
322+
pip install -r requirements-dev.txt
323+
324+
- name: Try to upload Phase 1 to s3
325+
id: phase-1-upload-s3
326+
continue-on-error: true
327+
run: |
328+
python ./scripts/create-loss-graph.py \
329+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
330+
--output-file "./phase-1-test.md" \
331+
--phase "1" \
332+
--aws-region "${{ vars.AWS_REGION }}" \
333+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
334+
--base-branch "${GITHUB_REF##*/}" \
335+
--head-sha "${{ github.sha }}" \
336+
--pr-number "${{ github.event.number }}" \
337+
--origin-repository "${{ github.repository }}"
338+
339+
- name: Try to upload Phase 2 to s3
340+
id: phase-2-upload-s3
341+
continue-on-error: true
342+
run: |
343+
python ./scripts/create-loss-graph.py \
344+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
345+
--output-file "./phase-2-test.md" \
346+
--phase "2" \
347+
--aws-region "${{ vars.AWS_REGION }}" \
348+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
349+
--base-branch "${GITHUB_REF##*/}" \
350+
--head-sha "${{ github.sha }}" \
351+
--pr-number "${{ github.event.number }}" \
352+
--origin-repository "${{ github.repository }}"
353+
354+
- name: Check Phase 1 S3 upload status for success
355+
if: steps.phase-1-upload-s3.outcome == 'success'
356+
run: |
357+
echo "Uploaded Phase 1 loss graph to S3."
358+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
359+
360+
- name: Check Phase 2 S3 upload status for success
361+
if: steps.phase-2-upload-s3.outcome == 'success'
362+
run: |
363+
echo "Uploaded Phase 2 loss graph to S3."
364+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
365+
366+
- name: Check Phase 1 S3 upload status for failure
367+
if: steps.phase-1-upload-s3.outcome == 'failure'
368+
run: |
369+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
370+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
371+
372+
- name: Check Phase 2 S3 upload status for failure
373+
if: steps.phase-2-upload-s3.outcome == 'failure'
374+
run: |
375+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
376+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

0 commit comments

Comments
 (0)