3
3
name : E2E (NVIDIA L40S x4) SDK Test
4
4
5
5
on :
6
+ pull_request :
7
+ branches :
8
+ - " main"
9
+ schedule :
10
+ - cron : ' 0 16 * * *' # Runs at 4PM UTC every day
6
11
workflow_dispatch :
7
12
inputs :
8
13
pr_or_branch :
9
14
description : ' pull request number or branch name'
10
15
required : true
11
16
default : ' main'
12
- jobs:
17
+ concurrency :
18
+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
19
+ cancel-in-progress : true
20
+
21
+ env :
22
+ TMPDIR : /home/tmp
23
+
24
+ jobs :
25
+ start-large-ec2-runner :
26
+ runs-on : ubuntu-latest
27
+ outputs :
28
+ label : ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29
+ ec2-instance-id : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30
+ ec2-instance-region : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31
+ steps :
32
+ - name : Checkout "launch-ec2-runner-with-fallback" in-house CI action
33
+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34
+ with :
35
+ repository : instructlab/ci-actions
36
+ # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37
+ path : ci-actions
38
+ ref : release-v0.1
39
+ sparse-checkout : |
40
+ actions/launch-ec2-runner-with-fallback
41
+
42
+ - name : Launch EC2 Runner with Fallback
43
+ id : launch-ec2-instance-with-fallback
44
+ uses : ./ci-actions/actions/launch-ec2-runner-with-fallback
45
+ env :
46
+ TMPDIR : " /tmp"
47
+ with :
48
+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
49
+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50
+ github_token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51
+ regions_config : >
52
+ [
53
+ {
54
+ "region": "us-east-2",
55
+ "subnets": {
56
+ "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57
+ "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58
+ "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59
+ },
60
+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61
+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62
+ },
63
+ {
64
+ "region": "us-east-1",
65
+ "subnets": {
66
+ "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67
+ "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68
+ "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69
+ "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70
+ "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71
+ "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72
+ },
73
+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74
+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75
+ }
76
+ ]
77
+ try_spot_instance_first : false
78
+ ec2_instance_type : g6e.12xlarge
79
+ aws_resource_tags : >
80
+ [
81
+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82
+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83
+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84
+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85
+ ]
86
+
87
+ e2e-large-test :
88
+ needs :
89
+ - start-large-ec2-runner
90
+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
91
+
92
+ permissions :
93
+ pull-requests : write
94
+
95
+ steps :
96
+ - name : " Harden Runner"
97
+ # v2.10.1
98
+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99
+ with :
100
+ egress-policy : audit
101
+ - name : Install Packages
102
+ run : |
103
+ cat /etc/os-release
104
+ mkdir -p "${TMPDIR}"
105
+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106
+
107
+ - name : Checkout
108
+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109
+ with :
110
+ # https://github.com/actions/checkout/issues/249
111
+ fetch-depth : 0
112
+
113
+ - name : Determine if pr_or_branch is a PR number
114
+ id : check_pr
115
+ run : |
116
+ PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
117
+ if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
118
+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
119
+ else
120
+ echo "is_pr=false" >> "$GITHUB_OUTPUT"
121
+ fi
122
+ echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
123
+
124
+ - name : Check if gh cli is installed
125
+ id : gh_cli
126
+ run : |
127
+ if command -v gh &> /dev/null ; then
128
+ echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
129
+ else
130
+ echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
131
+ fi
132
+
133
+ - name : Install gh CLI
134
+ if : steps.gh_cli.outputs.gh_cli_installed == 'false'
135
+ run : |
136
+ sudo dnf install 'dnf-command(config-manager)' -y
137
+ sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
138
+ sudo dnf install gh --repo gh-cli -y
139
+
140
+ - name : test gh CLI
141
+ run : |
142
+ gh --version
143
+
144
+ - name : set default repo
145
+ run : |
146
+ gh repo set-default ${{ github.server_url }}/${{ github.repository }}
147
+ env :
148
+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
149
+
150
+ - name : Add comment to PR
151
+ if : steps.check_pr.outputs.is_pr == 'true'
152
+ run : |
153
+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
154
+ env :
155
+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
156
+
157
+ - name : Fetch and checkout PR
158
+ if : steps.check_pr.outputs.is_pr == 'true'
159
+ run : |
160
+ gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
161
+ env :
162
+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
163
+
164
+ - name : Checkout branch
165
+ if : steps.check_pr.outputs.is_pr == 'false'
166
+ run : |
167
+ git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
168
+
169
+ - name : Update instructlab-training library
170
+ run : |
171
+ export CUDA_HOME="/usr/local/cuda"
172
+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
173
+ export PATH="$PATH:$CUDA_HOME/bin"
174
+ nvidia-smi
175
+ python3.11 -m venv --upgrade-deps venv
176
+ . venv/bin/activate
177
+ pip install instructlab
178
+ pip install instructlab[cuda]
179
+ python3.11 -m pip install packaging wheel setuptools-scm
180
+ pip install .
181
+ pip install .[cuda]
182
+ python3.11 -m pip uninstall -y flash-attn
183
+ python3.11 -m pip cache purge
184
+ python3.11 -m pip install ninja
185
+ MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
186
+
187
+ - name : Check disk before tests
188
+ run : |
189
+ df -h
190
+
191
+ # TODO: switch to downloading a ds rather than generating one
192
+ # - name: Download SDG Dataset
193
+ # working-directory: ./training
194
+ # uses: actions/download-artifact@v4
195
+ # with:
196
+ # name: sdg-dataset.jsonl
197
+ # path: dataset
198
+
199
+ - name : Run e2e test
200
+ env :
201
+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
202
+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
203
+ run : |
204
+ . venv/bin/activate
205
+
206
+ ./scripts/test-sdk.sh
207
+
208
+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
209
+ # and we know that it will be written into a directory created by `mktemp -d`.
210
+ # Given this information, we can use the following command to find the file:
211
+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
212
+ phase_num=1;
213
+ for log_file in $log_files; do
214
+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
215
+ ((phase_num++))
216
+ done
217
+
218
+ - name : Check disk after tests
219
+ run : |
220
+ df -h
221
+
222
+ - name : Upload training logs Phase 1
223
+ uses : actions/upload-artifact@v4
224
+ with :
225
+ name : phase-1-training-log.jsonl
226
+ path : ./phase-1-training-log.jsonl
227
+ retention-days : 1
228
+ overwrite : true
229
+
230
+ - name : Upload training logs Phase 2
231
+ uses : actions/upload-artifact@v4
232
+ with :
233
+ name : phase-2-training-log.jsonl
234
+ path : ./phase-2-training-log.jsonl
235
+ retention-days : 1
236
+ overwrite : true
237
+
238
+ - name : Add comment to PR if the workflow failed
239
+ if : failure() && steps.check_pr.outputs.is_pr == 'true'
240
+ run : |
241
+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
242
+ env :
243
+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
244
+
245
+ - name : Add comment to PR if the workflow succeeded
246
+ if : success() && steps.check_pr.outputs.is_pr == 'true'
247
+ run : |
248
+ gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
249
+ env :
250
+ GH_TOKEN : ${{ secrets.GITHUB_TOKEN }}
251
+
252
+ stop-large-ec2-runner :
253
+ needs :
254
+ - start-large-ec2-runner
255
+ - e2e-large-test
256
+ runs-on : ubuntu-latest
257
+ if : ${{ always() }}
258
+ steps :
259
+ - name : " Harden Runner"
260
+ # v2.10.1
261
+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
262
+ with :
263
+ egress-policy : audit
264
+
265
+ - name : Configure AWS credentials
266
+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
267
+ with :
268
+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
269
+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
270
+ aws-region : ${{ vars.AWS_REGION }}
271
+
272
+ - name : Stop EC2 runner
273
+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
274
+ with :
275
+ mode : stop
276
+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
277
+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
278
+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
279
+
280
+ loss-graphs :
281
+ needs :
282
+ - stop-large-ec2-runner
283
+ runs-on : ubuntu-latest
284
+ if : ${{ always() }}
285
+ steps :
286
+ - name : " Harden Runner"
287
+ # v2.10.1
288
+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
289
+ with :
290
+ egress-policy : audit
291
+
292
+ - name : Configure AWS credentials
293
+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
294
+ with :
295
+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
296
+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
297
+ aws-region : ${{ vars.AWS_REGION }}
298
+
299
+ - name : Download loss data Phase 1
300
+ id : phase-1-download-logs
301
+ uses : actions/download-artifact@v4
302
+ with :
303
+ name : phase-1-training-log.jsonl
304
+ path : downloaded-data
305
+
306
+ - name : Download loss data Phase 2
307
+ id : phase-2-download-logs
308
+ uses : actions/download-artifact@v4
309
+ with :
310
+ name : phase-2-training-log.jsonl
311
+ path : downloaded-data
312
+
313
+ - name : Checkout
314
+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
315
+ with :
316
+ # https://github.com/actions/checkout/issues/249
317
+ fetch-depth : 0
318
+
319
+ - name : Install dependencies
320
+ run : |
321
+ python -m pip install --upgrade pip
322
+ pip install -r requirements-dev.txt
323
+
324
+ - name : Try to upload Phase 1 to s3
325
+ id : phase-1-upload-s3
326
+ continue-on-error : true
327
+ run : |
328
+ python ./scripts/create-loss-graph.py \
329
+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
330
+ --output-file "./phase-1-test.md" \
331
+ --phase "1" \
332
+ --aws-region "${{ vars.AWS_REGION }}" \
333
+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
334
+ --base-branch "${GITHUB_REF##*/}" \
335
+ --head-sha "${{ github.sha }}" \
336
+ --pr-number "${{ github.event.number }}" \
337
+ --origin-repository "${{ github.repository }}"
338
+
339
+ - name : Try to upload Phase 2 to s3
340
+ id : phase-2-upload-s3
341
+ continue-on-error : true
342
+ run : |
343
+ python ./scripts/create-loss-graph.py \
344
+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
345
+ --output-file "./phase-2-test.md" \
346
+ --phase "2" \
347
+ --aws-region "${{ vars.AWS_REGION }}" \
348
+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
349
+ --base-branch "${GITHUB_REF##*/}" \
350
+ --head-sha "${{ github.sha }}" \
351
+ --pr-number "${{ github.event.number }}" \
352
+ --origin-repository "${{ github.repository }}"
353
+
354
+ - name : Check Phase 1 S3 upload status for success
355
+ if : steps.phase-1-upload-s3.outcome == 'success'
356
+ run : |
357
+ echo "Uploaded Phase 1 loss graph to S3."
358
+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
359
+
360
+ - name : Check Phase 2 S3 upload status for success
361
+ if : steps.phase-2-upload-s3.outcome == 'success'
362
+ run : |
363
+ echo "Uploaded Phase 2 loss graph to S3."
364
+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
365
+
366
+ - name : Check Phase 1 S3 upload status for failure
367
+ if : steps.phase-1-upload-s3.outcome == 'failure'
368
+ run : |
369
+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
370
+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
371
+
372
+ - name : Check Phase 2 S3 upload status for failure
373
+ if : steps.phase-2-upload-s3.outcome == 'failure'
374
+ run : |
375
+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
376
+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments