3
3
name : E2E (NVIDIA L40S x4) SDK Test
4
4
5
5
on :
6
+ pull_request :
7
+ branches :
8
+ - " main"
9
+ schedule :
10
+ - cron : ' 0 16 * * *' # Runs at 4PM UTC every day
6
11
workflow_dispatch :
7
12
inputs :
8
13
pr_or_branch :
9
14
description : ' pull request number or branch name'
10
15
required : true
11
16
default : ' main'
12
- jobs:
17
+ concurrency :
18
+ group : ${{ github.workflow }}-${{ github.event.number || github.ref }}
19
+ cancel-in-progress : true
20
+
21
+ env :
22
+ TMPDIR : /home/tmp
23
+
24
+ jobs :
25
+ start-large-ec2-runner :
26
+ runs-on : ubuntu-latest
27
+ outputs :
28
+ label : ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
29
+ ec2-instance-id : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
30
+ ec2-instance-region : ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
31
+ steps :
32
+ - name : Checkout "launch-ec2-runner-with-fallback" in-house CI action
33
+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
34
+ with :
35
+ repository : instructlab/ci-actions
36
+ # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
37
+ path : ci-actions
38
+ ref : release-v0.1
39
+ sparse-checkout : |
40
+ actions/launch-ec2-runner-with-fallback
41
+
42
+ - name : Launch EC2 Runner with Fallback
43
+ id : launch-ec2-instance-with-fallback
44
+ uses : ./ci-actions/actions/launch-ec2-runner-with-fallback
45
+ env :
46
+ TMPDIR : " /tmp"
47
+ with :
48
+ aws_access_key_id : ${{ secrets.AWS_ACCESS_KEY_ID }}
49
+ aws_secret_access_key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50
+ github_token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
51
+ regions_config : >
52
+ [
53
+ {
54
+ "region": "us-east-2",
55
+ "subnets": {
56
+ "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
57
+ "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
58
+ "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
59
+ },
60
+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
61
+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
62
+ },
63
+ {
64
+ "region": "us-east-1",
65
+ "subnets": {
66
+ "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
67
+ "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
68
+ "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
69
+ "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
70
+ "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
71
+ "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
72
+ },
73
+ "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
74
+ "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
75
+ }
76
+ ]
77
+ try_spot_instance_first : false
78
+ ec2_instance_type : g6e.12xlarge
79
+ aws_resource_tags : >
80
+ [
81
+ {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
82
+ {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
83
+ {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
84
+ {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
85
+ ]
86
+
87
+ e2e-large-test :
88
+ needs :
89
+ - start-large-ec2-runner
90
+ runs-on : ${{ needs.start-large-ec2-runner.outputs.label }}
91
+
92
+ permissions :
93
+ pull-requests : write
94
+
95
+ steps :
96
+ - name : " Harden Runner"
97
+ # v2.10.1
98
+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
99
+ with :
100
+ egress-policy : audit
101
+ - name : Install Packages
102
+ run : |
103
+ cat /etc/os-release
104
+ mkdir -p "${TMPDIR}"
105
+ sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
106
+
107
+ - name : Checkout
108
+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
109
+ with :
110
+ # https://github.com/actions/checkout/issues/249
111
+ fetch-depth : 0
112
+
113
+ - name : Install dependent PRs if needed
114
+ uses : depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
115
+ with :
116
+ token : ${{ secrets.GITHUB_TOKEN }}
117
+
118
+ - name : Fetch and checkout PR
119
+ if : ${{ github.event_name == 'pull_request_target' }}
120
+ run : |
121
+ git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
122
+ git checkout pr-${{ github.event.number }}
123
+
124
+ - name : Update instructlab-training library
125
+ run : |
126
+ export CUDA_HOME="/usr/local/cuda"
127
+ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
128
+ export PATH="$PATH:$CUDA_HOME/bin"
129
+ nvidia-smi
130
+ python3.11 -m venv --upgrade-deps venv
131
+ . venv/bin/activate
132
+ pip install instructlab
133
+ pip install instructlab[cuda]
134
+ pip install vllm
135
+ python3.11 -m pip install packaging wheel setuptools-scm
136
+ pip install .
137
+ pip install .[cuda]
138
+ python3.11 -m pip uninstall -y flash-attn
139
+ python3.11 -m pip cache purge
140
+ python3.11 -m pip install ninja
141
+ MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
142
+
143
+ - name : Check disk before tests
144
+ run : |
145
+ df -h
146
+
147
+ # TODO: switch to downloading a ds rather than generating one
148
+ # - name: Download SDG Dataset
149
+ # working-directory: ./training
150
+ # uses: actions/download-artifact@v4
151
+ # with:
152
+ # name: sdg-dataset.jsonl
153
+ # path: dataset
154
+
155
+ - name : Run e2e test
156
+ env :
157
+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
158
+ OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
159
+ run : |
160
+ . venv/bin/activate
161
+ ls scripts
162
+ ls ./
163
+ ./scripts/test-sdk.sh
164
+
165
+ # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
166
+ # and we know that it will be written into a directory created by `mktemp -d`.
167
+ # Given this information, we can use the following command to find the file:
168
+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
169
+ phase_num=1;
170
+ for log_file in $log_files; do
171
+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
172
+ ((phase_num++))
173
+ done
174
+
175
+ - name : Check disk after tests
176
+ run : |
177
+ df -h
178
+
179
+ - name : Upload training logs Phase 1
180
+ uses : actions/upload-artifact@v4
181
+ with :
182
+ name : phase-1-training-log.jsonl
183
+ path : ./phase-1-training-log.jsonl
184
+ retention-days : 1
185
+ overwrite : true
186
+
187
+ - name : Upload training logs Phase 2
188
+ uses : actions/upload-artifact@v4
189
+ with :
190
+ name : phase-2-training-log.jsonl
191
+ path : ./phase-2-training-log.jsonl
192
+ retention-days : 1
193
+ overwrite : true
194
+
195
+ stop-large-ec2-runner :
196
+ needs :
197
+ - start-large-ec2-runner
198
+ - e2e-large-test
199
+ runs-on : ubuntu-latest
200
+ if : ${{ always() }}
201
+ steps :
202
+ - name : " Harden Runner"
203
+ # v2.10.1
204
+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
205
+ with :
206
+ egress-policy : audit
207
+
208
+ - name : Configure AWS credentials
209
+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
210
+ with :
211
+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
212
+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
213
+ aws-region : ${{ vars.AWS_REGION }}
214
+
215
+ - name : Stop EC2 runner
216
+ uses : machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
217
+ with :
218
+ mode : stop
219
+ github-token : ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
220
+ label : ${{ needs.start-large-ec2-runner.outputs.label }}
221
+ ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
222
+
223
+ loss-graphs :
224
+ needs :
225
+ - stop-large-ec2-runner
226
+ runs-on : ubuntu-latest
227
+ if : ${{ always() }}
228
+ steps :
229
+ - name : " Harden Runner"
230
+ # v2.10.1
231
+ uses : step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
232
+ with :
233
+ egress-policy : audit
234
+
235
+ - name : Configure AWS credentials
236
+ uses : aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
237
+ with :
238
+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
239
+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
240
+ aws-region : ${{ vars.AWS_REGION }}
241
+
242
+ - name : Download loss data Phase 1
243
+ id : phase-1-download-logs
244
+ uses : actions/download-artifact@v4
245
+ with :
246
+ name : phase-1-training-log.jsonl
247
+ path : downloaded-data
248
+
249
+ - name : Download loss data Phase 2
250
+ id : phase-2-download-logs
251
+ uses : actions/download-artifact@v4
252
+ with :
253
+ name : phase-2-training-log.jsonl
254
+ path : downloaded-data
255
+
256
+ - name : Checkout
257
+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
258
+ with :
259
+ # https://github.com/actions/checkout/issues/249
260
+ fetch-depth : 0
261
+
262
+ - name : Install dependencies
263
+ run : |
264
+ python -m pip install --upgrade pip
265
+ pip install -r requirements-dev.txt
266
+
267
+ - name : Try to upload Phase 1 to s3
268
+ id : phase-1-upload-s3
269
+ continue-on-error : true
270
+ run : |
271
+ python ./scripts/create-loss-graph.py \
272
+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
273
+ --output-file "./phase-1-test.md" \
274
+ --phase "1" \
275
+ --aws-region "${{ vars.AWS_REGION }}" \
276
+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
277
+ --base-branch "${GITHUB_REF##*/}" \
278
+ --head-sha "${{ github.sha }}" \
279
+ --pr-number "${{ github.event.number }}" \
280
+ --origin-repository "${{ github.repository }}"
281
+
282
+ - name : Try to upload Phase 2 to s3
283
+ id : phase-2-upload-s3
284
+ continue-on-error : true
285
+ run : |
286
+ python ./scripts/create-loss-graph.py \
287
+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
288
+ --output-file "./phase-2-test.md" \
289
+ --phase "2" \
290
+ --aws-region "${{ vars.AWS_REGION }}" \
291
+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
292
+ --base-branch "${GITHUB_REF##*/}" \
293
+ --head-sha "${{ github.sha }}" \
294
+ --pr-number "${{ github.event.number }}" \
295
+ --origin-repository "${{ github.repository }}"
296
+
297
+ - name : Check Phase 1 S3 upload status for success
298
+ if : steps.phase-1-upload-s3.outcome == 'success'
299
+ run : |
300
+ echo "Uploaded Phase 1 loss graph to S3."
301
+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
302
+
303
+ - name : Check Phase 2 S3 upload status for success
304
+ if : steps.phase-2-upload-s3.outcome == 'success'
305
+ run : |
306
+ echo "Uploaded Phase 2 loss graph to S3."
307
+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
308
+
309
+ - name : Check Phase 1 S3 upload status for failure
310
+ if : steps.phase-1-upload-s3.outcome == 'failure'
311
+ run : |
312
+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
313
+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
314
+
315
+ - name : Check Phase 2 S3 upload status for failure
316
+ if : steps.phase-2-upload-s3.outcome == 'failure'
317
+ run : |
318
+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
319
+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments