|
3 | 3 | name: E2E (NVIDIA L40S x4) SDK Test
|
4 | 4 |
|
5 | 5 | on:
|
| 6 | + pull_request: |
| 7 | + branches: |
| 8 | + - "main" |
| 9 | + schedule: |
| 10 | + - cron: '0 16 * * *' # Runs at 4PM UTC every day |
6 | 11 | workflow_dispatch:
|
7 | 12 | inputs:
|
8 | 13 | pr_or_branch:
|
9 | 14 | description: 'pull request number or branch name'
|
10 | 15 | required: true
|
11 | 16 | default: 'main'
|
| 17 | +concurrency: |
| 18 | + group: ${{ github.workflow }}-${{ github.event.number || github.ref }} |
| 19 | + cancel-in-progress: true |
| 20 | + |
| 21 | +env: |
| 22 | + TMPDIR: /home/tmp |
| 23 | + |
12 | 24 | jobs:
|
13 |
| - noop: |
| 25 | + start-large-ec2-runner: |
| 26 | + runs-on: ubuntu-latest |
| 27 | + outputs: |
| 28 | + label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} |
| 29 | + ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} |
| 30 | + ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} |
| 31 | + steps: |
| 32 | + - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action |
| 33 | + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 |
| 34 | + with: |
| 35 | + repository: instructlab/ci-actions |
| 36 | + # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents |
| 37 | + path: ci-actions |
| 38 | + ref: release-v0.1 |
| 39 | + sparse-checkout: | |
| 40 | + actions/launch-ec2-runner-with-fallback |
| 41 | +
|
| 42 | + - name: Launch EC2 Runner with Fallback |
| 43 | + id: launch-ec2-instance-with-fallback |
| 44 | + uses: ./ci-actions/actions/launch-ec2-runner-with-fallback |
| 45 | + env: |
| 46 | + TMPDIR: "/tmp" |
| 47 | + with: |
| 48 | + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} |
| 49 | + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} |
| 50 | + github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} |
| 51 | + regions_config: > |
| 52 | + [ |
| 53 | + { |
| 54 | + "region": "us-east-2", |
| 55 | + "subnets": { |
| 56 | + "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", |
| 57 | + "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", |
| 58 | + "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" |
| 59 | + }, |
| 60 | + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", |
| 61 | + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" |
| 62 | + }, |
| 63 | + { |
| 64 | + "region": "us-east-1", |
| 65 | + "subnets": { |
| 66 | + "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", |
| 67 | + "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", |
| 68 | + "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", |
| 69 | + "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", |
| 70 | + "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", |
| 71 | + "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" |
| 72 | + }, |
| 73 | + "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", |
| 74 | + "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" |
| 75 | + } |
| 76 | + ] |
| 77 | + try_spot_instance_first: false |
| 78 | + ec2_instance_type: g6e.12xlarge |
| 79 | + aws_resource_tags: > |
| 80 | + [ |
| 81 | + {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, |
| 82 | + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, |
| 83 | + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, |
| 84 | + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} |
| 85 | + ] |
| 86 | +
|
| 87 | + e2e-large-test: |
| 88 | + needs: |
| 89 | + - start-large-ec2-runner |
| 90 | + runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} |
| 91 | + |
| 92 | + permissions: |
| 93 | + pull-requests: write |
| 94 | + |
| 95 | + steps: |
| 96 | + - name: "Harden Runner" |
| 97 | + # v2.10.1 |
| 98 | + uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf |
| 99 | + with: |
| 100 | + egress-policy: audit |
| 101 | + - name: Install Packages |
| 102 | + run: | |
| 103 | + cat /etc/os-release |
| 104 | + mkdir -p "${TMPDIR}" |
| 105 | + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel |
| 106 | + |
| 107 | + - name: Checkout |
| 108 | + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 |
| 109 | + with: |
| 110 | + # https://github.com/actions/checkout/issues/249 |
| 111 | + fetch-depth: 0 |
| 112 | + |
| 113 | + - name: Install dependent PRs if needed |
| 114 | + uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main |
| 115 | + with: |
| 116 | + token: ${{ secrets.GITHUB_TOKEN }} |
| 117 | + |
| 118 | + - name: Fetch and checkout PR |
| 119 | + if: ${{ github.event_name == 'pull_request_target' }} |
| 120 | + run: | |
| 121 | + git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }} |
| 122 | + git checkout pr-${{ github.event.number }} |
| 123 | +
|
| 124 | + - name: Update instructlab-training library |
| 125 | + run: | |
| 126 | + export CUDA_HOME="/usr/local/cuda" |
| 127 | + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" |
| 128 | + export PATH="$PATH:$CUDA_HOME/bin" |
| 129 | + nvidia-smi |
| 130 | + python3.11 -m venv --upgrade-deps venv |
| 131 | + . venv/bin/activate |
| 132 | + pip install instructlab |
| 133 | + pip install instructlab[cuda] |
| 134 | + pip install vllm |
| 135 | + python3.11 -m pip install packaging wheel setuptools-scm |
| 136 | + pip install . |
| 137 | + pip install .[cuda] |
| 138 | + python3.11 -m pip uninstall -y flash-attn |
| 139 | + python3.11 -m pip cache purge |
| 140 | + python3.11 -m pip install ninja |
| 141 | + MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation |
| 142 | +
|
| 143 | + - name: Check disk before tests |
| 144 | + run: | |
| 145 | + df -h |
| 146 | +
|
| 147 | + # TODO: switch to downloading a ds rather than generating one |
| 148 | + # - name: Download SDG Dataset |
| 149 | + # working-directory: ./training |
| 150 | + # uses: actions/download-artifact@v4 |
| 151 | + # with: |
| 152 | + # name: sdg-dataset.jsonl |
| 153 | + # path: dataset |
| 154 | + |
| 155 | + - name: Run e2e test |
| 156 | + env: |
| 157 | + HF_TOKEN: ${{ secrets.HF_TOKEN }} |
| 158 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 159 | + run: | |
| 160 | + . venv/bin/activate |
| 161 | + ls scripts |
| 162 | + ls ./ |
| 163 | + ./scripts/test-sdk.sh |
| 164 | +
|
| 165 | + # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python |
| 166 | + # and we know that it will be written into a directory created by `mktemp -d`. |
| 167 | + # Given this information, we can use the following command to find the file: |
| 168 | + log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl") |
| 169 | + phase_num=1; |
| 170 | + for log_file in $log_files; do |
| 171 | + mv "${log_file}" phase-${phase_num}-training-log.jsonl |
| 172 | + ((phase_num++)) |
| 173 | + done |
| 174 | +
|
| 175 | + - name: Check disk after tests |
| 176 | + run: | |
| 177 | + df -h |
| 178 | +
|
| 179 | + - name: Upload training logs Phase 1 |
| 180 | + uses: actions/upload-artifact@v4 |
| 181 | + with: |
| 182 | + name: phase-1-training-log.jsonl |
| 183 | + path: ./phase-1-training-log.jsonl |
| 184 | + retention-days: 1 |
| 185 | + overwrite: true |
| 186 | + |
| 187 | + - name: Upload training logs Phase 2 |
| 188 | + uses: actions/upload-artifact@v4 |
| 189 | + with: |
| 190 | + name: phase-2-training-log.jsonl |
| 191 | + path: ./phase-2-training-log.jsonl |
| 192 | + retention-days: 1 |
| 193 | + overwrite: true |
| 194 | + |
| 195 | + stop-large-ec2-runner: |
| 196 | + needs: |
| 197 | + - start-large-ec2-runner |
| 198 | + - e2e-large-test |
14 | 199 | runs-on: ubuntu-latest
|
| 200 | + if: ${{ always() }} |
15 | 201 | steps:
|
16 |
| - - name: No-op |
17 |
| - run: 'true' |
| 202 | + - name: "Harden Runner" |
| 203 | + # v2.10.1 |
| 204 | + uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf |
| 205 | + with: |
| 206 | + egress-policy: audit |
| 207 | + |
| 208 | + - name: Configure AWS credentials |
| 209 | + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 |
| 210 | + with: |
| 211 | + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} |
| 212 | + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} |
| 213 | + aws-region: ${{ vars.AWS_REGION }} |
| 214 | + |
| 215 | + - name: Stop EC2 runner |
| 216 | + uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9 |
| 217 | + with: |
| 218 | + mode: stop |
| 219 | + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} |
| 220 | + label: ${{ needs.start-large-ec2-runner.outputs.label }} |
| 221 | + ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} |
| 222 | + |
| 223 | + loss-graphs: |
| 224 | + needs: |
| 225 | + - stop-large-ec2-runner |
| 226 | + runs-on: ubuntu-latest |
| 227 | + if: ${{ always() }} |
| 228 | + steps: |
| 229 | + - name: "Harden Runner" |
| 230 | + # v2.10.1 |
| 231 | + uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf |
| 232 | + with: |
| 233 | + egress-policy: audit |
| 234 | + |
| 235 | + - name: Configure AWS credentials |
| 236 | + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 |
| 237 | + with: |
| 238 | + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} |
| 239 | + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} |
| 240 | + aws-region: ${{ vars.AWS_REGION }} |
| 241 | + |
| 242 | + - name: Checkout |
| 243 | + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 |
| 244 | + with: |
| 245 | + # https://github.com/actions/checkout/issues/249 |
| 246 | + fetch-depth: 0 |
| 247 | + |
| 248 | + - name: Install dependencies |
| 249 | + run: | |
| 250 | + python -m pip install --upgrade pip |
| 251 | + pip install -r requirements-dev.txt |
| 252 | +
|
| 253 | + - name: Download loss data Phase 1 |
| 254 | + id: phase-1-download-logs |
| 255 | + uses: actions/download-artifact@v4 |
| 256 | + with: |
| 257 | + name: phase-1-training-log.jsonl |
| 258 | + path: downloaded-data |
| 259 | + |
| 260 | + - name: Download loss data Phase 2 |
| 261 | + id: phase-2-download-logs |
| 262 | + uses: actions/download-artifact@v4 |
| 263 | + with: |
| 264 | + name: phase-2-training-log.jsonl |
| 265 | + path: downloaded-data |
| 266 | + |
| 267 | + - name: Try to upload Phase 1 to s3 |
| 268 | + id: phase-1-upload-s3 |
| 269 | + continue-on-error: true |
| 270 | + run: | |
| 271 | + python ./scripts/create-loss-graph.py \ |
| 272 | + --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \ |
| 273 | + --output-file "./phase-1-test.md" \ |
| 274 | + --phase "1" \ |
| 275 | + --aws-region "${{ vars.AWS_REGION }}" \ |
| 276 | + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ |
| 277 | + --base-branch "${GITHUB_REF##*/}" \ |
| 278 | + --head-sha "${{ github.sha }}" \ |
| 279 | + --pr-number "${{ github.event.number }}" \ |
| 280 | + --origin-repository "${{ github.repository }}" |
| 281 | +
|
| 282 | + - name: Try to upload Phase 2 to s3 |
| 283 | + id: phase-2-upload-s3 |
| 284 | + continue-on-error: true |
| 285 | + run: | |
| 286 | + python ./scripts/create-loss-graph.py \ |
| 287 | + --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \ |
| 288 | + --output-file "./phase-2-test.md" \ |
| 289 | + --phase "2" \ |
| 290 | + --aws-region "${{ vars.AWS_REGION }}" \ |
| 291 | + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ |
| 292 | + --base-branch "${GITHUB_REF##*/}" \ |
| 293 | + --head-sha "${{ github.sha }}" \ |
| 294 | + --pr-number "${{ github.event.number }}" \ |
| 295 | + --origin-repository "${{ github.repository }}" |
| 296 | +
|
| 297 | + - name: Check Phase 1 S3 upload status for success |
| 298 | + if: steps.phase-1-upload-s3.outcome == 'success' |
| 299 | + run: | |
| 300 | + echo "Uploaded Phase 1 loss graph to S3." |
| 301 | + cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}" |
| 302 | +
|
| 303 | + - name: Check Phase 2 S3 upload status for success |
| 304 | + if: steps.phase-2-upload-s3.outcome == 'success' |
| 305 | + run: | |
| 306 | + echo "Uploaded Phase 2 loss graph to S3." |
| 307 | + cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}" |
| 308 | +
|
| 309 | + - name: Check Phase 1 S3 upload status for failure |
| 310 | + if: steps.phase-1-upload-s3.outcome == 'failure' |
| 311 | + run: | |
| 312 | + echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate." |
| 313 | + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" |
| 314 | +
|
| 315 | + - name: Check Phase 2 S3 upload status for failure |
| 316 | + if: steps.phase-2-upload-s3.outcome == 'failure' |
| 317 | + run: | |
| 318 | + echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." |
| 319 | + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" |
0 commit comments