Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
308 changes: 305 additions & 3 deletions .github/workflows/e2e-nvidia-l40s-x4-sdk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,317 @@
name: E2E (NVIDIA L40S x4) SDK Test

on:
pull_request:
branches:
- "main"
schedule:
- cron: '0 16 * * *' # Runs at 4PM UTC every day
workflow_dispatch:
inputs:
pr_or_branch:
description: 'pull request number or branch name'
required: true
default: 'main'
concurrency:
group: ${{ github.workflow }}-${{ github.event.number || github.ref }}
cancel-in-progress: true

env:
TMPDIR: /home/tmp

jobs:
noop:
start-large-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
steps:
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: instructlab/ci-actions
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
path: ci-actions
ref: release-v0.1
sparse-checkout: |
actions/launch-ec2-runner-with-fallback

- name: Launch EC2 Runner with Fallback
id: launch-ec2-instance-with-fallback
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
env:
TMPDIR: "/tmp"
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
regions_config: >
[
{
"region": "us-east-2",
"subnets": {
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
},
{
"region": "us-east-1",
"subnets": {
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
}
]
try_spot_instance_first: false
ec2_instance_type: g6e.12xlarge
aws_resource_tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]

e2e-large-test:
needs:
- start-large-ec2-runner
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}

permissions:
pull-requests: write

steps:
- name: "Harden Runner"
# v2.10.1
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
with:
egress-policy: audit
- name: Install Packages
run: |
cat /etc/os-release
mkdir -p "${TMPDIR}"
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Install dependent PRs if needed
uses: depends-on/depends-on-action@61cb3f4a0e2c8ae4b90c9448dc57c7ba9ca24c35 # main
with:
token: ${{ secrets.GITHUB_TOKEN }}

- name: Fetch and checkout PR
if: ${{ github.event_name == 'pull_request_target' }}
run: |
git fetch origin pull/${{ github.event.number }}/head:pr-${{ github.event.number }}
git checkout pr-${{ github.event.number }}

- name: Update instructlab-training library
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
export PATH="$PATH:$CUDA_HOME/bin"
nvidia-smi
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
pip install instructlab
pip install instructlab[cuda]
pip install vllm
python3.11 -m pip install packaging wheel setuptools-scm
pip install .
pip install .[cuda]
python3.11 -m pip uninstall -y flash-attn
python3.11 -m pip cache purge
python3.11 -m pip install ninja
MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation

- name: Check disk before tests
run: |
df -h

# TODO: switch to downloading a ds rather than generating one
# - name: Download SDG Dataset
# working-directory: ./training
# uses: actions/download-artifact@v4
# with:
# name: sdg-dataset.jsonl
# path: dataset

- name: Run e2e test
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
. venv/bin/activate
ls scripts
ls ./
./scripts/test-sdk.sh

# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
# and we know that it will be written into a directory created by `mktemp -d`.
# Given this information, we can use the following command to find the file:
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
phase_num=1;
for log_file in $log_files; do
mv "${log_file}" phase-${phase_num}-training-log.jsonl
((phase_num++))
done

- name: Check disk after tests
run: |
df -h

- name: Upload training logs Phase 1
uses: actions/upload-artifact@v4
with:
name: phase-1-training-log.jsonl
path: ./phase-1-training-log.jsonl
retention-days: 1
overwrite: true

- name: Upload training logs Phase 2
uses: actions/upload-artifact@v4
with:
name: phase-2-training-log.jsonl
path: ./phase-2-training-log.jsonl
retention-days: 1
overwrite: true

stop-large-ec2-runner:
needs:
- start-large-ec2-runner
- e2e-large-test
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: No-op
run: 'true'
- name: "Harden Runner"
# v2.10.1
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
with:
egress-policy: audit

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}

- name: Stop EC2 runner
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

loss-graphs:
needs:
- stop-large-ec2-runner
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: "Harden Runner"
# v2.10.1
uses: step-security/harden-runner@c6295a65d1254861815972266d5933fd6e532bdf
with:
egress-policy: audit

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}

- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-dev.txt

- name: Download loss data Phase 1
id: phase-1-download-logs
uses: actions/download-artifact@v4
with:
name: phase-1-training-log.jsonl
path: downloaded-data

- name: Download loss data Phase 2
id: phase-2-download-logs
uses: actions/download-artifact@v4
with:
name: phase-2-training-log.jsonl
path: downloaded-data

- name: Try to upload Phase 1 to s3
id: phase-1-upload-s3
continue-on-error: true
run: |
python ./scripts/create-loss-graph.py \
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
--output-file "./phase-1-test.md" \
--phase "1" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
--base-branch "${GITHUB_REF##*/}" \
--head-sha "${{ github.sha }}" \
--pr-number "${{ github.event.number }}" \
--origin-repository "${{ github.repository }}"

- name: Try to upload Phase 2 to s3
id: phase-2-upload-s3
continue-on-error: true
run: |
python ./scripts/create-loss-graph.py \
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
--output-file "./phase-2-test.md" \
--phase "2" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
--base-branch "${GITHUB_REF##*/}" \
--head-sha "${{ github.sha }}" \
--pr-number "${{ github.event.number }}" \
--origin-repository "${{ github.repository }}"

- name: Check Phase 1 S3 upload status for success
if: steps.phase-1-upload-s3.outcome == 'success'
run: |
echo "Uploaded Phase 1 loss graph to S3."
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"

- name: Check Phase 2 S3 upload status for success
if: steps.phase-2-upload-s3.outcome == 'success'
run: |
echo "Uploaded Phase 2 loss graph to S3."
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"

- name: Check Phase 1 S3 upload status for failure
if: steps.phase-1-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

- name: Check Phase 2 S3 upload status for failure
if: steps.phase-2-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
30 changes: 30 additions & 0 deletions scripts/ibm_legacy_tmpl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# SPDX-License-Identifier: Apache-2.0

# First Party
from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo

SPECIAL_TOKENS = SpecialTokens(
system=TokenInfo("<|system|>", add_to_tokenizer=True),
user=TokenInfo("<|user|>", add_to_tokenizer=True),
assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
)

CHAT_TEMPLATE = (
"{% for message in messages %}"
"{% if message['role'] == 'pretraining' %}"
"{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
"{% elif message['role'] == 'system' %}"
"{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
"{% elif message['role'] == 'user' %}"
"{{'<|user|>' + '\n' + message['content'] + '\n'}}"
"{% elif message['role'] == 'assistant' %}"
"{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
"{% endif %}"
"{% if loop.last and add_generation_prompt %}"
"{{ '<|assistant|>' + '\n' }}"
"{% endif %}"
"{% endfor %}"
)
Loading
Loading