From 2bb68e52c39074795d01ee1549a760fba41a5b55 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 13 Feb 2023 06:12:01 +0100 Subject: [PATCH 1/4] ci: adding HPU agents --- .azure/ci-testig-parameterized.yml | 8 +- ...testing-template.yml => cuda-template.yml} | 7 +- .azure/habana-template.yml | 106 ++++++++++++++++++ 3 files changed, 117 insertions(+), 4 deletions(-) rename .azure/{testing-template.yml => cuda-template.yml} (97%) create mode 100644 .azure/habana-template.yml diff --git a/.azure/ci-testig-parameterized.yml b/.azure/ci-testig-parameterized.yml index eb6ad1e1..33f99c58 100644 --- a/.azure/ci-testig-parameterized.yml +++ b/.azure/ci-testig-parameterized.yml @@ -14,7 +14,7 @@ schedules: include: ["main"] jobs: -- template: testing-template.yml +- template: cuda-template.yml parameters: configs: - "Lightning-AI/metrics_pl-develop.yaml" @@ -24,3 +24,9 @@ jobs: - "microsoft/deepspeed-release.yaml" - "neptune-ai/lightning_integration.yaml" - "manujosephv/pytorch-tabular_lit-release.yaml" + +- template: habana-template.yml + parameters: + configs: + - "Lightning-AI/metrics_pl-develop.yaml" + - "Lightning-AI/metrics_pl-release.yaml" diff --git a/.azure/testing-template.yml b/.azure/cuda-template.yml similarity index 97% rename from .azure/testing-template.yml rename to .azure/cuda-template.yml index d8b15ac2..e265c786 100644 --- a/.azure/testing-template.yml +++ b/.azure/cuda-template.yml @@ -36,8 +36,6 @@ jobs: timeoutInMinutes: 75 # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: 2 - workspace: - clean: all pool: 'lit-rtx-3090' # this need to have installed docker in the base image... @@ -47,6 +45,9 @@ jobs: # image: "nvcr.io/nvidia/pytorch:21.11-py3" image: "pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime" options: "--gpus=all --shm-size=8g -v /usr/bin/docker:/tmp/docker:ro" + workspace: + clean: all + steps: - bash: | @@ -70,7 +71,7 @@ jobs: - bash: | sudo apt-get update -q --fix-missing - sudo apt-get install -q -y build-essential gcc g++ cmake git unzip tree --no-install-recommends + sudo apt-get install -q -y --no-install-recommends build-essential gcc g++ cmake git unzip tree # Python's dependencies pip --version pip install -r requirements.txt diff --git a/.azure/habana-template.yml b/.azure/habana-template.yml new file mode 100644 index 00000000..95f46ce8 --- /dev/null +++ b/.azure/habana-template.yml @@ -0,0 +1,106 @@ +jobs: + +- job: check_diff + pool: + vmImage: 'Ubuntu-20.04' + steps: + - bash: | + pip --version + pip install -q -r requirements.txt + pip list + displayName: 'Install dependencies' + + - script: | + echo $PR_NUMBER + CONFIGS=$(python _actions/assistant.py changed_configs $PR_NUMBER --as_list=False 2>&1) + printf "Changed configs: $CONFIGS\n" + echo "##vso[task.setvariable variable=diff;isOutput=true]$CONFIGS" + name: files + env: + PR_NUMBER: "$(System.PullRequest.PullRequestNumber)" + displayName: 'Config diff' + + +- ${{ each config in parameters.configs }}: + - job: + displayName: ${{config}} + dependsOn: check_diff + variables: + # map the output variable from A into this job + configs: $[ dependencies.check_diff.outputs['files.diff'] ] + config: "${{ config }}" + + condition: or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), contains(variables['configs'], variables['config'])) + # how long to run the job before automatically cancelling + timeoutInMinutes: 75 + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: 2 + + pool: 'intel-hpus' + # this need to have installed docker in the base image... + container: + image: "vault.habana.ai/gaudi-docker/1.8.0/ubuntu20.04/habanalabs/pytorch-installer-1.13.1:latest" + options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g -v /usr/bin/docker:/tmp/docker:ro" + workspace: + clean: all + + steps: + + - script: | + container_id=$(head -1 /proc/self/cgroup|cut -d/ -f3) + /tmp/docker exec -t -u 0 $container_id \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo" + echo "##vso[task.setvariable variable=CONTAINER_ID]$container_id" + displayName: 'Install Sudo in container (thanks Microsoft!)' + + - bash: | + whoami && id + sudo apt-get install -q -y hwinfo + hwinfo --short + python --version + python --version + pip --version + pip list + displayName: 'Image info & HW' + + - bash: | + sudo apt-get update -q --fix-missing + sudo apt-get install -q -y --no-install-recommends build-essential gcc g++ cmake git unzip tree + # Python's dependencies + pip --version + pip install -r requirements.txt + pip list + displayName: 'Install dependencies' + + #- bash: | + # echo $CONTAINER_ID + # displayName: 'Sanity check' + + - bash: | + python _actions/assistant.py prepare_env --config_file=${{config}} > prepare_env.sh + cat prepare_env.sh + displayName: 'Create scripts' + + - bash: | + bash prepare_env.sh + # pip list + tree . + displayName: 'Prepare env.' + + - script: | + ENVS=$(python _actions/assistant.py list_env --config_file=${{config}} --export 2>&1) + printf "PyTest env. variables: $ENVS\n" + echo "##vso[task.setvariable variable=envs;isOutput=true]$ENVS" + ARGS=$(python _actions/assistant.py specify_tests --config_file=${{config}} 2>&1) + printf "PyTest arguments: $ARGS\n" + echo "##vso[task.setvariable variable=args;isOutput=true]$ARGS" + name: testing + displayName: 'testing specs' + + - bash: | + $(testing.envs) + python -m pytest $(testing.args) -v + workingDirectory: _integrations + displayName: 'Integration tests' + + # ToDo: add Slack notification From 116b21f340e4380d26b5e23df53059030a94a6f8 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 13 Feb 2023 06:13:36 +0100 Subject: [PATCH 2/4] tm trigger --- configs/Lightning-AI/metrics_pl-release.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/Lightning-AI/metrics_pl-release.yaml b/configs/Lightning-AI/metrics_pl-release.yaml index 4d66d872..48a1f007 100644 --- a/configs/Lightning-AI/metrics_pl-release.yaml +++ b/configs/Lightning-AI/metrics_pl-release.yaml @@ -24,6 +24,6 @@ dependencies: # install_extras: all runtimes: - - {os: "ubuntu-20.04", python: "3.8"} - - {os: "macOS-11", python: "3.8"} + - {os: "ubuntu-22.04", python: "3.10"} + - {os: "macOS-12", python: "3.9"} - {os: "windows-2022", python: "3.8"} From 1ad49b79edd64a199548672d3d4f2849cbd4dad2 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 13 Feb 2023 06:44:42 +0100 Subject: [PATCH 3/4] deps --- .azure/cuda-template.yml | 6 +++--- .azure/habana-template.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.azure/cuda-template.yml b/.azure/cuda-template.yml index e265c786..74823e05 100644 --- a/.azure/cuda-template.yml +++ b/.azure/cuda-template.yml @@ -1,6 +1,6 @@ jobs: -- job: check_diff +- job: check_cuda_diff pool: vmImage: 'Ubuntu-20.04' steps: @@ -24,10 +24,10 @@ jobs: - ${{ each config in parameters.configs }}: - job: displayName: ${{config}} - dependsOn: check_diff + dependsOn: check_cuda_diff variables: # map the output variable from A into this job - configs: $[ dependencies.check_diff.outputs['files.diff'] ] + configs: $[ dependencies.check_cuda_diff.outputs['files.diff'] ] config: "${{ config }}" DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0"; print(gpus)' ) diff --git a/.azure/habana-template.yml b/.azure/habana-template.yml index 95f46ce8..609093ff 100644 --- a/.azure/habana-template.yml +++ b/.azure/habana-template.yml @@ -1,6 +1,6 @@ jobs: -- job: check_diff +- job: check_habana_diff pool: vmImage: 'Ubuntu-20.04' steps: @@ -24,10 +24,10 @@ jobs: - ${{ each config in parameters.configs }}: - job: displayName: ${{config}} - dependsOn: check_diff + dependsOn: check_habana_diff variables: # map the output variable from A into this job - configs: $[ dependencies.check_diff.outputs['files.diff'] ] + configs: $[ dependencies.check_habana_diff.outputs['files.diff'] ] config: "${{ config }}" condition: or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), contains(variables['configs'], variables['config'])) From eede2c0588c51d06b6bc54bd37afaf76c8a1545c Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 13 Feb 2023 07:07:10 +0100 Subject: [PATCH 4/4] status --- .azure/habana-template.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.azure/habana-template.yml b/.azure/habana-template.yml index 609093ff..76b484bf 100644 --- a/.azure/habana-template.yml +++ b/.azure/habana-template.yml @@ -57,11 +57,12 @@ jobs: whoami && id sudo apt-get install -q -y hwinfo hwinfo --short - python --version + hl-smi -L + lsmod | grep habanalabs python --version pip --version pip list - displayName: 'Image info & HW' + displayName: 'Image info & HW status' - bash: | sudo apt-get update -q --fix-missing