Skip to content

Reduce size of ParallelClusterComponent #6906

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 2 additions & 19 deletions cli/src/pcluster/resources/imagebuilder/parallelcluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ phases:
commands:
- |
set -v
COOKBOOK_URL="https://${AWS::Region}-aws-parallelcluster.s3.${AWS::Region}.${AWS::URLSuffix}/parallelcluster/${CfnParamCookbookVersion}/cookbooks/{{ build.PClusterCookbookVersionName.outputs.stdout }}.tgz"
[ -n "${CfnParamChefCookbook}" ] && COOKBOOK_URL="${CfnParamChefCookbook}"
COOKBOOK_URL="${CfnParamChefCookbook}"
[ -n "${COOKBOOK_URL}" ] && COOKBOOK_URL="https://${AWS::Region}-aws-parallelcluster.s3.${AWS::Region}.${AWS::URLSuffix}/parallelcluster/${CfnParamCookbookVersion}/cookbooks/{{ build.PClusterCookbookVersionName.outputs.stdout }}.tgz"
echo "${!COOKBOOK_URL}"

# Get Cinc Url
Expand Down Expand Up @@ -81,7 +81,6 @@ phases:
- |
set -v
RELEASE='{{ build.OperatingSystemRelease.outputs.stdout }}'

if [ `echo "${!RELEASE}" | grep -w '^amzn\.2'` ]; then
OS='alinux2'
elif [ `echo "${!RELEASE}" | grep -w '^amzn\.2023'` ]; then
Expand All @@ -104,7 +103,6 @@ phases:
echo "Operating System '${!RELEASE}' is not supported. Failing build."
exit {{ FailExitCode }}
fi

echo ${!OS}

- name: OperatingSystemVersion
Expand All @@ -130,13 +128,11 @@ phases:
- |
set -v
OS='{{ build.OperatingSystemName.outputs.stdout }}'

if [ `echo "${!OS}" | grep -E '^(alinux|rhel|rocky)'` ]; then
PLATFORM='RHEL'
elif [ `echo "${!OS}" | grep -E '^ubuntu'` ]; then
PLATFORM='DEBIAN'
fi

echo ${!PLATFORM}

# Get input base AMI Architecture
Expand Down Expand Up @@ -173,7 +169,6 @@ phases:
echo "This component does not support '${!RELEASE}'. Failing build."
exit {{ FailExitCode }}
fi

# This component only supports aarch64 CPUs on Amazon Linux 2, Ubuntu2004, Ubuntu2204, RHEL8, Rocky8, RHEL9 and Rocky9
ARCH=$(uname -m)
if [[ `echo ${!ARCH}` == 'aarch64' ]]; then
Expand Down Expand Up @@ -206,7 +201,6 @@ phases:
if [[ ${!OS} != "rocky8" ]] && [[ ${!OS} != "rhel8" ]]; then
PACKAGE_LIST+=" kernel-devel-matched-$(uname -r)"
fi

if [[ ${!OS} == "rocky8" ]] || [[ ${!OS} == "rocky9" ]] ; then
for PACKAGE in ${!PACKAGE_LIST}
do
Expand All @@ -224,7 +218,6 @@ phases:
yum -y install ${!PACKAGE}
done
fi

yum install -y yum-plugin-versionlock
# listing all the packages because wildcard does not work as expected
yum versionlock kernel kernel-core kernel-modules
Expand Down Expand Up @@ -263,7 +256,6 @@ phases:
set -v
OS='{{ build.OperatingSystemName.outputs.stdout }}'
PLATFORM='{{ build.PlatformName.outputs.stdout }}'

if [[ ${!PLATFORM} == RHEL ]]; then
yum -y update krb5-libs
yum -y groupinstall development && sudo yum -y install wget jq
Expand Down Expand Up @@ -299,25 +291,20 @@ phases:
- |
set -v
PLATFORM='{{ build.PlatformName.outputs.stdout }}'

if [[ ${!PLATFORM} == RHEL ]]; then
CA_CERTS_FILE=/etc/ssl/certs/ca-bundle.crt
yum -y upgrade ca-certificates
elif [[ ${!PLATFORM} == DEBIAN ]]; then
CA_CERTS_FILE=/etc/ssl/certs/ca-certificates.crt
apt-get -y --only-upgrade install ca-certificates
fi

curl --retry 3 -L {{ build.CincUrl.outputs.stdout }} | bash -s -- -v {{ ChefVersion }}

if [[ -e ${!CA_CERTS_FILE} ]]; then
mkdir -p /opt/cinc/embedded/ssl/certs
ln -sf ${!CA_CERTS_FILE} /opt/cinc/embedded/ssl/certs/cacert.pem
fi

curl --retry 3 -L -o gems.tgz https://${AWS::Region}-aws-parallelcluster.s3.${AWS::Region}.${AWS::URLSuffix}/archives/dependencies/ruby/gems.tgz
tar -xf gems.tgz

cd vendor/cache
/opt/cinc/embedded/bin/gem install --no-document minitar:0.9
/opt/cinc/embedded/bin/gem install --local --no-document berkshelf:{{ BerkshelfVersion }}
Expand All @@ -330,13 +317,10 @@ phases:
- |
set -v
mkdir -p /etc/chef && sudo chown -R root:root /etc/chef

curl --retry 3 -L -o /etc/chef/aws-parallelcluster-cookbook.tgz "{{ build.CookbookUrl.outputs.stdout }}"

mkdir -p /tmp/cookbooks
cd /tmp/cookbooks
tar -xzf /etc/chef/aws-parallelcluster-cookbook.tgz

export HOME="/tmp"
for dir in $(ls /tmp/cookbooks); do
cd /tmp/cookbooks/${!dir}
Expand Down Expand Up @@ -388,7 +372,6 @@ phases:
# Remove kernel version lock
if [[ ${!PLATFORM} == RHEL ]]; then
yum versionlock delete kernel kernel-core kernel-modules

if [[ ${!OS} == "rocky8" ]] || [[ ${!OS} == "rocky9" ]] ; then
yum versionlock delete rocky-release rocky-repos
elif [[ ${!OS} == "rhel8" ]] || [[ ${!OS} == "rhel9" ]] ; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ phases:
- |
set -v
RELEASE='{{ test.OSRelease.outputs.stdout }}'

if [ `echo "${RELEASE}" | grep -w '^amzn\.2'` ]; then
OS='alinux2'
elif [ `echo "${RELEASE}" | grep -w '^amzn\.2023'` ]; then
Expand All @@ -53,7 +52,6 @@ phases:
else
echo "Operating System '${RELEASE}' is not supported. Failing build." && exit 1
fi

echo ${OS}

- name: OSArchitecture
Expand Down Expand Up @@ -82,13 +80,11 @@ phases:
- |
set -v
OS='{{ test.OSName.outputs.stdout }}'

if [ `echo "${OS}" | grep -E '^(alinux|centos|rhel|rocky)'` ]; then
PLATFORM='RHEL'
elif [ `echo "${OS}" | grep -E '^ubuntu'` ]; then
PLATFORM='DEBIAN'
fi

echo ${PLATFORM}

- name: IntelMPISupported
Expand Down Expand Up @@ -212,13 +208,11 @@ phases:
- |
set -vx
PLATFORM='{{ test.PlatformName.outputs.stdout }}'

if [ {{ test.IntelMPISupported.outputs.stdout }} == true ]; then
echo "Checking efa packages installed..."
if [ ${PLATFORM} == RHEL ]; then
rpm -qa | grep libfabric && rpm -qa | grep efa-
[[ $? -ne 0 ]] && echo "Check efa rpm failed" && exit 1

echo "Checking Intel MPI 20xx installed and module available..."
unset MODULEPATH
source /etc/profile.d/modules.sh
Expand All @@ -238,22 +232,18 @@ phases:
- |
set -vx
PLATFORM='{{ test.PlatformName.outputs.stdout }}'

if [[ {{ test.NvidiaEnabled.outputs.stdout }} == 'no' ]]; then
echo "Nvidia recipe not enabled, skipping." && exit 0
fi
if [ {{ test.HasGPU.outputs.stdout }} == "false" ]; then
echo "No GPU detected, skipping." && exit 0
fi

driver_ver="{{ test.NvidiaVersion.outputs.stdout }}"
export PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin"

echo "Testing Nvidia driver version"
driver_output=$(nvidia-smi | grep -E -o "Driver Version: [0-9.]+")
[[ "${driver_output}" != "Driver Version: ${driver_ver}" ]] && "ERROR Installed version ${driver_output} but expected ${driver_ver}" && exit 1
echo "Correctly installed Nvidia ${driver_output}"

if [ {{ test.FabricManagerSupported.outputs.stdout }} == "true" ]; then
echo "Testing Nvidia Fabric Manager version"
nvidia_driver_version=$(modinfo -F version nvidia)
Expand All @@ -266,15 +256,13 @@ phases:
fi
echo "Fabric Manager match Nvidia driver and version is locked"
fi

echo "Testing CUDA installation with nvcc"
cuda_ver="{{ test.CudaVersion.outputs.stdout }}"
export PATH=/usr/local/cuda-${cuda_ver}/bin:${PATH}
export LD_LIBRARY_PATH=/usr/local/cuda-${cuda_ver}/lib64:${LD_LIBRARY_PATH}
cuda_output=$(nvcc -V | grep -E -o "release [0-9]+.[0-9]+")
[[ "${cuda_output}" != "release ${cuda_ver}" ]] && echo "ERROR Installed version ${cuda_output} but expected ${cuda_ver}" && exit 1
echo "Correctly installed CUDA ${cuda_output}"

echo "Testing CUDA with deviceQuery..."
if [ {{ test.OSArchitecture.outputs.stdout }} != 'arm64' ]; then
/usr/local/cuda-${cuda_ver}/extras/demo_suite/deviceQuery | grep -o "Result = PASS"
Expand Down Expand Up @@ -322,7 +310,6 @@ phases:
- |
set -vx
OS='{{ test.OSName.outputs.stdout }}'

[[ $? -ne 0 ]] && echo "Check for Lustre client failed" && exit 1
echo "FSx Lustre test passed"

Expand Down
68 changes: 68 additions & 0 deletions cli/tests/pcluster/templates/test_imagebuilder_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2203,6 +2203,63 @@ def test_imagebuilder_lambda_execution_role(
{"ComponentArn": {"Ref": "ParallelClusterTestComponent"}},
],
),
(
{
"imagebuilder": {
"build": {
"parent_image": "ami-0185634c5a8a37250",
"installation": {"nvidia_software": {"enabled": True}, "lustre_client": {"enabled": True}},
"imds": {"imds_support": "v2.0"},
"subnet_id": "subnet-0292c5356eadc531f",
"iam": {
"instance_role": "arn:aws:iam::123456789012:role/pcluster",
"instance_profile": "arn:aws:iam::123456789012:instance-profile/pcluster",
"cleanup_lambda_role": "arn:aws:iam::123456789012:role/pcluster",
"additional_iam_policies": [{"policy": "arn:aws:iam::aws:policy/AmazonEC2ReadOnlyAccess"}]
},
"instance_type": "c5.xlarge",
"security_group_ids": ["sg-b0bbeacc", "sg-0fc70b22048995b07"],
"components": [
{
"type": "arn",
"value": "arn:aws:imagebuilder:us-east-1:aws:component/apache-tomcat-9-linux/1.0.0",
},
{
"type": "arn",
"value": "arn:aws:imagebuilder:us-east-1:"
"aws:component/amazon-cloudwatch-agent-linux/1.0.0",
},
],
"update_os_packages": {"enabled": True},
},
"dev_settings": {
"cookbook": {
"chef_cookbook": "https://tests/aws-parallelcluster-cookbook-3.0.tgz",
"extra_chef_attributes": '{"cluster": {"test_cluster_attribute": "test_cluster_attribute_values"}}',
},
"node_package": "https://tests/aws-parallelcluster-node-3.0.tgz",
},
}
},
{
"Architecture": "x86_64",
"BlockDeviceMappings": [
{
"DeviceName": "/dev/xvda",
"Ebs": {
"VolumeSize": 50,
},
}
],
},
[
{"ComponentArn": {"Ref": "UpdateOSComponent"}},
{"ComponentArn": {"Ref": "ParallelClusterComponent"}},
{"ComponentArn": {"Ref": "ParallelClusterTagComponent"}},
{"ComponentArn": "arn:aws:imagebuilder:us-east-1:aws:component/apache-tomcat-9-linux/1.0.0"},
{"ComponentArn": "arn:aws:imagebuilder:us-east-1:aws:component/amazon-cloudwatch-agent-linux/1.0.0"},
],
),
],
)
def test_imagebuilder_components(mocker, resource, response, expected_components):
Expand All @@ -2222,6 +2279,17 @@ def test_imagebuilder_components(mocker, resource, response, expected_components
assert_that(generated_template.get("Resources").get("ImageRecipe").get("Properties").get("Components")).is_equal_to(
expected_components
)
# Check size Limits of ImageBuilder Components
imagebuilder_resources = generated_template.get("Resources")
for component_name, component_content in imagebuilder_resources.items():
if (
imagebuilder_resources.get(component_name)
and imagebuilder_resources.get(component_name).get("Type") == "AWS::ImageBuilder::Component"
):
print("Component {} has size {}".format(component_name, len(str(imagebuilder_resources.get(component_name).get("Properties").get("Data")))))
assert_that(
len(str(imagebuilder_resources.get(component_name).get("Properties").get("Data")))
).is_less_than(16000)


@pytest.mark.parametrize(
Expand Down
Loading