diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py new file mode 100644 index 0000000000..858120eb3c --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_topology_generator.py @@ -0,0 +1,170 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +# FIXME: Fix Code Duplication +# pylint: disable=R0801 + +import argparse +import logging +import os +import traceback + +import yaml + +log = logging.getLogger() + + +CAPACITY_TYPE_MAP = { + "ONDEMAND": "on-demand", + "SPOT": "spot", + "CAPACITY_BLOCK": "capacity-block", +} +CONFIG_HEADER = "# This file is automatically generated by pcluster\n" + + +class CriticalError(Exception): + """Critical error for the script.""" + + pass + + +class ConfigurationFieldNotFoundError(Exception): + """Field not found in configuration.""" + + pass + + +def _load_cluster_config(input_file_path): + """Load cluster config file.""" + with open(input_file_path, encoding="utf-8") as input_file: + return yaml.load(input_file, Loader=yaml.SafeLoader) + + +def generate_topology_config_file(output_file: str, input_file: str, block_sizes: str): # noqa: C901 + """ + Generate Topology configuration file. + + Generate topology.conf + + # This file is automatically generated by pcluster + BlockName=block1 Nodes=queue-1-st-compute-resource-0-[1-9] #### 9 nodes + BlockName=block2 Nodes=queue-1-st-compute-resource-0-[1-18] #### 18 nodes + BlockSizes=9,18 + """ + if block_sizes: + min_block_size_list = min(list(map(int, block_sizes.split(",")))) + max_block_size_list = max(list(map(int, block_sizes.split(",")))) + + cluster_config = _load_cluster_config(input_file) + queue_name, compute_resource_name = None, None + try: + topology_config = CONFIG_HEADER + "\n" + block_count = 0 + for queue_config in cluster_config["Scheduling"]["SlurmQueues"]: + queue_name = queue_config["Name"] + + # Retrieve capacity info from the queue_name, if there + queue_capacity_type = CAPACITY_TYPE_MAP.get(queue_config.get("CapacityType", "ONDEMAND")) + if queue_capacity_type != CAPACITY_TYPE_MAP.get("CAPACITY_BLOCK"): + log.info("ParallelCluster does not create topology for %s", queue_capacity_type) + continue + + for compute_resource_config in queue_config["ComputeResources"]: + compute_resource_name = compute_resource_config["Name"] + compute_min_count = compute_resource_config["MinCount"] + compute_max_count = compute_resource_config["MaxCount"] + if compute_min_count == compute_max_count: + node_type = "st" + else: + continue + + # Check for if reservation is for NVLink and size matches min_block_size_list + if compute_resource_config.get("InstanceType") == "p6e-gb200.36xlarge": + if min_block_size_list == compute_min_count or max_block_size_list == compute_max_count: + block_count += 1 + # Each Capacity Reservation ID is a Capacity Block, + # we associate each slurm block with a single capacity Block + topology_config += ( + "BlockName=Block" + + str(block_count) + + " Nodes=" + + str(queue_name) + + "-" + + str(node_type) + + "-" + + str(compute_resource_name) + + "-[1-" + + str(compute_max_count) + + "]\n" + ) + + topology_config += "BlockSizes=" + str(block_sizes) + "\n" + except (KeyError, AttributeError) as e: + if isinstance(e, KeyError): + message = f"Unable to find key {e} in the configuration file." + else: + message = f"Error parsing configuration file. {e}. {traceback.format_exc()}." + message += f" Queue: {queue_name}" if queue_name else "" + log.error(message) + raise CriticalError(message) + + log.info("Writing Info %s", topology_config) + log.info("Generating %s", output_file) + with open(output_file, "w", encoding="utf-8") as output: + output.write(topology_config) + + log.info("Finished.") + + +def cleanup_topology_config_file(file_path): + """Cleanup topology.conf file.""" + try: + if os.path.exists(file_path): + log.info("Cleaning up %s", file_path) + os.remove(file_path) + except Exception as err: + log.warning("Unable to delete %s due to %s", file_path, err) + + +def main(): + try: + logging.basicConfig( + level=logging.INFO, format="%(asctime)s - [%(name)s:%(funcName)s] - %(levelname)s - %(message)s" + ) + log.info("Running ParallelCluster Topology Config Generator") + parser = argparse.ArgumentParser(description="Take in Topology configuration generator related parameters") + cleanup_or_generate_exclusive_group = parser.add_mutually_exclusive_group(required=True) + parser.add_argument("--output-file", help="The output file for generated topology.conf", required=True) + parser.add_argument( + "--input-file", + help="Yaml file containing pcluster CLI configuration file with default values", + required=True, + ) + cleanup_or_generate_exclusive_group.add_argument("--block-sizes", help="Block Sizes of topology.conf") + cleanup_or_generate_exclusive_group.add_argument( + "--cleanup", + action="store_true", + help="Cleanup topology.conf", + ) + args = parser.parse_args() + if args.cleanup: + cleanup_topology_config_file(args.output_file) + else: + generate_topology_config_file(args.output_file, args.input_file, args.block_sizes) + log.info("Completed Execution of ParallelCluster Topology Config Generator") + except Exception as e: + log.exception("Failed to generate Topology.conf, exception: %s", e) + raise + + +if __name__ == "__main__": + main() diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb index 419c147484..99000e5676 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/config/config_head_node.rb @@ -45,6 +45,7 @@ owner 'root' group 'root' mode '0644' + variables(is_block_topology_plugin_supported: platform?('amazon') && node['platform_version'] == "2") end template "#{node['cluster']['slurm']['install_dir']}/etc/gres.conf" do @@ -54,6 +55,10 @@ mode '0644' end +block_topology 'Add Block Topology configuration' do + action :configure +end + unless on_docker? # Generate pcluster specific configs no_gpu = nvidia_installed? ? "" : "--no-gpu" diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 7a131f706a..9b63a4a4b6 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -151,6 +151,10 @@ def update_nodes_in_queue(strategy, queues) end end +block_topology 'Update or Cleanup Slurm Topology' do + action :update +end + execute "generate_pcluster_slurm_configs" do command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \ " --output-directory #{node['cluster']['slurm']['install_dir']}/etc/" \ diff --git a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_alinux2023.rb b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_alinux2023.rb new file mode 100644 index 0000000000..a06e2175dc --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_alinux2023.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :block_topology, platform: 'amazon' do |node| + node['platform_version'].to_i == 2023 +end + +use 'partial/_block_topology_common.rb' diff --git a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_amazon2.rb b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_amazon2.rb new file mode 100644 index 0000000000..f1d9625b31 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_amazon2.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :block_topology, platform: 'amazon', platform_version: '2' + +use 'partial/_block_topology_common.rb' + +def is_block_topology_supported? + # We do not support Block Topology with Alinux2 as we do not support Gb200 with this OS + false +end diff --git a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_redhat8.rb b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_redhat8.rb new file mode 100644 index 0000000000..a220a67240 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_redhat8.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :block_topology, platform: 'redhat' do |node| + node['platform_version'].to_i >= 8 +end + +use 'partial/_block_topology_common.rb' diff --git a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_rocky8.rb b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_rocky8.rb new file mode 100644 index 0000000000..0e6acd3712 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_rocky8.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :block_topology, platform: 'rocky' do |node| + node['platform_version'].to_i >= 8 +end + +use 'partial/_block_topology_common.rb' diff --git a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_ubuntu22+.rb b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_ubuntu22+.rb new file mode 100644 index 0000000000..5a9a2f6485 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/block_topology_ubuntu22+.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :block_topology, platform: 'ubuntu' do |node| + node['platform_version'].to_i >= 22 +end + +use 'partial/_block_topology_common.rb' diff --git a/cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb new file mode 100644 index 0000000000..a3dd501aac --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb @@ -0,0 +1,70 @@ +# frozen_string_literal: true +# +# Copyright:: 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +unified_mode true +default_action :configure + +action :configure do + return unless is_block_topology_supported? + # Use slurm_parallelcluster_topology to add Block Topology plugin + template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do + source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb' + owner 'root' + group 'root' + mode '0644' + end + # Generate Slurm topology.conf file + execute "generate_topology_config" do + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\ + " --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\ + " --block-sizes #{node['cluster']['p6egb200_block_sizes']}"\ + " --input-file #{node['cluster']['cluster_config_path']}" + not_if { node['cluster']['p6egb200_block_sizes'].nil? } + end +end + +action :update do + return unless is_block_topology_supported? + # Update slurm_parallelcluster_topology to add/remove Block Topology plugin + template "#{node['cluster']['slurm']['install_dir']}/etc/slurm_parallelcluster_topology.conf" do + source 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb' + owner 'root' + group 'root' + mode '0644' + end + # Update Slurm topology.conf file + execute "update or cleanup topology.conf" do + command "#{cookbook_virtualenv_path}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_topology_generator.py"\ + " --output-file #{node['cluster']['slurm']['install_dir']}/etc/topology.conf"\ + " --input-file #{node['cluster']['cluster_config_path']}"\ + "#{topology_generator_command_args}" + not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && topology_generator_command_args.nil? } + end +end + +def is_block_topology_supported? + true +end + +def topology_generator_command_args + if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf") + # If topology.conf exist and Capacity Block is removed, we cleanup + " --cleanup" + elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated? + # We do nothing if p6e-gb200 is not used and queues are not updated + nil + else + " --block-sizes #{node['cluster']['p6egb200_block_sizes']}" + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb new file mode 100644 index 0000000000..a943918951 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb @@ -0,0 +1,159 @@ +require 'spec_helper' + +class ConvergeBlockTopology + def self.configure(chef_run) + chef_run.converge_dsl('aws-parallelcluster-slurm') do + block_topology 'configure' do + action :configure + end + end + end + + def self.update(chef_run) + chef_run.converge_dsl('aws-parallelcluster-slurm') do + block_topology 'update' do + action :update + end + end + end +end + +script_dir = 'SCRIPT_DIR' +slurm_install_dir = 'SLURM_INSTALL_DIR' +block_sizes = '9,18' +cluster_config = 'CONFIG_YAML' +cookbook_env = 'FAKE_COOKBOOK_PATH' + +describe 'block_topology:configure' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = ChefSpec::SoloRunner.new( + platform: platform, + version: version, + step_into: ['block_topology'] + ) do |node| + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['scripts_dir'] = script_dir + node.override['cluster']['slurm']['install_dir'] = slurm_install_dir + node.override['cluster']['p6egb200_block_sizes'] = block_sizes + node.override['cluster']['cluster_config_path'] = cluster_config + end + allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true) + allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env) + ConvergeBlockTopology.configure(runner) + runner + end + + if platform == 'amazon' && version == '2' + it 'does not configures block_topology' do + expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") + expect(chef_run).not_to run_execute('generate_topology_config') + end + else + it 'creates the topology configuration template' do + expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") + .with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + end + + it 'generates topology config when block sizes are present' do + expect(chef_run).to run_execute('generate_topology_config') + .with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \ + " --output-file #{slurm_install_dir}/etc/topology.conf" \ + " --block-sizes #{block_sizes}" \ + " --input-file #{cluster_config}") + end + end + end + end +end + +describe 'block_topology:update' do + for_all_oses do |platform, version| + ['--cleannup', nil, "--block-sizes #{block_sizes}"].each do |topo_command_args| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = ChefSpec::SoloRunner.new( + platform: platform, + version: version, + step_into: ['block_topology'] + ) do |node| + node.override['cluster']['node_type'] = 'HeadNode' + node.override['cluster']['scripts_dir'] = script_dir + node.override['cluster']['slurm']['install_dir'] = slurm_install_dir + node.override['cluster']['p6egb200_block_sizes'] = block_sizes + node.override['cluster']['cluster_config_path'] = cluster_config + end + allow_any_instance_of(Object).to receive(:is_block_topology_supported).and_return(true) + allow_any_instance_of(Object).to receive(:topology_generator_command_args).and_return(topo_command_args) + allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_env) + ConvergeBlockTopology.update(runner) + runner + end + + if platform == 'amazon' && version == '2' + it 'does not configures block_topology' do + expect(chef_run).not_to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") + expect(chef_run).not_to run_execute('update or cleanup topology.conf') + end + else + it 'creates the topology configuration template' do + expect(chef_run).to create_template("#{slurm_install_dir}/etc/slurm_parallelcluster_topology.conf") + .with(source: 'slurm/block_topology/slurm_parallelcluster_topology.conf.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + end + + it 'update or cleanup topology.conf when block sizes are present' do + expect(chef_run).to run_execute('update or cleanup topology.conf') + .with(command: "#{cookbook_env}/bin/python #{script_dir}/slurm/pcluster_topology_generator.py" \ + " --output-file #{slurm_install_dir}/etc/topology.conf" \ + " --input-file #{cluster_config}"\ + "#{topo_command_args}") + end + end + end + end + end +end + +describe 'block_topology:topology_generator_command_args' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner(platform: platform, version: version, step_into: ['block_topology']) do |node| + node.override['cluster']['p6egb200_block_sizes'] = nil + end + end + cached(:resource) do + ConvergeBlockTopology.update(chef_run) + chef_run.find_resource('block_topology', 'update') + end + + context "when queues are not updated and topolog.conf does not exists" do + before do + allow_any_instance_of(Object).to receive(:are_queues_updated?).and_return(false) + allow(File).to receive(:exist?).with("#{slurm_install_dir}/etc/topology.conf").and_return(false) + end + + it 'it gives nil' do + expect(resource.topology_generator_command_args).to eq(nil) + end + end + + context "when block sizes is not nil" do + before do + chef_run.node.override['cluster']['p6egb200_block_sizes'] = block_sizes + end + + it 'returns block-sizes argument' do + expect(resource.topology_generator_command_args).to eq(" --block-sizes #{block_sizes}") + end + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/block_topology/slurm_parallelcluster_topology.conf.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/block_topology/slurm_parallelcluster_topology.conf.erb new file mode 100644 index 0000000000..e521a232c7 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/block_topology/slurm_parallelcluster_topology.conf.erb @@ -0,0 +1,8 @@ +# slurm_parallelcluster_topology.conf is managed by the pcluster processes. +# Do not modify. +# Please use CustomSlurmSettings in the ParallelCluster configuration file to add user-specific slurm configuration +# options +# TOPOLOGY Plugin +<% unless node['cluster']['p6egb200_block_sizes'].nil? -%> +TopologyPlugin=topology/block +<% end -%> diff --git a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb index b52a1b6b5d..97d299399b 100644 --- a/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb +++ b/cookbooks/aws-parallelcluster-slurm/templates/default/slurm/slurm.conf.erb @@ -84,3 +84,9 @@ include slurm_parallelcluster.conf # WARNING!!! The custom_slurm_settings_include_file_slurm.conf file included below can be updated by the pcluster process. # Please do not edit it. include pcluster/custom_slurm_settings_include_file_slurm.conf +<% unless @is_block_topology_plugin_supported -%> +# +# WARNING!!! The slurm_parallelcluster_topology.conf file included below can be updated by the pcluster process. +# Please do not edit it. +include slurm_parallelcluster_topology.conf +<% end -%> diff --git a/test/unit/slurm/test_topology_generator.py b/test/unit/slurm/test_topology_generator.py new file mode 100644 index 0000000000..57a12780c4 --- /dev/null +++ b/test/unit/slurm/test_topology_generator.py @@ -0,0 +1,50 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with +# the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest +from assertpy import assert_that +from pcluster_topology_generator import ( + cleanup_topology_config_file, + generate_topology_config_file, +) + + +def _assert_files_are_equal(file, expected_file): + with open(file, "r", encoding="utf-8") as f, open(expected_file, "r", encoding="utf-8") as exp_f: + assert_that(f.read()).is_equal_to(exp_f.read()) + + +@pytest.mark.parametrize("file_name_suffix", ["with_capacity_block", "no_capacity_block"]) +def test_generate_topology_config(test_datadir, tmpdir, file_name_suffix): + block_sizes = "9,18" if "no" not in file_name_suffix else None + file_name = "sample_" + file_name_suffix + ".yaml" + input_file_path = str(test_datadir / file_name) + output_file_name = "topology_" + file_name_suffix + ".conf" + output_file_path = f"{tmpdir}/{output_file_name}" + generate_topology_config_file(output_file_path, input_file_path, block_sizes) + if "no" in file_name_suffix: + assert_that(os.path.isfile(output_file_path)).is_equal_to(False) + else: + _assert_files_are_equal(output_file_path, test_datadir / "expected_outputs" / output_file_name) + + +@pytest.mark.parametrize("file_exists", [True, False]) +def test_cleanup_topology_config_file(mocker, tmpdir, file_exists): + topology_file_path = tmpdir / "topology.conf" + mocker.patch("os.path.exists", return_value=file_exists) + mock_remove = mocker.patch("os.remove") + cleanup_topology_config_file(str(topology_file_path)) + if file_exists: + mock_remove.assert_called_once_with(str(topology_file_path)) + else: + mock_remove.assert_not_called() diff --git a/test/unit/slurm/test_topology_generator/test_generate_topology_config/expected_outputs/topology_with_capacity_block.conf b/test/unit/slurm/test_topology_generator/test_generate_topology_config/expected_outputs/topology_with_capacity_block.conf new file mode 100644 index 0000000000..8a14b308be --- /dev/null +++ b/test/unit/slurm/test_topology_generator/test_generate_topology_config/expected_outputs/topology_with_capacity_block.conf @@ -0,0 +1,6 @@ +# This file is automatically generated by pcluster + +BlockName=Block1 Nodes=capacity-block-queue1-st-cb-gb200-1-[1-9] +BlockName=Block2 Nodes=capacity-block-queue2-st-cb-gb200-2-[1-18] +BlockName=Block3 Nodes=capacity-block-queue2-st-cb-gb200-3-[1-9] +BlockSizes=9,18 diff --git a/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_no_capacity_block.yaml b/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_no_capacity_block.yaml new file mode 100644 index 0000000000..9750108547 --- /dev/null +++ b/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_no_capacity_block.yaml @@ -0,0 +1,89 @@ +Scheduling: + SlurmQueues: + - CapacityType: SPOT + ComputeResources: + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: c4.xlarge + MaxCount: 10 + MinCount: 5 + Name: multiplespot-1 + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: c5.2xlarge + MaxCount: 5 + MinCount: 5 + Name: multiplespot-2 + SpotPrice: 1.5 + StaticNodePriority: 1 + DynamicNodePriority: 1000 + ComputeSettings: null + CustomActions: null + Iam: + AdditionalIamPolicies: [] + InstanceRole: null + S3Access: null + Name: multiple_spot + - CapacityType: ONDEMAND + ComputeResources: + - DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + InstanceType: c5n.18xlarge + MaxCount: 5 + MinCount: 1 + Name: efa-c5n + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + ComputeSettings: null + CustomActions: null + Iam: + AdditionalIamPolicies: [] + InstanceRole: null + S3Access: null + Name: efa + - CapacityType: ONDEMAND + ComputeResources: + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: g3.8xlarge + MaxCount: 5 + MinCount: 1 + Name: gpu-g38xlarge + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: p3dn.24xlarge + MaxCount: 10 + MinCount: 10 + Name: gpu-p3dn24xlarge + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + ComputeSettings: null + CustomActions: null + Iam: + AdditionalIamPolicies: [] + InstanceRole: null + S3Access: null + Name: gpu + Scheduler: slurm + SlurmSettings: + ScaledownIdletime: 10 + Database: null + ExternalSlurmdbd: null diff --git a/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml b/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml new file mode 100644 index 0000000000..14ca792f32 --- /dev/null +++ b/test/unit/slurm/test_topology_generator/test_generate_topology_config/sample_with_capacity_block.yaml @@ -0,0 +1,199 @@ +Scheduling: + SlurmQueues: + - CapacityType: SPOT + ComputeResources: + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: c4.xlarge + MaxCount: 10 + MinCount: 5 + Name: multiplespot-1 + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: c5.2xlarge + MaxCount: 5 + MinCount: 5 + Name: multiplespot-2 + SpotPrice: 1.5 + StaticNodePriority: 1 + DynamicNodePriority: 1000 + ComputeSettings: null + CustomActions: null + Iam: + AdditionalIamPolicies: [] + InstanceRole: null + S3Access: null + Name: multiple_spot + - CapacityType: ONDEMAND + ComputeResources: + - DisableSimultaneousMultithreading: true + Efa: + Enabled: true + GdrSupport: false + InstanceType: c5n.18xlarge + MaxCount: 5 + MinCount: 1 + Name: efa-c5n + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + ComputeSettings: null + CustomActions: null + Iam: + AdditionalIamPolicies: [] + InstanceRole: null + S3Access: null + Name: efa + - CapacityType: ONDEMAND + ComputeResources: + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: g3.8xlarge + MaxCount: 5 + MinCount: 1 + Name: gpu-g38xlarge + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + - DisableSimultaneousMultithreading: true + Efa: + Enabled: false + GdrSupport: false + InstanceType: p3dn.24xlarge + MaxCount: 10 + MinCount: 10 + Name: gpu-p3dn24xlarge + SpotPrice: null + StaticNodePriority: 1 + DynamicNodePriority: 1000 + ComputeSettings: null + CustomActions: null + Iam: + AdditionalIamPolicies: [] + InstanceRole: null + S3Access: null + Name: gpu + - CapacityType: CAPACITY_BLOCK + CapacityReservationTarget: + CapacityReservationId: cr-987654 + ComputeResources: + # compute resource with different reservation id, single instance type + - CapacityReservationTarget: + CapacityReservationId: cr-876543 + DisableSimultaneousMultithreading: false + Efa: + Enabled: true + GdrSupport: false + InstanceType: p6e-gb200.36xlarge + MaxCount: 9 + MinCount: 9 + Name: cb-gb200-1 + # compute resource with multiple instance types + - CapacityReservationTarget: null + DisableSimultaneousMultithreading: false + Efa: + Enabled: true + GdrSupport: false + Instances: + - InstanceType: c5n.4xlarge + - InstanceType: r5.4xlarge + MaxCount: 10 + MinCount: 10 + Name: fleet-no-res + SchedulableMemory: null + SpotPrice: null + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: null + Iam: + AdditionalIamPolicies: [ ] + InstanceProfile: null + InstanceRole: null + S3Access: null + Image: null + Name: capacity-block-queue1 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: true + Id: null + Proxy: null + SecurityGroups: null + SubnetIds: + - subnet-0230367ab0e5123a4 + - CapacityType: CAPACITY_BLOCK + ComputeResources: + - CapacityReservationTarget: + CapacityReservationId: cr-876543 + DisableSimultaneousMultithreading: false + Efa: + Enabled: true + GdrSupport: false + InstanceType: p6e-gb200.36xlarge + MaxCount: 18 + MinCount: 18 + Name: cb-gb200-2 + SchedulableMemory: null + SpotPrice: null + # compute resource with multiple instance types + - CapacityReservationTarget: + CapacityReservationId: cr-876543 + DisableSimultaneousMultithreading: false + Efa: + Enabled: true + GdrSupport: false + InstanceType: p6e-gb200.36xlarge + MaxCount: 9 + MinCount: 9 + Name: cb-gb200-3 + SchedulableMemory: null + SpotPrice: null + ComputeSettings: + LocalStorage: + EphemeralVolume: null + RootVolume: + Encrypted: true + Iops: 3000 + Size: null + Throughput: 125 + VolumeType: gp3 + CustomActions: null + Iam: + AdditionalIamPolicies: [ ] + InstanceProfile: null + InstanceRole: null + S3Access: null + Image: null + Name: capacity-block-queue2 + Networking: + AdditionalSecurityGroups: null + AssignPublicIp: null + PlacementGroup: + Enabled: true + Id: null + Proxy: null + SecurityGroups: null + SubnetIds: + - subnet-0230367ab0e5123a4 + Scheduler: slurm + SlurmSettings: + ScaledownIdletime: 10 + Database: null + ExternalSlurmdbd: null