Skip to content

Commit a2651f4

Browse files
himani2411Himani Anil Deshpande
andauthored
[SlurmTopo] Solve the update failures by defining when to update topology.conf (#3009)
Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent 2d4cf30 commit a2651f4

File tree

3 files changed

+28
-4
lines changed

3 files changed

+28
-4
lines changed

cookbooks/aws-parallelcluster-slurm/attributes/slurm_attributes.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@
2828

2929
# Block Topology Plugin
3030
default['cluster']['slurm']['block_topology']['force_configuration'] = false
31+
default['cluster']['p6egb200_block_sizes'] = nil

cookbooks/aws-parallelcluster-slurm/resources/block_topology/partial/_block_topology_common.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,8 @@ def topology_generator_command_args
6363
if node['cluster']['p6egb200_block_sizes'].nil? && are_queues_updated? && ::File.exist?("#{node['cluster']['slurm']['install_dir']}/etc/topology.conf")
6464
# If topology.conf exist and Capacity Block is removed, we cleanup
6565
" --cleanup"
66-
elsif node['cluster']['p6egb200_block_sizes'].nil? && !are_queues_updated?
67-
# We do nothing if p6e-gb200 is not used and queues are not updated
68-
nil
69-
else
66+
elsif !node['cluster']['p6egb200_block_sizes'].nil?
67+
# We add/update topology.conf if p6egb200_block_sizes is not null
7068
" --block-sizes #{node['cluster']['p6egb200_block_sizes']}"
7169
end
7270
end

cookbooks/aws-parallelcluster-slurm/spec/unit/resources/block_topology_spec.rb

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,13 +154,26 @@ def self.update(chef_run)
154154
cached(:chef_run) do
155155
runner(platform: platform, version: version, step_into: ['block_topology']) do |node|
156156
node.override['cluster']['p6egb200_block_sizes'] = nil
157+
node.override['cluster']['slurm']['install_dir'] = slurm_install_dir
157158
end
158159
end
159160
cached(:resource) do
160161
ConvergeBlockTopology.update(chef_run)
161162
chef_run.find_resource('block_topology', 'update')
162163
end
163164

165+
context "when queues are updated and topolog.conf does exists" do
166+
before do
167+
allow_any_instance_of(Object).to receive(:are_queues_updated?).and_return(true)
168+
allow(File).to receive(:exist?).with("#{slurm_install_dir}/etc/topology.conf").and_return(true)
169+
chef_run.node.override['cluster']['p6egb200_block_sizes'] = nil
170+
end
171+
172+
it 'returns cleanup' do
173+
expect(resource.topology_generator_command_args).to eq(" --cleanup")
174+
end
175+
end
176+
164177
context "when queues are not updated and topolog.conf does not exists" do
165178
before do
166179
allow_any_instance_of(Object).to receive(:are_queues_updated?).and_return(false)
@@ -172,6 +185,18 @@ def self.update(chef_run)
172185
end
173186
end
174187

188+
context "when queues are updated and topolog.conf does not exists" do
189+
before do
190+
allow_any_instance_of(Object).to receive(:are_queues_updated?).and_return(true)
191+
allow(File).to receive(:exist?).with("#{slurm_install_dir}/etc/topology.conf").and_return(false)
192+
chef_run.node.override['cluster']['p6egb200_block_sizes'] = block_sizes
193+
end
194+
195+
it 'returns block-sizes argument' do
196+
expect(resource.topology_generator_command_args).to eq(" --block-sizes #{block_sizes}")
197+
end
198+
end
199+
175200
context "when block sizes is not nil" do
176201
before do
177202
chef_run.node.override['cluster']['p6egb200_block_sizes'] = block_sizes

0 commit comments

Comments
 (0)