Skip to content

Commit ca27a9c

Browse files
Avoid using sinfo -N to support cluster creation with large compute nodes
Prior to this commit, if the total number of dynamic compute node is larger than 130k. After the commit, number becomes 180k Signed-off-by: Hanwen <[email protected]>
1 parent d39131d commit ca27a9c

File tree

3 files changed

+18
-12
lines changed

3 files changed

+18
-12
lines changed

cookbooks/aws-parallelcluster-computefleet/recipes/config/fleet_status.rb

+7
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@
3838
mode '0755'
3939
end
4040

41+
template "/usr/local/bin/is_fleet_ready.sh" do
42+
source 'compute_fleet_status/is_fleet_ready.erb'
43+
owner 'root'
44+
group 'root'
45+
mode '0755'
46+
end
47+
4148
template "#{node['cluster']['etc_dir']}/clusterstatusmgtd.conf" do
4249
source 'clusterstatusmgtd/clusterstatusmgtd.conf.erb'
4350
owner 'root'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$')
4+
while IFS= read -r line; do
5+
nodelist=$(echo "$line" | awk '{print $1}')
6+
<%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; }
7+
done <<< "$sinfo_output"

cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb

+4-12
Original file line numberDiff line numberDiff line change
@@ -204,18 +204,10 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
204204
"/usr/local/bin/get-compute-fleet-status.sh"
205205
)
206206
# Example output for sinfo
207-
# $ /opt/slurm/bin/sinfo -N -h -o '%N %t'
208-
# ondemand-dy-c52xlarge-1 idle~
209-
# ondemand-dy-c52xlarge-2 idle~
210-
# spot-dy-c5xlarge-1 idle~
211-
# spot-st-t2large-1 down
212-
# spot-st-t2large-2 idle
213-
# capacity-block-st-t2micro-1 maint
214-
# capacity-block-dy-t2micro-1 maint
215-
is_fleet_ready_command = Shellwords.escape(
216-
"set -o pipefail && #{node['cluster']['slurm']['install_dir']}/bin/sinfo -N -h -o '%N %t' | { grep -E '^[a-z0-9\\-]+\\-st\\-[a-z0-9\\-]+\\-[0-9]+ .*' || true; } | { grep -v -E '(idle|alloc|mix|maint)$' || true; }"
217-
)
218-
until shell_out!("/bin/bash -c #{is_fleet_ready_command}").stdout.strip.empty?
207+
# sinfo -h -o '%N %t'
208+
# queue-0-dy-compute-resource-g4dn-0-[1-10],queue-1-dy-compute-resource-g4dn-1-[1-10] idle~
209+
# queue-2-dy-compute-resource-g4dn-2-[1-10],queue-3-dy-compute-resource-g4dn-3-[1-10] idle
210+
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
219211
check_for_protected_mode(fleet_status_command)
220212

221213
Chef::Log.info("Waiting for static fleet capacity provisioning")

0 commit comments

Comments
 (0)