Skip to content

Commit 2aa9aec

Browse files
Avoid using sinfo -N to support cluster creation with large compute nodes
Prior to this commit, if the total number of dynamic compute node is larger than 130k. After the commit, number becomes 180k Signed-off-by: Hanwen <[email protected]>
1 parent d39131d commit 2aa9aec

File tree

3 files changed

+15
-4
lines changed

3 files changed

+15
-4
lines changed

cookbooks/aws-parallelcluster-computefleet/recipes/config/fleet_status.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,13 @@
3838
mode '0755'
3939
end
4040

41+
template "/usr/local/bin/is_fleet_ready.sh" do
42+
source 'compute_fleet_status/is_fleet_ready.erb'
43+
owner 'root'
44+
group 'root'
45+
mode '0755'
46+
end
47+
4148
template "#{node['cluster']['etc_dir']}/clusterstatusmgtd.conf" do
4249
source 'clusterstatusmgtd/clusterstatusmgtd.conf.erb'
4350
owner 'root'
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
sinfo_output=$(<%= node['cluster']['slurm']['install_dir'] %>/bin/sinfo -h -o '%N %t' | grep -v -E '(idle|alloc|mix|maint)$')
4+
while IFS= read -r line; do
5+
nodelist=$(echo "$line" | awk '{print $1}')
6+
<%= node['cluster']['slurm']['install_dir'] %>/bin/scontrol show hostnames "$nodelist" | { grep -E '^[a-z0-9\-]+\-st\-[a-z0-9\-]+\-[0-9]+.*' || true; }
7+
done <<< "$sinfo_output"

cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,7 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
212212
# spot-st-t2large-2 idle
213213
# capacity-block-st-t2micro-1 maint
214214
# capacity-block-dy-t2micro-1 maint
215-
is_fleet_ready_command = Shellwords.escape(
216-
"set -o pipefail && #{node['cluster']['slurm']['install_dir']}/bin/sinfo -N -h -o '%N %t' | { grep -E '^[a-z0-9\\-]+\\-st\\-[a-z0-9\\-]+\\-[0-9]+ .*' || true; } | { grep -v -E '(idle|alloc|mix|maint)$' || true; }"
217-
)
218-
until shell_out!("/bin/bash -c #{is_fleet_ready_command}").stdout.strip.empty?
215+
until shell_out!("/bin/bash -c /usr/local/bin/is_fleet_ready.sh").stdout.strip.empty?
219216
check_for_protected_mode(fleet_status_command)
220217

221218
Chef::Log.info("Waiting for static fleet capacity provisioning")

0 commit comments

Comments
 (0)