Skip to content

Commit 3b5e370

Browse files
[LoginNodes] Adapt check login nodes stopped script for multiple
Prior to this commit, the script only checks the first pool Signed-off-by: Hanwen <[email protected]>
1 parent 3e46216 commit 3b5e370

File tree

2 files changed

+40
-35
lines changed

2 files changed

+40
-35
lines changed

cookbooks/aws-parallelcluster-slurm/recipes/config/config_check_login_stopped_script.rb

+8-5
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@
1818
group 'root'
1919
mode '0700'
2020
variables(
21-
target_group_name: lazy do
22-
get_target_group_name(
23-
node['cluster']['cluster_name'] || node['cluster']['stack_name'],
24-
node['cluster']['config'].dig(:LoginNodes, :Pools, 0, :Name)
25-
)
21+
target_group_names: lazy do
22+
# Generate a string that is the array representation in bash (e.g. "(item1, item2, item3)")
23+
"(#{node['cluster']['config'].dig(:LoginNodes, :Pools).map do |pool_config|
24+
get_target_group_name(
25+
node['cluster']['cluster_name'] || node['cluster']['stack_name'],
26+
pool_config['Name']
27+
)
28+
end.join(' ')})"
2629
end,
2730
region: node['cluster']['region']
2831
)

cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/check_login_nodes_stopped.sh.erb

+32-30
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,40 @@
88

99
set -e
1010

11-
TARGET_GROUP_NAME="<%= @target_group_name %>"
11+
TARGET_GROUP_NAMES=<%= @target_group_names %>
1212
REGION="<%= @region %>"
1313

14-
# Get Target Group ARN
15-
target_group_arn=$(aws elbv2 describe-target-groups \
16-
--names ${TARGET_GROUP_NAME} \
17-
--query "TargetGroups[0].TargetGroupArn" \
18-
--output text \
19-
--region ${REGION})
20-
21-
# Exit if Target Group is not found
22-
if [[ -n "${target_group_arn}" ]]; then
23-
echo "TargetGroup ARN found: ${target_group_arn}"
24-
else
25-
echo "No Target Group found for the specified Load Balancer ${load_balancer_arn}."
26-
exit 1
27-
fi
28-
29-
# Get the number of healthy and unhealthy targets
30-
target_healths=$(aws elbv2 describe-target-health \
31-
--target-group-arn $target_group_arn \
32-
--region ${REGION})
33-
34-
healthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State == "healthy") | .Target.Id' | wc -l)
35-
unhealthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State != "healthy") | .Target.Id' | wc -l)
36-
37-
# Check if there are running login nodes
38-
total_nodes=$((healthy_count + unhealthy_count))
39-
if [[ $total_nodes -gt 0 ]]; then
40-
echo "Login nodes are running. Please stop them before updating the munge key."
41-
exit 1
42-
fi
14+
for TARGET_GROUP_NAME in ${TARGET_GROUP_NAMES[@]}; do
15+
# Get Target Group ARN
16+
target_group_arn=$(aws elbv2 describe-target-groups \
17+
--names ${TARGET_GROUP_NAME} \
18+
--query "TargetGroups[0].TargetGroupArn" \
19+
--output text \
20+
--region ${REGION})
21+
22+
# Exit if Target Group is not found
23+
if [[ -n "${target_group_arn}" ]]; then
24+
echo "TargetGroup ARN found: ${target_group_arn}"
25+
else
26+
echo "No Target Group found for the specified Load Balancer ${load_balancer_arn}."
27+
exit 1
28+
fi
29+
30+
# Get the number of healthy and unhealthy targets
31+
target_healths=$(aws elbv2 describe-target-health \
32+
--target-group-arn $target_group_arn \
33+
--region ${REGION})
34+
35+
healthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State == "healthy") | .Target.Id' | wc -l)
36+
unhealthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State != "healthy") | .Target.Id' | wc -l)
37+
38+
# Check if there are running login nodes
39+
total_nodes=$((healthy_count + unhealthy_count))
40+
if [[ $total_nodes -gt 0 ]]; then
41+
echo "Login nodes are running. Please stop them before updating the munge key."
42+
exit 1
43+
fi
44+
done
4345

4446
echo "Login nodes are stopped."
4547
exit 0

0 commit comments

Comments
 (0)