Skip to content

Commit 26d121d

Browse files
login nodes
Signed-off-by: Hanwen <[email protected]>
1 parent ca27a9c commit 26d121d

File tree

10 files changed

+55
-46
lines changed

10 files changed

+55
-46
lines changed

cookbooks/aws-parallelcluster-platform/recipes/config/loginmgtd.rb

+4-4
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,26 @@
1818
load_cluster_config(node['cluster']['login_cluster_config_path'])
1919

2020
# Create the configuration file for loginmgtd
21-
template "#{node['cluster']['shared_dir_login_nodes']}/loginmgtd_config.json" do
21+
template "#{node['cluster']['etc_dir']}/loginmgtd_config.json" do
2222
source 'loginmgtd/loginmgtd_config.json.erb'
2323
owner node['cluster']['cluster_admin_user']
2424
group node['cluster']['cluster_admin_user']
2525
mode '0644'
2626
variables(
27-
gracetime_period: lazy { node['cluster']['config'].dig(:LoginNodes, :Pools, 0, :GracetimePeriod) }
27+
gracetime_period: lazy { get_login_node_pool_config(node['cluster']['config'], node['cluster']['pool_name'])['GracetimePeriod'] }
2828
)
2929
end
3030

3131
# Create the termination hook for loginmgtd
32-
template "#{node['cluster']['shared_dir_login_nodes']}/loginmgtd_on_termination.sh" do
32+
template "#{node['cluster']['etc_dir']}/loginmgtd_on_termination.sh" do
3333
source 'loginmgtd/loginmgtd_on_termination.sh.erb'
3434
owner node['cluster']['cluster_admin_user']
3535
group node['cluster']['cluster_admin_user']
3636
mode '0744'
3737
end
3838

3939
# Create the script to run loginmgtd
40-
template "#{node['cluster']['shared_dir_login_nodes']}/loginmgtd.sh" do
40+
template "#{node['cluster']['etc_dir']}/loginmgtd.sh" do
4141
source 'loginmgtd/loginmgtd.sh.erb'
4242
owner node['cluster']['cluster_admin_user']
4343
group node['cluster']['cluster_admin_user']

cookbooks/aws-parallelcluster-platform/recipes/update.rb

+1
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@
2020
end
2121

2222
sudo_access "Update Sudo Access" if node['cluster']['scheduler'] == 'slurm'
23+
include_recipe 'aws-parallelcluster-platform::config_login' if node['cluster']['node_type'] == 'LoginNode'

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/loginmgtd_spec.rb

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
end
99

1010
it 'creates the loginmgtd configuration with the correct attributes' do
11-
is_expected.to create_template('/opt/parallelcluster/shared_login_nodes/loginmgtd_config.json').with(
11+
is_expected.to create_template('/etc/parallelcluster/loginmgtd_config.json').with(
1212
source: 'loginmgtd/loginmgtd_config.json.erb',
1313
owner: 'pcluster-admin',
1414
group: 'pcluster-admin',
@@ -17,7 +17,7 @@
1717
end
1818

1919
it 'creates the loginmgtd script with the correct attributes' do
20-
is_expected.to create_template('/opt/parallelcluster/shared_login_nodes/loginmgtd.sh').with(
20+
is_expected.to create_template('/etc/parallelcluster/loginmgtd.sh').with(
2121
source: 'loginmgtd/loginmgtd.sh.erb',
2222
owner: 'pcluster-admin',
2323
group: 'pcluster-admin',
@@ -26,7 +26,7 @@
2626
end
2727

2828
it 'creates the loginmgtd termination hook script with the correct attributes' do
29-
is_expected.to create_template('/opt/parallelcluster/shared_login_nodes/loginmgtd_on_termination.sh').with(
29+
is_expected.to create_template('/etc/parallelcluster/loginmgtd_on_termination.sh').with(
3030
source: 'loginmgtd/loginmgtd_on_termination.sh.erb',
3131
owner: 'pcluster-admin',
3232
group: 'pcluster-admin',
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
Cmnd_Alias LOGINMGTD_HOOKS_COMMANDS = <%= node['cluster']['shared_dir_login_nodes'] %>/loginmgtd_on_termination.sh
1+
Cmnd_Alias LOGINMGTD_HOOKS_COMMANDS = <%= node['cluster']['etc_dir'] %>/loginmgtd_on_termination.sh
22

33
<%= node['cluster']['cluster_admin_user'] %> ALL = (root) NOPASSWD: LOGINMGTD_HOOKS_COMMANDS

cookbooks/aws-parallelcluster-platform/templates/loginmgtd/loginmgtd.sh.erb

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ set -ex
1111
# This script must be executed as <%= node['cluster']['cluster_admin_user'] %> because
1212
# because only this user has sudoers privileges to execute the termination hook.
1313

14-
CONFIG_PATH="<%= node['cluster']['shared_dir_login_nodes'] %>/loginmgtd_config.json"
14+
CONFIG_PATH="<%= node['cluster']['etc_dir'] %>/loginmgtd_config.json"
1515
CONFIG_JSON=$(cat $CONFIG_PATH)
1616
TERMINATION_SCRIPT_PATH=$(echo $CONFIG_JSON | jq -r .termination_script_path)
1717
TERMINATION_MESSAGE=$(echo $CONFIG_JSON | jq -r .termination_message)
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"termination_script_path": "<%= node['cluster']['shared_dir_login_nodes'] %>/loginmgtd_on_termination.sh",
2+
"termination_script_path": "<%= node['cluster']['etc_dir'] %>/loginmgtd_on_termination.sh",
33
"termination_message": "The system will be terminated within <%= @gracetime_period %> minutes.",
44
"gracetime_period": "<%= @gracetime_period %>"
55
}

cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ stdout_logfile_maxbytes = 0
6060
<%# LoginNode -%>
6161
<% when 'LoginNode' -%>
6262
[program:loginmgtd]
63-
command = <%= node['cluster']['shared_dir_login_nodes'] %>/loginmgtd.sh
63+
command = <%= node['cluster']['etc_dir'] %>/loginmgtd.sh
6464
user = <%= node['cluster']['cluster_admin_user'] %>
6565
environment = HOME="/home/<%= node['cluster']['cluster_admin_user'] %>",USER="<%= node['cluster']['cluster_admin_user'] %>"
6666
autorestart = unexpected

cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb

+4
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,7 @@ def check_for_protected_mode(fleet_status_command) # rubocop:disable Lint/Nested
217217
end
218218
end
219219
end
220+
221+
def get_login_node_pool_config(config, pool_name)
222+
config['LoginNodes']['Pools'].select { |pool| pool['Name'] == pool_name }.first
223+
end

cookbooks/aws-parallelcluster-slurm/recipes/config/config_check_login_stopped_script.rb

+7-5
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
group 'root'
1919
mode '0700'
2020
variables(
21-
target_group_name: lazy do
22-
get_target_group_name(
23-
node['cluster']['cluster_name'] || node['cluster']['stack_name'],
24-
node['cluster']['config'].dig(:LoginNodes, :Pools, 0, :Name)
25-
)
21+
target_group_names: lazy do
22+
"(#{node['cluster']['config'].dig(:LoginNodes, :Pools).map do |pool_config|
23+
get_target_group_name(
24+
node['cluster']['cluster_name'] || node['cluster']['stack_name'],
25+
pool_config['Name']
26+
)
27+
end.join(' ')})"
2628
end,
2729
region: node['cluster']['region']
2830
)

cookbooks/aws-parallelcluster-slurm/templates/default/slurm/head_node/check_login_nodes_stopped.sh.erb

+32-30
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,40 @@
88

99
set -e
1010

11-
TARGET_GROUP_NAME="<%= @target_group_name %>"
11+
TARGET_GROUP_NAMES=<%= @target_group_names %>
1212
REGION="<%= @region %>"
1313

14-
# Get Target Group ARN
15-
target_group_arn=$(aws elbv2 describe-target-groups \
16-
--names ${TARGET_GROUP_NAME} \
17-
--query "TargetGroups[0].TargetGroupArn" \
18-
--output text \
19-
--region ${REGION})
20-
21-
# Exit if Target Group is not found
22-
if [[ -n "${target_group_arn}" ]]; then
23-
echo "TargetGroup ARN found: ${target_group_arn}"
24-
else
25-
echo "No Target Group found for the specified Load Balancer ${load_balancer_arn}."
26-
exit 1
27-
fi
28-
29-
# Get the number of healthy and unhealthy targets
30-
target_healths=$(aws elbv2 describe-target-health \
31-
--target-group-arn $target_group_arn \
32-
--region ${REGION})
33-
34-
healthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State == "healthy") | .Target.Id' | wc -l)
35-
unhealthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State != "healthy") | .Target.Id' | wc -l)
36-
37-
# Check if there are running login nodes
38-
total_nodes=$((healthy_count + unhealthy_count))
39-
if [[ $total_nodes -gt 0 ]]; then
40-
echo "Login nodes are running. Please stop them before updating the munge key."
41-
exit 1
42-
fi
14+
for TARGET_GROUP_NAME in ${TARGET_GROUP_NAMES[@]}; do
15+
# Get Target Group ARN
16+
target_group_arn=$(aws elbv2 describe-target-groups \
17+
--names ${TARGET_GROUP_NAME} \
18+
--query "TargetGroups[0].TargetGroupArn" \
19+
--output text \
20+
--region ${REGION})
21+
22+
# Exit if Target Group is not found
23+
if [[ -n "${target_group_arn}" ]]; then
24+
echo "TargetGroup ARN found: ${target_group_arn}"
25+
else
26+
echo "No Target Group found for the specified Load Balancer ${load_balancer_arn}."
27+
exit 1
28+
fi
29+
30+
# Get the number of healthy and unhealthy targets
31+
target_healths=$(aws elbv2 describe-target-health \
32+
--target-group-arn $target_group_arn \
33+
--region ${REGION})
34+
35+
healthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State == "healthy") | .Target.Id' | wc -l)
36+
unhealthy_count=$(echo $target_healths | jq -r '.TargetHealthDescriptions[] | select(.TargetHealth.State != "healthy") | .Target.Id' | wc -l)
37+
38+
# Check if there are running login nodes
39+
total_nodes=$((healthy_count + unhealthy_count))
40+
if [[ $total_nodes -gt 0 ]]; then
41+
echo "Login nodes are running. Please stop them before updating the munge key."
42+
exit 1
43+
fi
44+
done
4345

4446
echo "Login nodes are stopped."
4547
exit 0

0 commit comments

Comments
 (0)