-
Notifications
You must be signed in to change notification settings - Fork 4
/
set-pve-watchdog
executable file
·140 lines (118 loc) · 4.22 KB
/
set-pve-watchdog
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/bin/bash
set -euo pipefail
pname=$(basename "$0")
usage="
Usage: $pname <storpool|pve>
Set WD_FORCE=1 to disregard checks for the currently enabled watchdog.
Sets the active PVE HA watchdog - StorPool's sp-watchdog-mux replacement or the default watchdog-mux
This script will put the node in maintenance mode, wait for resources to be migrated,
enable the desired watchdog while disabling the other one, and turn off maintenance mode.
The script is meant to be run on one node at a time in order to allow HA resources to migrate
gracefully between machines.
"
function plog() {
logger --id=$$ --tag "$pname" --stderr "$@"
}
WD_FORCE="${WD_FORCE:-0}"
watchdog="${1:-none}"
if [[ $watchdog == "storpool" ]]; then
enable_svc="sp-watchdog-mux.service"
disable_svc="watchdog-mux.service"
elif [[ $watchdog == "pve" ]]; then
enable_svc="watchdog-mux.service"
disable_svc="sp-watchdog-mux.service"
else
echo "$usage"
exit 1
fi
if [[ $WD_FORCE -eq 0 ]] && systemctl is-enabled "$enable_svc" &>/dev/null && ! systemctl is-enabled "$disable_svc" &>/dev/null; then
plog "Watchdog $watchdog ($enable_svc) already enabled, $disable_svc disabled"
exit 0
fi
plog "Will enable $watchdog watchdog ($enable_svc) and disable $disable_svc (forced=$WD_FORCE)"
plog "Enabling maintenance mode on node $HOSTNAME"
ha-manager crm-command node-maintenance enable "$HOSTNAME"
skip_maintenance=0
node_maintenance=0
resources_migrated=0
while true; do
ha_status=$(pvesh get cluster/ha/status/manager_status -output-format=json)
ha_services=$(echo "$ha_status" | jq -rc '[(try .manager_status.service_status[])] | length // 0')
if [[ $ha_services -eq 0 ]]; then
plog "No HA services in cluster, skipping maintenance"
skip_maintenance=1
node_maintenance=1
node_status="sp-no-ha-services"
resources=0
resources_migrated=1
fi
if [[ $node_maintenance -ne 1 ]]; then
node_status=$(
echo "$ha_status" | \
jq -rc --arg node "$HOSTNAME" '.manager_status.node_status[$node]'
)
if [[ $node_status == "maintenance" ]]; then
node_maintenance=1
plog "Node $HOSTNAME reached maintenance status"
fi
fi
if [[ $resources_migrated -ne 1 ]]; then
resources=$(
echo "$ha_status" | \
jq --arg node "$HOSTNAME" '[(try .manager_status.service_status[] | select(.node == $node))] | length // 0'
)
if [[ $resources -eq 0 ]]; then
plog "All HA resources migrated"
resources_migrated=1
fi
fi
if [[ $node_maintenance -eq 1 && $resources_migrated -eq 1 ]]; then
plog "Node $HOSTNAME ready for sp-watchdog-mux (${resources} HA resource(s), status ${node_status})"
break
else
plog "Node $HOSTNAME not ready yet (${resources} HA resource(s), status ${node_status} )"
sleep 10
fi
done
plog "Enabling watchdog"
plog "Stopping PVE HA services"
systemctl stop pve-ha-crm pve-ha-lrm
if systemctl is-enabled "$disable_svc"; then
plog "Masking $disable_svc"
systemctl mask --now "$disable_svc"
fi
plog "Enabling and starting $enable_svc"
systemctl unmask "$enable_svc"
if [[ $enable_svc != "watchdog-mux.service" ]]; then
systemctl enable --now "$enable_svc"
fi
plog "Waiting for watchdog socket to appear"
while [[ ! -S /run/watchdog-mux.sock ]]; do
echo "Watchdog socket still missing"
sleep 10
done
plog "Watchdog socket appeared"
sleep 5
plog "Starting PVE HA services"
systemctl start pve-ha-crm pve-ha-lrm
plog "Disabling maintenance mode on node $HOSTNAME"
ha-manager crm-command node-maintenance disable "$HOSTNAME"
while true; do
if [[ $skip_maintenance -eq 1 ]]; then
plog "Node $HOSTNAME skipped maintenance, not waiting for online status"
break
fi
ha_status=$(pvesh get cluster/ha/status/manager_status -output-format=json)
node_status=$(
echo "$ha_status" | \
jq -rc --arg node "$HOSTNAME" '.manager_status.node_status[$node]'
)
if [[ $node_status == "online" ]]; then
plog "Node $HOSTNAME reached online status"
break
else
plog "Node $HOSTNAME status $node_status"
sleep 10
fi
done
plog "sp-watchdog-mux setup done on $HOSTNAME"