Skip to content

Commit e440280

Browse files
author
Al Tobey
committed
Add the rolling reboot tool.
I threw this together today to do a large cluster reboot. There's a lot more possible with this, such as adding an option to pass a script that determines host readiness after boot (e.g. don't just wait for the host to come up, make sure the database has started and is ready too before moving on to the next host). Being used in production now.
1 parent 71a52cb commit e440280

File tree

1 file changed

+249
-0
lines changed

1 file changed

+249
-0
lines changed

cl-rolling-reboot.pl

+249
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
#!/usr/bin/env perl
2+
$|++;
3+
4+
###########################################################################
5+
# #
6+
# Cluster Tools: cl-rolling-restart.pl #
7+
# Copyright 2013, Albert P. Tobey <[email protected]> #
8+
# #
9+
###########################################################################
10+
11+
=head1 NAME
12+
13+
cl-rolling-restart.pl - reboot a cluster as safely as possible
14+
15+
=head1 SYNOPSIS
16+
17+
This script attempts to reboot a cluster safely. It steps through the host
18+
list serially, rebooting one node at a time and only progresses to the next
19+
node if the previous node comes back online. It is quite verbose on purpose,
20+
with the intent of being run in a screen session and left alone for many hours
21+
or days to do its thing.
22+
23+
ICMP is used to determine basic network availability. No node is considered
24+
actually available unless a command can be run over ssh.
25+
26+
Most failures are fatal. Large clusters will typically have a few nodes down
27+
at any given time, so those nodes are skipped if they fail an ICMP test.
28+
29+
When a run fails, a new list will be written to your ~/.dsh that only contains
30+
the incomplete list, allowing you to resume easily. The name of the file and
31+
the correct command for resuming will be printed.
32+
33+
cl-rolling-restart.pl --list foo [--timeout 1800] [--wait 60]
34+
--timeout: number of seconds before giving up on a host
35+
--wait: number of seconds to wait between reboots
36+
37+
=cut
38+
39+
use Pod::Usage;
40+
use File::Temp qw/tempfile/;
41+
use Getopt::Long;
42+
use IPC::Open3;
43+
use strict;
44+
use warnings;
45+
46+
use FindBin qw($Bin);
47+
use lib $Bin;
48+
use DshPerlHostLoop;
49+
50+
our $opt_help = undef;
51+
our $opt_timeout = 1800; # 1/2 hour
52+
our $opt_wait = 60; # one minute
53+
54+
GetOptions(
55+
"timeout:i" => \$opt_timeout,
56+
"wait:i" => \$opt_wait,
57+
"help" => \$opt_help, "h" => \$opt_help
58+
);
59+
60+
if ($opt_help) {
61+
pod2usage();
62+
}
63+
64+
=item ping()
65+
66+
Ping the host once, waiting 3 seconds for a response. Returns
67+
1 (true) on success and undef (false) on failure.
68+
69+
This function should move to DshPerlHostLoop at some point.
70+
71+
ping($hostname);
72+
73+
=cut
74+
75+
sub ping {
76+
my $hostname = shift;
77+
78+
my $pid = open3(my $w, my $r, my $e, '/bin/ping', '-c', '1', '-W', '3', $hostname);
79+
80+
waitpid($pid, 0);
81+
82+
if ($? != 0) {
83+
return undef;
84+
}
85+
86+
return 1;
87+
}
88+
89+
=item reboot()
90+
91+
SSHes in to the host and issues 'sudo reboot'. The return value is
92+
any text printed by the reboot command, but this should not be used
93+
to determine if it was successful.
94+
95+
reboot($hostname);
96+
97+
=cut
98+
99+
sub reboot {
100+
my $host = shift;
101+
my @out = ssh("$remote_user\@$host", "sudo reboot");
102+
return @out;
103+
}
104+
105+
=item fail()
106+
107+
Writes out the incomplete hosts to a new host list, prints some information,
108+
then exits immediately with a return code of 1.
109+
110+
fail(@hostlist, $index); # will exit
111+
112+
=cut
113+
114+
sub fail {
115+
my($hostlist, $i) = @_;
116+
my $now = time;
117+
118+
# print out a machine list containing only the hosts that failed
119+
# to make resuming the reboot more convenient
120+
open(my $fh, "> $ENV{HOME}/.dsh/machines.reboot-failed-$now");
121+
for (1; $i<@$hostlist; $i++) {
122+
print $fh "$hostlist->[$i]\n";
123+
}
124+
close $fh;
125+
126+
print "\nA machine list containing only the un-rebooted nodes has been written to:\n";
127+
print "$ENV{HOME}/.dsh/machines.reboot-failed-$now\n";
128+
print "To resume:\n";
129+
print "cl-rolling-reboot.pl --list reboot-failed-$now\n\n";
130+
exit 1;
131+
}
132+
133+
=item main()
134+
135+
Try really hard to reboot machines without accidentally taking down more than one
136+
node at a time.
137+
138+
=cut
139+
140+
my @hosts = hostlist();
141+
for (my $i=0; $i<@hosts; $i++) {
142+
# skip hosts that are down
143+
next unless ping($hosts[$i]);
144+
145+
# failsafe: break and fail if work hangs somewhere
146+
$SIG{'ALRM'} = sub {
147+
print "Timeout. Something hung and SIGALRM has fired. Exiting now.\n";
148+
fail(\@hosts, $i);
149+
};
150+
alarm($opt_timeout + $opt_wait + 600);
151+
152+
my $rebooted_at = time;
153+
reboot($hosts[$i]);
154+
print "$hosts[$i]: sent reboot command ...\n";
155+
156+
print "Waiting up to five minutes for the host to go offline ...\n";
157+
my $count = 0;
158+
while (1) {
159+
sleep 1;
160+
my $status = ping($hosts[$i]);
161+
162+
if ($status) {
163+
$count++;
164+
if ($count % 10 == 0) {
165+
print "$hosts[$i] has not gone offline after $count seconds. Retrying in 10 seconds ...\n";
166+
}
167+
if ($count > 300) {
168+
print "$hosts[$i] has not gone offline after $count seconds.\n";
169+
fail(\@hosts, $i);
170+
}
171+
} else {
172+
print "$hosts[$i] is offline. Going to sleep for two minutes ...\n";
173+
last;
174+
}
175+
}
176+
177+
# wait two minutes before even trying to ping the box
178+
sleep 120;
179+
180+
print "Host has been down for at least two minutes. Will start pinging now.\n";
181+
$count = 0;
182+
my $upcount = 0;
183+
while (1) {
184+
my $status = ping($hosts[$i]);
185+
my $elapsed = time - $rebooted_at;
186+
187+
if ($status) {
188+
$upcount++;
189+
print "$hosts[$i] network has responded to $upcount pings.\n";
190+
# require 5 consecutive successes before moving on
191+
if ($upcount == 4) {
192+
last;
193+
}
194+
else {
195+
next;
196+
}
197+
}
198+
199+
# reset the counter if even a single ping fails
200+
$upcount = 0;
201+
202+
$count++;
203+
if (not $status && $count % 10 == 0) {
204+
print "$hosts[$i] has been down for $elapsed seconds.\n";
205+
}
206+
207+
# wait up to $opt_timeout minutes for the host to come back, if it doesn't,
208+
# stop trying and wait for the operator to clean up
209+
if ($elapsed > $opt_timeout) {
210+
print "Reboot of $hosts[$i] failed, it is still down after $elapsed seconds.\n";
211+
fail(\@hosts, $i);
212+
}
213+
}
214+
215+
print "$hosts[$i] network is responding. Checking SSH in 5 minutes...\n";
216+
sleep 300;
217+
218+
# TODO: retries?
219+
my @out = ssh("$remote_user\@$hosts[$i]", "uptime");
220+
my $flat = join(' ', map { chomp; $_ } @out);
221+
222+
if ($flat =~ / up /) {
223+
print "\n-----------------------------------------------------------------------\n";
224+
print "$hosts[$i] is back online! Moving on.\n";
225+
print "$hosts[$i] $flat\n";
226+
print "-----------------------------------------------------------------------\n\n";
227+
} else {
228+
print "$hosts[$i]: could not run the uptime command.\n";
229+
fail(\@hosts, $i);
230+
}
231+
232+
print "Sleeping $opt_wait seconds before moving on to the next host.\n";
233+
sleep $opt_wait;
234+
}
235+
236+
# vim: et ts=4 sw=4 ai smarttab
237+
238+
__END__
239+
240+
=head1 COPYRIGHT AND LICENSE
241+
242+
This software is copyright (c) 2013 by Al Tobey.
243+
244+
This is free software; you can redistribute it and/or modify it under the terms
245+
of the Artistic License 2.0. (Note that, unlike the Artistic License 1.0,
246+
version 2.0 is GPL compatible by itself, hence there is no benefit to having an
247+
Artistic 2.0 / GPL disjunction.) See the file LICENSE for details.
248+
249+
=cut

0 commit comments

Comments
 (0)