Skip to content

Commit 5908996

Browse files
Merge pull request #16084 from vrothberg/health-check-fix
health checks: make on-failure action retry aware
2 parents 2062ab9 + 0204008 commit 5908996

File tree

3 files changed

+30
-23
lines changed

3 files changed

+30
-23
lines changed

libpod/healthcheck.go

+24-19
Original file line numberDiff line numberDiff line change
@@ -32,30 +32,31 @@ func (r *Runtime) HealthCheck(name string) (define.HealthCheckStatus, error) {
3232
}
3333

3434
hcStatus, err := checkHealthCheckCanBeRun(container)
35-
if err == nil {
36-
hcStatus, err := container.runHealthCheck()
37-
if err := container.processHealthCheckStatus(hcStatus); err != nil {
38-
return hcStatus, err
39-
}
35+
if err != nil {
36+
return hcStatus, err
37+
}
38+
39+
hcStatus, logStatus, err := container.runHealthCheck()
40+
if err := container.processHealthCheckStatus(logStatus); err != nil {
4041
return hcStatus, err
4142
}
4243
return hcStatus, err
4344
}
4445

4546
// runHealthCheck runs the health check as defined by the container
46-
func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
47+
func (c *Container) runHealthCheck() (define.HealthCheckStatus, string, error) {
4748
var (
4849
newCommand []string
4950
returnCode int
5051
inStartPeriod bool
5152
)
5253
hcCommand := c.HealthCheckConfig().Test
5354
if len(hcCommand) < 1 {
54-
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
55+
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
5556
}
5657
switch hcCommand[0] {
5758
case "", define.HealthConfigTestNone:
58-
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
59+
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
5960
case define.HealthConfigTestCmd:
6061
newCommand = hcCommand[1:]
6162
case define.HealthConfigTestCmdShell:
@@ -66,11 +67,11 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
6667
newCommand = hcCommand
6768
}
6869
if len(newCommand) < 1 || newCommand[0] == "" {
69-
return define.HealthCheckNotDefined, fmt.Errorf("container %s has no defined healthcheck", c.ID())
70+
return define.HealthCheckNotDefined, "", fmt.Errorf("container %s has no defined healthcheck", c.ID())
7071
}
7172
rPipe, wPipe, err := os.Pipe()
7273
if err != nil {
73-
return define.HealthCheckInternalError, fmt.Errorf("unable to create pipe for healthcheck session: %w", err)
74+
return define.HealthCheckInternalError, "", fmt.Errorf("unable to create pipe for healthcheck session: %w", err)
7475
}
7576
defer wPipe.Close()
7677
defer rPipe.Close()
@@ -135,15 +136,16 @@ func (c *Container) runHealthCheck() (define.HealthCheckStatus, error) {
135136
}
136137

137138
hcl := newHealthCheckLog(timeStart, timeEnd, returnCode, eventLog)
138-
if err := c.updateHealthCheckLog(hcl, inStartPeriod); err != nil {
139-
return hcResult, fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
139+
logStatus, err := c.updateHealthCheckLog(hcl, inStartPeriod)
140+
if err != nil {
141+
return hcResult, "", fmt.Errorf("unable to update health check log %s for %s: %w", c.healthCheckLogPath(), c.ID(), err)
140142
}
141143

142-
return hcResult, hcErr
144+
return hcResult, logStatus, hcErr
143145
}
144146

145-
func (c *Container) processHealthCheckStatus(status define.HealthCheckStatus) error {
146-
if status == define.HealthCheckSuccess {
147+
func (c *Container) processHealthCheckStatus(status string) error {
148+
if status != define.HealthCheckUnhealthy {
147149
return nil
148150
}
149151

@@ -211,10 +213,13 @@ func (c *Container) updateHealthStatus(status string) error {
211213
}
212214

213215
// UpdateHealthCheckLog parses the health check results and writes the log
214-
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) error {
216+
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPeriod bool) (string, error) {
217+
c.lock.Lock()
218+
defer c.lock.Unlock()
219+
215220
healthCheck, err := c.getHealthCheckLog()
216221
if err != nil {
217-
return err
222+
return "", err
218223
}
219224
if hcl.ExitCode == 0 {
220225
// set status to healthy, reset failing state to 0
@@ -239,9 +244,9 @@ func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, inStartPerio
239244
}
240245
newResults, err := json.Marshal(healthCheck)
241246
if err != nil {
242-
return fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
247+
return "", fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
243248
}
244-
return os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
249+
return healthCheck.Status, os.WriteFile(c.healthCheckLogPath(), newResults, 0700)
245250
}
246251

247252
// HealthCheckLogPath returns the path for where the health check log is

test/system/220-healthcheck.bats

+5-4
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,11 @@ function _check_health {
2828
--health-cmd /healthcheck \
2929
--health-interval 1s \
3030
--health-retries 3 \
31+
--health-on-failure=kill \
3132
healthcheck_i
3233

3334
run_podman inspect healthcheck_c --format "{{.Config.HealthcheckOnFailureAction}}"
34-
is "$output" "none" "default on-failure action is none"
35+
is "$output" "kill" "on-failure action is set to kill"
3536

3637
# We can't check for 'starting' because a 1-second interval is too
3738
# short; it could run healthcheck before we get to our first check.
@@ -67,9 +68,8 @@ Log[-1].ExitCode | 1
6768
Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
6869
"
6970

70-
# healthcheck should now fail, with exit status 1 and 'unhealthy' output
71-
run_podman 1 healthcheck run healthcheck_c
72-
is "$output" "unhealthy" "output from 'podman healthcheck run'"
71+
# now the on-failure should kick in and kill the container
72+
podman wait healthcheck_c
7373

7474
# Clean up
7575
run_podman rm -t 0 -f healthcheck_c
@@ -95,6 +95,7 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\"
9595
# Run that healthcheck image.
9696
run_podman run -d --name $ctr \
9797
--health-cmd /healthcheck \
98+
--health-retries=1 \
9899
--health-on-failure=$policy \
99100
$img
100101

test/system/250-systemd.bats

+1
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ LISTEN_FDNAMES=listen_fdnames" | sort)
318318
run_podman create --name $cname \
319319
--health-cmd /healthcheck \
320320
--health-on-failure=kill \
321+
--health-retries=1 \
321322
--restart=on-failure \
322323
$img
323324

0 commit comments

Comments
 (0)