b6c7becb |
package daemon
import (
"bytes"
"fmt"
"runtime"
"strings" |
c3319445 |
"sync" |
b6c7becb |
"time"
"golang.org/x/net/context"
|
91e197d6 |
"github.com/docker/docker/api/types" |
5f81cf11 |
containertypes "github.com/docker/docker/api/types/container" |
91e197d6 |
"github.com/docker/docker/api/types/strslice" |
b6c7becb |
"github.com/docker/docker/container"
"github.com/docker/docker/daemon/exec" |
1009e6a4 |
"github.com/sirupsen/logrus" |
b6c7becb |
)
const (
// Longest healthcheck probe output message to store. Longer messages will be truncated.
maxOutputLen = 4096
// Default interval between probe runs (from the end of the first to the start of the second).
// Also the time before the first probe.
defaultProbeInterval = 30 * time.Second
// The maximum length of time a single probe run should take. If the probe takes longer
// than this, the check is considered to have failed.
defaultProbeTimeout = 30 * time.Second
|
e401f637 |
// The time given for the container to start before the health check starts considering
// the container unstable. Defaults to none.
defaultStartPeriod = 0 * time.Second
|
50e470fa |
// Default number of consecutive failures of the health check
// for the container to be considered unhealthy.
defaultProbeRetries = 3
|
b6c7becb |
// Maximum number of entries to record
maxLogEntries = 5
)
const (
// Exit status codes that can be returned by the probe command.
|
62c1f0ef |
exitStatusHealthy = 0 // Container is healthy |
b6c7becb |
)
// probe implementations know how to run a particular type of probe.
type probe interface {
// Perform one run of the check. Returns the exit code and an optional
// short diagnostic string.
run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
}
// cmdProbe implements the "CMD" probe type.
type cmdProbe struct {
// Run the command with the system's default shell instead of execing it directly.
shell bool
}
// exec the healthcheck command in the container.
// Returns the exit code and probe output (if any) |
5836d86a |
func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] |
b6c7becb |
if p.shell { |
5836d86a |
cmdSlice = append(getShell(cntr.Config), cmdSlice...) |
b6c7becb |
}
entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
execConfig := exec.NewConfig()
execConfig.OpenStdin = false
execConfig.OpenStdout = true
execConfig.OpenStderr = true |
5836d86a |
execConfig.ContainerID = cntr.ID |
b6c7becb |
execConfig.DetachKeys = []byte{}
execConfig.Entrypoint = entrypoint
execConfig.Args = args
execConfig.Tty = false
execConfig.Privileged = false |
5836d86a |
execConfig.User = cntr.Config.User |
852a943c |
execConfig.WorkingDir = cntr.Config.WorkingDir |
5836d86a |
linkedEnv, err := d.setupLinkedContainers(cntr)
if err != nil {
return nil, err
}
execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) |
b6c7becb |
|
5836d86a |
d.registerExecCommand(cntr, execConfig) |
aa6bb5cb |
attributes := map[string]string{
"execID": execConfig.ID,
}
d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes) |
b6c7becb |
output := &limitedBuffer{} |
5836d86a |
err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) |
b6c7becb |
if err != nil {
return nil, err
}
info, err := d.getExecConfig(execConfig.ID)
if err != nil {
return nil, err
}
if info.ExitCode == nil { |
9b47b7b1 |
return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID) |
b6c7becb |
}
// Note: Go's json package will handle invalid UTF-8 for us
out := output.String()
return &types.HealthcheckResult{
End: time.Now(),
ExitCode: *info.ExitCode,
Output: out,
}, nil
}
// Update the container's Status.Health struct based on the latest probe's result. |
89b12347 |
func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { |
b6c7becb |
c.Lock()
defer c.Unlock()
|
89b12347 |
// probe may have been cancelled while waiting on lock. Ignore result then
select {
case <-done:
return
default:
}
|
b6c7becb |
retries := c.Config.Healthcheck.Retries
if retries <= 0 { |
50e470fa |
retries = defaultProbeRetries |
b6c7becb |
}
h := c.State.Health |
7db30ab0 |
oldStatus := h.Status() |
b6c7becb |
if len(h.Log) >= maxLogEntries {
h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
} else {
h.Log = append(h.Log, result)
}
if result.ExitCode == exitStatusHealthy {
h.FailingStreak = 0 |
7db30ab0 |
h.SetStatus(types.Healthy) |
e401f637 |
} else { // Failure (including invalid exit code)
shouldIncrementStreak := true
// If the container is starting (i.e. we never had a successful health check)
// then we check if we are within the start period of the container in which
// case we do not increment the failure streak. |
7db30ab0 |
if h.Status() == types.Starting { |
e401f637 |
startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
timeSinceStart := result.Start.Sub(c.State.StartedAt)
// If still within the start period, then don't increment failing streak.
if timeSinceStart < startPeriod {
shouldIncrementStreak = false
}
}
if shouldIncrementStreak {
h.FailingStreak++
if h.FailingStreak >= retries { |
7db30ab0 |
h.SetStatus(types.Unhealthy) |
e401f637 |
} |
b6c7becb |
}
// Else we're starting or healthy. Stay in that state.
}
|
eed4c7b7 |
// replicate Health status changes |
aacddda8 |
if err := c.CheckpointTo(d.containersReplica); err != nil { |
eed4c7b7 |
// queries will be inconsistent until the next probe runs or other state mutations |
aacddda8 |
// checkpoint the container |
eed4c7b7 |
logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
}
|
7db30ab0 |
current := h.Status()
if oldStatus != current {
d.LogContainerEvent(c, "health_status: "+current) |
b6c7becb |
}
}
// Run the container's monitoring thread until notified via "stop".
// There is never more than one monitor thread running per container at a time.
func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
for {
select {
case <-stop: |
a4a4f373 |
logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) |
b6c7becb |
return
case <-time.After(probeInterval): |
a4a4f373 |
logrus.Debugf("Running health check for container %s ...", c.ID) |
b6c7becb |
startTime := time.Now()
ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) |
67297ba0 |
results := make(chan *types.HealthcheckResult, 1) |
b6c7becb |
go func() { |
3343d234 |
healthChecksCounter.Inc() |
b6c7becb |
result, err := probe.run(ctx, d, c)
if err != nil { |
3343d234 |
healthChecksFailedCounter.Inc() |
a4a4f373 |
logrus.Warnf("Health check for container %s error: %v", c.ID, err) |
b6c7becb |
results <- &types.HealthcheckResult{
ExitCode: -1,
Output: err.Error(),
Start: startTime,
End: time.Now(),
}
} else {
result.Start = startTime |
a4a4f373 |
logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) |
b6c7becb |
results <- result
}
close(results)
}()
select {
case <-stop: |
a4a4f373 |
logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) |
b6c7becb |
cancelProbe() |
67297ba0 |
// Wait for probe to exit (it might take a while to respond to the TERM
// signal and we don't want dying probes to pile up).
<-results |
b6c7becb |
return
case result := <-results: |
89b12347 |
handleProbeResult(d, c, result, stop) |
b6c7becb |
// Stop timeout
cancelProbe()
case <-ctx.Done(): |
a4a4f373 |
logrus.Debugf("Health check for container %s taking too long", c.ID) |
b6c7becb |
handleProbeResult(d, c, &types.HealthcheckResult{
ExitCode: -1,
Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
Start: startTime,
End: time.Now(), |
89b12347 |
}, stop) |
b6c7becb |
cancelProbe()
// Wait for probe to exit (it might take a while to respond to the TERM
// signal and we don't want dying probes to pile up).
<-results
}
}
}
}
// Get a suitable probe implementation for the container's healthcheck configuration. |
4016038b |
// Nil will be returned if no healthcheck was configured or NONE was set. |
b6c7becb |
func getProbe(c *container.Container) probe {
config := c.Config.Healthcheck
if config == nil || len(config.Test) == 0 {
return nil
}
switch config.Test[0] {
case "CMD":
return &cmdProbe{shell: false}
case "CMD-SHELL":
return &cmdProbe{shell: true} |
e987c554 |
case "NONE":
return nil |
b6c7becb |
default: |
a4a4f373 |
logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) |
b6c7becb |
return nil
}
}
// Ensure the health-check monitor is running or not, depending on the current
// state of the container.
// Called from monitor.go, with c locked.
func (d *Daemon) updateHealthMonitor(c *container.Container) {
h := c.State.Health
if h == nil {
return // No healthcheck configured
}
probe := getProbe(c)
wantRunning := c.Running && !c.Paused && probe != nil
if wantRunning {
if stop := h.OpenMonitorChannel(); stop != nil {
go monitor(d, c, stop, probe)
}
} else {
h.CloseMonitorChannel()
}
}
// Reset the health state for a newly-started, restarted or restored container.
// initHealthMonitor is called from monitor.go and we should never be running
// two instances at once.
// Called with c locked.
func (d *Daemon) initHealthMonitor(c *container.Container) { |
4016038b |
// If no healthcheck is setup then don't init the monitor
if getProbe(c) == nil { |
b6c7becb |
return
}
// This is needed in case we're auto-restarting
d.stopHealthchecks(c)
|
b8793cff |
if h := c.State.Health; h != nil { |
7db30ab0 |
h.SetStatus(types.Starting) |
b8793cff |
h.FailingStreak = 0
} else { |
b6c7becb |
h := &container.Health{} |
7db30ab0 |
h.SetStatus(types.Starting) |
b6c7becb |
c.State.Health = h
}
d.updateHealthMonitor(c)
}
// Called when the container is being stopped (whether because the health check is
// failing or for any other reason).
func (d *Daemon) stopHealthchecks(c *container.Container) {
h := c.State.Health
if h != nil {
h.CloseMonitorChannel()
}
}
// Buffer up to maxOutputLen bytes. Further data is discarded.
type limitedBuffer struct {
buf bytes.Buffer |
c3319445 |
mu sync.Mutex |
b6c7becb |
truncated bool // indicates that data has been lost
}
// Append to limitedBuffer while there is room.
func (b *limitedBuffer) Write(data []byte) (int, error) { |
c3319445 |
b.mu.Lock()
defer b.mu.Unlock()
|
b6c7becb |
bufLen := b.buf.Len()
dataLen := len(data)
keep := min(maxOutputLen-bufLen, dataLen)
if keep > 0 {
b.buf.Write(data[:keep])
}
if keep < dataLen {
b.truncated = true
}
return dataLen, nil
}
// The contents of the buffer, with "..." appended if it overflowed.
func (b *limitedBuffer) String() string { |
c3319445 |
b.mu.Lock()
defer b.mu.Unlock()
|
b6c7becb |
out := b.buf.String()
if b.truncated {
out = out + "..."
}
return out
}
// If configuredValue is zero, use defaultValue instead.
func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
if configuredValue == 0 {
return defaultValue
}
return configuredValue
}
func min(x, y int) int {
if x < y {
return x
}
return y
} |
5f81cf11 |
func getShell(config *containertypes.Config) []string {
if len(config.Shell) != 0 {
return config.Shell
}
if runtime.GOOS != "windows" {
return []string{"/bin/sh", "-c"}
}
return []string{"cmd", "/S", "/C"}
} |