Browse code

Add support for user-defined healthchecks

This PR adds support for user-defined health-check probes for Docker
containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus
some corresponding "docker run" options. It can be used with a restart policy
to automatically restart a container if the check fails.

The `HEALTHCHECK` instruction has two forms:

* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)

The `HEALTHCHECK` instruction tells Docker how to test a container to check that
it is still working. This can detect cases such as a web server that is stuck in
an infinite loop and unable to handle new connections, even though the server
process is still running.

When a container has a healthcheck specified, it has a _health status_ in
addition to its normal status. This status is initially `starting`. Whenever a
health check passes, it becomes `healthy` (whatever state it was previously in).
After a certain number of consecutive failures, it becomes `unhealthy`.

The options that can appear before `CMD` are:

* `--interval=DURATION` (default: `30s`)
* `--timeout=DURATION` (default: `30s`)
* `--retries=N` (default: `1`)

The health check will first run **interval** seconds after the container is
started, and then again **interval** seconds after each previous check completes.

If a single run of the check takes longer than **timeout** seconds then the check
is considered to have failed.

It takes **retries** consecutive failures of the health check for the container
to be considered `unhealthy`.

There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
more than one then only the last `HEALTHCHECK` will take effect.

The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
see e.g. `ENTRYPOINT` for details).

The command's exit status indicates the health status of the container.
The possible values are:

- 0: success - the container is healthy and ready for use
- 1: unhealthy - the container is not working correctly
- 2: starting - the container is not ready for use yet, but is working correctly

If the probe returns 2 ("starting") when the container has already moved out of the
"starting" state then it is treated as "unhealthy" instead.

For example, to check every five minutes or so that a web-server is able to
serve the site's main page within three seconds:

HEALTHCHECK --interval=5m --timeout=3s \
CMD curl -f http://localhost/ || exit 1

To help debug failing probes, any output text (UTF-8 encoded) that the command writes
on stdout or stderr will be stored in the health status and can be queried with
`docker inspect`. Such output should be kept short (only the first 4096 bytes
are stored currently).

When the health status of a container changes, a `health_status` event is
generated with the new status. The health status is also displayed in the
`docker ps` output.

Signed-off-by: Thomas Leonard <thomas.leonard@docker.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>

Thomas Leonard authored on 2016/04/18 18:48:13
Showing 27 changed files
... ...
@@ -17,7 +17,7 @@ type execBackend interface {
17 17
 	ContainerExecCreate(name string, config *types.ExecConfig) (string, error)
18 18
 	ContainerExecInspect(id string) (*backend.ExecInspect, error)
19 19
 	ContainerExecResize(name string, height, width int) error
20
-	ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
20
+	ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
21 21
 	ExecExists(name string) (bool, error)
22 22
 }
23 23
 
... ...
@@ -106,7 +106,8 @@ func (s *containerRouter) postContainerExecStart(ctx context.Context, w http.Res
106 106
 	}
107 107
 
108 108
 	// Now run the user process in container.
109
-	if err := s.backend.ContainerExecStart(execName, stdin, stdout, stderr); err != nil {
109
+	// Maybe we should we pass ctx here if we're not detaching?
110
+	if err := s.backend.ContainerExecStart(context.Background(), execName, stdin, stdout, stderr); err != nil {
110 111
 		if execStartCheck.Detach {
111 112
 			return err
112 113
 		}
... ...
@@ -22,15 +22,16 @@ import (
22 22
 )
23 23
 
24 24
 var validCommitCommands = map[string]bool{
25
-	"cmd":        true,
26
-	"entrypoint": true,
27
-	"env":        true,
28
-	"expose":     true,
29
-	"label":      true,
30
-	"onbuild":    true,
31
-	"user":       true,
32
-	"volume":     true,
33
-	"workdir":    true,
25
+	"cmd":         true,
26
+	"entrypoint":  true,
27
+	"healthcheck": true,
28
+	"env":         true,
29
+	"expose":      true,
30
+	"label":       true,
31
+	"onbuild":     true,
32
+	"user":        true,
33
+	"volume":      true,
34
+	"workdir":     true,
34 35
 }
35 36
 
36 37
 // BuiltinAllowedBuildArgs is list of built-in allowed build args
... ...
@@ -3,40 +3,42 @@ package command
3 3
 
4 4
 // Define constants for the command strings
5 5
 const (
6
-	Env        = "env"
7
-	Label      = "label"
8
-	Maintainer = "maintainer"
9
-	Add        = "add"
10
-	Copy       = "copy"
11
-	From       = "from"
12
-	Onbuild    = "onbuild"
13
-	Workdir    = "workdir"
14
-	Run        = "run"
15
-	Cmd        = "cmd"
16
-	Entrypoint = "entrypoint"
17
-	Expose     = "expose"
18
-	Volume     = "volume"
19
-	User       = "user"
20
-	StopSignal = "stopsignal"
21
-	Arg        = "arg"
6
+	Env         = "env"
7
+	Label       = "label"
8
+	Maintainer  = "maintainer"
9
+	Add         = "add"
10
+	Copy        = "copy"
11
+	From        = "from"
12
+	Onbuild     = "onbuild"
13
+	Workdir     = "workdir"
14
+	Run         = "run"
15
+	Cmd         = "cmd"
16
+	Entrypoint  = "entrypoint"
17
+	Expose      = "expose"
18
+	Volume      = "volume"
19
+	User        = "user"
20
+	StopSignal  = "stopsignal"
21
+	Arg         = "arg"
22
+	Healthcheck = "healthcheck"
22 23
 )
23 24
 
24 25
 // Commands is list of all Dockerfile commands
25 26
 var Commands = map[string]struct{}{
26
-	Env:        {},
27
-	Label:      {},
28
-	Maintainer: {},
29
-	Add:        {},
30
-	Copy:       {},
31
-	From:       {},
32
-	Onbuild:    {},
33
-	Workdir:    {},
34
-	Run:        {},
35
-	Cmd:        {},
36
-	Entrypoint: {},
37
-	Expose:     {},
38
-	Volume:     {},
39
-	User:       {},
40
-	StopSignal: {},
41
-	Arg:        {},
27
+	Env:         {},
28
+	Label:       {},
29
+	Maintainer:  {},
30
+	Add:         {},
31
+	Copy:        {},
32
+	From:        {},
33
+	Onbuild:     {},
34
+	Workdir:     {},
35
+	Run:         {},
36
+	Cmd:         {},
37
+	Entrypoint:  {},
38
+	Expose:      {},
39
+	Volume:      {},
40
+	User:        {},
41
+	StopSignal:  {},
42
+	Arg:         {},
43
+	Healthcheck: {},
42 44
 }
... ...
@@ -12,7 +12,9 @@ import (
12 12
 	"regexp"
13 13
 	"runtime"
14 14
 	"sort"
15
+	"strconv"
15 16
 	"strings"
17
+	"time"
16 18
 
17 19
 	"github.com/Sirupsen/logrus"
18 20
 	"github.com/docker/docker/api"
... ...
@@ -426,6 +428,111 @@ func cmd(b *Builder, args []string, attributes map[string]bool, original string)
426 426
 	return nil
427 427
 }
428 428
 
429
+// parseOptInterval(flag) is the duration of flag.Value, or 0 if
430
+// empty. An error is reported if the value is given and is not positive.
431
+func parseOptInterval(f *Flag) (time.Duration, error) {
432
+	s := f.Value
433
+	if s == "" {
434
+		return 0, nil
435
+	}
436
+	d, err := time.ParseDuration(s)
437
+	if err != nil {
438
+		return 0, err
439
+	}
440
+	if d <= 0 {
441
+		return 0, fmt.Errorf("Interval %#v must be positive", f.name)
442
+	}
443
+	return d, nil
444
+}
445
+
446
+// HEALTHCHECK foo
447
+//
448
+// Set the default healthcheck command to run in the container (which may be empty).
449
+// Argument handling is the same as RUN.
450
+//
451
+func healthcheck(b *Builder, args []string, attributes map[string]bool, original string) error {
452
+	if len(args) == 0 {
453
+		return fmt.Errorf("HEALTHCHECK requires an argument")
454
+	}
455
+	typ := strings.ToUpper(args[0])
456
+	args = args[1:]
457
+	if typ == "NONE" {
458
+		if len(args) != 0 {
459
+			return fmt.Errorf("HEALTHCHECK NONE takes no arguments")
460
+		}
461
+		test := strslice.StrSlice{typ}
462
+		b.runConfig.Healthcheck = &container.HealthConfig{
463
+			Test: test,
464
+		}
465
+	} else {
466
+		if b.runConfig.Healthcheck != nil {
467
+			oldCmd := b.runConfig.Healthcheck.Test
468
+			if len(oldCmd) > 0 && oldCmd[0] != "NONE" {
469
+				fmt.Fprintf(b.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd)
470
+			}
471
+		}
472
+
473
+		healthcheck := container.HealthConfig{}
474
+
475
+		flInterval := b.flags.AddString("interval", "")
476
+		flTimeout := b.flags.AddString("timeout", "")
477
+		flRetries := b.flags.AddString("retries", "")
478
+
479
+		if err := b.flags.Parse(); err != nil {
480
+			return err
481
+		}
482
+
483
+		switch typ {
484
+		case "CMD":
485
+			cmdSlice := handleJSONArgs(args, attributes)
486
+			if len(cmdSlice) == 0 {
487
+				return fmt.Errorf("Missing command after HEALTHCHECK CMD")
488
+			}
489
+
490
+			if !attributes["json"] {
491
+				typ = "CMD-SHELL"
492
+			}
493
+
494
+			healthcheck.Test = strslice.StrSlice(append([]string{typ}, cmdSlice...))
495
+		default:
496
+			return fmt.Errorf("Unknown type %#v in HEALTHCHECK (try CMD)", typ)
497
+		}
498
+
499
+		interval, err := parseOptInterval(flInterval)
500
+		if err != nil {
501
+			return err
502
+		}
503
+		healthcheck.Interval = interval
504
+
505
+		timeout, err := parseOptInterval(flTimeout)
506
+		if err != nil {
507
+			return err
508
+		}
509
+		healthcheck.Timeout = timeout
510
+
511
+		if flRetries.Value != "" {
512
+			retries, err := strconv.ParseInt(flRetries.Value, 10, 32)
513
+			if err != nil {
514
+				return err
515
+			}
516
+			if retries < 1 {
517
+				return fmt.Errorf("--retries must be at least 1 (not %d)", retries)
518
+			}
519
+			healthcheck.Retries = int(retries)
520
+		} else {
521
+			healthcheck.Retries = 0
522
+		}
523
+
524
+		b.runConfig.Healthcheck = &healthcheck
525
+	}
526
+
527
+	if err := b.commit("", b.runConfig.Cmd, fmt.Sprintf("HEALTHCHECK %q", b.runConfig.Healthcheck)); err != nil {
528
+		return err
529
+	}
530
+
531
+	return nil
532
+}
533
+
429 534
 // ENTRYPOINT /usr/sbin/nginx
430 535
 //
431 536
 // Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to
... ...
@@ -58,22 +58,23 @@ var evaluateTable map[string]func(*Builder, []string, map[string]bool, string) e
58 58
 
59 59
 func init() {
60 60
 	evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
61
-		command.Env:        env,
62
-		command.Label:      label,
63
-		command.Maintainer: maintainer,
64
-		command.Add:        add,
65
-		command.Copy:       dispatchCopy, // copy() is a go builtin
66
-		command.From:       from,
67
-		command.Onbuild:    onbuild,
68
-		command.Workdir:    workdir,
69
-		command.Run:        run,
70
-		command.Cmd:        cmd,
71
-		command.Entrypoint: entrypoint,
72
-		command.Expose:     expose,
73
-		command.Volume:     volume,
74
-		command.User:       user,
75
-		command.StopSignal: stopSignal,
76
-		command.Arg:        arg,
61
+		command.Env:         env,
62
+		command.Label:       label,
63
+		command.Maintainer:  maintainer,
64
+		command.Add:         add,
65
+		command.Copy:        dispatchCopy, // copy() is a go builtin
66
+		command.From:        from,
67
+		command.Onbuild:     onbuild,
68
+		command.Workdir:     workdir,
69
+		command.Run:         run,
70
+		command.Cmd:         cmd,
71
+		command.Entrypoint:  entrypoint,
72
+		command.Expose:      expose,
73
+		command.Volume:      volume,
74
+		command.User:        user,
75
+		command.StopSignal:  stopSignal,
76
+		command.Arg:         arg,
77
+		command.Healthcheck: healthcheck,
77 78
 	}
78 79
 }
79 80
 
... ...
@@ -329,3 +329,32 @@ func parseMaybeJSONToList(rest string) (*Node, map[string]bool, error) {
329 329
 
330 330
 	return parseStringsWhitespaceDelimited(rest)
331 331
 }
332
+
333
+// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument.
334
+func parseHealthConfig(rest string) (*Node, map[string]bool, error) {
335
+	// Find end of first argument
336
+	var sep int
337
+	for ; sep < len(rest); sep++ {
338
+		if unicode.IsSpace(rune(rest[sep])) {
339
+			break
340
+		}
341
+	}
342
+	next := sep
343
+	for ; next < len(rest); next++ {
344
+		if !unicode.IsSpace(rune(rest[next])) {
345
+			break
346
+		}
347
+	}
348
+
349
+	if sep == 0 {
350
+		return nil, nil, nil
351
+	}
352
+
353
+	typ := rest[:sep]
354
+	cmd, attrs, err := parseMaybeJSON(rest[next:])
355
+	if err != nil {
356
+		return nil, nil, err
357
+	}
358
+
359
+	return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err
360
+}
... ...
@@ -66,22 +66,23 @@ func init() {
66 66
 	// functions. Errors are propagated up by Parse() and the resulting AST can
67 67
 	// be incorporated directly into the existing AST as a next.
68 68
 	dispatch = map[string]func(string) (*Node, map[string]bool, error){
69
-		command.User:       parseString,
70
-		command.Onbuild:    parseSubCommand,
71
-		command.Workdir:    parseString,
72
-		command.Env:        parseEnv,
73
-		command.Label:      parseLabel,
74
-		command.Maintainer: parseString,
75
-		command.From:       parseString,
76
-		command.Add:        parseMaybeJSONToList,
77
-		command.Copy:       parseMaybeJSONToList,
78
-		command.Run:        parseMaybeJSON,
79
-		command.Cmd:        parseMaybeJSON,
80
-		command.Entrypoint: parseMaybeJSON,
81
-		command.Expose:     parseStringsWhitespaceDelimited,
82
-		command.Volume:     parseMaybeJSONToList,
83
-		command.StopSignal: parseString,
84
-		command.Arg:        parseNameOrNameVal,
69
+		command.User:        parseString,
70
+		command.Onbuild:     parseSubCommand,
71
+		command.Workdir:     parseString,
72
+		command.Env:         parseEnv,
73
+		command.Label:       parseLabel,
74
+		command.Maintainer:  parseString,
75
+		command.From:        parseString,
76
+		command.Add:         parseMaybeJSONToList,
77
+		command.Copy:        parseMaybeJSONToList,
78
+		command.Run:         parseMaybeJSON,
79
+		command.Cmd:         parseMaybeJSON,
80
+		command.Entrypoint:  parseMaybeJSON,
81
+		command.Expose:      parseStringsWhitespaceDelimited,
82
+		command.Volume:      parseMaybeJSONToList,
83
+		command.StopSignal:  parseString,
84
+		command.Arg:         parseNameOrNameVal,
85
+		command.Healthcheck: parseHealthConfig,
85 86
 	}
86 87
 }
87 88
 
88 89
new file mode 100644
... ...
@@ -0,0 +1,10 @@
0
+FROM debian
1
+ADD check.sh main.sh /app/
2
+CMD /app/main.sh
3
+HEALTHCHECK
4
+HEALTHCHECK --interval=5s --timeout=3s --retries=1 \
5
+  CMD /app/check.sh --quiet
6
+HEALTHCHECK CMD
7
+HEALTHCHECK   CMD   a b
8
+HEALTHCHECK --timeout=3s CMD ["foo"]
9
+HEALTHCHECK CONNECT TCP 7000
0 10
new file mode 100644
... ...
@@ -0,0 +1,9 @@
0
+(from "debian")
1
+(add "check.sh" "main.sh" "/app/")
2
+(cmd "/app/main.sh")
3
+(healthcheck)
4
+(healthcheck ["--interval=5s" "--timeout=3s" "--retries=1"] "CMD" "/app/check.sh --quiet")
5
+(healthcheck "CMD")
6
+(healthcheck "CMD" "a b")
7
+(healthcheck ["--timeout=3s"] "CMD" "foo")
8
+(healthcheck "CONNECT" "TCP 7000")
0 9
new file mode 100644
... ...
@@ -0,0 +1,49 @@
0
+package container
1
+
2
+import (
3
+	"github.com/Sirupsen/logrus"
4
+	"github.com/docker/engine-api/types"
5
+)
6
+
7
+// Health holds the current container health-check state
8
+type Health struct {
9
+	types.Health
10
+	stop chan struct{} // Write struct{} to stop the monitor
11
+}
12
+
13
+// String returns a human-readable description of the health-check state
14
+func (s *Health) String() string {
15
+	if s.stop == nil {
16
+		return "no healthcheck"
17
+	}
18
+	switch s.Status {
19
+	case types.Starting:
20
+		return "health: starting"
21
+	default: // Healthy and Unhealthy are clear on their own
22
+		return s.Status
23
+	}
24
+}
25
+
26
+// OpenMonitorChannel creates and returns a new monitor channel. If there already is one,
27
+// it returns nil.
28
+func (s *Health) OpenMonitorChannel() chan struct{} {
29
+	if s.stop == nil {
30
+		logrus.Debugf("OpenMonitorChannel")
31
+		s.stop = make(chan struct{})
32
+		return s.stop
33
+	}
34
+	return nil
35
+}
36
+
37
+// CloseMonitorChannel closes any existing monitor channel.
38
+func (s *Health) CloseMonitorChannel() {
39
+	if s.stop != nil {
40
+		logrus.Debugf("CloseMonitorChannel: waiting for probe to stop")
41
+		// This channel does not buffer. Once the write succeeds, the monitor
42
+		// has read the stop request and will not make any further updates
43
+		// to c.State.Health.
44
+		s.stop <- struct{}{}
45
+		s.stop = nil
46
+		logrus.Debugf("CloseMonitorChannel done")
47
+	}
48
+}
... ...
@@ -27,6 +27,7 @@ type State struct {
27 27
 	StartedAt         time.Time
28 28
 	FinishedAt        time.Time
29 29
 	waitChan          chan struct{}
30
+	Health            *Health
30 31
 }
31 32
 
32 33
 // NewState creates a default state object with a fresh channel for state changes.
... ...
@@ -46,6 +47,9 @@ func (s *State) String() string {
46 46
 			return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
47 47
 		}
48 48
 
49
+		if h := s.Health; h != nil {
50
+			return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String())
51
+		}
49 52
 		return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
50 53
 	}
51 54
 
... ...
@@ -80,6 +80,25 @@ func merge(userConf, imageConf *containertypes.Config) error {
80 80
 			userConf.Entrypoint = imageConf.Entrypoint
81 81
 		}
82 82
 	}
83
+	if imageConf.Healthcheck != nil {
84
+		if userConf.Healthcheck == nil {
85
+			userConf.Healthcheck = imageConf.Healthcheck
86
+		} else {
87
+			if len(userConf.Healthcheck.Test) == 0 {
88
+				userConf.Healthcheck.Test = imageConf.Healthcheck.Test
89
+			}
90
+			if userConf.Healthcheck.Interval == 0 {
91
+				userConf.Healthcheck.Interval = imageConf.Healthcheck.Interval
92
+			}
93
+			if userConf.Healthcheck.Timeout == 0 {
94
+				userConf.Healthcheck.Timeout = imageConf.Healthcheck.Timeout
95
+			}
96
+			if userConf.Healthcheck.Retries == 0 {
97
+				userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries
98
+			}
99
+		}
100
+	}
101
+
83 102
 	if userConf.WorkingDir == "" {
84 103
 		userConf.WorkingDir = imageConf.WorkingDir
85 104
 	}
... ...
@@ -14,11 +14,15 @@ import (
14 14
 	"github.com/docker/docker/errors"
15 15
 	"github.com/docker/docker/libcontainerd"
16 16
 	"github.com/docker/docker/pkg/pools"
17
+	"github.com/docker/docker/pkg/signal"
17 18
 	"github.com/docker/docker/pkg/term"
18 19
 	"github.com/docker/engine-api/types"
19 20
 	"github.com/docker/engine-api/types/strslice"
20 21
 )
21 22
 
23
+// Seconds to wait after sending TERM before trying KILL
24
+const termProcessTimeout = 10
25
+
22 26
 func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
23 27
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
24 28
 	container.ExecCommands.Add(config.ID, config)
... ...
@@ -130,7 +134,8 @@ func (d *Daemon) ContainerExecCreate(name string, config *types.ExecConfig) (str
130 130
 
131 131
 // ContainerExecStart starts a previously set up exec instance. The
132 132
 // std streams are set up.
133
-func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
133
+// If ctx is cancelled, the process is terminated.
134
+func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
134 135
 	var (
135 136
 		cStdin           io.ReadCloser
136 137
 		cStdout, cStderr io.Writer
... ...
@@ -197,15 +202,28 @@ func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.
197 197
 		return nil
198 198
 	}
199 199
 
200
-	attachErr := container.AttachStreams(context.Background(), ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
200
+	attachErr := container.AttachStreams(ctx, ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
201 201
 
202 202
 	if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
203 203
 		return err
204 204
 	}
205 205
 
206
-	err = <-attachErr
207
-	if err != nil {
208
-		return fmt.Errorf("attach failed with error: %v", err)
206
+	select {
207
+	case <-ctx.Done():
208
+		logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
209
+		d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["TERM"]))
210
+		select {
211
+		case <-time.After(termProcessTimeout * time.Second):
212
+			logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout)
213
+			d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["KILL"]))
214
+		case <-attachErr:
215
+			// TERM signal worked
216
+		}
217
+		return fmt.Errorf("context cancelled")
218
+	case err := <-attachErr:
219
+		if err != nil {
220
+			return fmt.Errorf("attach failed with error: %v", err)
221
+		}
209 222
 	}
210 223
 	return nil
211 224
 }
212 225
new file mode 100644
... ...
@@ -0,0 +1,314 @@
0
+package daemon
1
+
2
+import (
3
+	"bytes"
4
+	"fmt"
5
+	"runtime"
6
+	"strings"
7
+	"time"
8
+
9
+	"golang.org/x/net/context"
10
+
11
+	"github.com/Sirupsen/logrus"
12
+	"github.com/docker/docker/container"
13
+	"github.com/docker/docker/daemon/exec"
14
+	"github.com/docker/engine-api/types"
15
+	"github.com/docker/engine-api/types/strslice"
16
+)
17
+
18
+const (
19
+	// Longest healthcheck probe output message to store. Longer messages will be truncated.
20
+	maxOutputLen = 4096
21
+
22
+	// Default interval between probe runs (from the end of the first to the start of the second).
23
+	// Also the time before the first probe.
24
+	defaultProbeInterval = 30 * time.Second
25
+
26
+	// The maximum length of time a single probe run should take. If the probe takes longer
27
+	// than this, the check is considered to have failed.
28
+	defaultProbeTimeout = 30 * time.Second
29
+
30
+	// Shut down a container if it becomes Unhealthy.
31
+	defaultExitOnUnhealthy = true
32
+
33
+	// Maximum number of entries to record
34
+	maxLogEntries = 5
35
+)
36
+
37
+const (
38
+	// Exit status codes that can be returned by the probe command.
39
+
40
+	exitStatusHealthy   = 0 // Container is healthy
41
+	exitStatusUnhealthy = 1 // Container is unhealthy
42
+	exitStatusStarting  = 2 // Container needs more time to start
43
+)
44
+
45
+// probe implementations know how to run a particular type of probe.
46
+type probe interface {
47
+	// Perform one run of the check. Returns the exit code and an optional
48
+	// short diagnostic string.
49
+	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
50
+}
51
+
52
+// cmdProbe implements the "CMD" probe type.
53
+type cmdProbe struct {
54
+	// Run the command with the system's default shell instead of execing it directly.
55
+	shell bool
56
+}
57
+
58
+// exec the healthcheck command in the container.
59
+// Returns the exit code and probe output (if any)
60
+func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
61
+	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
62
+	if p.shell {
63
+		if runtime.GOOS != "windows" {
64
+			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
65
+		} else {
66
+			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
67
+		}
68
+	}
69
+	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
70
+	execConfig := exec.NewConfig()
71
+	execConfig.OpenStdin = false
72
+	execConfig.OpenStdout = true
73
+	execConfig.OpenStderr = true
74
+	execConfig.ContainerID = container.ID
75
+	execConfig.DetachKeys = []byte{}
76
+	execConfig.Entrypoint = entrypoint
77
+	execConfig.Args = args
78
+	execConfig.Tty = false
79
+	execConfig.Privileged = false
80
+	execConfig.User = container.Config.User
81
+
82
+	d.registerExecCommand(container, execConfig)
83
+	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
84
+
85
+	output := &limitedBuffer{}
86
+	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
87
+	if err != nil {
88
+		return nil, err
89
+	}
90
+	info, err := d.getExecConfig(execConfig.ID)
91
+	if err != nil {
92
+		return nil, err
93
+	}
94
+	if info.ExitCode == nil {
95
+		return nil, fmt.Errorf("Healthcheck has no exit code!")
96
+	}
97
+	// Note: Go's json package will handle invalid UTF-8 for us
98
+	out := output.String()
99
+	return &types.HealthcheckResult{
100
+		End:      time.Now(),
101
+		ExitCode: *info.ExitCode,
102
+		Output:   out,
103
+	}, nil
104
+}
105
+
106
+// Update the container's Status.Health struct based on the latest probe's result.
107
+func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
108
+	c.Lock()
109
+	defer c.Unlock()
110
+
111
+	retries := c.Config.Healthcheck.Retries
112
+	if retries <= 0 {
113
+		retries = 1 // Default if unset or set to an invalid value
114
+	}
115
+
116
+	h := c.State.Health
117
+	oldStatus := h.Status
118
+
119
+	if len(h.Log) >= maxLogEntries {
120
+		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
121
+	} else {
122
+		h.Log = append(h.Log, result)
123
+	}
124
+
125
+	if result.ExitCode == exitStatusHealthy {
126
+		h.FailingStreak = 0
127
+		h.Status = types.Healthy
128
+	} else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting {
129
+		// The container is not ready yet. Remain in the starting state.
130
+	} else {
131
+		// Failure (incuding invalid exit code)
132
+		h.FailingStreak++
133
+		if c.State.Health.FailingStreak >= retries {
134
+			h.Status = types.Unhealthy
135
+		}
136
+		// Else we're starting or healthy. Stay in that state.
137
+	}
138
+
139
+	if oldStatus != h.Status {
140
+		d.LogContainerEvent(c, "health_status: "+h.Status)
141
+	}
142
+}
143
+
144
+// Run the container's monitoring thread until notified via "stop".
145
+// There is never more than one monitor thread running per container at a time.
146
+func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
147
+	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
148
+	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
149
+	for {
150
+		select {
151
+		case <-stop:
152
+			logrus.Debugf("Stop healthcheck monitoring (received while idle)")
153
+			return
154
+		case <-time.After(probeInterval):
155
+			logrus.Debugf("Running health check...")
156
+			startTime := time.Now()
157
+			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
158
+			results := make(chan *types.HealthcheckResult)
159
+			go func() {
160
+				result, err := probe.run(ctx, d, c)
161
+				if err != nil {
162
+					logrus.Warnf("Health check error: %v", err)
163
+					results <- &types.HealthcheckResult{
164
+						ExitCode: -1,
165
+						Output:   err.Error(),
166
+						Start:    startTime,
167
+						End:      time.Now(),
168
+					}
169
+				} else {
170
+					result.Start = startTime
171
+					logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
172
+					results <- result
173
+				}
174
+				close(results)
175
+			}()
176
+			select {
177
+			case <-stop:
178
+				logrus.Debugf("Stop healthcheck monitoring (received while probing)")
179
+				// Stop timeout and kill probe, but don't wait for probe to exit.
180
+				cancelProbe()
181
+				return
182
+			case result := <-results:
183
+				handleProbeResult(d, c, result)
184
+				// Stop timeout
185
+				cancelProbe()
186
+			case <-ctx.Done():
187
+				logrus.Debugf("Health check taking too long")
188
+				handleProbeResult(d, c, &types.HealthcheckResult{
189
+					ExitCode: -1,
190
+					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
191
+					Start:    startTime,
192
+					End:      time.Now(),
193
+				})
194
+				cancelProbe()
195
+				// Wait for probe to exit (it might take a while to respond to the TERM
196
+				// signal and we don't want dying probes to pile up).
197
+				<-results
198
+			}
199
+		}
200
+	}
201
+}
202
+
203
+// Get a suitable probe implementation for the container's healthcheck configuration.
204
+func getProbe(c *container.Container) probe {
205
+	config := c.Config.Healthcheck
206
+	if config == nil || len(config.Test) == 0 {
207
+		return nil
208
+	}
209
+	switch config.Test[0] {
210
+	case "CMD":
211
+		return &cmdProbe{shell: false}
212
+	case "CMD-SHELL":
213
+		return &cmdProbe{shell: true}
214
+	default:
215
+		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
216
+		return nil
217
+	}
218
+}
219
+
220
+// Ensure the health-check monitor is running or not, depending on the current
221
+// state of the container.
222
+// Called from monitor.go, with c locked.
223
+func (d *Daemon) updateHealthMonitor(c *container.Container) {
224
+	h := c.State.Health
225
+	if h == nil {
226
+		return // No healthcheck configured
227
+	}
228
+
229
+	probe := getProbe(c)
230
+	wantRunning := c.Running && !c.Paused && probe != nil
231
+	if wantRunning {
232
+		if stop := h.OpenMonitorChannel(); stop != nil {
233
+			go monitor(d, c, stop, probe)
234
+		}
235
+	} else {
236
+		h.CloseMonitorChannel()
237
+	}
238
+}
239
+
240
+// Reset the health state for a newly-started, restarted or restored container.
241
+// initHealthMonitor is called from monitor.go and we should never be running
242
+// two instances at once.
243
+// Called with c locked.
244
+func (d *Daemon) initHealthMonitor(c *container.Container) {
245
+	if c.Config.Healthcheck == nil {
246
+		return
247
+	}
248
+
249
+	// This is needed in case we're auto-restarting
250
+	d.stopHealthchecks(c)
251
+
252
+	if c.State.Health == nil {
253
+		h := &container.Health{}
254
+		h.Status = types.Starting
255
+		h.FailingStreak = 0
256
+		c.State.Health = h
257
+	}
258
+
259
+	d.updateHealthMonitor(c)
260
+}
261
+
262
+// Called when the container is being stopped (whether because the health check is
263
+// failing or for any other reason).
264
+func (d *Daemon) stopHealthchecks(c *container.Container) {
265
+	h := c.State.Health
266
+	if h != nil {
267
+		h.CloseMonitorChannel()
268
+	}
269
+}
270
+
271
+// Buffer up to maxOutputLen bytes. Further data is discarded.
272
+type limitedBuffer struct {
273
+	buf       bytes.Buffer
274
+	truncated bool // indicates that data has been lost
275
+}
276
+
277
+// Append to limitedBuffer while there is room.
278
+func (b *limitedBuffer) Write(data []byte) (int, error) {
279
+	bufLen := b.buf.Len()
280
+	dataLen := len(data)
281
+	keep := min(maxOutputLen-bufLen, dataLen)
282
+	if keep > 0 {
283
+		b.buf.Write(data[:keep])
284
+	}
285
+	if keep < dataLen {
286
+		b.truncated = true
287
+	}
288
+	return dataLen, nil
289
+}
290
+
291
+// The contents of the buffer, with "..." appended if it overflowed.
292
+func (b *limitedBuffer) String() string {
293
+	out := b.buf.String()
294
+	if b.truncated {
295
+		out = out + "..."
296
+	}
297
+	return out
298
+}
299
+
300
+// If configuredValue is zero, use defaultValue instead.
301
+func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
302
+	if configuredValue == 0 {
303
+		return defaultValue
304
+	}
305
+	return configuredValue
306
+}
307
+
308
+func min(x, y int) int {
309
+	if x < y {
310
+		return x
311
+	}
312
+	return y
313
+}
0 314
new file mode 100644
... ...
@@ -0,0 +1,112 @@
0
+package daemon
1
+
2
+import (
3
+	"testing"
4
+	"time"
5
+
6
+	"github.com/docker/docker/container"
7
+	"github.com/docker/docker/daemon/events"
8
+	"github.com/docker/engine-api/types"
9
+	containertypes "github.com/docker/engine-api/types/container"
10
+	eventtypes "github.com/docker/engine-api/types/events"
11
+)
12
+
13
+func reset(c *container.Container) {
14
+	c.State = &container.State{}
15
+	c.State.Health = &container.Health{}
16
+	c.State.Health.Status = types.Starting
17
+}
18
+
19
+func TestHealthStates(t *testing.T) {
20
+	e := events.New()
21
+	_, l, _ := e.Subscribe()
22
+	defer e.Evict(l)
23
+
24
+	expect := func(expected string) {
25
+		select {
26
+		case event := <-l:
27
+			ev := event.(eventtypes.Message)
28
+			if ev.Status != expected {
29
+				t.Errorf("Expecting event %#v, but got %#v\n", expected, ev.Status)
30
+			}
31
+		case <-time.After(1 * time.Second):
32
+			t.Errorf("Expecting event %#v, but got nothing\n", expected)
33
+		}
34
+	}
35
+
36
+	c := &container.Container{
37
+		CommonContainer: container.CommonContainer{
38
+			ID:   "container_id",
39
+			Name: "container_name",
40
+			Config: &containertypes.Config{
41
+				Image: "image_name",
42
+			},
43
+		},
44
+	}
45
+	daemon := &Daemon{
46
+		EventsService: e,
47
+	}
48
+
49
+	c.Config.Healthcheck = &containertypes.HealthConfig{
50
+		Retries: 1,
51
+	}
52
+
53
+	reset(c)
54
+
55
+	handleResult := func(startTime time.Time, exitCode int) {
56
+		handleProbeResult(daemon, c, &types.HealthcheckResult{
57
+			Start:    startTime,
58
+			End:      startTime,
59
+			ExitCode: exitCode,
60
+		})
61
+	}
62
+
63
+	// starting -> failed -> success -> failed
64
+
65
+	handleResult(c.State.StartedAt.Add(1*time.Second), 1)
66
+	expect("health_status: unhealthy")
67
+
68
+	handleResult(c.State.StartedAt.Add(2*time.Second), 0)
69
+	expect("health_status: healthy")
70
+
71
+	handleResult(c.State.StartedAt.Add(3*time.Second), 1)
72
+	expect("health_status: unhealthy")
73
+
74
+	// starting -> starting -> starting ->
75
+	// healthy -> starting (invalid transition)
76
+
77
+	reset(c)
78
+
79
+	handleResult(c.State.StartedAt.Add(20*time.Second), 2)
80
+	handleResult(c.State.StartedAt.Add(40*time.Second), 2)
81
+	if c.State.Health.Status != types.Starting {
82
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
83
+	}
84
+
85
+	handleResult(c.State.StartedAt.Add(50*time.Second), 0)
86
+	expect("health_status: healthy")
87
+	handleResult(c.State.StartedAt.Add(60*time.Second), 2)
88
+	expect("health_status: unhealthy")
89
+
90
+	// Test retries
91
+
92
+	reset(c)
93
+	c.Config.Healthcheck.Retries = 3
94
+
95
+	handleResult(c.State.StartedAt.Add(20*time.Second), 1)
96
+	handleResult(c.State.StartedAt.Add(40*time.Second), 1)
97
+	if c.State.Health.Status != types.Starting {
98
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
99
+	}
100
+	if c.State.Health.FailingStreak != 2 {
101
+		t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak)
102
+	}
103
+	handleResult(c.State.StartedAt.Add(60*time.Second), 1)
104
+	expect("health_status: unhealthy")
105
+
106
+	handleResult(c.State.StartedAt.Add(80*time.Second), 0)
107
+	expect("health_status: healthy")
108
+	if c.State.Health.FailingStreak != 0 {
109
+		t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak)
110
+	}
111
+}
... ...
@@ -108,6 +108,15 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
108 108
 		hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
109 109
 	}
110 110
 
111
+	var containerHealth *types.Health
112
+	if container.State.Health != nil {
113
+		containerHealth = &types.Health{
114
+			Status:        container.State.Health.Status,
115
+			FailingStreak: container.State.Health.FailingStreak,
116
+			Log:           append([]*types.HealthcheckResult{}, container.State.Health.Log...),
117
+		}
118
+	}
119
+
111 120
 	containerState := &types.ContainerState{
112 121
 		Status:     container.State.StateString(),
113 122
 		Running:    container.State.Running,
... ...
@@ -120,6 +129,7 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
120 120
 		Error:      container.State.Error,
121 121
 		StartedAt:  container.State.StartedAt.Format(time.RFC3339Nano),
122 122
 		FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano),
123
+		Health:     containerHealth,
123 124
 	}
124 125
 
125 126
 	contJSONBase := &types.ContainerJSONBase{
... ...
@@ -25,6 +25,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
25 25
 		if runtime.GOOS == "windows" {
26 26
 			return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
27 27
 		}
28
+		daemon.updateHealthMonitor(c)
28 29
 		daemon.LogContainerEvent(c, "oom")
29 30
 	case libcontainerd.StateExit:
30 31
 		c.Lock()
... ...
@@ -35,6 +36,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
35 35
 		attributes := map[string]string{
36 36
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
37 37
 		}
38
+		daemon.updateHealthMonitor(c)
38 39
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
39 40
 		daemon.Cleanup(c)
40 41
 		// FIXME: here is race condition between two RUN instructions in Dockerfile
... ...
@@ -54,6 +56,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
54 54
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
55 55
 		}
56 56
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
57
+		daemon.updateHealthMonitor(c)
57 58
 		return c.ToDisk()
58 59
 	case libcontainerd.StateExitProcess:
59 60
 		c.Lock()
... ...
@@ -74,18 +77,24 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
74 74
 			logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
75 75
 		}
76 76
 	case libcontainerd.StateStart, libcontainerd.StateRestore:
77
+		// Container is already locked in this case
77 78
 		c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
78 79
 		c.HasBeenManuallyStopped = false
79 80
 		if err := c.ToDisk(); err != nil {
80 81
 			c.Reset(false)
81 82
 			return err
82 83
 		}
84
+		daemon.initHealthMonitor(c)
83 85
 		daemon.LogContainerEvent(c, "start")
84 86
 	case libcontainerd.StatePause:
87
+		// Container is already locked in this case
85 88
 		c.Paused = true
89
+		daemon.updateHealthMonitor(c)
86 90
 		daemon.LogContainerEvent(c, "pause")
87 91
 	case libcontainerd.StateResume:
92
+		// Container is already locked in this case
88 93
 		c.Paused = false
94
+		daemon.updateHealthMonitor(c)
89 95
 		daemon.LogContainerEvent(c, "unpause")
90 96
 	}
91 97
 
... ...
@@ -41,6 +41,8 @@ func (daemon *Daemon) containerStop(container *container.Container, seconds int)
41 41
 		return nil
42 42
 	}
43 43
 
44
+	daemon.stopHealthchecks(container)
45
+
44 46
 	stopSignal := container.StopSignal()
45 47
 	// 1. Send a stop signal
46 48
 	if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
... ...
@@ -1470,6 +1470,73 @@ The `STOPSIGNAL` instruction sets the system call signal that will be sent to th
1470 1470
 This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9,
1471 1471
 or a signal name in the format SIGNAME, for instance SIGKILL.
1472 1472
 
1473
+## HEALTHCHECK
1474
+
1475
+The `HEALTHCHECK` instruction has two forms:
1476
+
1477
+* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
1478
+* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)
1479
+
1480
+The `HEALTHCHECK` instruction tells Docker how to test a container to check that
1481
+it is still working. This can detect cases such as a web server that is stuck in
1482
+an infinite loop and unable to handle new connections, even though the server
1483
+process is still running.
1484
+
1485
+When a container has a healthcheck specified, it has a _health status_ in
1486
+addition to its normal status. This status is initially `starting`. Whenever a
1487
+health check passes, it becomes `healthy` (whatever state it was previously in).
1488
+After a certain number of consecutive failures, it becomes `unhealthy`.
1489
+
1490
+The options that can appear before `CMD` are:
1491
+
1492
+* `--interval=DURATION` (default: `30s`)
1493
+* `--timeout=DURATION` (default: `30s`)
1494
+* `--retries=N` (default: `1`)
1495
+
1496
+The health check will first run **interval** seconds after the container is
1497
+started, and then again **interval** seconds after each previous check completes.
1498
+
1499
+If a single run of the check takes longer than **timeout** seconds then the check
1500
+is considered to have failed.
1501
+
1502
+It takes **retries** consecutive failures of the health check for the container
1503
+to be considered `unhealthy`.
1504
+
1505
+There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
1506
+more than one then only the last `HEALTHCHECK` will take effect.
1507
+
1508
+The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
1509
+CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
1510
+see e.g. `ENTRYPOINT` for details).
1511
+
1512
+The command's exit status indicates the health status of the container.
1513
+The possible values are:
1514
+
1515
+- 0: success - the container is healthy and ready for use
1516
+- 1: unhealthy - the container is not working correctly
1517
+- 2: starting - the container is not ready for use yet, but is working correctly
1518
+
1519
+If the probe returns 2 ("starting") when the container has already moved out of the
1520
+"starting" state then it is treated as "unhealthy" instead.
1521
+
1522
+For example, to check every five minutes or so that a web-server is able to
1523
+serve the site's main page within three seconds:
1524
+
1525
+    HEALTHCHECK --interval=5m --timeout=3s \
1526
+      CMD curl -f http://localhost/ || exit 1
1527
+
1528
+To help debug failing probes, any output text (UTF-8 encoded) that the command writes
1529
+on stdout or stderr will be stored in the health status and can be queried with
1530
+`docker inspect`. Such output should be kept short (only the first 4096 bytes
1531
+are stored currently).
1532
+
1533
+When the health status of a container changes, a `health_status` event is
1534
+generated with the new status.
1535
+
1536
+The `HEALTHCHECK` feature was added in Docker 1.12.
1537
+
1538
+
1539
+
1473 1540
 ## Dockerfile examples
1474 1541
 
1475 1542
 Below you can see some examples of Dockerfile syntax. If you're interested in
... ...
@@ -1250,6 +1250,7 @@ Dockerfile instruction and how the operator can override that setting.
1250 1250
     #entrypoint-default-command-to-execute-at-runtime)
1251 1251
  - [EXPOSE (Incoming Ports)](#expose-incoming-ports)
1252 1252
  - [ENV (Environment Variables)](#env-environment-variables)
1253
+ - [HEALTHCHECK](#healthcheck)
1253 1254
  - [VOLUME (Shared Filesystems)](#volume-shared-filesystems)
1254 1255
  - [USER](#user)
1255 1256
  - [WORKDIR](#workdir)
... ...
@@ -1398,6 +1399,65 @@ above, or already defined by the developer with a Dockerfile `ENV`:
1398 1398
 
1399 1399
 Similarly the operator can set the **hostname** with `-h`.
1400 1400
 
1401
+### HEALTHCHECK
1402
+
1403
+```
1404
+  --health-cmd            Command to run to check health
1405
+  --health-interval       Time between running the check
1406
+  --health-retries        Consecutive failures needed to report unhealthy
1407
+  --health-timeout        Maximum time to allow one check to run
1408
+  --no-healthcheck        Disable any container-specified HEALTHCHECK
1409
+```
1410
+
1411
+Example:
1412
+
1413
+    $ docker run --name=test -d \
1414
+        --health-cmd='stat /etc/passwd || exit 1' \
1415
+        --health-interval=2s \
1416
+        busybox sleep 1d
1417
+    $ sleep 2; docker inspect --format='{{.State.Health.Status}}' test
1418
+    healthy
1419
+    $ docker exec test rm /etc/passwd
1420
+    $ sleep 2; docker inspect --format='{{json .State.Health}}' test
1421
+    {
1422
+      "Status": "unhealthy",
1423
+      "FailingStreak": 3,
1424
+      "Log": [
1425
+        {
1426
+          "Start": "2016-05-25T17:22:04.635478668Z",
1427
+          "End": "2016-05-25T17:22:04.7272552Z",
1428
+          "ExitCode": 0,
1429
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
1430
+        },
1431
+        {
1432
+          "Start": "2016-05-25T17:22:06.732900633Z",
1433
+          "End": "2016-05-25T17:22:06.822168935Z",
1434
+          "ExitCode": 0,
1435
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
1436
+        },
1437
+        {
1438
+          "Start": "2016-05-25T17:22:08.823956535Z",
1439
+          "End": "2016-05-25T17:22:08.897359124Z",
1440
+          "ExitCode": 1,
1441
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
1442
+        },
1443
+        {
1444
+          "Start": "2016-05-25T17:22:10.898802931Z",
1445
+          "End": "2016-05-25T17:22:10.969631866Z",
1446
+          "ExitCode": 1,
1447
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
1448
+        },
1449
+        {
1450
+          "Start": "2016-05-25T17:22:12.971033523Z",
1451
+          "End": "2016-05-25T17:22:13.082015516Z",
1452
+          "ExitCode": 1,
1453
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
1454
+        }
1455
+      ]
1456
+    }
1457
+
1458
+The health status is also displayed in the `docker ps` output.
1459
+
1401 1460
 ### TMPFS (mount tmpfs filesystems)
1402 1461
 
1403 1462
 ```bash
1404 1463
new file mode 100644
... ...
@@ -0,0 +1,154 @@
0
+package main
1
+
2
+import (
3
+	"encoding/json"
4
+	"github.com/docker/docker/pkg/integration/checker"
5
+	"github.com/docker/engine-api/types"
6
+	"github.com/go-check/check"
7
+	"strconv"
8
+	"strings"
9
+	"time"
10
+)
11
+
12
+func waitForStatus(c *check.C, name string, prev string, expected string) {
13
+	prev = prev + "\n"
14
+	expected = expected + "\n"
15
+	for {
16
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name)
17
+		if out == expected {
18
+			return
19
+		}
20
+		c.Check(out, checker.Equals, prev)
21
+		if out != prev {
22
+			return
23
+		}
24
+		time.Sleep(100 * time.Millisecond)
25
+	}
26
+}
27
+
28
+func waitForHealthStatus(c *check.C, name string, prev string, expected string) {
29
+	prev = prev + "\n"
30
+	expected = expected + "\n"
31
+	for {
32
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
33
+		if out == expected {
34
+			return
35
+		}
36
+		c.Check(out, checker.Equals, prev)
37
+		if out != prev {
38
+			return
39
+		}
40
+		time.Sleep(100 * time.Millisecond)
41
+	}
42
+}
43
+
44
+func getHealth(c *check.C, name string) *types.Health {
45
+	out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name)
46
+	var health types.Health
47
+	err := json.Unmarshal([]byte(out), &health)
48
+	c.Check(err, checker.Equals, nil)
49
+	return &health
50
+}
51
+
52
+func (s *DockerSuite) TestHealth(c *check.C) {
53
+	testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
54
+
55
+	imageName := "testhealth"
56
+	_, err := buildImage(imageName,
57
+		`FROM busybox
58
+		RUN echo OK > /status
59
+		CMD ["/bin/sleep", "120"]
60
+		STOPSIGNAL SIGKILL
61
+		HEALTHCHECK --interval=1s --timeout=30s \
62
+		  CMD cat /status`,
63
+		true)
64
+
65
+	c.Check(err, check.IsNil)
66
+
67
+	// No health status before starting
68
+	name := "test_health"
69
+	dockerCmd(c, "create", "--name", name, imageName)
70
+	out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}")
71
+	c.Check(out, checker.Equals, "Created\n")
72
+
73
+	// Inspect the options
74
+	out, _ = dockerCmd(c, "inspect",
75
+		"--format='timeout={{.Config.Healthcheck.Timeout}} "+
76
+			"interval={{.Config.Healthcheck.Interval}} "+
77
+			"retries={{.Config.Healthcheck.Retries}} "+
78
+			"test={{.Config.Healthcheck.Test}}'", name)
79
+	c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n")
80
+
81
+	// Start
82
+	dockerCmd(c, "start", name)
83
+	waitForHealthStatus(c, name, "starting", "healthy")
84
+
85
+	// Make it fail
86
+	dockerCmd(c, "exec", name, "rm", "/status")
87
+	waitForHealthStatus(c, name, "healthy", "unhealthy")
88
+
89
+	// Inspect the status
90
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
91
+	c.Check(out, checker.Equals, "unhealthy\n")
92
+
93
+	// Make it healthy again
94
+	dockerCmd(c, "exec", name, "touch", "/status")
95
+	waitForHealthStatus(c, name, "unhealthy", "healthy")
96
+
97
+	// Remove container
98
+	dockerCmd(c, "rm", "-f", name)
99
+
100
+	// Disable the check from the CLI
101
+	out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName)
102
+	out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh")
103
+	c.Check(out, checker.Equals, "[NONE]\n")
104
+	dockerCmd(c, "rm", "noh")
105
+
106
+	// Disable the check with a new build
107
+	_, err = buildImage("no_healthcheck",
108
+		`FROM testhealth
109
+		HEALTHCHECK NONE`, true)
110
+	c.Check(err, check.IsNil)
111
+
112
+	out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck")
113
+	c.Check(out, checker.Equals, "[NONE]\n")
114
+
115
+	// Enable the checks from the CLI
116
+	_, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck",
117
+		"--health-interval=0.5s",
118
+		"--health-retries=3",
119
+		"--health-cmd=cat /status",
120
+		"no_healthcheck")
121
+	waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy")
122
+	health := getHealth(c, "fatal_healthcheck")
123
+	c.Check(health.Status, checker.Equals, "healthy")
124
+	c.Check(health.FailingStreak, checker.Equals, 0)
125
+	last := health.Log[len(health.Log)-1]
126
+	c.Check(last.ExitCode, checker.Equals, 0)
127
+	c.Check(last.Output, checker.Equals, "OK\n")
128
+
129
+	// Fail the check, which should now make it exit
130
+	dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status")
131
+	waitForStatus(c, "fatal_healthcheck", "running", "exited")
132
+
133
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck")
134
+	c.Check(out, checker.Equals, "unhealthy\n")
135
+	failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck")
136
+	fails, err := strconv.Atoi(strings.TrimSpace(failsStr))
137
+	c.Check(err, check.IsNil)
138
+	c.Check(fails >= 3, checker.Equals, true)
139
+	dockerCmd(c, "rm", "-f", "fatal_healthcheck")
140
+
141
+	// Check timeout
142
+	// Note: if the interval is too small, it seems that Docker spends all its time running health
143
+	// checks and never gets around to killing it.
144
+	_, _ = dockerCmd(c, "run", "-d", "--name=test",
145
+		"--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName)
146
+	waitForHealthStatus(c, "test", "starting", "unhealthy")
147
+	health = getHealth(c, "test")
148
+	last = health.Log[len(health.Log)-1]
149
+	c.Check(health.Status, checker.Equals, "unhealthy")
150
+	c.Check(last.ExitCode, checker.Equals, -1)
151
+	c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)")
152
+	dockerCmd(c, "rm", "-f", "test")
153
+}
... ...
@@ -190,6 +190,17 @@ func (clnt *client) Signal(containerID string, sig int) error {
190 190
 	return err
191 191
 }
192 192
 
193
+func (clnt *client) SignalProcess(containerID string, pid string, sig int) error {
194
+	clnt.lock(containerID)
195
+	defer clnt.unlock(containerID)
196
+	_, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{
197
+		Id:     containerID,
198
+		Pid:    pid,
199
+		Signal: uint32(sig),
200
+	})
201
+	return err
202
+}
203
+
193 204
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
194 205
 	clnt.lock(containerID)
195 206
 	defer clnt.unlock(containerID)
... ...
@@ -304,6 +304,25 @@ func (clnt *client) Signal(containerID string, sig int) error {
304 304
 	return nil
305 305
 }
306 306
 
307
+// While Linux has support for the full range of signals, signals aren't really implemented on Windows.
308
+// We try to terminate the specified process whatever signal is requested.
309
+func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error {
310
+	clnt.lock(containerID)
311
+	defer clnt.unlock(containerID)
312
+	cont, err := clnt.getContainer(containerID)
313
+	if err != nil {
314
+		return err
315
+	}
316
+
317
+	for _, p := range cont.processes {
318
+		if p.friendlyName == processFriendlyName {
319
+			return hcsshim.TerminateProcessInComputeSystem(containerID, p.systemPid)
320
+		}
321
+	}
322
+
323
+	return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID)
324
+}
325
+
307 326
 // Resize handles a CLI event to resize an interactive docker run or docker exec
308 327
 // window.
309 328
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
... ...
@@ -34,6 +34,7 @@ type Backend interface {
34 34
 type Client interface {
35 35
 	Create(containerID string, spec Spec, options ...CreateOption) error
36 36
 	Signal(containerID string, sig int) error
37
+	SignalProcess(containerID string, processFriendlyName string, sig int) error
37 38
 	AddProcess(containerID, processFriendlyName string, process Process) error
38 39
 	Resize(containerID, processFriendlyName string, width, height int) error
39 40
 	Pause(containerID string) error
... ...
@@ -100,6 +100,12 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
100 100
 		flStopSignal        = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
101 101
 		flIsolation         = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
102 102
 		flShmSize           = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
103
+		// Healthcheck
104
+		flNoHealthcheck  = cmd.Bool([]string{"-no-healthcheck"}, false, "Disable any container-specified HEALTHCHECK")
105
+		flHealthCmd      = cmd.String([]string{"-health-cmd"}, "", "Command to run to check health")
106
+		flHealthInterval = cmd.Duration([]string{"-health-interval"}, 0, "Time between running the check")
107
+		flHealthTimeout  = cmd.Duration([]string{"-health-timeout"}, 0, "Maximum time to allow one check to run")
108
+		flHealthRetries  = cmd.Int([]string{"-health-retries"}, 0, "Consecutive failures needed to report unhealthy")
103 109
 	)
104 110
 
105 111
 	cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
... ...
@@ -351,6 +357,39 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
351 351
 		return nil, nil, nil, cmd, err
352 352
 	}
353 353
 
354
+	// Healthcheck
355
+	var healthConfig *container.HealthConfig
356
+	haveHealthSettings := *flHealthCmd != "" ||
357
+		*flHealthInterval != 0 ||
358
+		*flHealthTimeout != 0 ||
359
+		*flHealthRetries != 0
360
+	if *flNoHealthcheck {
361
+		if haveHealthSettings {
362
+			return nil, nil, nil, cmd, fmt.Errorf("--no-healthcheck conflicts with --health-* options")
363
+		}
364
+		test := strslice.StrSlice{"NONE"}
365
+		healthConfig = &container.HealthConfig{Test: test}
366
+	} else if haveHealthSettings {
367
+		var probe strslice.StrSlice
368
+		if *flHealthCmd != "" {
369
+			args := []string{"CMD-SHELL", *flHealthCmd}
370
+			probe = strslice.StrSlice(args)
371
+		}
372
+		if *flHealthInterval < 0 {
373
+			return nil, nil, nil, cmd, fmt.Errorf("--health-interval cannot be negative")
374
+		}
375
+		if *flHealthTimeout < 0 {
376
+			return nil, nil, nil, cmd, fmt.Errorf("--health-timeout cannot be negative")
377
+		}
378
+
379
+		healthConfig = &container.HealthConfig{
380
+			Test:     probe,
381
+			Interval: *flHealthInterval,
382
+			Timeout:  *flHealthTimeout,
383
+			Retries:  *flHealthRetries,
384
+		}
385
+	}
386
+
354 387
 	resources := container.Resources{
355 388
 		CgroupParent:         *flCgroupParent,
356 389
 		Memory:               flMemory,
... ...
@@ -399,6 +438,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
399 399
 		Entrypoint:      entrypoint,
400 400
 		WorkingDir:      *flWorkingDir,
401 401
 		Labels:          ConvertKVStringsToMap(labels),
402
+		Healthcheck:     healthConfig,
402 403
 	}
403 404
 	if cmd.IsSet("-stop-signal") {
404 405
 		config.StopSignal = *flStopSignal
... ...
@@ -9,6 +9,7 @@ import (
9 9
 	"runtime"
10 10
 	"strings"
11 11
 	"testing"
12
+	"time"
12 13
 
13 14
 	flag "github.com/docker/docker/pkg/mflag"
14 15
 	"github.com/docker/docker/runconfig"
... ...
@@ -584,6 +585,45 @@ func TestParseRestartPolicy(t *testing.T) {
584 584
 	}
585 585
 }
586 586
 
587
+func TestParseHealth(t *testing.T) {
588
+	checkOk := func(args ...string) *container.HealthConfig {
589
+		config, _, _, _, err := parseRun(args)
590
+		if err != nil {
591
+			t.Fatalf("%#v: %v", args, err)
592
+		}
593
+		return config.Healthcheck
594
+	}
595
+	checkError := func(expected string, args ...string) {
596
+		config, _, _, _, err := parseRun(args)
597
+		if err == nil {
598
+			t.Fatalf("Expected error, but got %#v", config)
599
+		}
600
+		if err.Error() != expected {
601
+			t.Fatalf("Expected %#v, got %#v", expected, err)
602
+		}
603
+	}
604
+	health := checkOk("--no-healthcheck", "img", "cmd")
605
+	if health == nil || len(health.Test) != 1 || health.Test[0] != "NONE" {
606
+		t.Fatalf("--no-healthcheck failed: %#v", health)
607
+	}
608
+
609
+	health = checkOk("--health-cmd=/check.sh -q", "img", "cmd")
610
+	if len(health.Test) != 2 || health.Test[0] != "CMD-SHELL" || health.Test[1] != "/check.sh -q" {
611
+		t.Fatalf("--health-cmd: got %#v", health.Test)
612
+	}
613
+	if health.Timeout != 0 {
614
+		t.Fatalf("--health-cmd: timeout = %f", health.Timeout)
615
+	}
616
+
617
+	checkError("--no-healthcheck conflicts with --health-* options",
618
+		"--no-healthcheck", "--health-cmd=/check.sh -q", "img", "cmd")
619
+
620
+	health = checkOk("--health-timeout=2s", "--health-retries=3", "--health-interval=4.5s", "img", "cmd")
621
+	if health.Timeout != 2*time.Second || health.Retries != 3 || health.Interval != 4500*time.Millisecond {
622
+		t.Fatalf("--health-*: got %#v", health)
623
+	}
624
+}
625
+
587 626
 func TestParseLoggingOpts(t *testing.T) {
588 627
 	// logging opts ko
589 628
 	if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {