This PR adds support for user-defined health-check probes for Docker
containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus
some corresponding "docker run" options. It can be used with a restart policy
to automatically restart a container if the check fails.
The `HEALTHCHECK` instruction has two forms:
* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)
The `HEALTHCHECK` instruction tells Docker how to test a container to check that
it is still working. This can detect cases such as a web server that is stuck in
an infinite loop and unable to handle new connections, even though the server
process is still running.
When a container has a healthcheck specified, it has a _health status_ in
addition to its normal status. This status is initially `starting`. Whenever a
health check passes, it becomes `healthy` (whatever state it was previously in).
After a certain number of consecutive failures, it becomes `unhealthy`.
The options that can appear before `CMD` are:
* `--interval=DURATION` (default: `30s`)
* `--timeout=DURATION` (default: `30s`)
* `--retries=N` (default: `1`)
The health check will first run **interval** seconds after the container is
started, and then again **interval** seconds after each previous check completes.
If a single run of the check takes longer than **timeout** seconds then the check
is considered to have failed.
It takes **retries** consecutive failures of the health check for the container
to be considered `unhealthy`.
There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
more than one then only the last `HEALTHCHECK` will take effect.
The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
see e.g. `ENTRYPOINT` for details).
The command's exit status indicates the health status of the container.
The possible values are:
- 0: success - the container is healthy and ready for use
- 1: unhealthy - the container is not working correctly
- 2: starting - the container is not ready for use yet, but is working correctly
If the probe returns 2 ("starting") when the container has already moved out of the
"starting" state then it is treated as "unhealthy" instead.
For example, to check every five minutes or so that a web-server is able to
serve the site's main page within three seconds:
HEALTHCHECK --interval=5m --timeout=3s \
CMD curl -f http://localhost/ || exit 1
To help debug failing probes, any output text (UTF-8 encoded) that the command writes
on stdout or stderr will be stored in the health status and can be queried with
`docker inspect`. Such output should be kept short (only the first 4096 bytes
are stored currently).
When the health status of a container changes, a `health_status` event is
generated with the new status. The health status is also displayed in the
`docker ps` output.
Signed-off-by: Thomas Leonard <thomas.leonard@docker.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
| ... | ... |
@@ -17,7 +17,7 @@ type execBackend interface {
|
| 17 | 17 |
ContainerExecCreate(name string, config *types.ExecConfig) (string, error) |
| 18 | 18 |
ContainerExecInspect(id string) (*backend.ExecInspect, error) |
| 19 | 19 |
ContainerExecResize(name string, height, width int) error |
| 20 |
- ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error |
|
| 20 |
+ ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error |
|
| 21 | 21 |
ExecExists(name string) (bool, error) |
| 22 | 22 |
} |
| 23 | 23 |
|
| ... | ... |
@@ -106,7 +106,8 @@ func (s *containerRouter) postContainerExecStart(ctx context.Context, w http.Res |
| 106 | 106 |
} |
| 107 | 107 |
|
| 108 | 108 |
// Now run the user process in container. |
| 109 |
- if err := s.backend.ContainerExecStart(execName, stdin, stdout, stderr); err != nil {
|
|
| 109 |
+ // Maybe we should we pass ctx here if we're not detaching? |
|
| 110 |
+ if err := s.backend.ContainerExecStart(context.Background(), execName, stdin, stdout, stderr); err != nil {
|
|
| 110 | 111 |
if execStartCheck.Detach {
|
| 111 | 112 |
return err |
| 112 | 113 |
} |
| ... | ... |
@@ -22,15 +22,16 @@ import ( |
| 22 | 22 |
) |
| 23 | 23 |
|
| 24 | 24 |
var validCommitCommands = map[string]bool{
|
| 25 |
- "cmd": true, |
|
| 26 |
- "entrypoint": true, |
|
| 27 |
- "env": true, |
|
| 28 |
- "expose": true, |
|
| 29 |
- "label": true, |
|
| 30 |
- "onbuild": true, |
|
| 31 |
- "user": true, |
|
| 32 |
- "volume": true, |
|
| 33 |
- "workdir": true, |
|
| 25 |
+ "cmd": true, |
|
| 26 |
+ "entrypoint": true, |
|
| 27 |
+ "healthcheck": true, |
|
| 28 |
+ "env": true, |
|
| 29 |
+ "expose": true, |
|
| 30 |
+ "label": true, |
|
| 31 |
+ "onbuild": true, |
|
| 32 |
+ "user": true, |
|
| 33 |
+ "volume": true, |
|
| 34 |
+ "workdir": true, |
|
| 34 | 35 |
} |
| 35 | 36 |
|
| 36 | 37 |
// BuiltinAllowedBuildArgs is list of built-in allowed build args |
| ... | ... |
@@ -3,40 +3,42 @@ package command |
| 3 | 3 |
|
| 4 | 4 |
// Define constants for the command strings |
| 5 | 5 |
const ( |
| 6 |
- Env = "env" |
|
| 7 |
- Label = "label" |
|
| 8 |
- Maintainer = "maintainer" |
|
| 9 |
- Add = "add" |
|
| 10 |
- Copy = "copy" |
|
| 11 |
- From = "from" |
|
| 12 |
- Onbuild = "onbuild" |
|
| 13 |
- Workdir = "workdir" |
|
| 14 |
- Run = "run" |
|
| 15 |
- Cmd = "cmd" |
|
| 16 |
- Entrypoint = "entrypoint" |
|
| 17 |
- Expose = "expose" |
|
| 18 |
- Volume = "volume" |
|
| 19 |
- User = "user" |
|
| 20 |
- StopSignal = "stopsignal" |
|
| 21 |
- Arg = "arg" |
|
| 6 |
+ Env = "env" |
|
| 7 |
+ Label = "label" |
|
| 8 |
+ Maintainer = "maintainer" |
|
| 9 |
+ Add = "add" |
|
| 10 |
+ Copy = "copy" |
|
| 11 |
+ From = "from" |
|
| 12 |
+ Onbuild = "onbuild" |
|
| 13 |
+ Workdir = "workdir" |
|
| 14 |
+ Run = "run" |
|
| 15 |
+ Cmd = "cmd" |
|
| 16 |
+ Entrypoint = "entrypoint" |
|
| 17 |
+ Expose = "expose" |
|
| 18 |
+ Volume = "volume" |
|
| 19 |
+ User = "user" |
|
| 20 |
+ StopSignal = "stopsignal" |
|
| 21 |
+ Arg = "arg" |
|
| 22 |
+ Healthcheck = "healthcheck" |
|
| 22 | 23 |
) |
| 23 | 24 |
|
| 24 | 25 |
// Commands is list of all Dockerfile commands |
| 25 | 26 |
var Commands = map[string]struct{}{
|
| 26 |
- Env: {},
|
|
| 27 |
- Label: {},
|
|
| 28 |
- Maintainer: {},
|
|
| 29 |
- Add: {},
|
|
| 30 |
- Copy: {},
|
|
| 31 |
- From: {},
|
|
| 32 |
- Onbuild: {},
|
|
| 33 |
- Workdir: {},
|
|
| 34 |
- Run: {},
|
|
| 35 |
- Cmd: {},
|
|
| 36 |
- Entrypoint: {},
|
|
| 37 |
- Expose: {},
|
|
| 38 |
- Volume: {},
|
|
| 39 |
- User: {},
|
|
| 40 |
- StopSignal: {},
|
|
| 41 |
- Arg: {},
|
|
| 27 |
+ Env: {},
|
|
| 28 |
+ Label: {},
|
|
| 29 |
+ Maintainer: {},
|
|
| 30 |
+ Add: {},
|
|
| 31 |
+ Copy: {},
|
|
| 32 |
+ From: {},
|
|
| 33 |
+ Onbuild: {},
|
|
| 34 |
+ Workdir: {},
|
|
| 35 |
+ Run: {},
|
|
| 36 |
+ Cmd: {},
|
|
| 37 |
+ Entrypoint: {},
|
|
| 38 |
+ Expose: {},
|
|
| 39 |
+ Volume: {},
|
|
| 40 |
+ User: {},
|
|
| 41 |
+ StopSignal: {},
|
|
| 42 |
+ Arg: {},
|
|
| 43 |
+ Healthcheck: {},
|
|
| 42 | 44 |
} |
| ... | ... |
@@ -12,7 +12,9 @@ import ( |
| 12 | 12 |
"regexp" |
| 13 | 13 |
"runtime" |
| 14 | 14 |
"sort" |
| 15 |
+ "strconv" |
|
| 15 | 16 |
"strings" |
| 17 |
+ "time" |
|
| 16 | 18 |
|
| 17 | 19 |
"github.com/Sirupsen/logrus" |
| 18 | 20 |
"github.com/docker/docker/api" |
| ... | ... |
@@ -426,6 +428,111 @@ func cmd(b *Builder, args []string, attributes map[string]bool, original string) |
| 426 | 426 |
return nil |
| 427 | 427 |
} |
| 428 | 428 |
|
| 429 |
+// parseOptInterval(flag) is the duration of flag.Value, or 0 if |
|
| 430 |
+// empty. An error is reported if the value is given and is not positive. |
|
| 431 |
+func parseOptInterval(f *Flag) (time.Duration, error) {
|
|
| 432 |
+ s := f.Value |
|
| 433 |
+ if s == "" {
|
|
| 434 |
+ return 0, nil |
|
| 435 |
+ } |
|
| 436 |
+ d, err := time.ParseDuration(s) |
|
| 437 |
+ if err != nil {
|
|
| 438 |
+ return 0, err |
|
| 439 |
+ } |
|
| 440 |
+ if d <= 0 {
|
|
| 441 |
+ return 0, fmt.Errorf("Interval %#v must be positive", f.name)
|
|
| 442 |
+ } |
|
| 443 |
+ return d, nil |
|
| 444 |
+} |
|
| 445 |
+ |
|
| 446 |
+// HEALTHCHECK foo |
|
| 447 |
+// |
|
| 448 |
+// Set the default healthcheck command to run in the container (which may be empty). |
|
| 449 |
+// Argument handling is the same as RUN. |
|
| 450 |
+// |
|
| 451 |
+func healthcheck(b *Builder, args []string, attributes map[string]bool, original string) error {
|
|
| 452 |
+ if len(args) == 0 {
|
|
| 453 |
+ return fmt.Errorf("HEALTHCHECK requires an argument")
|
|
| 454 |
+ } |
|
| 455 |
+ typ := strings.ToUpper(args[0]) |
|
| 456 |
+ args = args[1:] |
|
| 457 |
+ if typ == "NONE" {
|
|
| 458 |
+ if len(args) != 0 {
|
|
| 459 |
+ return fmt.Errorf("HEALTHCHECK NONE takes no arguments")
|
|
| 460 |
+ } |
|
| 461 |
+ test := strslice.StrSlice{typ}
|
|
| 462 |
+ b.runConfig.Healthcheck = &container.HealthConfig{
|
|
| 463 |
+ Test: test, |
|
| 464 |
+ } |
|
| 465 |
+ } else {
|
|
| 466 |
+ if b.runConfig.Healthcheck != nil {
|
|
| 467 |
+ oldCmd := b.runConfig.Healthcheck.Test |
|
| 468 |
+ if len(oldCmd) > 0 && oldCmd[0] != "NONE" {
|
|
| 469 |
+ fmt.Fprintf(b.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd) |
|
| 470 |
+ } |
|
| 471 |
+ } |
|
| 472 |
+ |
|
| 473 |
+ healthcheck := container.HealthConfig{}
|
|
| 474 |
+ |
|
| 475 |
+ flInterval := b.flags.AddString("interval", "")
|
|
| 476 |
+ flTimeout := b.flags.AddString("timeout", "")
|
|
| 477 |
+ flRetries := b.flags.AddString("retries", "")
|
|
| 478 |
+ |
|
| 479 |
+ if err := b.flags.Parse(); err != nil {
|
|
| 480 |
+ return err |
|
| 481 |
+ } |
|
| 482 |
+ |
|
| 483 |
+ switch typ {
|
|
| 484 |
+ case "CMD": |
|
| 485 |
+ cmdSlice := handleJSONArgs(args, attributes) |
|
| 486 |
+ if len(cmdSlice) == 0 {
|
|
| 487 |
+ return fmt.Errorf("Missing command after HEALTHCHECK CMD")
|
|
| 488 |
+ } |
|
| 489 |
+ |
|
| 490 |
+ if !attributes["json"] {
|
|
| 491 |
+ typ = "CMD-SHELL" |
|
| 492 |
+ } |
|
| 493 |
+ |
|
| 494 |
+ healthcheck.Test = strslice.StrSlice(append([]string{typ}, cmdSlice...))
|
|
| 495 |
+ default: |
|
| 496 |
+ return fmt.Errorf("Unknown type %#v in HEALTHCHECK (try CMD)", typ)
|
|
| 497 |
+ } |
|
| 498 |
+ |
|
| 499 |
+ interval, err := parseOptInterval(flInterval) |
|
| 500 |
+ if err != nil {
|
|
| 501 |
+ return err |
|
| 502 |
+ } |
|
| 503 |
+ healthcheck.Interval = interval |
|
| 504 |
+ |
|
| 505 |
+ timeout, err := parseOptInterval(flTimeout) |
|
| 506 |
+ if err != nil {
|
|
| 507 |
+ return err |
|
| 508 |
+ } |
|
| 509 |
+ healthcheck.Timeout = timeout |
|
| 510 |
+ |
|
| 511 |
+ if flRetries.Value != "" {
|
|
| 512 |
+ retries, err := strconv.ParseInt(flRetries.Value, 10, 32) |
|
| 513 |
+ if err != nil {
|
|
| 514 |
+ return err |
|
| 515 |
+ } |
|
| 516 |
+ if retries < 1 {
|
|
| 517 |
+ return fmt.Errorf("--retries must be at least 1 (not %d)", retries)
|
|
| 518 |
+ } |
|
| 519 |
+ healthcheck.Retries = int(retries) |
|
| 520 |
+ } else {
|
|
| 521 |
+ healthcheck.Retries = 0 |
|
| 522 |
+ } |
|
| 523 |
+ |
|
| 524 |
+ b.runConfig.Healthcheck = &healthcheck |
|
| 525 |
+ } |
|
| 526 |
+ |
|
| 527 |
+ if err := b.commit("", b.runConfig.Cmd, fmt.Sprintf("HEALTHCHECK %q", b.runConfig.Healthcheck)); err != nil {
|
|
| 528 |
+ return err |
|
| 529 |
+ } |
|
| 530 |
+ |
|
| 531 |
+ return nil |
|
| 532 |
+} |
|
| 533 |
+ |
|
| 429 | 534 |
// ENTRYPOINT /usr/sbin/nginx |
| 430 | 535 |
// |
| 431 | 536 |
// Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to |
| ... | ... |
@@ -58,22 +58,23 @@ var evaluateTable map[string]func(*Builder, []string, map[string]bool, string) e |
| 58 | 58 |
|
| 59 | 59 |
func init() {
|
| 60 | 60 |
evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
|
| 61 |
- command.Env: env, |
|
| 62 |
- command.Label: label, |
|
| 63 |
- command.Maintainer: maintainer, |
|
| 64 |
- command.Add: add, |
|
| 65 |
- command.Copy: dispatchCopy, // copy() is a go builtin |
|
| 66 |
- command.From: from, |
|
| 67 |
- command.Onbuild: onbuild, |
|
| 68 |
- command.Workdir: workdir, |
|
| 69 |
- command.Run: run, |
|
| 70 |
- command.Cmd: cmd, |
|
| 71 |
- command.Entrypoint: entrypoint, |
|
| 72 |
- command.Expose: expose, |
|
| 73 |
- command.Volume: volume, |
|
| 74 |
- command.User: user, |
|
| 75 |
- command.StopSignal: stopSignal, |
|
| 76 |
- command.Arg: arg, |
|
| 61 |
+ command.Env: env, |
|
| 62 |
+ command.Label: label, |
|
| 63 |
+ command.Maintainer: maintainer, |
|
| 64 |
+ command.Add: add, |
|
| 65 |
+ command.Copy: dispatchCopy, // copy() is a go builtin |
|
| 66 |
+ command.From: from, |
|
| 67 |
+ command.Onbuild: onbuild, |
|
| 68 |
+ command.Workdir: workdir, |
|
| 69 |
+ command.Run: run, |
|
| 70 |
+ command.Cmd: cmd, |
|
| 71 |
+ command.Entrypoint: entrypoint, |
|
| 72 |
+ command.Expose: expose, |
|
| 73 |
+ command.Volume: volume, |
|
| 74 |
+ command.User: user, |
|
| 75 |
+ command.StopSignal: stopSignal, |
|
| 76 |
+ command.Arg: arg, |
|
| 77 |
+ command.Healthcheck: healthcheck, |
|
| 77 | 78 |
} |
| 78 | 79 |
} |
| 79 | 80 |
|
| ... | ... |
@@ -329,3 +329,32 @@ func parseMaybeJSONToList(rest string) (*Node, map[string]bool, error) {
|
| 329 | 329 |
|
| 330 | 330 |
return parseStringsWhitespaceDelimited(rest) |
| 331 | 331 |
} |
| 332 |
+ |
|
| 333 |
+// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument. |
|
| 334 |
+func parseHealthConfig(rest string) (*Node, map[string]bool, error) {
|
|
| 335 |
+ // Find end of first argument |
|
| 336 |
+ var sep int |
|
| 337 |
+ for ; sep < len(rest); sep++ {
|
|
| 338 |
+ if unicode.IsSpace(rune(rest[sep])) {
|
|
| 339 |
+ break |
|
| 340 |
+ } |
|
| 341 |
+ } |
|
| 342 |
+ next := sep |
|
| 343 |
+ for ; next < len(rest); next++ {
|
|
| 344 |
+ if !unicode.IsSpace(rune(rest[next])) {
|
|
| 345 |
+ break |
|
| 346 |
+ } |
|
| 347 |
+ } |
|
| 348 |
+ |
|
| 349 |
+ if sep == 0 {
|
|
| 350 |
+ return nil, nil, nil |
|
| 351 |
+ } |
|
| 352 |
+ |
|
| 353 |
+ typ := rest[:sep] |
|
| 354 |
+ cmd, attrs, err := parseMaybeJSON(rest[next:]) |
|
| 355 |
+ if err != nil {
|
|
| 356 |
+ return nil, nil, err |
|
| 357 |
+ } |
|
| 358 |
+ |
|
| 359 |
+ return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err
|
|
| 360 |
+} |
| ... | ... |
@@ -66,22 +66,23 @@ func init() {
|
| 66 | 66 |
// functions. Errors are propagated up by Parse() and the resulting AST can |
| 67 | 67 |
// be incorporated directly into the existing AST as a next. |
| 68 | 68 |
dispatch = map[string]func(string) (*Node, map[string]bool, error){
|
| 69 |
- command.User: parseString, |
|
| 70 |
- command.Onbuild: parseSubCommand, |
|
| 71 |
- command.Workdir: parseString, |
|
| 72 |
- command.Env: parseEnv, |
|
| 73 |
- command.Label: parseLabel, |
|
| 74 |
- command.Maintainer: parseString, |
|
| 75 |
- command.From: parseString, |
|
| 76 |
- command.Add: parseMaybeJSONToList, |
|
| 77 |
- command.Copy: parseMaybeJSONToList, |
|
| 78 |
- command.Run: parseMaybeJSON, |
|
| 79 |
- command.Cmd: parseMaybeJSON, |
|
| 80 |
- command.Entrypoint: parseMaybeJSON, |
|
| 81 |
- command.Expose: parseStringsWhitespaceDelimited, |
|
| 82 |
- command.Volume: parseMaybeJSONToList, |
|
| 83 |
- command.StopSignal: parseString, |
|
| 84 |
- command.Arg: parseNameOrNameVal, |
|
| 69 |
+ command.User: parseString, |
|
| 70 |
+ command.Onbuild: parseSubCommand, |
|
| 71 |
+ command.Workdir: parseString, |
|
| 72 |
+ command.Env: parseEnv, |
|
| 73 |
+ command.Label: parseLabel, |
|
| 74 |
+ command.Maintainer: parseString, |
|
| 75 |
+ command.From: parseString, |
|
| 76 |
+ command.Add: parseMaybeJSONToList, |
|
| 77 |
+ command.Copy: parseMaybeJSONToList, |
|
| 78 |
+ command.Run: parseMaybeJSON, |
|
| 79 |
+ command.Cmd: parseMaybeJSON, |
|
| 80 |
+ command.Entrypoint: parseMaybeJSON, |
|
| 81 |
+ command.Expose: parseStringsWhitespaceDelimited, |
|
| 82 |
+ command.Volume: parseMaybeJSONToList, |
|
| 83 |
+ command.StopSignal: parseString, |
|
| 84 |
+ command.Arg: parseNameOrNameVal, |
|
| 85 |
+ command.Healthcheck: parseHealthConfig, |
|
| 85 | 86 |
} |
| 86 | 87 |
} |
| 87 | 88 |
|
| 88 | 89 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,10 @@ |
| 0 |
+FROM debian |
|
| 1 |
+ADD check.sh main.sh /app/ |
|
| 2 |
+CMD /app/main.sh |
|
| 3 |
+HEALTHCHECK |
|
| 4 |
+HEALTHCHECK --interval=5s --timeout=3s --retries=1 \ |
|
| 5 |
+ CMD /app/check.sh --quiet |
|
| 6 |
+HEALTHCHECK CMD |
|
| 7 |
+HEALTHCHECK CMD a b |
|
| 8 |
+HEALTHCHECK --timeout=3s CMD ["foo"] |
|
| 9 |
+HEALTHCHECK CONNECT TCP 7000 |
| 0 | 10 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,9 @@ |
| 0 |
+(from "debian") |
|
| 1 |
+(add "check.sh" "main.sh" "/app/") |
|
| 2 |
+(cmd "/app/main.sh") |
|
| 3 |
+(healthcheck) |
|
| 4 |
+(healthcheck ["--interval=5s" "--timeout=3s" "--retries=1"] "CMD" "/app/check.sh --quiet") |
|
| 5 |
+(healthcheck "CMD") |
|
| 6 |
+(healthcheck "CMD" "a b") |
|
| 7 |
+(healthcheck ["--timeout=3s"] "CMD" "foo") |
|
| 8 |
+(healthcheck "CONNECT" "TCP 7000") |
| 0 | 9 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,49 @@ |
| 0 |
+package container |
|
| 1 |
+ |
|
| 2 |
+import ( |
|
| 3 |
+ "github.com/Sirupsen/logrus" |
|
| 4 |
+ "github.com/docker/engine-api/types" |
|
| 5 |
+) |
|
| 6 |
+ |
|
| 7 |
+// Health holds the current container health-check state |
|
| 8 |
+type Health struct {
|
|
| 9 |
+ types.Health |
|
| 10 |
+ stop chan struct{} // Write struct{} to stop the monitor
|
|
| 11 |
+} |
|
| 12 |
+ |
|
| 13 |
+// String returns a human-readable description of the health-check state |
|
| 14 |
+func (s *Health) String() string {
|
|
| 15 |
+ if s.stop == nil {
|
|
| 16 |
+ return "no healthcheck" |
|
| 17 |
+ } |
|
| 18 |
+ switch s.Status {
|
|
| 19 |
+ case types.Starting: |
|
| 20 |
+ return "health: starting" |
|
| 21 |
+ default: // Healthy and Unhealthy are clear on their own |
|
| 22 |
+ return s.Status |
|
| 23 |
+ } |
|
| 24 |
+} |
|
| 25 |
+ |
|
| 26 |
+// OpenMonitorChannel creates and returns a new monitor channel. If there already is one, |
|
| 27 |
+// it returns nil. |
|
| 28 |
+func (s *Health) OpenMonitorChannel() chan struct{} {
|
|
| 29 |
+ if s.stop == nil {
|
|
| 30 |
+ logrus.Debugf("OpenMonitorChannel")
|
|
| 31 |
+ s.stop = make(chan struct{})
|
|
| 32 |
+ return s.stop |
|
| 33 |
+ } |
|
| 34 |
+ return nil |
|
| 35 |
+} |
|
| 36 |
+ |
|
| 37 |
+// CloseMonitorChannel closes any existing monitor channel. |
|
| 38 |
+func (s *Health) CloseMonitorChannel() {
|
|
| 39 |
+ if s.stop != nil {
|
|
| 40 |
+ logrus.Debugf("CloseMonitorChannel: waiting for probe to stop")
|
|
| 41 |
+ // This channel does not buffer. Once the write succeeds, the monitor |
|
| 42 |
+ // has read the stop request and will not make any further updates |
|
| 43 |
+ // to c.State.Health. |
|
| 44 |
+ s.stop <- struct{}{}
|
|
| 45 |
+ s.stop = nil |
|
| 46 |
+ logrus.Debugf("CloseMonitorChannel done")
|
|
| 47 |
+ } |
|
| 48 |
+} |
| ... | ... |
@@ -27,6 +27,7 @@ type State struct {
|
| 27 | 27 |
StartedAt time.Time |
| 28 | 28 |
FinishedAt time.Time |
| 29 | 29 |
waitChan chan struct{}
|
| 30 |
+ Health *Health |
|
| 30 | 31 |
} |
| 31 | 32 |
|
| 32 | 33 |
// NewState creates a default state object with a fresh channel for state changes. |
| ... | ... |
@@ -46,6 +47,9 @@ func (s *State) String() string {
|
| 46 | 46 |
return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
|
| 47 | 47 |
} |
| 48 | 48 |
|
| 49 |
+ if h := s.Health; h != nil {
|
|
| 50 |
+ return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String())
|
|
| 51 |
+ } |
|
| 49 | 52 |
return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
|
| 50 | 53 |
} |
| 51 | 54 |
|
| ... | ... |
@@ -80,6 +80,25 @@ func merge(userConf, imageConf *containertypes.Config) error {
|
| 80 | 80 |
userConf.Entrypoint = imageConf.Entrypoint |
| 81 | 81 |
} |
| 82 | 82 |
} |
| 83 |
+ if imageConf.Healthcheck != nil {
|
|
| 84 |
+ if userConf.Healthcheck == nil {
|
|
| 85 |
+ userConf.Healthcheck = imageConf.Healthcheck |
|
| 86 |
+ } else {
|
|
| 87 |
+ if len(userConf.Healthcheck.Test) == 0 {
|
|
| 88 |
+ userConf.Healthcheck.Test = imageConf.Healthcheck.Test |
|
| 89 |
+ } |
|
| 90 |
+ if userConf.Healthcheck.Interval == 0 {
|
|
| 91 |
+ userConf.Healthcheck.Interval = imageConf.Healthcheck.Interval |
|
| 92 |
+ } |
|
| 93 |
+ if userConf.Healthcheck.Timeout == 0 {
|
|
| 94 |
+ userConf.Healthcheck.Timeout = imageConf.Healthcheck.Timeout |
|
| 95 |
+ } |
|
| 96 |
+ if userConf.Healthcheck.Retries == 0 {
|
|
| 97 |
+ userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries |
|
| 98 |
+ } |
|
| 99 |
+ } |
|
| 100 |
+ } |
|
| 101 |
+ |
|
| 83 | 102 |
if userConf.WorkingDir == "" {
|
| 84 | 103 |
userConf.WorkingDir = imageConf.WorkingDir |
| 85 | 104 |
} |
| ... | ... |
@@ -14,11 +14,15 @@ import ( |
| 14 | 14 |
"github.com/docker/docker/errors" |
| 15 | 15 |
"github.com/docker/docker/libcontainerd" |
| 16 | 16 |
"github.com/docker/docker/pkg/pools" |
| 17 |
+ "github.com/docker/docker/pkg/signal" |
|
| 17 | 18 |
"github.com/docker/docker/pkg/term" |
| 18 | 19 |
"github.com/docker/engine-api/types" |
| 19 | 20 |
"github.com/docker/engine-api/types/strslice" |
| 20 | 21 |
) |
| 21 | 22 |
|
| 23 |
+// Seconds to wait after sending TERM before trying KILL |
|
| 24 |
+const termProcessTimeout = 10 |
|
| 25 |
+ |
|
| 22 | 26 |
func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
|
| 23 | 27 |
// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed. |
| 24 | 28 |
container.ExecCommands.Add(config.ID, config) |
| ... | ... |
@@ -130,7 +134,8 @@ func (d *Daemon) ContainerExecCreate(name string, config *types.ExecConfig) (str |
| 130 | 130 |
|
| 131 | 131 |
// ContainerExecStart starts a previously set up exec instance. The |
| 132 | 132 |
// std streams are set up. |
| 133 |
-func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
|
|
| 133 |
+// If ctx is cancelled, the process is terminated. |
|
| 134 |
+func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
|
|
| 134 | 135 |
var ( |
| 135 | 136 |
cStdin io.ReadCloser |
| 136 | 137 |
cStdout, cStderr io.Writer |
| ... | ... |
@@ -197,15 +202,28 @@ func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io. |
| 197 | 197 |
return nil |
| 198 | 198 |
} |
| 199 | 199 |
|
| 200 |
- attachErr := container.AttachStreams(context.Background(), ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys) |
|
| 200 |
+ attachErr := container.AttachStreams(ctx, ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys) |
|
| 201 | 201 |
|
| 202 | 202 |
if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
|
| 203 | 203 |
return err |
| 204 | 204 |
} |
| 205 | 205 |
|
| 206 |
- err = <-attachErr |
|
| 207 |
- if err != nil {
|
|
| 208 |
- return fmt.Errorf("attach failed with error: %v", err)
|
|
| 206 |
+ select {
|
|
| 207 |
+ case <-ctx.Done(): |
|
| 208 |
+ logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
|
|
| 209 |
+ d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["TERM"])) |
|
| 210 |
+ select {
|
|
| 211 |
+ case <-time.After(termProcessTimeout * time.Second): |
|
| 212 |
+ logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout)
|
|
| 213 |
+ d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["KILL"])) |
|
| 214 |
+ case <-attachErr: |
|
| 215 |
+ // TERM signal worked |
|
| 216 |
+ } |
|
| 217 |
+ return fmt.Errorf("context cancelled")
|
|
| 218 |
+ case err := <-attachErr: |
|
| 219 |
+ if err != nil {
|
|
| 220 |
+ return fmt.Errorf("attach failed with error: %v", err)
|
|
| 221 |
+ } |
|
| 209 | 222 |
} |
| 210 | 223 |
return nil |
| 211 | 224 |
} |
| 212 | 225 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,314 @@ |
| 0 |
+package daemon |
|
| 1 |
+ |
|
| 2 |
+import ( |
|
| 3 |
+ "bytes" |
|
| 4 |
+ "fmt" |
|
| 5 |
+ "runtime" |
|
| 6 |
+ "strings" |
|
| 7 |
+ "time" |
|
| 8 |
+ |
|
| 9 |
+ "golang.org/x/net/context" |
|
| 10 |
+ |
|
| 11 |
+ "github.com/Sirupsen/logrus" |
|
| 12 |
+ "github.com/docker/docker/container" |
|
| 13 |
+ "github.com/docker/docker/daemon/exec" |
|
| 14 |
+ "github.com/docker/engine-api/types" |
|
| 15 |
+ "github.com/docker/engine-api/types/strslice" |
|
| 16 |
+) |
|
| 17 |
+ |
|
| 18 |
+const ( |
|
| 19 |
+ // Longest healthcheck probe output message to store. Longer messages will be truncated. |
|
| 20 |
+ maxOutputLen = 4096 |
|
| 21 |
+ |
|
| 22 |
+ // Default interval between probe runs (from the end of the first to the start of the second). |
|
| 23 |
+ // Also the time before the first probe. |
|
| 24 |
+ defaultProbeInterval = 30 * time.Second |
|
| 25 |
+ |
|
| 26 |
+ // The maximum length of time a single probe run should take. If the probe takes longer |
|
| 27 |
+ // than this, the check is considered to have failed. |
|
| 28 |
+ defaultProbeTimeout = 30 * time.Second |
|
| 29 |
+ |
|
| 30 |
+ // Shut down a container if it becomes Unhealthy. |
|
| 31 |
+ defaultExitOnUnhealthy = true |
|
| 32 |
+ |
|
| 33 |
+ // Maximum number of entries to record |
|
| 34 |
+ maxLogEntries = 5 |
|
| 35 |
+) |
|
| 36 |
+ |
|
| 37 |
+const ( |
|
| 38 |
+ // Exit status codes that can be returned by the probe command. |
|
| 39 |
+ |
|
| 40 |
+ exitStatusHealthy = 0 // Container is healthy |
|
| 41 |
+ exitStatusUnhealthy = 1 // Container is unhealthy |
|
| 42 |
+ exitStatusStarting = 2 // Container needs more time to start |
|
| 43 |
+) |
|
| 44 |
+ |
|
| 45 |
+// probe implementations know how to run a particular type of probe. |
|
| 46 |
+type probe interface {
|
|
| 47 |
+ // Perform one run of the check. Returns the exit code and an optional |
|
| 48 |
+ // short diagnostic string. |
|
| 49 |
+ run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) |
|
| 50 |
+} |
|
| 51 |
+ |
|
| 52 |
+// cmdProbe implements the "CMD" probe type. |
|
| 53 |
+type cmdProbe struct {
|
|
| 54 |
+ // Run the command with the system's default shell instead of execing it directly. |
|
| 55 |
+ shell bool |
|
| 56 |
+} |
|
| 57 |
+ |
|
| 58 |
+// exec the healthcheck command in the container. |
|
| 59 |
+// Returns the exit code and probe output (if any) |
|
| 60 |
+func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
|
|
| 61 |
+ cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:] |
|
| 62 |
+ if p.shell {
|
|
| 63 |
+ if runtime.GOOS != "windows" {
|
|
| 64 |
+ cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
|
|
| 65 |
+ } else {
|
|
| 66 |
+ cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
|
|
| 67 |
+ } |
|
| 68 |
+ } |
|
| 69 |
+ entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
|
|
| 70 |
+ execConfig := exec.NewConfig() |
|
| 71 |
+ execConfig.OpenStdin = false |
|
| 72 |
+ execConfig.OpenStdout = true |
|
| 73 |
+ execConfig.OpenStderr = true |
|
| 74 |
+ execConfig.ContainerID = container.ID |
|
| 75 |
+ execConfig.DetachKeys = []byte{}
|
|
| 76 |
+ execConfig.Entrypoint = entrypoint |
|
| 77 |
+ execConfig.Args = args |
|
| 78 |
+ execConfig.Tty = false |
|
| 79 |
+ execConfig.Privileged = false |
|
| 80 |
+ execConfig.User = container.Config.User |
|
| 81 |
+ |
|
| 82 |
+ d.registerExecCommand(container, execConfig) |
|
| 83 |
+ d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) |
|
| 84 |
+ |
|
| 85 |
+ output := &limitedBuffer{}
|
|
| 86 |
+ err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) |
|
| 87 |
+ if err != nil {
|
|
| 88 |
+ return nil, err |
|
| 89 |
+ } |
|
| 90 |
+ info, err := d.getExecConfig(execConfig.ID) |
|
| 91 |
+ if err != nil {
|
|
| 92 |
+ return nil, err |
|
| 93 |
+ } |
|
| 94 |
+ if info.ExitCode == nil {
|
|
| 95 |
+ return nil, fmt.Errorf("Healthcheck has no exit code!")
|
|
| 96 |
+ } |
|
| 97 |
+ // Note: Go's json package will handle invalid UTF-8 for us |
|
| 98 |
+ out := output.String() |
|
| 99 |
+ return &types.HealthcheckResult{
|
|
| 100 |
+ End: time.Now(), |
|
| 101 |
+ ExitCode: *info.ExitCode, |
|
| 102 |
+ Output: out, |
|
| 103 |
+ }, nil |
|
| 104 |
+} |
|
| 105 |
+ |
|
| 106 |
+// Update the container's Status.Health struct based on the latest probe's result. |
|
| 107 |
+func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
|
|
| 108 |
+ c.Lock() |
|
| 109 |
+ defer c.Unlock() |
|
| 110 |
+ |
|
| 111 |
+ retries := c.Config.Healthcheck.Retries |
|
| 112 |
+ if retries <= 0 {
|
|
| 113 |
+ retries = 1 // Default if unset or set to an invalid value |
|
| 114 |
+ } |
|
| 115 |
+ |
|
| 116 |
+ h := c.State.Health |
|
| 117 |
+ oldStatus := h.Status |
|
| 118 |
+ |
|
| 119 |
+ if len(h.Log) >= maxLogEntries {
|
|
| 120 |
+ h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) |
|
| 121 |
+ } else {
|
|
| 122 |
+ h.Log = append(h.Log, result) |
|
| 123 |
+ } |
|
| 124 |
+ |
|
| 125 |
+ if result.ExitCode == exitStatusHealthy {
|
|
| 126 |
+ h.FailingStreak = 0 |
|
| 127 |
+ h.Status = types.Healthy |
|
| 128 |
+ } else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting {
|
|
| 129 |
+ // The container is not ready yet. Remain in the starting state. |
|
| 130 |
+ } else {
|
|
| 131 |
+ // Failure (incuding invalid exit code) |
|
| 132 |
+ h.FailingStreak++ |
|
| 133 |
+ if c.State.Health.FailingStreak >= retries {
|
|
| 134 |
+ h.Status = types.Unhealthy |
|
| 135 |
+ } |
|
| 136 |
+ // Else we're starting or healthy. Stay in that state. |
|
| 137 |
+ } |
|
| 138 |
+ |
|
| 139 |
+ if oldStatus != h.Status {
|
|
| 140 |
+ d.LogContainerEvent(c, "health_status: "+h.Status) |
|
| 141 |
+ } |
|
| 142 |
+} |
|
| 143 |
+ |
|
| 144 |
+// Run the container's monitoring thread until notified via "stop". |
|
| 145 |
+// There is never more than one monitor thread running per container at a time. |
|
| 146 |
+func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
|
|
| 147 |
+ probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) |
|
| 148 |
+ probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) |
|
| 149 |
+ for {
|
|
| 150 |
+ select {
|
|
| 151 |
+ case <-stop: |
|
| 152 |
+ logrus.Debugf("Stop healthcheck monitoring (received while idle)")
|
|
| 153 |
+ return |
|
| 154 |
+ case <-time.After(probeInterval): |
|
| 155 |
+ logrus.Debugf("Running health check...")
|
|
| 156 |
+ startTime := time.Now() |
|
| 157 |
+ ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) |
|
| 158 |
+ results := make(chan *types.HealthcheckResult) |
|
| 159 |
+ go func() {
|
|
| 160 |
+ result, err := probe.run(ctx, d, c) |
|
| 161 |
+ if err != nil {
|
|
| 162 |
+ logrus.Warnf("Health check error: %v", err)
|
|
| 163 |
+ results <- &types.HealthcheckResult{
|
|
| 164 |
+ ExitCode: -1, |
|
| 165 |
+ Output: err.Error(), |
|
| 166 |
+ Start: startTime, |
|
| 167 |
+ End: time.Now(), |
|
| 168 |
+ } |
|
| 169 |
+ } else {
|
|
| 170 |
+ result.Start = startTime |
|
| 171 |
+ logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
|
|
| 172 |
+ results <- result |
|
| 173 |
+ } |
|
| 174 |
+ close(results) |
|
| 175 |
+ }() |
|
| 176 |
+ select {
|
|
| 177 |
+ case <-stop: |
|
| 178 |
+ logrus.Debugf("Stop healthcheck monitoring (received while probing)")
|
|
| 179 |
+ // Stop timeout and kill probe, but don't wait for probe to exit. |
|
| 180 |
+ cancelProbe() |
|
| 181 |
+ return |
|
| 182 |
+ case result := <-results: |
|
| 183 |
+ handleProbeResult(d, c, result) |
|
| 184 |
+ // Stop timeout |
|
| 185 |
+ cancelProbe() |
|
| 186 |
+ case <-ctx.Done(): |
|
| 187 |
+ logrus.Debugf("Health check taking too long")
|
|
| 188 |
+ handleProbeResult(d, c, &types.HealthcheckResult{
|
|
| 189 |
+ ExitCode: -1, |
|
| 190 |
+ Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
|
|
| 191 |
+ Start: startTime, |
|
| 192 |
+ End: time.Now(), |
|
| 193 |
+ }) |
|
| 194 |
+ cancelProbe() |
|
| 195 |
+ // Wait for probe to exit (it might take a while to respond to the TERM |
|
| 196 |
+ // signal and we don't want dying probes to pile up). |
|
| 197 |
+ <-results |
|
| 198 |
+ } |
|
| 199 |
+ } |
|
| 200 |
+ } |
|
| 201 |
+} |
|
| 202 |
+ |
|
| 203 |
+// Get a suitable probe implementation for the container's healthcheck configuration. |
|
| 204 |
+func getProbe(c *container.Container) probe {
|
|
| 205 |
+ config := c.Config.Healthcheck |
|
| 206 |
+ if config == nil || len(config.Test) == 0 {
|
|
| 207 |
+ return nil |
|
| 208 |
+ } |
|
| 209 |
+ switch config.Test[0] {
|
|
| 210 |
+ case "CMD": |
|
| 211 |
+ return &cmdProbe{shell: false}
|
|
| 212 |
+ case "CMD-SHELL": |
|
| 213 |
+ return &cmdProbe{shell: true}
|
|
| 214 |
+ default: |
|
| 215 |
+ logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
|
|
| 216 |
+ return nil |
|
| 217 |
+ } |
|
| 218 |
+} |
|
| 219 |
+ |
|
| 220 |
+// Ensure the health-check monitor is running or not, depending on the current |
|
| 221 |
+// state of the container. |
|
| 222 |
+// Called from monitor.go, with c locked. |
|
| 223 |
+func (d *Daemon) updateHealthMonitor(c *container.Container) {
|
|
| 224 |
+ h := c.State.Health |
|
| 225 |
+ if h == nil {
|
|
| 226 |
+ return // No healthcheck configured |
|
| 227 |
+ } |
|
| 228 |
+ |
|
| 229 |
+ probe := getProbe(c) |
|
| 230 |
+ wantRunning := c.Running && !c.Paused && probe != nil |
|
| 231 |
+ if wantRunning {
|
|
| 232 |
+ if stop := h.OpenMonitorChannel(); stop != nil {
|
|
| 233 |
+ go monitor(d, c, stop, probe) |
|
| 234 |
+ } |
|
| 235 |
+ } else {
|
|
| 236 |
+ h.CloseMonitorChannel() |
|
| 237 |
+ } |
|
| 238 |
+} |
|
| 239 |
+ |
|
| 240 |
+// Reset the health state for a newly-started, restarted or restored container. |
|
| 241 |
+// initHealthMonitor is called from monitor.go and we should never be running |
|
| 242 |
+// two instances at once. |
|
| 243 |
+// Called with c locked. |
|
| 244 |
+func (d *Daemon) initHealthMonitor(c *container.Container) {
|
|
| 245 |
+ if c.Config.Healthcheck == nil {
|
|
| 246 |
+ return |
|
| 247 |
+ } |
|
| 248 |
+ |
|
| 249 |
+ // This is needed in case we're auto-restarting |
|
| 250 |
+ d.stopHealthchecks(c) |
|
| 251 |
+ |
|
| 252 |
+ if c.State.Health == nil {
|
|
| 253 |
+ h := &container.Health{}
|
|
| 254 |
+ h.Status = types.Starting |
|
| 255 |
+ h.FailingStreak = 0 |
|
| 256 |
+ c.State.Health = h |
|
| 257 |
+ } |
|
| 258 |
+ |
|
| 259 |
+ d.updateHealthMonitor(c) |
|
| 260 |
+} |
|
| 261 |
+ |
|
| 262 |
+// Called when the container is being stopped (whether because the health check is |
|
| 263 |
+// failing or for any other reason). |
|
| 264 |
+func (d *Daemon) stopHealthchecks(c *container.Container) {
|
|
| 265 |
+ h := c.State.Health |
|
| 266 |
+ if h != nil {
|
|
| 267 |
+ h.CloseMonitorChannel() |
|
| 268 |
+ } |
|
| 269 |
+} |
|
| 270 |
+ |
|
| 271 |
+// Buffer up to maxOutputLen bytes. Further data is discarded. |
|
| 272 |
+type limitedBuffer struct {
|
|
| 273 |
+ buf bytes.Buffer |
|
| 274 |
+ truncated bool // indicates that data has been lost |
|
| 275 |
+} |
|
| 276 |
+ |
|
| 277 |
+// Append to limitedBuffer while there is room. |
|
| 278 |
+func (b *limitedBuffer) Write(data []byte) (int, error) {
|
|
| 279 |
+ bufLen := b.buf.Len() |
|
| 280 |
+ dataLen := len(data) |
|
| 281 |
+ keep := min(maxOutputLen-bufLen, dataLen) |
|
| 282 |
+ if keep > 0 {
|
|
| 283 |
+ b.buf.Write(data[:keep]) |
|
| 284 |
+ } |
|
| 285 |
+ if keep < dataLen {
|
|
| 286 |
+ b.truncated = true |
|
| 287 |
+ } |
|
| 288 |
+ return dataLen, nil |
|
| 289 |
+} |
|
| 290 |
+ |
|
| 291 |
+// The contents of the buffer, with "..." appended if it overflowed. |
|
| 292 |
+func (b *limitedBuffer) String() string {
|
|
| 293 |
+ out := b.buf.String() |
|
| 294 |
+ if b.truncated {
|
|
| 295 |
+ out = out + "..." |
|
| 296 |
+ } |
|
| 297 |
+ return out |
|
| 298 |
+} |
|
| 299 |
+ |
|
| 300 |
+// If configuredValue is zero, use defaultValue instead. |
|
| 301 |
+func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
|
|
| 302 |
+ if configuredValue == 0 {
|
|
| 303 |
+ return defaultValue |
|
| 304 |
+ } |
|
| 305 |
+ return configuredValue |
|
| 306 |
+} |
|
| 307 |
+ |
|
| 308 |
+func min(x, y int) int {
|
|
| 309 |
+ if x < y {
|
|
| 310 |
+ return x |
|
| 311 |
+ } |
|
| 312 |
+ return y |
|
| 313 |
+} |
| 0 | 314 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,112 @@ |
| 0 |
+package daemon |
|
| 1 |
+ |
|
| 2 |
+import ( |
|
| 3 |
+ "testing" |
|
| 4 |
+ "time" |
|
| 5 |
+ |
|
| 6 |
+ "github.com/docker/docker/container" |
|
| 7 |
+ "github.com/docker/docker/daemon/events" |
|
| 8 |
+ "github.com/docker/engine-api/types" |
|
| 9 |
+ containertypes "github.com/docker/engine-api/types/container" |
|
| 10 |
+ eventtypes "github.com/docker/engine-api/types/events" |
|
| 11 |
+) |
|
| 12 |
+ |
|
| 13 |
+func reset(c *container.Container) {
|
|
| 14 |
+ c.State = &container.State{}
|
|
| 15 |
+ c.State.Health = &container.Health{}
|
|
| 16 |
+ c.State.Health.Status = types.Starting |
|
| 17 |
+} |
|
| 18 |
+ |
|
| 19 |
+func TestHealthStates(t *testing.T) {
|
|
| 20 |
+ e := events.New() |
|
| 21 |
+ _, l, _ := e.Subscribe() |
|
| 22 |
+ defer e.Evict(l) |
|
| 23 |
+ |
|
| 24 |
+ expect := func(expected string) {
|
|
| 25 |
+ select {
|
|
| 26 |
+ case event := <-l: |
|
| 27 |
+ ev := event.(eventtypes.Message) |
|
| 28 |
+ if ev.Status != expected {
|
|
| 29 |
+ t.Errorf("Expecting event %#v, but got %#v\n", expected, ev.Status)
|
|
| 30 |
+ } |
|
| 31 |
+ case <-time.After(1 * time.Second): |
|
| 32 |
+ t.Errorf("Expecting event %#v, but got nothing\n", expected)
|
|
| 33 |
+ } |
|
| 34 |
+ } |
|
| 35 |
+ |
|
| 36 |
+ c := &container.Container{
|
|
| 37 |
+ CommonContainer: container.CommonContainer{
|
|
| 38 |
+ ID: "container_id", |
|
| 39 |
+ Name: "container_name", |
|
| 40 |
+ Config: &containertypes.Config{
|
|
| 41 |
+ Image: "image_name", |
|
| 42 |
+ }, |
|
| 43 |
+ }, |
|
| 44 |
+ } |
|
| 45 |
+ daemon := &Daemon{
|
|
| 46 |
+ EventsService: e, |
|
| 47 |
+ } |
|
| 48 |
+ |
|
| 49 |
+ c.Config.Healthcheck = &containertypes.HealthConfig{
|
|
| 50 |
+ Retries: 1, |
|
| 51 |
+ } |
|
| 52 |
+ |
|
| 53 |
+ reset(c) |
|
| 54 |
+ |
|
| 55 |
+ handleResult := func(startTime time.Time, exitCode int) {
|
|
| 56 |
+ handleProbeResult(daemon, c, &types.HealthcheckResult{
|
|
| 57 |
+ Start: startTime, |
|
| 58 |
+ End: startTime, |
|
| 59 |
+ ExitCode: exitCode, |
|
| 60 |
+ }) |
|
| 61 |
+ } |
|
| 62 |
+ |
|
| 63 |
+ // starting -> failed -> success -> failed |
|
| 64 |
+ |
|
| 65 |
+ handleResult(c.State.StartedAt.Add(1*time.Second), 1) |
|
| 66 |
+ expect("health_status: unhealthy")
|
|
| 67 |
+ |
|
| 68 |
+ handleResult(c.State.StartedAt.Add(2*time.Second), 0) |
|
| 69 |
+ expect("health_status: healthy")
|
|
| 70 |
+ |
|
| 71 |
+ handleResult(c.State.StartedAt.Add(3*time.Second), 1) |
|
| 72 |
+ expect("health_status: unhealthy")
|
|
| 73 |
+ |
|
| 74 |
+ // starting -> starting -> starting -> |
|
| 75 |
+ // healthy -> starting (invalid transition) |
|
| 76 |
+ |
|
| 77 |
+ reset(c) |
|
| 78 |
+ |
|
| 79 |
+ handleResult(c.State.StartedAt.Add(20*time.Second), 2) |
|
| 80 |
+ handleResult(c.State.StartedAt.Add(40*time.Second), 2) |
|
| 81 |
+ if c.State.Health.Status != types.Starting {
|
|
| 82 |
+ t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
|
|
| 83 |
+ } |
|
| 84 |
+ |
|
| 85 |
+ handleResult(c.State.StartedAt.Add(50*time.Second), 0) |
|
| 86 |
+ expect("health_status: healthy")
|
|
| 87 |
+ handleResult(c.State.StartedAt.Add(60*time.Second), 2) |
|
| 88 |
+ expect("health_status: unhealthy")
|
|
| 89 |
+ |
|
| 90 |
+ // Test retries |
|
| 91 |
+ |
|
| 92 |
+ reset(c) |
|
| 93 |
+ c.Config.Healthcheck.Retries = 3 |
|
| 94 |
+ |
|
| 95 |
+ handleResult(c.State.StartedAt.Add(20*time.Second), 1) |
|
| 96 |
+ handleResult(c.State.StartedAt.Add(40*time.Second), 1) |
|
| 97 |
+ if c.State.Health.Status != types.Starting {
|
|
| 98 |
+ t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
|
|
| 99 |
+ } |
|
| 100 |
+ if c.State.Health.FailingStreak != 2 {
|
|
| 101 |
+ t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak)
|
|
| 102 |
+ } |
|
| 103 |
+ handleResult(c.State.StartedAt.Add(60*time.Second), 1) |
|
| 104 |
+ expect("health_status: unhealthy")
|
|
| 105 |
+ |
|
| 106 |
+ handleResult(c.State.StartedAt.Add(80*time.Second), 0) |
|
| 107 |
+ expect("health_status: healthy")
|
|
| 108 |
+ if c.State.Health.FailingStreak != 0 {
|
|
| 109 |
+ t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak)
|
|
| 110 |
+ } |
|
| 111 |
+} |
| ... | ... |
@@ -108,6 +108,15 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool) |
| 108 | 108 |
hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
|
| 109 | 109 |
} |
| 110 | 110 |
|
| 111 |
+ var containerHealth *types.Health |
|
| 112 |
+ if container.State.Health != nil {
|
|
| 113 |
+ containerHealth = &types.Health{
|
|
| 114 |
+ Status: container.State.Health.Status, |
|
| 115 |
+ FailingStreak: container.State.Health.FailingStreak, |
|
| 116 |
+ Log: append([]*types.HealthcheckResult{}, container.State.Health.Log...),
|
|
| 117 |
+ } |
|
| 118 |
+ } |
|
| 119 |
+ |
|
| 111 | 120 |
containerState := &types.ContainerState{
|
| 112 | 121 |
Status: container.State.StateString(), |
| 113 | 122 |
Running: container.State.Running, |
| ... | ... |
@@ -120,6 +129,7 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool) |
| 120 | 120 |
Error: container.State.Error, |
| 121 | 121 |
StartedAt: container.State.StartedAt.Format(time.RFC3339Nano), |
| 122 | 122 |
FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano), |
| 123 |
+ Health: containerHealth, |
|
| 123 | 124 |
} |
| 124 | 125 |
|
| 125 | 126 |
contJSONBase := &types.ContainerJSONBase{
|
| ... | ... |
@@ -25,6 +25,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 25 | 25 |
if runtime.GOOS == "windows" {
|
| 26 | 26 |
return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
|
| 27 | 27 |
} |
| 28 |
+ daemon.updateHealthMonitor(c) |
|
| 28 | 29 |
daemon.LogContainerEvent(c, "oom") |
| 29 | 30 |
case libcontainerd.StateExit: |
| 30 | 31 |
c.Lock() |
| ... | ... |
@@ -35,6 +36,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 35 | 35 |
attributes := map[string]string{
|
| 36 | 36 |
"exitCode": strconv.Itoa(int(e.ExitCode)), |
| 37 | 37 |
} |
| 38 |
+ daemon.updateHealthMonitor(c) |
|
| 38 | 39 |
daemon.LogContainerEventWithAttributes(c, "die", attributes) |
| 39 | 40 |
daemon.Cleanup(c) |
| 40 | 41 |
// FIXME: here is race condition between two RUN instructions in Dockerfile |
| ... | ... |
@@ -54,6 +56,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 54 | 54 |
"exitCode": strconv.Itoa(int(e.ExitCode)), |
| 55 | 55 |
} |
| 56 | 56 |
daemon.LogContainerEventWithAttributes(c, "die", attributes) |
| 57 |
+ daemon.updateHealthMonitor(c) |
|
| 57 | 58 |
return c.ToDisk() |
| 58 | 59 |
case libcontainerd.StateExitProcess: |
| 59 | 60 |
c.Lock() |
| ... | ... |
@@ -74,18 +77,24 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 74 | 74 |
logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
|
| 75 | 75 |
} |
| 76 | 76 |
case libcontainerd.StateStart, libcontainerd.StateRestore: |
| 77 |
+ // Container is already locked in this case |
|
| 77 | 78 |
c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart) |
| 78 | 79 |
c.HasBeenManuallyStopped = false |
| 79 | 80 |
if err := c.ToDisk(); err != nil {
|
| 80 | 81 |
c.Reset(false) |
| 81 | 82 |
return err |
| 82 | 83 |
} |
| 84 |
+ daemon.initHealthMonitor(c) |
|
| 83 | 85 |
daemon.LogContainerEvent(c, "start") |
| 84 | 86 |
case libcontainerd.StatePause: |
| 87 |
+ // Container is already locked in this case |
|
| 85 | 88 |
c.Paused = true |
| 89 |
+ daemon.updateHealthMonitor(c) |
|
| 86 | 90 |
daemon.LogContainerEvent(c, "pause") |
| 87 | 91 |
case libcontainerd.StateResume: |
| 92 |
+ // Container is already locked in this case |
|
| 88 | 93 |
c.Paused = false |
| 94 |
+ daemon.updateHealthMonitor(c) |
|
| 89 | 95 |
daemon.LogContainerEvent(c, "unpause") |
| 90 | 96 |
} |
| 91 | 97 |
|
| ... | ... |
@@ -41,6 +41,8 @@ func (daemon *Daemon) containerStop(container *container.Container, seconds int) |
| 41 | 41 |
return nil |
| 42 | 42 |
} |
| 43 | 43 |
|
| 44 |
+ daemon.stopHealthchecks(container) |
|
| 45 |
+ |
|
| 44 | 46 |
stopSignal := container.StopSignal() |
| 45 | 47 |
// 1. Send a stop signal |
| 46 | 48 |
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
|
| ... | ... |
@@ -1470,6 +1470,73 @@ The `STOPSIGNAL` instruction sets the system call signal that will be sent to th |
| 1470 | 1470 |
This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9, |
| 1471 | 1471 |
or a signal name in the format SIGNAME, for instance SIGKILL. |
| 1472 | 1472 |
|
| 1473 |
+## HEALTHCHECK |
|
| 1474 |
+ |
|
| 1475 |
+The `HEALTHCHECK` instruction has two forms: |
|
| 1476 |
+ |
|
| 1477 |
+* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container) |
|
| 1478 |
+* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image) |
|
| 1479 |
+ |
|
| 1480 |
+The `HEALTHCHECK` instruction tells Docker how to test a container to check that |
|
| 1481 |
+it is still working. This can detect cases such as a web server that is stuck in |
|
| 1482 |
+an infinite loop and unable to handle new connections, even though the server |
|
| 1483 |
+process is still running. |
|
| 1484 |
+ |
|
| 1485 |
+When a container has a healthcheck specified, it has a _health status_ in |
|
| 1486 |
+addition to its normal status. This status is initially `starting`. Whenever a |
|
| 1487 |
+health check passes, it becomes `healthy` (whatever state it was previously in). |
|
| 1488 |
+After a certain number of consecutive failures, it becomes `unhealthy`. |
|
| 1489 |
+ |
|
| 1490 |
+The options that can appear before `CMD` are: |
|
| 1491 |
+ |
|
| 1492 |
+* `--interval=DURATION` (default: `30s`) |
|
| 1493 |
+* `--timeout=DURATION` (default: `30s`) |
|
| 1494 |
+* `--retries=N` (default: `1`) |
|
| 1495 |
+ |
|
| 1496 |
+The health check will first run **interval** seconds after the container is |
|
| 1497 |
+started, and then again **interval** seconds after each previous check completes. |
|
| 1498 |
+ |
|
| 1499 |
+If a single run of the check takes longer than **timeout** seconds then the check |
|
| 1500 |
+is considered to have failed. |
|
| 1501 |
+ |
|
| 1502 |
+It takes **retries** consecutive failures of the health check for the container |
|
| 1503 |
+to be considered `unhealthy`. |
|
| 1504 |
+ |
|
| 1505 |
+There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list |
|
| 1506 |
+more than one then only the last `HEALTHCHECK` will take effect. |
|
| 1507 |
+ |
|
| 1508 |
+The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK |
|
| 1509 |
+CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands; |
|
| 1510 |
+see e.g. `ENTRYPOINT` for details). |
|
| 1511 |
+ |
|
| 1512 |
+The command's exit status indicates the health status of the container. |
|
| 1513 |
+The possible values are: |
|
| 1514 |
+ |
|
| 1515 |
+- 0: success - the container is healthy and ready for use |
|
| 1516 |
+- 1: unhealthy - the container is not working correctly |
|
| 1517 |
+- 2: starting - the container is not ready for use yet, but is working correctly |
|
| 1518 |
+ |
|
| 1519 |
+If the probe returns 2 ("starting") when the container has already moved out of the
|
|
| 1520 |
+"starting" state then it is treated as "unhealthy" instead. |
|
| 1521 |
+ |
|
| 1522 |
+For example, to check every five minutes or so that a web-server is able to |
|
| 1523 |
+serve the site's main page within three seconds: |
|
| 1524 |
+ |
|
| 1525 |
+ HEALTHCHECK --interval=5m --timeout=3s \ |
|
| 1526 |
+ CMD curl -f http://localhost/ || exit 1 |
|
| 1527 |
+ |
|
| 1528 |
+To help debug failing probes, any output text (UTF-8 encoded) that the command writes |
|
| 1529 |
+on stdout or stderr will be stored in the health status and can be queried with |
|
| 1530 |
+`docker inspect`. Such output should be kept short (only the first 4096 bytes |
|
| 1531 |
+are stored currently). |
|
| 1532 |
+ |
|
| 1533 |
+When the health status of a container changes, a `health_status` event is |
|
| 1534 |
+generated with the new status. |
|
| 1535 |
+ |
|
| 1536 |
+The `HEALTHCHECK` feature was added in Docker 1.12. |
|
| 1537 |
+ |
|
| 1538 |
+ |
|
| 1539 |
+ |
|
| 1473 | 1540 |
## Dockerfile examples |
| 1474 | 1541 |
|
| 1475 | 1542 |
Below you can see some examples of Dockerfile syntax. If you're interested in |
| ... | ... |
@@ -1250,6 +1250,7 @@ Dockerfile instruction and how the operator can override that setting. |
| 1250 | 1250 |
#entrypoint-default-command-to-execute-at-runtime) |
| 1251 | 1251 |
- [EXPOSE (Incoming Ports)](#expose-incoming-ports) |
| 1252 | 1252 |
- [ENV (Environment Variables)](#env-environment-variables) |
| 1253 |
+ - [HEALTHCHECK](#healthcheck) |
|
| 1253 | 1254 |
- [VOLUME (Shared Filesystems)](#volume-shared-filesystems) |
| 1254 | 1255 |
- [USER](#user) |
| 1255 | 1256 |
- [WORKDIR](#workdir) |
| ... | ... |
@@ -1398,6 +1399,65 @@ above, or already defined by the developer with a Dockerfile `ENV`: |
| 1398 | 1398 |
|
| 1399 | 1399 |
Similarly the operator can set the **hostname** with `-h`. |
| 1400 | 1400 |
|
| 1401 |
+### HEALTHCHECK |
|
| 1402 |
+ |
|
| 1403 |
+``` |
|
| 1404 |
+ --health-cmd Command to run to check health |
|
| 1405 |
+ --health-interval Time between running the check |
|
| 1406 |
+ --health-retries Consecutive failures needed to report unhealthy |
|
| 1407 |
+ --health-timeout Maximum time to allow one check to run |
|
| 1408 |
+ --no-healthcheck Disable any container-specified HEALTHCHECK |
|
| 1409 |
+``` |
|
| 1410 |
+ |
|
| 1411 |
+Example: |
|
| 1412 |
+ |
|
| 1413 |
+ $ docker run --name=test -d \ |
|
| 1414 |
+ --health-cmd='stat /etc/passwd || exit 1' \ |
|
| 1415 |
+ --health-interval=2s \ |
|
| 1416 |
+ busybox sleep 1d |
|
| 1417 |
+ $ sleep 2; docker inspect --format='{{.State.Health.Status}}' test
|
|
| 1418 |
+ healthy |
|
| 1419 |
+ $ docker exec test rm /etc/passwd |
|
| 1420 |
+ $ sleep 2; docker inspect --format='{{json .State.Health}}' test
|
|
| 1421 |
+ {
|
|
| 1422 |
+ "Status": "unhealthy", |
|
| 1423 |
+ "FailingStreak": 3, |
|
| 1424 |
+ "Log": [ |
|
| 1425 |
+ {
|
|
| 1426 |
+ "Start": "2016-05-25T17:22:04.635478668Z", |
|
| 1427 |
+ "End": "2016-05-25T17:22:04.7272552Z", |
|
| 1428 |
+ "ExitCode": 0, |
|
| 1429 |
+ "Output": " File: /etc/passwd\n Size: 334 \tBlocks: 8 IO Block: 4096 regular file\nDevice: 32h/50d\tInode: 12 Links: 1\nAccess: (0664/-rw-rw-r--) Uid: ( 0/ root) Gid: ( 0/ root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..." |
|
| 1430 |
+ }, |
|
| 1431 |
+ {
|
|
| 1432 |
+ "Start": "2016-05-25T17:22:06.732900633Z", |
|
| 1433 |
+ "End": "2016-05-25T17:22:06.822168935Z", |
|
| 1434 |
+ "ExitCode": 0, |
|
| 1435 |
+ "Output": " File: /etc/passwd\n Size: 334 \tBlocks: 8 IO Block: 4096 regular file\nDevice: 32h/50d\tInode: 12 Links: 1\nAccess: (0664/-rw-rw-r--) Uid: ( 0/ root) Gid: ( 0/ root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..." |
|
| 1436 |
+ }, |
|
| 1437 |
+ {
|
|
| 1438 |
+ "Start": "2016-05-25T17:22:08.823956535Z", |
|
| 1439 |
+ "End": "2016-05-25T17:22:08.897359124Z", |
|
| 1440 |
+ "ExitCode": 1, |
|
| 1441 |
+ "Output": "stat: can't stat '/etc/passwd': No such file or directory\n" |
|
| 1442 |
+ }, |
|
| 1443 |
+ {
|
|
| 1444 |
+ "Start": "2016-05-25T17:22:10.898802931Z", |
|
| 1445 |
+ "End": "2016-05-25T17:22:10.969631866Z", |
|
| 1446 |
+ "ExitCode": 1, |
|
| 1447 |
+ "Output": "stat: can't stat '/etc/passwd': No such file or directory\n" |
|
| 1448 |
+ }, |
|
| 1449 |
+ {
|
|
| 1450 |
+ "Start": "2016-05-25T17:22:12.971033523Z", |
|
| 1451 |
+ "End": "2016-05-25T17:22:13.082015516Z", |
|
| 1452 |
+ "ExitCode": 1, |
|
| 1453 |
+ "Output": "stat: can't stat '/etc/passwd': No such file or directory\n" |
|
| 1454 |
+ } |
|
| 1455 |
+ ] |
|
| 1456 |
+ } |
|
| 1457 |
+ |
|
| 1458 |
+The health status is also displayed in the `docker ps` output. |
|
| 1459 |
+ |
|
| 1401 | 1460 |
### TMPFS (mount tmpfs filesystems) |
| 1402 | 1461 |
|
| 1403 | 1462 |
```bash |
| 1404 | 1463 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,154 @@ |
| 0 |
+package main |
|
| 1 |
+ |
|
| 2 |
+import ( |
|
| 3 |
+ "encoding/json" |
|
| 4 |
+ "github.com/docker/docker/pkg/integration/checker" |
|
| 5 |
+ "github.com/docker/engine-api/types" |
|
| 6 |
+ "github.com/go-check/check" |
|
| 7 |
+ "strconv" |
|
| 8 |
+ "strings" |
|
| 9 |
+ "time" |
|
| 10 |
+) |
|
| 11 |
+ |
|
| 12 |
+func waitForStatus(c *check.C, name string, prev string, expected string) {
|
|
| 13 |
+ prev = prev + "\n" |
|
| 14 |
+ expected = expected + "\n" |
|
| 15 |
+ for {
|
|
| 16 |
+ out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name)
|
|
| 17 |
+ if out == expected {
|
|
| 18 |
+ return |
|
| 19 |
+ } |
|
| 20 |
+ c.Check(out, checker.Equals, prev) |
|
| 21 |
+ if out != prev {
|
|
| 22 |
+ return |
|
| 23 |
+ } |
|
| 24 |
+ time.Sleep(100 * time.Millisecond) |
|
| 25 |
+ } |
|
| 26 |
+} |
|
| 27 |
+ |
|
| 28 |
+func waitForHealthStatus(c *check.C, name string, prev string, expected string) {
|
|
| 29 |
+ prev = prev + "\n" |
|
| 30 |
+ expected = expected + "\n" |
|
| 31 |
+ for {
|
|
| 32 |
+ out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
|
|
| 33 |
+ if out == expected {
|
|
| 34 |
+ return |
|
| 35 |
+ } |
|
| 36 |
+ c.Check(out, checker.Equals, prev) |
|
| 37 |
+ if out != prev {
|
|
| 38 |
+ return |
|
| 39 |
+ } |
|
| 40 |
+ time.Sleep(100 * time.Millisecond) |
|
| 41 |
+ } |
|
| 42 |
+} |
|
| 43 |
+ |
|
| 44 |
+func getHealth(c *check.C, name string) *types.Health {
|
|
| 45 |
+ out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name)
|
|
| 46 |
+ var health types.Health |
|
| 47 |
+ err := json.Unmarshal([]byte(out), &health) |
|
| 48 |
+ c.Check(err, checker.Equals, nil) |
|
| 49 |
+ return &health |
|
| 50 |
+} |
|
| 51 |
+ |
|
| 52 |
+func (s *DockerSuite) TestHealth(c *check.C) {
|
|
| 53 |
+ testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows |
|
| 54 |
+ |
|
| 55 |
+ imageName := "testhealth" |
|
| 56 |
+ _, err := buildImage(imageName, |
|
| 57 |
+ `FROM busybox |
|
| 58 |
+ RUN echo OK > /status |
|
| 59 |
+ CMD ["/bin/sleep", "120"] |
|
| 60 |
+ STOPSIGNAL SIGKILL |
|
| 61 |
+ HEALTHCHECK --interval=1s --timeout=30s \ |
|
| 62 |
+ CMD cat /status`, |
|
| 63 |
+ true) |
|
| 64 |
+ |
|
| 65 |
+ c.Check(err, check.IsNil) |
|
| 66 |
+ |
|
| 67 |
+ // No health status before starting |
|
| 68 |
+ name := "test_health" |
|
| 69 |
+ dockerCmd(c, "create", "--name", name, imageName) |
|
| 70 |
+ out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}")
|
|
| 71 |
+ c.Check(out, checker.Equals, "Created\n") |
|
| 72 |
+ |
|
| 73 |
+ // Inspect the options |
|
| 74 |
+ out, _ = dockerCmd(c, "inspect", |
|
| 75 |
+ "--format='timeout={{.Config.Healthcheck.Timeout}} "+
|
|
| 76 |
+ "interval={{.Config.Healthcheck.Interval}} "+
|
|
| 77 |
+ "retries={{.Config.Healthcheck.Retries}} "+
|
|
| 78 |
+ "test={{.Config.Healthcheck.Test}}'", name)
|
|
| 79 |
+ c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n") |
|
| 80 |
+ |
|
| 81 |
+ // Start |
|
| 82 |
+ dockerCmd(c, "start", name) |
|
| 83 |
+ waitForHealthStatus(c, name, "starting", "healthy") |
|
| 84 |
+ |
|
| 85 |
+ // Make it fail |
|
| 86 |
+ dockerCmd(c, "exec", name, "rm", "/status") |
|
| 87 |
+ waitForHealthStatus(c, name, "healthy", "unhealthy") |
|
| 88 |
+ |
|
| 89 |
+ // Inspect the status |
|
| 90 |
+ out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
|
|
| 91 |
+ c.Check(out, checker.Equals, "unhealthy\n") |
|
| 92 |
+ |
|
| 93 |
+ // Make it healthy again |
|
| 94 |
+ dockerCmd(c, "exec", name, "touch", "/status") |
|
| 95 |
+ waitForHealthStatus(c, name, "unhealthy", "healthy") |
|
| 96 |
+ |
|
| 97 |
+ // Remove container |
|
| 98 |
+ dockerCmd(c, "rm", "-f", name) |
|
| 99 |
+ |
|
| 100 |
+ // Disable the check from the CLI |
|
| 101 |
+ out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName) |
|
| 102 |
+ out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh")
|
|
| 103 |
+ c.Check(out, checker.Equals, "[NONE]\n") |
|
| 104 |
+ dockerCmd(c, "rm", "noh") |
|
| 105 |
+ |
|
| 106 |
+ // Disable the check with a new build |
|
| 107 |
+ _, err = buildImage("no_healthcheck",
|
|
| 108 |
+ `FROM testhealth |
|
| 109 |
+ HEALTHCHECK NONE`, true) |
|
| 110 |
+ c.Check(err, check.IsNil) |
|
| 111 |
+ |
|
| 112 |
+ out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck")
|
|
| 113 |
+ c.Check(out, checker.Equals, "[NONE]\n") |
|
| 114 |
+ |
|
| 115 |
+ // Enable the checks from the CLI |
|
| 116 |
+ _, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck", |
|
| 117 |
+ "--health-interval=0.5s", |
|
| 118 |
+ "--health-retries=3", |
|
| 119 |
+ "--health-cmd=cat /status", |
|
| 120 |
+ "no_healthcheck") |
|
| 121 |
+ waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy") |
|
| 122 |
+ health := getHealth(c, "fatal_healthcheck") |
|
| 123 |
+ c.Check(health.Status, checker.Equals, "healthy") |
|
| 124 |
+ c.Check(health.FailingStreak, checker.Equals, 0) |
|
| 125 |
+ last := health.Log[len(health.Log)-1] |
|
| 126 |
+ c.Check(last.ExitCode, checker.Equals, 0) |
|
| 127 |
+ c.Check(last.Output, checker.Equals, "OK\n") |
|
| 128 |
+ |
|
| 129 |
+ // Fail the check, which should now make it exit |
|
| 130 |
+ dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status") |
|
| 131 |
+ waitForStatus(c, "fatal_healthcheck", "running", "exited") |
|
| 132 |
+ |
|
| 133 |
+ out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck")
|
|
| 134 |
+ c.Check(out, checker.Equals, "unhealthy\n") |
|
| 135 |
+ failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck")
|
|
| 136 |
+ fails, err := strconv.Atoi(strings.TrimSpace(failsStr)) |
|
| 137 |
+ c.Check(err, check.IsNil) |
|
| 138 |
+ c.Check(fails >= 3, checker.Equals, true) |
|
| 139 |
+ dockerCmd(c, "rm", "-f", "fatal_healthcheck") |
|
| 140 |
+ |
|
| 141 |
+ // Check timeout |
|
| 142 |
+ // Note: if the interval is too small, it seems that Docker spends all its time running health |
|
| 143 |
+ // checks and never gets around to killing it. |
|
| 144 |
+ _, _ = dockerCmd(c, "run", "-d", "--name=test", |
|
| 145 |
+ "--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName) |
|
| 146 |
+ waitForHealthStatus(c, "test", "starting", "unhealthy") |
|
| 147 |
+ health = getHealth(c, "test") |
|
| 148 |
+ last = health.Log[len(health.Log)-1] |
|
| 149 |
+ c.Check(health.Status, checker.Equals, "unhealthy") |
|
| 150 |
+ c.Check(last.ExitCode, checker.Equals, -1) |
|
| 151 |
+ c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)") |
|
| 152 |
+ dockerCmd(c, "rm", "-f", "test") |
|
| 153 |
+} |
| ... | ... |
@@ -190,6 +190,17 @@ func (clnt *client) Signal(containerID string, sig int) error {
|
| 190 | 190 |
return err |
| 191 | 191 |
} |
| 192 | 192 |
|
| 193 |
+func (clnt *client) SignalProcess(containerID string, pid string, sig int) error {
|
|
| 194 |
+ clnt.lock(containerID) |
|
| 195 |
+ defer clnt.unlock(containerID) |
|
| 196 |
+ _, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{
|
|
| 197 |
+ Id: containerID, |
|
| 198 |
+ Pid: pid, |
|
| 199 |
+ Signal: uint32(sig), |
|
| 200 |
+ }) |
|
| 201 |
+ return err |
|
| 202 |
+} |
|
| 203 |
+ |
|
| 193 | 204 |
func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
|
| 194 | 205 |
clnt.lock(containerID) |
| 195 | 206 |
defer clnt.unlock(containerID) |
| ... | ... |
@@ -304,6 +304,25 @@ func (clnt *client) Signal(containerID string, sig int) error {
|
| 304 | 304 |
return nil |
| 305 | 305 |
} |
| 306 | 306 |
|
| 307 |
+// While Linux has support for the full range of signals, signals aren't really implemented on Windows. |
|
| 308 |
+// We try to terminate the specified process whatever signal is requested. |
|
| 309 |
+func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error {
|
|
| 310 |
+ clnt.lock(containerID) |
|
| 311 |
+ defer clnt.unlock(containerID) |
|
| 312 |
+ cont, err := clnt.getContainer(containerID) |
|
| 313 |
+ if err != nil {
|
|
| 314 |
+ return err |
|
| 315 |
+ } |
|
| 316 |
+ |
|
| 317 |
+ for _, p := range cont.processes {
|
|
| 318 |
+ if p.friendlyName == processFriendlyName {
|
|
| 319 |
+ return hcsshim.TerminateProcessInComputeSystem(containerID, p.systemPid) |
|
| 320 |
+ } |
|
| 321 |
+ } |
|
| 322 |
+ |
|
| 323 |
+ return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID)
|
|
| 324 |
+} |
|
| 325 |
+ |
|
| 307 | 326 |
// Resize handles a CLI event to resize an interactive docker run or docker exec |
| 308 | 327 |
// window. |
| 309 | 328 |
func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
|
| ... | ... |
@@ -34,6 +34,7 @@ type Backend interface {
|
| 34 | 34 |
type Client interface {
|
| 35 | 35 |
Create(containerID string, spec Spec, options ...CreateOption) error |
| 36 | 36 |
Signal(containerID string, sig int) error |
| 37 |
+ SignalProcess(containerID string, processFriendlyName string, sig int) error |
|
| 37 | 38 |
AddProcess(containerID, processFriendlyName string, process Process) error |
| 38 | 39 |
Resize(containerID, processFriendlyName string, width, height int) error |
| 39 | 40 |
Pause(containerID string) error |
| ... | ... |
@@ -100,6 +100,12 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host |
| 100 | 100 |
flStopSignal = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
|
| 101 | 101 |
flIsolation = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
|
| 102 | 102 |
flShmSize = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
|
| 103 |
+ // Healthcheck |
|
| 104 |
+ flNoHealthcheck = cmd.Bool([]string{"-no-healthcheck"}, false, "Disable any container-specified HEALTHCHECK")
|
|
| 105 |
+ flHealthCmd = cmd.String([]string{"-health-cmd"}, "", "Command to run to check health")
|
|
| 106 |
+ flHealthInterval = cmd.Duration([]string{"-health-interval"}, 0, "Time between running the check")
|
|
| 107 |
+ flHealthTimeout = cmd.Duration([]string{"-health-timeout"}, 0, "Maximum time to allow one check to run")
|
|
| 108 |
+ flHealthRetries = cmd.Int([]string{"-health-retries"}, 0, "Consecutive failures needed to report unhealthy")
|
|
| 103 | 109 |
) |
| 104 | 110 |
|
| 105 | 111 |
cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
|
| ... | ... |
@@ -351,6 +357,39 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host |
| 351 | 351 |
return nil, nil, nil, cmd, err |
| 352 | 352 |
} |
| 353 | 353 |
|
| 354 |
+ // Healthcheck |
|
| 355 |
+ var healthConfig *container.HealthConfig |
|
| 356 |
+ haveHealthSettings := *flHealthCmd != "" || |
|
| 357 |
+ *flHealthInterval != 0 || |
|
| 358 |
+ *flHealthTimeout != 0 || |
|
| 359 |
+ *flHealthRetries != 0 |
|
| 360 |
+ if *flNoHealthcheck {
|
|
| 361 |
+ if haveHealthSettings {
|
|
| 362 |
+ return nil, nil, nil, cmd, fmt.Errorf("--no-healthcheck conflicts with --health-* options")
|
|
| 363 |
+ } |
|
| 364 |
+ test := strslice.StrSlice{"NONE"}
|
|
| 365 |
+ healthConfig = &container.HealthConfig{Test: test}
|
|
| 366 |
+ } else if haveHealthSettings {
|
|
| 367 |
+ var probe strslice.StrSlice |
|
| 368 |
+ if *flHealthCmd != "" {
|
|
| 369 |
+ args := []string{"CMD-SHELL", *flHealthCmd}
|
|
| 370 |
+ probe = strslice.StrSlice(args) |
|
| 371 |
+ } |
|
| 372 |
+ if *flHealthInterval < 0 {
|
|
| 373 |
+ return nil, nil, nil, cmd, fmt.Errorf("--health-interval cannot be negative")
|
|
| 374 |
+ } |
|
| 375 |
+ if *flHealthTimeout < 0 {
|
|
| 376 |
+ return nil, nil, nil, cmd, fmt.Errorf("--health-timeout cannot be negative")
|
|
| 377 |
+ } |
|
| 378 |
+ |
|
| 379 |
+ healthConfig = &container.HealthConfig{
|
|
| 380 |
+ Test: probe, |
|
| 381 |
+ Interval: *flHealthInterval, |
|
| 382 |
+ Timeout: *flHealthTimeout, |
|
| 383 |
+ Retries: *flHealthRetries, |
|
| 384 |
+ } |
|
| 385 |
+ } |
|
| 386 |
+ |
|
| 354 | 387 |
resources := container.Resources{
|
| 355 | 388 |
CgroupParent: *flCgroupParent, |
| 356 | 389 |
Memory: flMemory, |
| ... | ... |
@@ -399,6 +438,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host |
| 399 | 399 |
Entrypoint: entrypoint, |
| 400 | 400 |
WorkingDir: *flWorkingDir, |
| 401 | 401 |
Labels: ConvertKVStringsToMap(labels), |
| 402 |
+ Healthcheck: healthConfig, |
|
| 402 | 403 |
} |
| 403 | 404 |
if cmd.IsSet("-stop-signal") {
|
| 404 | 405 |
config.StopSignal = *flStopSignal |
| ... | ... |
@@ -9,6 +9,7 @@ import ( |
| 9 | 9 |
"runtime" |
| 10 | 10 |
"strings" |
| 11 | 11 |
"testing" |
| 12 |
+ "time" |
|
| 12 | 13 |
|
| 13 | 14 |
flag "github.com/docker/docker/pkg/mflag" |
| 14 | 15 |
"github.com/docker/docker/runconfig" |
| ... | ... |
@@ -584,6 +585,45 @@ func TestParseRestartPolicy(t *testing.T) {
|
| 584 | 584 |
} |
| 585 | 585 |
} |
| 586 | 586 |
|
| 587 |
+func TestParseHealth(t *testing.T) {
|
|
| 588 |
+ checkOk := func(args ...string) *container.HealthConfig {
|
|
| 589 |
+ config, _, _, _, err := parseRun(args) |
|
| 590 |
+ if err != nil {
|
|
| 591 |
+ t.Fatalf("%#v: %v", args, err)
|
|
| 592 |
+ } |
|
| 593 |
+ return config.Healthcheck |
|
| 594 |
+ } |
|
| 595 |
+ checkError := func(expected string, args ...string) {
|
|
| 596 |
+ config, _, _, _, err := parseRun(args) |
|
| 597 |
+ if err == nil {
|
|
| 598 |
+ t.Fatalf("Expected error, but got %#v", config)
|
|
| 599 |
+ } |
|
| 600 |
+ if err.Error() != expected {
|
|
| 601 |
+ t.Fatalf("Expected %#v, got %#v", expected, err)
|
|
| 602 |
+ } |
|
| 603 |
+ } |
|
| 604 |
+ health := checkOk("--no-healthcheck", "img", "cmd")
|
|
| 605 |
+ if health == nil || len(health.Test) != 1 || health.Test[0] != "NONE" {
|
|
| 606 |
+ t.Fatalf("--no-healthcheck failed: %#v", health)
|
|
| 607 |
+ } |
|
| 608 |
+ |
|
| 609 |
+ health = checkOk("--health-cmd=/check.sh -q", "img", "cmd")
|
|
| 610 |
+ if len(health.Test) != 2 || health.Test[0] != "CMD-SHELL" || health.Test[1] != "/check.sh -q" {
|
|
| 611 |
+ t.Fatalf("--health-cmd: got %#v", health.Test)
|
|
| 612 |
+ } |
|
| 613 |
+ if health.Timeout != 0 {
|
|
| 614 |
+ t.Fatalf("--health-cmd: timeout = %f", health.Timeout)
|
|
| 615 |
+ } |
|
| 616 |
+ |
|
| 617 |
+ checkError("--no-healthcheck conflicts with --health-* options",
|
|
| 618 |
+ "--no-healthcheck", "--health-cmd=/check.sh -q", "img", "cmd") |
|
| 619 |
+ |
|
| 620 |
+ health = checkOk("--health-timeout=2s", "--health-retries=3", "--health-interval=4.5s", "img", "cmd")
|
|
| 621 |
+ if health.Timeout != 2*time.Second || health.Retries != 3 || health.Interval != 4500*time.Millisecond {
|
|
| 622 |
+ t.Fatalf("--health-*: got %#v", health)
|
|
| 623 |
+ } |
|
| 624 |
+} |
|
| 625 |
+ |
|
| 587 | 626 |
func TestParseLoggingOpts(t *testing.T) {
|
| 588 | 627 |
// logging opts ko |
| 589 | 628 |
if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {
|