This adds an additional interval to be used by healthchecks during the
start period.
Typically when a container is just starting you want to check if it is
ready more quickly than a typical healthcheck might run. Without this
users have to balance between running healthchecks to frequently vs
taking a very long time to mark a container as healthy for the first
time.
Signed-off-by: Brian Goff <cpuguy83@gmail.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
| ... | ... |
@@ -541,6 +541,14 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo |
| 541 | 541 |
bo.CreateMountpoint = false |
| 542 | 542 |
} |
| 543 | 543 |
} |
| 544 |
+ |
|
| 545 |
+ } |
|
| 546 |
+ |
|
| 547 |
+ if hostConfig != nil && versions.LessThan(version, "1.44") {
|
|
| 548 |
+ if config.Healthcheck != nil {
|
|
| 549 |
+ // StartInterval was added in API 1.44 |
|
| 550 |
+ config.Healthcheck.StartInterval = 0 |
|
| 551 |
+ } |
|
| 544 | 552 |
} |
| 545 | 553 |
|
| 546 | 554 |
if hostConfig != nil && versions.GreaterThanOrEqualTo(version, "1.42") {
|
| ... | ... |
@@ -804,6 +804,12 @@ definitions: |
| 804 | 804 |
1000000 (1 ms). 0 means inherit. |
| 805 | 805 |
type: "integer" |
| 806 | 806 |
format: "int64" |
| 807 |
+ StartInterval: |
|
| 808 |
+ description: | |
|
| 809 |
+ The time to wait between checks in nanoseconds during the start period. |
|
| 810 |
+ It should be 0 or at least 1000000 (1 ms). 0 means inherit. |
|
| 811 |
+ type: "integer" |
|
| 812 |
+ format: "int64" |
|
| 807 | 813 |
|
| 808 | 814 |
Health: |
| 809 | 815 |
description: | |
| ... | ... |
@@ -44,9 +44,10 @@ type HealthConfig struct {
|
| 44 | 44 |
Test []string `json:",omitempty"` |
| 45 | 45 |
|
| 46 | 46 |
// Zero means to inherit. Durations are expressed as integer nanoseconds. |
| 47 |
- Interval time.Duration `json:",omitempty"` // Interval is the time to wait between checks. |
|
| 48 |
- Timeout time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung. |
|
| 49 |
- StartPeriod time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down. |
|
| 47 |
+ Interval time.Duration `json:",omitempty"` // Interval is the time to wait between checks. |
|
| 48 |
+ Timeout time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung. |
|
| 49 |
+ StartPeriod time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down. |
|
| 50 |
+ StartInterval time.Duration `json:",omitempty"` // The interval to attempt healthchecks at during the start period |
|
| 50 | 51 |
|
| 51 | 52 |
// Retries is the number of consecutive failures needed to consider a container as unhealthy. |
| 52 | 53 |
// Zero means inherit. |
| ... | ... |
@@ -29,6 +29,9 @@ func (cli *Client) ContainerCreate(ctx context.Context, config *container.Config |
| 29 | 29 |
if err := cli.NewVersionError("1.41", "specify container image platform"); platform != nil && err != nil {
|
| 30 | 30 |
return response, err |
| 31 | 31 |
} |
| 32 |
+ if err := cli.NewVersionError("1.44", "specify health-check start interval"); config != nil && config.Healthcheck != nil && config.Healthcheck.StartInterval != 0 && err != nil {
|
|
| 33 |
+ return response, err |
|
| 34 |
+ } |
|
| 32 | 35 |
|
| 33 | 36 |
if hostConfig != nil {
|
| 34 | 37 |
if versions.LessThan(cli.ClientVersion(), "1.25") {
|
| ... | ... |
@@ -92,6 +92,9 @@ func merge(userConf, imageConf *containertypes.Config) error {
|
| 92 | 92 |
if userConf.Healthcheck.StartPeriod == 0 {
|
| 93 | 93 |
userConf.Healthcheck.StartPeriod = imageConf.Healthcheck.StartPeriod |
| 94 | 94 |
} |
| 95 |
+ if userConf.Healthcheck.StartInterval == 0 {
|
|
| 96 |
+ userConf.Healthcheck.StartInterval = imageConf.Healthcheck.StartInterval |
|
| 97 |
+ } |
|
| 95 | 98 |
if userConf.Healthcheck.Retries == 0 {
|
| 96 | 99 |
userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries |
| 97 | 100 |
} |
| ... | ... |
@@ -248,13 +248,31 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch |
| 248 | 248 |
// There is never more than one monitor thread running per container at a time. |
| 249 | 249 |
func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
|
| 250 | 250 |
probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) |
| 251 |
+ startInterval := timeoutWithDefault(c.Config.Healthcheck.StartInterval, defaultProbeInterval) |
|
| 252 |
+ startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) |
|
| 251 | 253 |
|
| 252 |
- intervalTimer := time.NewTimer(probeInterval) |
|
| 254 |
+ c.Lock() |
|
| 255 |
+ started := c.State.StartedAt |
|
| 256 |
+ c.Unlock() |
|
| 257 |
+ |
|
| 258 |
+ getInterval := func() time.Duration {
|
|
| 259 |
+ if time.Since(started) >= startPeriod {
|
|
| 260 |
+ return probeInterval |
|
| 261 |
+ } |
|
| 262 |
+ c.Lock() |
|
| 263 |
+ status := c.Health.Health.Status |
|
| 264 |
+ c.Unlock() |
|
| 265 |
+ |
|
| 266 |
+ if status == types.Starting {
|
|
| 267 |
+ return startInterval |
|
| 268 |
+ } |
|
| 269 |
+ return probeInterval |
|
| 270 |
+ } |
|
| 271 |
+ |
|
| 272 |
+ intervalTimer := time.NewTimer(getInterval()) |
|
| 253 | 273 |
defer intervalTimer.Stop() |
| 254 | 274 |
|
| 255 | 275 |
for {
|
| 256 |
- intervalTimer.Reset(probeInterval) |
|
| 257 |
- |
|
| 258 | 276 |
select {
|
| 259 | 277 |
case <-stop: |
| 260 | 278 |
log.G(context.TODO()).Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
|
| ... | ... |
@@ -296,6 +314,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
|
| 296 | 296 |
cancelProbe() |
| 297 | 297 |
} |
| 298 | 298 |
} |
| 299 |
+ intervalTimer.Reset(getInterval()) |
|
| 299 | 300 |
} |
| 300 | 301 |
} |
| 301 | 302 |
|
| ... | ... |
@@ -24,6 +24,8 @@ keywords: "API, Docker, rcli, REST, documentation" |
| 24 | 24 |
with runtimes which support the feature. |
| 25 | 25 |
`POST /containers/create`, `GET /containers/{id}/json`, and `GET /containers/json` now supports
|
| 26 | 26 |
`BindOptions.ReadOnlyNonRecursive` and `BindOptions.ReadOnlyForceRecursive` to customize the behavior. |
| 27 |
+* `POST /containers/create` now accepts a `HealthConfig.StartInterval` to set the |
|
| 28 |
+ interval for health checks during the start period. |
|
| 27 | 29 |
|
| 28 | 30 |
## v1.43 API changes |
| 29 | 31 |
|
| ... | ... |
@@ -111,6 +111,70 @@ func TestHealthCheckProcessKilled(t *testing.T) {
|
| 111 | 111 |
poll.WaitOn(t, pollForHealthCheckLog(ctx, apiClient, cID, "Health check exceeded timeout (50ms): logs logs logs\n")) |
| 112 | 112 |
} |
| 113 | 113 |
|
| 114 |
+func TestHealthStartInterval(t *testing.T) {
|
|
| 115 |
+ skip.If(t, testEnv.DaemonInfo.OSType == "windows", "The shell commands used in the test healthcheck do not work on Windows") |
|
| 116 |
+ defer setupTest(t)() |
|
| 117 |
+ ctx := context.Background() |
|
| 118 |
+ client := testEnv.APIClient() |
|
| 119 |
+ |
|
| 120 |
+ // Note: Windows is much slower than linux so this use longer intervals/timeouts |
|
| 121 |
+ id := container.Run(ctx, t, client, func(c *container.TestContainerConfig) {
|
|
| 122 |
+ c.Config.Healthcheck = &containertypes.HealthConfig{
|
|
| 123 |
+ Test: []string{"CMD-SHELL", `count="$(cat /tmp/health)"; if [ -z "${count}" ]; then let count=0; fi; let count=${count}+1; echo -n ${count} | tee /tmp/health; if [ ${count} -lt 3 ]; then exit 1; fi`},
|
|
| 124 |
+ Interval: 30 * time.Second, |
|
| 125 |
+ StartInterval: time.Second, |
|
| 126 |
+ StartPeriod: 30 * time.Second, |
|
| 127 |
+ } |
|
| 128 |
+ }) |
|
| 129 |
+ |
|
| 130 |
+ ctxPoll, cancel := context.WithTimeout(ctx, 30*time.Second) |
|
| 131 |
+ defer cancel() |
|
| 132 |
+ |
|
| 133 |
+ dl, _ := ctxPoll.Deadline() |
|
| 134 |
+ |
|
| 135 |
+ poll.WaitOn(t, func(log poll.LogT) poll.Result {
|
|
| 136 |
+ if ctxPoll.Err() != nil {
|
|
| 137 |
+ return poll.Error(ctxPoll.Err()) |
|
| 138 |
+ } |
|
| 139 |
+ inspect, err := client.ContainerInspect(ctxPoll, id) |
|
| 140 |
+ if err != nil {
|
|
| 141 |
+ return poll.Error(err) |
|
| 142 |
+ } |
|
| 143 |
+ if inspect.State.Health.Status != "healthy" {
|
|
| 144 |
+ if len(inspect.State.Health.Log) > 0 {
|
|
| 145 |
+ t.Log(inspect.State.Health.Log[len(inspect.State.Health.Log)-1]) |
|
| 146 |
+ } |
|
| 147 |
+ return poll.Continue("waiting on container to be ready")
|
|
| 148 |
+ } |
|
| 149 |
+ return poll.Success() |
|
| 150 |
+ }, poll.WithDelay(100*time.Millisecond), poll.WithTimeout(time.Until(dl))) |
|
| 151 |
+ cancel() |
|
| 152 |
+ |
|
| 153 |
+ ctxPoll, cancel = context.WithTimeout(ctx, 2*time.Minute) |
|
| 154 |
+ defer cancel() |
|
| 155 |
+ dl, _ = ctxPoll.Deadline() |
|
| 156 |
+ |
|
| 157 |
+ poll.WaitOn(t, func(log poll.LogT) poll.Result {
|
|
| 158 |
+ inspect, err := client.ContainerInspect(ctxPoll, id) |
|
| 159 |
+ if err != nil {
|
|
| 160 |
+ return poll.Error(err) |
|
| 161 |
+ } |
|
| 162 |
+ |
|
| 163 |
+ hLen := len(inspect.State.Health.Log) |
|
| 164 |
+ if hLen < 2 {
|
|
| 165 |
+ return poll.Continue("waiting for more healthcheck results")
|
|
| 166 |
+ } |
|
| 167 |
+ |
|
| 168 |
+ h1 := inspect.State.Health.Log[hLen-1] |
|
| 169 |
+ h2 := inspect.State.Health.Log[hLen-2] |
|
| 170 |
+ if h1.Start.Sub(h2.Start) >= inspect.Config.Healthcheck.Interval {
|
|
| 171 |
+ return poll.Success() |
|
| 172 |
+ } |
|
| 173 |
+ t.Log(h1.Start.Sub(h2.Start)) |
|
| 174 |
+ return poll.Continue("waiting for health check interval to switch from the start interval")
|
|
| 175 |
+ }, poll.WithDelay(time.Second), poll.WithTimeout(time.Until(dl))) |
|
| 176 |
+} |
|
| 177 |
+ |
|
| 114 | 178 |
func pollForHealthCheckLog(ctx context.Context, client client.APIClient, containerID string, expected string) func(log poll.LogT) poll.Result {
|
| 115 | 179 |
return func(log poll.LogT) poll.Result {
|
| 116 | 180 |
inspect, err := client.ContainerInspect(ctx, containerID) |