Browse code

Cleanup container shutdown check and add test

Adds a test case for the case where dockerd gets stuck on startup due to
hanging `daemon.shutdownContainer`

Signed-off-by: Brian Goff <cpuguy83@gmail.com>

Brian Goff authored on 2020/12/20 07:04:06
Showing 3 changed files
... ...
@@ -330,10 +330,15 @@ func (daemon *Daemon) restore() error {
330 330
 
331 331
 			daemon.setStateCounter(c)
332 332
 
333
-			log.WithFields(logrus.Fields{
334
-				"running": c.IsRunning(),
335
-				"paused":  c.IsPaused(),
336
-			}).Debug("restoring container")
333
+			logger := func(c *container.Container) *logrus.Entry {
334
+				return log.WithFields(logrus.Fields{
335
+					"running":    c.IsRunning(),
336
+					"paused":     c.IsPaused(),
337
+					"restarting": c.IsRestarting(),
338
+				})
339
+			}
340
+
341
+			logger(c).Debug("restoring container")
337 342
 
338 343
 			var (
339 344
 				err      error
... ...
@@ -345,17 +350,24 @@ func (daemon *Daemon) restore() error {
345 345
 
346 346
 			alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio)
347 347
 			if err != nil && !errdefs.IsNotFound(err) {
348
-				log.WithError(err).Error("failed to restore container with containerd")
348
+				logger(c).WithError(err).Error("failed to restore container with containerd")
349 349
 				return
350 350
 			}
351
-			log.Debugf("alive: %v", alive)
352
-			if !alive && process != nil {
353
-				ec, exitedAt, err = process.Delete(context.Background())
354
-				if err != nil && !errdefs.IsNotFound(err) {
355
-					log.WithError(err).Error("failed to delete container from containerd")
356
-					return
351
+			logger(c).Debugf("alive: %v", alive)
352
+			if !alive {
353
+				// If process is not nil, cleanup dead container from containerd.
354
+				// If process is nil then the above `containerd.Restore` returned an errdefs.NotFoundError,
355
+				// and docker's view of the container state will be updated accorrdingly via SetStopped further down.
356
+				if process != nil {
357
+					logger(c).Debug("cleaning up dead container process")
358
+					ec, exitedAt, err = process.Delete(context.Background())
359
+					if err != nil && !errdefs.IsNotFound(err) {
360
+						logger(c).WithError(err).Error("failed to delete container from containerd")
361
+						return
362
+					}
357 363
 				}
358
-			} else if alive && !daemon.configStore.LiveRestoreEnabled {
364
+			} else if !daemon.configStore.LiveRestoreEnabled {
365
+				logger(c).Debug("shutting down container considered alive by containerd")
359 366
 				if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) {
360 367
 					log.WithError(err).Error("error shutting down container")
361 368
 					return
... ...
@@ -364,14 +376,16 @@ func (daemon *Daemon) restore() error {
364 364
 			}
365 365
 
366 366
 			if c.IsRunning() || c.IsPaused() {
367
+				logger(c).Debug("syncing container on disk state with real state")
368
+
367 369
 				c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking
368 370
 
369 371
 				if c.IsPaused() && alive {
370 372
 					s, err := daemon.containerd.Status(context.Background(), c.ID)
371 373
 					if err != nil {
372
-						log.WithError(err).Error("failed to get container status")
374
+						logger(c).WithError(err).Error("failed to get container status")
373 375
 					} else {
374
-						log.WithField("state", s).Info("restored container paused")
376
+						logger(c).WithField("state", s).Info("restored container paused")
375 377
 						switch s {
376 378
 						case containerd.Paused, containerd.Pausing:
377 379
 							// nothing to do
... ...
@@ -393,14 +407,15 @@ func (daemon *Daemon) restore() error {
393 393
 				}
394 394
 
395 395
 				if !alive {
396
+					logger(c).Debug("setting stopped state")
396 397
 					c.Lock()
397
-					log.Debug("setting stopped state")
398 398
 					c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt})
399 399
 					daemon.Cleanup(c)
400 400
 					if err := c.CheckpointTo(daemon.containersReplica); err != nil {
401 401
 						log.WithError(err).Error("failed to update stopped container state")
402 402
 					}
403 403
 					c.Unlock()
404
+					logger(c).Debug("set stopped state")
404 405
 				}
405 406
 
406 407
 				// we call Mount and then Unmount to get BaseFs of the container
... ...
@@ -411,10 +426,10 @@ func (daemon *Daemon) restore() error {
411 411
 					// stopped/restarted/removed.
412 412
 					// See #29365 for related information.
413 413
 					// The error is only logged here.
414
-					log.WithError(err).Warn("failed to mount container to get BaseFs path")
414
+					logger(c).WithError(err).Warn("failed to mount container to get BaseFs path")
415 415
 				} else {
416 416
 					if err := daemon.Unmount(c); err != nil {
417
-						log.WithError(err).Warn("failed to umount container to get BaseFs path")
417
+						logger(c).WithError(err).Warn("failed to umount container to get BaseFs path")
418 418
 					}
419 419
 				}
420 420
 
... ...
@@ -422,7 +437,7 @@ func (daemon *Daemon) restore() error {
422 422
 				if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
423 423
 					options, err := daemon.buildSandboxOptions(c)
424 424
 					if err != nil {
425
-						log.WithError(err).Warn("failed to build sandbox option to restore container")
425
+						logger(c).WithError(err).Warn("failed to build sandbox option to restore container")
426 426
 					}
427 427
 					mapLock.Lock()
428 428
 					activeSandboxes[c.NetworkSettings.SandboxID] = options
... ...
@@ -467,6 +482,7 @@ func (daemon *Daemon) restore() error {
467 467
 				}
468 468
 			}
469 469
 			c.Unlock()
470
+			logger(c).Debug("done restoring container")
470 471
 		}(c)
471 472
 	}
472 473
 	group.Wait()
... ...
@@ -2,13 +2,18 @@ package container // import "github.com/docker/docker/integration/container"
2 2
 
3 3
 import (
4 4
 	"context"
5
+	"encoding/json"
5 6
 	"fmt"
6 7
 	"io/ioutil"
8
+	"path/filepath"
7 9
 	"strconv"
8 10
 	"strings"
9 11
 	"testing"
12
+	"time"
10 13
 
11 14
 	"github.com/docker/docker/api/types"
15
+	containerapi "github.com/docker/docker/api/types/container"
16
+	realcontainer "github.com/docker/docker/container"
12 17
 	"github.com/docker/docker/integration/internal/container"
13 18
 	"github.com/docker/docker/testutil/daemon"
14 19
 	"golang.org/x/sys/unix"
... ...
@@ -166,3 +171,65 @@ func TestDaemonHostGatewayIP(t *testing.T) {
166 166
 	d.Stop(t)
167 167
 
168 168
 }
169
+
170
+// TestRestartDaemonWithRestartingContainer simulates a case where a container is in "restarting" state when
171
+// dockerd is killed (due to machine reset or something else).
172
+//
173
+// Related to moby/moby#41817
174
+//
175
+// In this test we'll change the container state to "restarting".
176
+// This means that the container will not be 'alive' when we attempt to restore in on daemon startup.
177
+//
178
+// We could do the same with `docker run -d --resetart=always busybox:latest exit 1`, and then
179
+// `kill -9` dockerd while the container is in "restarting" state. This is difficult to reproduce reliably
180
+// in an automated test, so we manipulate on disk state instead.
181
+func TestRestartDaemonWithRestartingContainer(t *testing.T) {
182
+	skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run")
183
+	skip.If(t, testEnv.DaemonInfo.OSType == "windows")
184
+
185
+	t.Parallel()
186
+
187
+	d := daemon.New(t)
188
+	defer d.Cleanup(t)
189
+
190
+	d.StartWithBusybox(t, "--iptables=false")
191
+	defer d.Kill()
192
+
193
+	ctx := context.Background()
194
+	client := d.NewClientT(t)
195
+
196
+	// Just create the container, no need to start it to be started.
197
+	// We really want to make sure there is no process running when docker starts back up.
198
+	// We will manipulate the on disk state later
199
+	id := container.Create(ctx, t, client, container.WithRestartPolicy("always"), container.WithCmd("/bin/sh", "-c", "exit 1"))
200
+
201
+	// SIGKILL the daemon
202
+	assert.NilError(t, d.Kill())
203
+
204
+	configPath := filepath.Join(d.Root, "containers", id, "config.v2.json")
205
+	configBytes, err := ioutil.ReadFile(configPath)
206
+	assert.NilError(t, err)
207
+
208
+	var c realcontainer.Container
209
+
210
+	assert.NilError(t, json.Unmarshal(configBytes, &c))
211
+
212
+	c.State = realcontainer.NewState()
213
+	c.SetRestarting(&realcontainer.ExitStatus{ExitCode: 1})
214
+	c.HasBeenStartedBefore = true
215
+
216
+	configBytes, err = json.Marshal(&c)
217
+	assert.NilError(t, err)
218
+	assert.NilError(t, ioutil.WriteFile(configPath, configBytes, 0600))
219
+
220
+	d.Start(t)
221
+
222
+	ctxTimeout, cancel := context.WithTimeout(ctx, 30*time.Second)
223
+	defer cancel()
224
+	chOk, chErr := client.ContainerWait(ctxTimeout, id, containerapi.WaitConditionNextExit)
225
+	select {
226
+	case <-chOk:
227
+	case err := <-chErr:
228
+		assert.NilError(t, err)
229
+	}
230
+}
... ...
@@ -288,6 +288,7 @@ func (d *Daemon) Cleanup(t testing.TB) {
288 288
 func (d *Daemon) Start(t testing.TB, args ...string) {
289 289
 	t.Helper()
290 290
 	if err := d.StartWithError(args...); err != nil {
291
+		d.DumpStackAndQuit() // in case the daemon is stuck
291 292
 		t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err)
292 293
 	}
293 294
 }