Adds a test case for the case where dockerd gets stuck on startup due to
hanging `daemon.shutdownContainer`
Signed-off-by: Brian Goff <cpuguy83@gmail.com>
| ... | ... |
@@ -330,10 +330,15 @@ func (daemon *Daemon) restore() error {
|
| 330 | 330 |
|
| 331 | 331 |
daemon.setStateCounter(c) |
| 332 | 332 |
|
| 333 |
- log.WithFields(logrus.Fields{
|
|
| 334 |
- "running": c.IsRunning(), |
|
| 335 |
- "paused": c.IsPaused(), |
|
| 336 |
- }).Debug("restoring container")
|
|
| 333 |
+ logger := func(c *container.Container) *logrus.Entry {
|
|
| 334 |
+ return log.WithFields(logrus.Fields{
|
|
| 335 |
+ "running": c.IsRunning(), |
|
| 336 |
+ "paused": c.IsPaused(), |
|
| 337 |
+ "restarting": c.IsRestarting(), |
|
| 338 |
+ }) |
|
| 339 |
+ } |
|
| 340 |
+ |
|
| 341 |
+ logger(c).Debug("restoring container")
|
|
| 337 | 342 |
|
| 338 | 343 |
var ( |
| 339 | 344 |
err error |
| ... | ... |
@@ -345,17 +350,24 @@ func (daemon *Daemon) restore() error {
|
| 345 | 345 |
|
| 346 | 346 |
alive, _, process, err = daemon.containerd.Restore(context.Background(), c.ID, c.InitializeStdio) |
| 347 | 347 |
if err != nil && !errdefs.IsNotFound(err) {
|
| 348 |
- log.WithError(err).Error("failed to restore container with containerd")
|
|
| 348 |
+ logger(c).WithError(err).Error("failed to restore container with containerd")
|
|
| 349 | 349 |
return |
| 350 | 350 |
} |
| 351 |
- log.Debugf("alive: %v", alive)
|
|
| 352 |
- if !alive && process != nil {
|
|
| 353 |
- ec, exitedAt, err = process.Delete(context.Background()) |
|
| 354 |
- if err != nil && !errdefs.IsNotFound(err) {
|
|
| 355 |
- log.WithError(err).Error("failed to delete container from containerd")
|
|
| 356 |
- return |
|
| 351 |
+ logger(c).Debugf("alive: %v", alive)
|
|
| 352 |
+ if !alive {
|
|
| 353 |
+ // If process is not nil, cleanup dead container from containerd. |
|
| 354 |
+ // If process is nil then the above `containerd.Restore` returned an errdefs.NotFoundError, |
|
| 355 |
+ // and docker's view of the container state will be updated accorrdingly via SetStopped further down. |
|
| 356 |
+ if process != nil {
|
|
| 357 |
+ logger(c).Debug("cleaning up dead container process")
|
|
| 358 |
+ ec, exitedAt, err = process.Delete(context.Background()) |
|
| 359 |
+ if err != nil && !errdefs.IsNotFound(err) {
|
|
| 360 |
+ logger(c).WithError(err).Error("failed to delete container from containerd")
|
|
| 361 |
+ return |
|
| 362 |
+ } |
|
| 357 | 363 |
} |
| 358 |
- } else if alive && !daemon.configStore.LiveRestoreEnabled {
|
|
| 364 |
+ } else if !daemon.configStore.LiveRestoreEnabled {
|
|
| 365 |
+ logger(c).Debug("shutting down container considered alive by containerd")
|
|
| 359 | 366 |
if err := daemon.shutdownContainer(c); err != nil && !errdefs.IsNotFound(err) {
|
| 360 | 367 |
log.WithError(err).Error("error shutting down container")
|
| 361 | 368 |
return |
| ... | ... |
@@ -364,14 +376,16 @@ func (daemon *Daemon) restore() error {
|
| 364 | 364 |
} |
| 365 | 365 |
|
| 366 | 366 |
if c.IsRunning() || c.IsPaused() {
|
| 367 |
+ logger(c).Debug("syncing container on disk state with real state")
|
|
| 368 |
+ |
|
| 367 | 369 |
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking |
| 368 | 370 |
|
| 369 | 371 |
if c.IsPaused() && alive {
|
| 370 | 372 |
s, err := daemon.containerd.Status(context.Background(), c.ID) |
| 371 | 373 |
if err != nil {
|
| 372 |
- log.WithError(err).Error("failed to get container status")
|
|
| 374 |
+ logger(c).WithError(err).Error("failed to get container status")
|
|
| 373 | 375 |
} else {
|
| 374 |
- log.WithField("state", s).Info("restored container paused")
|
|
| 376 |
+ logger(c).WithField("state", s).Info("restored container paused")
|
|
| 375 | 377 |
switch s {
|
| 376 | 378 |
case containerd.Paused, containerd.Pausing: |
| 377 | 379 |
// nothing to do |
| ... | ... |
@@ -393,14 +407,15 @@ func (daemon *Daemon) restore() error {
|
| 393 | 393 |
} |
| 394 | 394 |
|
| 395 | 395 |
if !alive {
|
| 396 |
+ logger(c).Debug("setting stopped state")
|
|
| 396 | 397 |
c.Lock() |
| 397 |
- log.Debug("setting stopped state")
|
|
| 398 | 398 |
c.SetStopped(&container.ExitStatus{ExitCode: int(ec), ExitedAt: exitedAt})
|
| 399 | 399 |
daemon.Cleanup(c) |
| 400 | 400 |
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
|
| 401 | 401 |
log.WithError(err).Error("failed to update stopped container state")
|
| 402 | 402 |
} |
| 403 | 403 |
c.Unlock() |
| 404 |
+ logger(c).Debug("set stopped state")
|
|
| 404 | 405 |
} |
| 405 | 406 |
|
| 406 | 407 |
// we call Mount and then Unmount to get BaseFs of the container |
| ... | ... |
@@ -411,10 +426,10 @@ func (daemon *Daemon) restore() error {
|
| 411 | 411 |
// stopped/restarted/removed. |
| 412 | 412 |
// See #29365 for related information. |
| 413 | 413 |
// The error is only logged here. |
| 414 |
- log.WithError(err).Warn("failed to mount container to get BaseFs path")
|
|
| 414 |
+ logger(c).WithError(err).Warn("failed to mount container to get BaseFs path")
|
|
| 415 | 415 |
} else {
|
| 416 | 416 |
if err := daemon.Unmount(c); err != nil {
|
| 417 |
- log.WithError(err).Warn("failed to umount container to get BaseFs path")
|
|
| 417 |
+ logger(c).WithError(err).Warn("failed to umount container to get BaseFs path")
|
|
| 418 | 418 |
} |
| 419 | 419 |
} |
| 420 | 420 |
|
| ... | ... |
@@ -422,7 +437,7 @@ func (daemon *Daemon) restore() error {
|
| 422 | 422 |
if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
|
| 423 | 423 |
options, err := daemon.buildSandboxOptions(c) |
| 424 | 424 |
if err != nil {
|
| 425 |
- log.WithError(err).Warn("failed to build sandbox option to restore container")
|
|
| 425 |
+ logger(c).WithError(err).Warn("failed to build sandbox option to restore container")
|
|
| 426 | 426 |
} |
| 427 | 427 |
mapLock.Lock() |
| 428 | 428 |
activeSandboxes[c.NetworkSettings.SandboxID] = options |
| ... | ... |
@@ -467,6 +482,7 @@ func (daemon *Daemon) restore() error {
|
| 467 | 467 |
} |
| 468 | 468 |
} |
| 469 | 469 |
c.Unlock() |
| 470 |
+ logger(c).Debug("done restoring container")
|
|
| 470 | 471 |
}(c) |
| 471 | 472 |
} |
| 472 | 473 |
group.Wait() |
| ... | ... |
@@ -2,13 +2,18 @@ package container // import "github.com/docker/docker/integration/container" |
| 2 | 2 |
|
| 3 | 3 |
import ( |
| 4 | 4 |
"context" |
| 5 |
+ "encoding/json" |
|
| 5 | 6 |
"fmt" |
| 6 | 7 |
"io/ioutil" |
| 8 |
+ "path/filepath" |
|
| 7 | 9 |
"strconv" |
| 8 | 10 |
"strings" |
| 9 | 11 |
"testing" |
| 12 |
+ "time" |
|
| 10 | 13 |
|
| 11 | 14 |
"github.com/docker/docker/api/types" |
| 15 |
+ containerapi "github.com/docker/docker/api/types/container" |
|
| 16 |
+ realcontainer "github.com/docker/docker/container" |
|
| 12 | 17 |
"github.com/docker/docker/integration/internal/container" |
| 13 | 18 |
"github.com/docker/docker/testutil/daemon" |
| 14 | 19 |
"golang.org/x/sys/unix" |
| ... | ... |
@@ -166,3 +171,65 @@ func TestDaemonHostGatewayIP(t *testing.T) {
|
| 166 | 166 |
d.Stop(t) |
| 167 | 167 |
|
| 168 | 168 |
} |
| 169 |
+ |
|
| 170 |
+// TestRestartDaemonWithRestartingContainer simulates a case where a container is in "restarting" state when |
|
| 171 |
+// dockerd is killed (due to machine reset or something else). |
|
| 172 |
+// |
|
| 173 |
+// Related to moby/moby#41817 |
|
| 174 |
+// |
|
| 175 |
+// In this test we'll change the container state to "restarting". |
|
| 176 |
+// This means that the container will not be 'alive' when we attempt to restore in on daemon startup. |
|
| 177 |
+// |
|
| 178 |
+// We could do the same with `docker run -d --resetart=always busybox:latest exit 1`, and then |
|
| 179 |
+// `kill -9` dockerd while the container is in "restarting" state. This is difficult to reproduce reliably |
|
| 180 |
+// in an automated test, so we manipulate on disk state instead. |
|
| 181 |
+func TestRestartDaemonWithRestartingContainer(t *testing.T) {
|
|
| 182 |
+ skip.If(t, testEnv.IsRemoteDaemon, "cannot start daemon on remote test run") |
|
| 183 |
+ skip.If(t, testEnv.DaemonInfo.OSType == "windows") |
|
| 184 |
+ |
|
| 185 |
+ t.Parallel() |
|
| 186 |
+ |
|
| 187 |
+ d := daemon.New(t) |
|
| 188 |
+ defer d.Cleanup(t) |
|
| 189 |
+ |
|
| 190 |
+ d.StartWithBusybox(t, "--iptables=false") |
|
| 191 |
+ defer d.Kill() |
|
| 192 |
+ |
|
| 193 |
+ ctx := context.Background() |
|
| 194 |
+ client := d.NewClientT(t) |
|
| 195 |
+ |
|
| 196 |
+ // Just create the container, no need to start it to be started. |
|
| 197 |
+ // We really want to make sure there is no process running when docker starts back up. |
|
| 198 |
+ // We will manipulate the on disk state later |
|
| 199 |
+ id := container.Create(ctx, t, client, container.WithRestartPolicy("always"), container.WithCmd("/bin/sh", "-c", "exit 1"))
|
|
| 200 |
+ |
|
| 201 |
+ // SIGKILL the daemon |
|
| 202 |
+ assert.NilError(t, d.Kill()) |
|
| 203 |
+ |
|
| 204 |
+ configPath := filepath.Join(d.Root, "containers", id, "config.v2.json") |
|
| 205 |
+ configBytes, err := ioutil.ReadFile(configPath) |
|
| 206 |
+ assert.NilError(t, err) |
|
| 207 |
+ |
|
| 208 |
+ var c realcontainer.Container |
|
| 209 |
+ |
|
| 210 |
+ assert.NilError(t, json.Unmarshal(configBytes, &c)) |
|
| 211 |
+ |
|
| 212 |
+ c.State = realcontainer.NewState() |
|
| 213 |
+ c.SetRestarting(&realcontainer.ExitStatus{ExitCode: 1})
|
|
| 214 |
+ c.HasBeenStartedBefore = true |
|
| 215 |
+ |
|
| 216 |
+ configBytes, err = json.Marshal(&c) |
|
| 217 |
+ assert.NilError(t, err) |
|
| 218 |
+ assert.NilError(t, ioutil.WriteFile(configPath, configBytes, 0600)) |
|
| 219 |
+ |
|
| 220 |
+ d.Start(t) |
|
| 221 |
+ |
|
| 222 |
+ ctxTimeout, cancel := context.WithTimeout(ctx, 30*time.Second) |
|
| 223 |
+ defer cancel() |
|
| 224 |
+ chOk, chErr := client.ContainerWait(ctxTimeout, id, containerapi.WaitConditionNextExit) |
|
| 225 |
+ select {
|
|
| 226 |
+ case <-chOk: |
|
| 227 |
+ case err := <-chErr: |
|
| 228 |
+ assert.NilError(t, err) |
|
| 229 |
+ } |
|
| 230 |
+} |
| ... | ... |
@@ -288,6 +288,7 @@ func (d *Daemon) Cleanup(t testing.TB) {
|
| 288 | 288 |
func (d *Daemon) Start(t testing.TB, args ...string) {
|
| 289 | 289 |
t.Helper() |
| 290 | 290 |
if err := d.StartWithError(args...); err != nil {
|
| 291 |
+ d.DumpStackAndQuit() // in case the daemon is stuck |
|
| 291 | 292 |
t.Fatalf("[%s] failed to start daemon with arguments %v : %v", d.id, d.args, err)
|
| 292 | 293 |
} |
| 293 | 294 |
} |