Browse code

Fix race with containerd events stream on restore

Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>

Kenfe-Mickael Laventure authored on 2016/11/24 07:26:20
Showing 1 changed files
... ...
@@ -405,13 +405,8 @@ func (clnt *client) getContainerLastEventSinceTime(id string, tsp *timestamp.Tim
405 405
 			logrus.Errorf("libcontainerd: failed to get container event for %s: %q", id, err)
406 406
 			return nil, err
407 407
 		}
408
-
409
-		logrus.Debugf("libcontainerd: received past event %#v", e)
410
-
411
-		switch e.Type {
412
-		case StateExit, StatePause, StateResume:
413
-			ev = e
414
-		}
408
+		ev = e
409
+		logrus.Debugf("libcontainerd: received past event %#v", ev)
415 410
 	}
416 411
 
417 412
 	return ev, nil
... ...
@@ -456,30 +451,36 @@ func (clnt *client) Restore(containerID string, attachStdio StdioCallback, optio
456 456
 	// Get its last event
457 457
 	ev, eerr := clnt.getContainerLastEvent(containerID)
458 458
 	if err != nil || cont.Status == "Stopped" {
459
-		if err != nil && !strings.Contains(err.Error(), "container not found") {
460
-			// Legitimate error
461
-			return err
459
+		if err != nil {
460
+			logrus.Warnf("libcontainerd: failed to retrieve container %s state: %v", containerID, err)
462 461
 		}
463
-
464
-		if ev == nil {
465
-			if _, err := clnt.getContainer(containerID); err == nil {
466
-				// If ev is nil and the container is running in containerd,
467
-				// we already consumed all the event of the
468
-				// container, included the "exit" one.
469
-				// Thus we return to avoid overriding the Exit Code.
470
-				logrus.Warnf("libcontainerd: restore was called on a fully synced container (%s)", containerID)
471
-				return nil
472
-			}
473
-			// the container is not running so we need to fix the state within docker
474
-			ev = &containerd.Event{
475
-				Type:   StateExit,
476
-				Status: 1,
462
+		if ev != nil && ev.Pid != InitFriendlyName || ev.Type != StateExit {
463
+			// Wait a while for the exit event
464
+			timeout := time.NewTimer(10 * time.Second)
465
+			tick := time.NewTicker(100 * time.Millisecond)
466
+		stop:
467
+			for {
468
+				select {
469
+				case <-timeout.C:
470
+					break stop
471
+				case <-tick.C:
472
+					ev, eerr = clnt.getContainerLastEvent(containerID)
473
+					if eerr != nil {
474
+						break stop
475
+					}
476
+					if ev != nil && ev.Pid == InitFriendlyName && ev.Type == StateExit {
477
+						break stop
478
+					}
479
+				}
477 480
 			}
481
+			timeout.Stop()
482
+			tick.Stop()
478 483
 		}
479 484
 
480
-		// get the exit status for this container
481
-		ec := uint32(0)
482
-		if eerr == nil && ev.Type == StateExit {
485
+		// get the exit status for this container, if we don't have
486
+		// one, indicate an error
487
+		ec := uint32(255)
488
+		if eerr == nil && ev != nil && ev.Pid == InitFriendlyName && ev.Type == StateExit {
483 489
 			ec = ev.Status
484 490
 		}
485 491
 		clnt.setExited(containerID, ec)