Browse code

Do not rely on "live" event anymore

Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>

Kenfe-Mickael Laventure authored on 2016/07/12 00:55:39
Showing 4 changed files
... ...
@@ -178,7 +178,7 @@ func (daemon *Daemon) restore() error {
178 178
 			rm := c.RestartManager(false)
179 179
 			if c.IsRunning() || c.IsPaused() {
180 180
 				if err := daemon.containerd.Restore(c.ID, libcontainerd.WithRestartManager(rm)); err != nil {
181
-					logrus.Errorf("Failed to restore with containerd: %q", err)
181
+					logrus.Errorf("Failed to restore %s with containerd: %s", c.ID, err)
182 182
 					return
183 183
 				}
184 184
 				if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
... ...
@@ -7,7 +7,7 @@ import (
7 7
 	"os/exec"
8 8
 	"path/filepath"
9 9
 	"strings"
10
-	"time"
10
+	"syscall"
11 11
 
12 12
 	"github.com/docker/docker/pkg/integration/checker"
13 13
 	"github.com/go-check/check"
... ...
@@ -129,7 +129,11 @@ func (s *DockerDaemonSuite) TestDaemonShutdownWithPlugins(c *check.C) {
129 129
 		c.Fatalf("Could not kill daemon: %v", err)
130 130
 	}
131 131
 
132
-	time.Sleep(5 * time.Second)
132
+	for {
133
+		if err := syscall.Kill(s.d.cmd.Process.Pid, 0); err == syscall.ESRCH {
134
+			break
135
+		}
136
+	}
133 137
 
134 138
 	cmd := exec.Command("pgrep", "-f", "plugin-no-remove")
135 139
 	if out, ec, err := runCommandWithOutput(cmd); ec != 1 {
... ...
@@ -281,16 +281,10 @@ func (clnt *client) cleanupOldRootfs(containerID string) {
281 281
 	}
282 282
 }
283 283
 
284
-func (clnt *client) setExited(containerID string) error {
284
+func (clnt *client) setExited(containerID string, exitCode uint32) error {
285 285
 	clnt.lock(containerID)
286 286
 	defer clnt.unlock(containerID)
287 287
 
288
-	var exitCode uint32
289
-	if event, ok := clnt.remote.pastEvents[containerID]; ok {
290
-		exitCode = event.Status
291
-		delete(clnt.remote.pastEvents, containerID)
292
-	}
293
-
294 288
 	err := clnt.backend.StateChanged(containerID, StateInfo{
295 289
 		CommonStateInfo: CommonStateInfo{
296 290
 			State:    StateExit,
... ...
@@ -393,7 +387,7 @@ func (clnt *client) getOrCreateExitNotifier(containerID string) *exitNotifier {
393 393
 	return w
394 394
 }
395 395
 
396
-func (clnt *client) restore(cont *containerd.Container, options ...CreateOption) (err error) {
396
+func (clnt *client) restore(cont *containerd.Container, lastEvent *containerd.Event, options ...CreateOption) (err error) {
397 397
 	clnt.lock(cont.Id)
398 398
 	defer clnt.unlock(cont.Id)
399 399
 
... ...
@@ -441,66 +435,132 @@ func (clnt *client) restore(cont *containerd.Container, options ...CreateOption)
441 441
 		return err
442 442
 	}
443 443
 
444
-	if event, ok := clnt.remote.pastEvents[containerID]; ok {
444
+	if lastEvent != nil {
445 445
 		// This should only be a pause or resume event
446
-		if event.Type == StatePause || event.Type == StateResume {
446
+		if lastEvent.Type == StatePause || lastEvent.Type == StateResume {
447 447
 			return clnt.backend.StateChanged(containerID, StateInfo{
448 448
 				CommonStateInfo: CommonStateInfo{
449
-					State: event.Type,
449
+					State: lastEvent.Type,
450 450
 					Pid:   container.systemPid,
451 451
 				}})
452 452
 		}
453 453
 
454
-		logrus.Warnf("unexpected backlog event: %#v", event)
454
+		logrus.Warnf("unexpected backlog event: %#v", lastEvent)
455 455
 	}
456 456
 
457 457
 	return nil
458 458
 }
459 459
 
460
-func (clnt *client) Restore(containerID string, options ...CreateOption) error {
461
-	if clnt.liveRestore {
462
-		cont, err := clnt.getContainerdContainer(containerID)
463
-		if err == nil && cont.Status != "stopped" {
464
-			if err := clnt.restore(cont, options...); err != nil {
465
-				logrus.Errorf("error restoring %s: %v", containerID, err)
460
+func (clnt *client) getContainerLastEvent(containerID string) (*containerd.Event, error) {
461
+	er := &containerd.EventsRequest{
462
+		Timestamp:  clnt.remote.restoreFromTimestamp,
463
+		StoredOnly: true,
464
+		Id:         containerID,
465
+	}
466
+	events, err := clnt.remote.apiClient.Events(context.Background(), er)
467
+	if err != nil {
468
+		logrus.Errorf("libcontainerd: failed to get container events stream for %s: %q", er.Id, err)
469
+		return nil, err
470
+	}
471
+
472
+	var ev *containerd.Event
473
+	for {
474
+		e, err := events.Recv()
475
+		if err != nil {
476
+			if err.Error() == "EOF" {
477
+				break
466 478
 			}
467
-			return nil
479
+			logrus.Errorf("libcontainerd: failed to get container event for %s: %q", containerID, err)
480
+			return nil, err
481
+		}
482
+
483
+		logrus.Debugf("libcontainerd: received past event %#v", e)
484
+
485
+		switch e.Type {
486
+		case StateExit, StatePause, StateResume:
487
+			ev = e
468 488
 		}
469
-		return clnt.setExited(containerID)
470 489
 	}
471 490
 
491
+	return ev, nil
492
+}
493
+
494
+func (clnt *client) Restore(containerID string, options ...CreateOption) error {
495
+	// Synchronize with live events
496
+	clnt.remote.Lock()
497
+	defer clnt.remote.Unlock()
498
+	// Check that containerd still knows this container.
499
+	//
500
+	// In the unlikely event that Restore for this container process
501
+	// the its past event before the main loop, the event will be
502
+	// processed twice. However, this is not an issue as all those
503
+	// events will do is change the state of the container to be
504
+	// exactly the same.
472 505
 	cont, err := clnt.getContainerdContainer(containerID)
473
-	if err == nil && cont.Status != "stopped" {
474
-		w := clnt.getOrCreateExitNotifier(containerID)
475
-		clnt.lock(cont.Id)
476
-		container := clnt.newContainer(cont.BundlePath)
477
-		container.systemPid = systemPid(cont)
478
-		clnt.appendContainer(container)
479
-		clnt.unlock(cont.Id)
480
-
481
-		container.discardFifos()
482
-
483
-		if err := clnt.Signal(containerID, int(syscall.SIGTERM)); err != nil {
484
-			logrus.Errorf("error sending sigterm to %v: %v", containerID, err)
506
+	// Get its last event
507
+	ev, eerr := clnt.getContainerLastEvent(containerID)
508
+	if err != nil || cont.Status == "Stopped" {
509
+		if err != nil && !strings.Contains(err.Error(), "container not found") {
510
+			// Legitimate error
511
+			return err
512
+		}
513
+
514
+		// If ev is nil, then we already consumed all the event of the
515
+		// container, included the "exit" one.
516
+		// Thus we return to avoid overriding the Exit Code.
517
+		if ev == nil {
518
+			logrus.Warnf("libcontainerd: restore was called on a fully synced container (%s)", containerID)
519
+			return nil
520
+		}
521
+
522
+		// get the exit status for this container
523
+		ec := uint32(0)
524
+		if eerr == nil && ev.Type == StateExit {
525
+			ec = ev.Status
526
+		}
527
+		clnt.setExited(containerID, ec)
528
+
529
+		return nil
530
+	}
531
+
532
+	// container is still alive
533
+	if clnt.liveRestore {
534
+		if err := clnt.restore(cont, ev, options...); err != nil {
535
+			logrus.Errorf("error restoring %s: %v", containerID, err)
536
+		}
537
+		return nil
538
+	}
539
+
540
+	// Kill the container if liveRestore == false
541
+	w := clnt.getOrCreateExitNotifier(containerID)
542
+	clnt.lock(cont.Id)
543
+	container := clnt.newContainer(cont.BundlePath)
544
+	container.systemPid = systemPid(cont)
545
+	clnt.appendContainer(container)
546
+	clnt.unlock(cont.Id)
547
+
548
+	container.discardFifos()
549
+
550
+	if err := clnt.Signal(containerID, int(syscall.SIGTERM)); err != nil {
551
+		logrus.Errorf("error sending sigterm to %v: %v", containerID, err)
552
+	}
553
+	select {
554
+	case <-time.After(10 * time.Second):
555
+		if err := clnt.Signal(containerID, int(syscall.SIGKILL)); err != nil {
556
+			logrus.Errorf("error sending sigkill to %v: %v", containerID, err)
485 557
 		}
486 558
 		select {
487
-		case <-time.After(10 * time.Second):
488
-			if err := clnt.Signal(containerID, int(syscall.SIGKILL)); err != nil {
489
-				logrus.Errorf("error sending sigkill to %v: %v", containerID, err)
490
-			}
491
-			select {
492
-			case <-time.After(2 * time.Second):
493
-			case <-w.wait():
494
-				return nil
495
-			}
559
+		case <-time.After(2 * time.Second):
496 560
 		case <-w.wait():
497 561
 			return nil
498 562
 		}
563
+	case <-w.wait():
564
+		return nil
499 565
 	}
500 566
 
501 567
 	clnt.deleteContainer(containerID)
502 568
 
503
-	return clnt.setExited(containerID)
569
+	return clnt.setExited(containerID, uint32(255))
504 570
 }
505 571
 
506 572
 type exitNotifier struct {
... ...
@@ -21,6 +21,7 @@ import (
21 21
 	sysinfo "github.com/docker/docker/pkg/system"
22 22
 	"github.com/docker/docker/utils"
23 23
 	"github.com/golang/protobuf/ptypes"
24
+	"github.com/golang/protobuf/ptypes/timestamp"
24 25
 	"golang.org/x/net/context"
25 26
 	"google.golang.org/grpc"
26 27
 	"google.golang.org/grpc/grpclog"
... ...
@@ -40,22 +41,22 @@ const (
40 40
 
41 41
 type remote struct {
42 42
 	sync.RWMutex
43
-	apiClient     containerd.APIClient
44
-	daemonPid     int
45
-	stateDir      string
46
-	rpcAddr       string
47
-	startDaemon   bool
48
-	closeManually bool
49
-	debugLog      bool
50
-	rpcConn       *grpc.ClientConn
51
-	clients       []*client
52
-	eventTsPath   string
53
-	pastEvents    map[string]*containerd.Event
54
-	runtime       string
55
-	runtimeArgs   []string
56
-	daemonWaitCh  chan struct{}
57
-	liveRestore   bool
58
-	oomScore      int
43
+	apiClient            containerd.APIClient
44
+	daemonPid            int
45
+	stateDir             string
46
+	rpcAddr              string
47
+	startDaemon          bool
48
+	closeManually        bool
49
+	debugLog             bool
50
+	rpcConn              *grpc.ClientConn
51
+	clients              []*client
52
+	eventTsPath          string
53
+	runtime              string
54
+	runtimeArgs          []string
55
+	daemonWaitCh         chan struct{}
56
+	liveRestore          bool
57
+	oomScore             int
58
+	restoreFromTimestamp *timestamp.Timestamp
59 59
 }
60 60
 
61 61
 // New creates a fresh instance of libcontainerd remote.
... ...
@@ -69,7 +70,6 @@ func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
69 69
 		stateDir:    stateDir,
70 70
 		daemonPid:   -1,
71 71
 		eventTsPath: filepath.Join(stateDir, eventTimestampFilename),
72
-		pastEvents:  make(map[string]*containerd.Event),
73 72
 	}
74 73
 	for _, option := range options {
75 74
 		if err := option.Apply(r); err != nil {
... ...
@@ -106,6 +106,14 @@ func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
106 106
 	r.rpcConn = conn
107 107
 	r.apiClient = containerd.NewAPIClient(conn)
108 108
 
109
+	// Get the timestamp to restore from
110
+	t := r.getLastEventTimestamp()
111
+	tsp, err := ptypes.TimestampProto(t)
112
+	if err != nil {
113
+		logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
114
+	}
115
+	r.restoreFromTimestamp = tsp
116
+
109 117
 	go r.handleConnectionChange()
110 118
 
111 119
 	if err := r.startEventsMonitor(); err != nil {
... ...
@@ -257,7 +265,8 @@ func (r *remote) getLastEventTimestamp() time.Time {
257 257
 
258 258
 func (r *remote) startEventsMonitor() error {
259 259
 	// First, get past events
260
-	tsp, err := ptypes.TimestampProto(r.getLastEventTimestamp())
260
+	t := r.getLastEventTimestamp()
261
+	tsp, err := ptypes.TimestampProto(t)
261 262
 	if err != nil {
262 263
 		logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
263 264
 	}
... ...
@@ -299,7 +308,7 @@ func (r *remote) handleEventStream(events containerd.API_EventsClient) {
299 299
 		}
300 300
 		r.RUnlock()
301 301
 		if container == nil {
302
-			logrus.Errorf("libcontainerd: %q", err)
302
+			logrus.Warnf("libcontainerd: unknown container %s", e.Id)
303 303
 			continue
304 304
 		}
305 305