Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>
| ... | ... |
@@ -178,7 +178,7 @@ func (daemon *Daemon) restore() error {
|
| 178 | 178 |
rm := c.RestartManager(false) |
| 179 | 179 |
if c.IsRunning() || c.IsPaused() {
|
| 180 | 180 |
if err := daemon.containerd.Restore(c.ID, libcontainerd.WithRestartManager(rm)); err != nil {
|
| 181 |
- logrus.Errorf("Failed to restore with containerd: %q", err)
|
|
| 181 |
+ logrus.Errorf("Failed to restore %s with containerd: %s", c.ID, err)
|
|
| 182 | 182 |
return |
| 183 | 183 |
} |
| 184 | 184 |
if !c.HostConfig.NetworkMode.IsContainer() && c.IsRunning() {
|
| ... | ... |
@@ -7,7 +7,7 @@ import ( |
| 7 | 7 |
"os/exec" |
| 8 | 8 |
"path/filepath" |
| 9 | 9 |
"strings" |
| 10 |
- "time" |
|
| 10 |
+ "syscall" |
|
| 11 | 11 |
|
| 12 | 12 |
"github.com/docker/docker/pkg/integration/checker" |
| 13 | 13 |
"github.com/go-check/check" |
| ... | ... |
@@ -129,7 +129,11 @@ func (s *DockerDaemonSuite) TestDaemonShutdownWithPlugins(c *check.C) {
|
| 129 | 129 |
c.Fatalf("Could not kill daemon: %v", err)
|
| 130 | 130 |
} |
| 131 | 131 |
|
| 132 |
- time.Sleep(5 * time.Second) |
|
| 132 |
+ for {
|
|
| 133 |
+ if err := syscall.Kill(s.d.cmd.Process.Pid, 0); err == syscall.ESRCH {
|
|
| 134 |
+ break |
|
| 135 |
+ } |
|
| 136 |
+ } |
|
| 133 | 137 |
|
| 134 | 138 |
cmd := exec.Command("pgrep", "-f", "plugin-no-remove")
|
| 135 | 139 |
if out, ec, err := runCommandWithOutput(cmd); ec != 1 {
|
| ... | ... |
@@ -281,16 +281,10 @@ func (clnt *client) cleanupOldRootfs(containerID string) {
|
| 281 | 281 |
} |
| 282 | 282 |
} |
| 283 | 283 |
|
| 284 |
-func (clnt *client) setExited(containerID string) error {
|
|
| 284 |
+func (clnt *client) setExited(containerID string, exitCode uint32) error {
|
|
| 285 | 285 |
clnt.lock(containerID) |
| 286 | 286 |
defer clnt.unlock(containerID) |
| 287 | 287 |
|
| 288 |
- var exitCode uint32 |
|
| 289 |
- if event, ok := clnt.remote.pastEvents[containerID]; ok {
|
|
| 290 |
- exitCode = event.Status |
|
| 291 |
- delete(clnt.remote.pastEvents, containerID) |
|
| 292 |
- } |
|
| 293 |
- |
|
| 294 | 288 |
err := clnt.backend.StateChanged(containerID, StateInfo{
|
| 295 | 289 |
CommonStateInfo: CommonStateInfo{
|
| 296 | 290 |
State: StateExit, |
| ... | ... |
@@ -393,7 +387,7 @@ func (clnt *client) getOrCreateExitNotifier(containerID string) *exitNotifier {
|
| 393 | 393 |
return w |
| 394 | 394 |
} |
| 395 | 395 |
|
| 396 |
-func (clnt *client) restore(cont *containerd.Container, options ...CreateOption) (err error) {
|
|
| 396 |
+func (clnt *client) restore(cont *containerd.Container, lastEvent *containerd.Event, options ...CreateOption) (err error) {
|
|
| 397 | 397 |
clnt.lock(cont.Id) |
| 398 | 398 |
defer clnt.unlock(cont.Id) |
| 399 | 399 |
|
| ... | ... |
@@ -441,66 +435,132 @@ func (clnt *client) restore(cont *containerd.Container, options ...CreateOption) |
| 441 | 441 |
return err |
| 442 | 442 |
} |
| 443 | 443 |
|
| 444 |
- if event, ok := clnt.remote.pastEvents[containerID]; ok {
|
|
| 444 |
+ if lastEvent != nil {
|
|
| 445 | 445 |
// This should only be a pause or resume event |
| 446 |
- if event.Type == StatePause || event.Type == StateResume {
|
|
| 446 |
+ if lastEvent.Type == StatePause || lastEvent.Type == StateResume {
|
|
| 447 | 447 |
return clnt.backend.StateChanged(containerID, StateInfo{
|
| 448 | 448 |
CommonStateInfo: CommonStateInfo{
|
| 449 |
- State: event.Type, |
|
| 449 |
+ State: lastEvent.Type, |
|
| 450 | 450 |
Pid: container.systemPid, |
| 451 | 451 |
}}) |
| 452 | 452 |
} |
| 453 | 453 |
|
| 454 |
- logrus.Warnf("unexpected backlog event: %#v", event)
|
|
| 454 |
+ logrus.Warnf("unexpected backlog event: %#v", lastEvent)
|
|
| 455 | 455 |
} |
| 456 | 456 |
|
| 457 | 457 |
return nil |
| 458 | 458 |
} |
| 459 | 459 |
|
| 460 |
-func (clnt *client) Restore(containerID string, options ...CreateOption) error {
|
|
| 461 |
- if clnt.liveRestore {
|
|
| 462 |
- cont, err := clnt.getContainerdContainer(containerID) |
|
| 463 |
- if err == nil && cont.Status != "stopped" {
|
|
| 464 |
- if err := clnt.restore(cont, options...); err != nil {
|
|
| 465 |
- logrus.Errorf("error restoring %s: %v", containerID, err)
|
|
| 460 |
+func (clnt *client) getContainerLastEvent(containerID string) (*containerd.Event, error) {
|
|
| 461 |
+ er := &containerd.EventsRequest{
|
|
| 462 |
+ Timestamp: clnt.remote.restoreFromTimestamp, |
|
| 463 |
+ StoredOnly: true, |
|
| 464 |
+ Id: containerID, |
|
| 465 |
+ } |
|
| 466 |
+ events, err := clnt.remote.apiClient.Events(context.Background(), er) |
|
| 467 |
+ if err != nil {
|
|
| 468 |
+ logrus.Errorf("libcontainerd: failed to get container events stream for %s: %q", er.Id, err)
|
|
| 469 |
+ return nil, err |
|
| 470 |
+ } |
|
| 471 |
+ |
|
| 472 |
+ var ev *containerd.Event |
|
| 473 |
+ for {
|
|
| 474 |
+ e, err := events.Recv() |
|
| 475 |
+ if err != nil {
|
|
| 476 |
+ if err.Error() == "EOF" {
|
|
| 477 |
+ break |
|
| 466 | 478 |
} |
| 467 |
- return nil |
|
| 479 |
+ logrus.Errorf("libcontainerd: failed to get container event for %s: %q", containerID, err)
|
|
| 480 |
+ return nil, err |
|
| 481 |
+ } |
|
| 482 |
+ |
|
| 483 |
+ logrus.Debugf("libcontainerd: received past event %#v", e)
|
|
| 484 |
+ |
|
| 485 |
+ switch e.Type {
|
|
| 486 |
+ case StateExit, StatePause, StateResume: |
|
| 487 |
+ ev = e |
|
| 468 | 488 |
} |
| 469 |
- return clnt.setExited(containerID) |
|
| 470 | 489 |
} |
| 471 | 490 |
|
| 491 |
+ return ev, nil |
|
| 492 |
+} |
|
| 493 |
+ |
|
| 494 |
+func (clnt *client) Restore(containerID string, options ...CreateOption) error {
|
|
| 495 |
+ // Synchronize with live events |
|
| 496 |
+ clnt.remote.Lock() |
|
| 497 |
+ defer clnt.remote.Unlock() |
|
| 498 |
+ // Check that containerd still knows this container. |
|
| 499 |
+ // |
|
| 500 |
+ // In the unlikely event that Restore for this container process |
|
| 501 |
+ // the its past event before the main loop, the event will be |
|
| 502 |
+ // processed twice. However, this is not an issue as all those |
|
| 503 |
+ // events will do is change the state of the container to be |
|
| 504 |
+ // exactly the same. |
|
| 472 | 505 |
cont, err := clnt.getContainerdContainer(containerID) |
| 473 |
- if err == nil && cont.Status != "stopped" {
|
|
| 474 |
- w := clnt.getOrCreateExitNotifier(containerID) |
|
| 475 |
- clnt.lock(cont.Id) |
|
| 476 |
- container := clnt.newContainer(cont.BundlePath) |
|
| 477 |
- container.systemPid = systemPid(cont) |
|
| 478 |
- clnt.appendContainer(container) |
|
| 479 |
- clnt.unlock(cont.Id) |
|
| 480 |
- |
|
| 481 |
- container.discardFifos() |
|
| 482 |
- |
|
| 483 |
- if err := clnt.Signal(containerID, int(syscall.SIGTERM)); err != nil {
|
|
| 484 |
- logrus.Errorf("error sending sigterm to %v: %v", containerID, err)
|
|
| 506 |
+ // Get its last event |
|
| 507 |
+ ev, eerr := clnt.getContainerLastEvent(containerID) |
|
| 508 |
+ if err != nil || cont.Status == "Stopped" {
|
|
| 509 |
+ if err != nil && !strings.Contains(err.Error(), "container not found") {
|
|
| 510 |
+ // Legitimate error |
|
| 511 |
+ return err |
|
| 512 |
+ } |
|
| 513 |
+ |
|
| 514 |
+ // If ev is nil, then we already consumed all the event of the |
|
| 515 |
+ // container, included the "exit" one. |
|
| 516 |
+ // Thus we return to avoid overriding the Exit Code. |
|
| 517 |
+ if ev == nil {
|
|
| 518 |
+ logrus.Warnf("libcontainerd: restore was called on a fully synced container (%s)", containerID)
|
|
| 519 |
+ return nil |
|
| 520 |
+ } |
|
| 521 |
+ |
|
| 522 |
+ // get the exit status for this container |
|
| 523 |
+ ec := uint32(0) |
|
| 524 |
+ if eerr == nil && ev.Type == StateExit {
|
|
| 525 |
+ ec = ev.Status |
|
| 526 |
+ } |
|
| 527 |
+ clnt.setExited(containerID, ec) |
|
| 528 |
+ |
|
| 529 |
+ return nil |
|
| 530 |
+ } |
|
| 531 |
+ |
|
| 532 |
+ // container is still alive |
|
| 533 |
+ if clnt.liveRestore {
|
|
| 534 |
+ if err := clnt.restore(cont, ev, options...); err != nil {
|
|
| 535 |
+ logrus.Errorf("error restoring %s: %v", containerID, err)
|
|
| 536 |
+ } |
|
| 537 |
+ return nil |
|
| 538 |
+ } |
|
| 539 |
+ |
|
| 540 |
+ // Kill the container if liveRestore == false |
|
| 541 |
+ w := clnt.getOrCreateExitNotifier(containerID) |
|
| 542 |
+ clnt.lock(cont.Id) |
|
| 543 |
+ container := clnt.newContainer(cont.BundlePath) |
|
| 544 |
+ container.systemPid = systemPid(cont) |
|
| 545 |
+ clnt.appendContainer(container) |
|
| 546 |
+ clnt.unlock(cont.Id) |
|
| 547 |
+ |
|
| 548 |
+ container.discardFifos() |
|
| 549 |
+ |
|
| 550 |
+ if err := clnt.Signal(containerID, int(syscall.SIGTERM)); err != nil {
|
|
| 551 |
+ logrus.Errorf("error sending sigterm to %v: %v", containerID, err)
|
|
| 552 |
+ } |
|
| 553 |
+ select {
|
|
| 554 |
+ case <-time.After(10 * time.Second): |
|
| 555 |
+ if err := clnt.Signal(containerID, int(syscall.SIGKILL)); err != nil {
|
|
| 556 |
+ logrus.Errorf("error sending sigkill to %v: %v", containerID, err)
|
|
| 485 | 557 |
} |
| 486 | 558 |
select {
|
| 487 |
- case <-time.After(10 * time.Second): |
|
| 488 |
- if err := clnt.Signal(containerID, int(syscall.SIGKILL)); err != nil {
|
|
| 489 |
- logrus.Errorf("error sending sigkill to %v: %v", containerID, err)
|
|
| 490 |
- } |
|
| 491 |
- select {
|
|
| 492 |
- case <-time.After(2 * time.Second): |
|
| 493 |
- case <-w.wait(): |
|
| 494 |
- return nil |
|
| 495 |
- } |
|
| 559 |
+ case <-time.After(2 * time.Second): |
|
| 496 | 560 |
case <-w.wait(): |
| 497 | 561 |
return nil |
| 498 | 562 |
} |
| 563 |
+ case <-w.wait(): |
|
| 564 |
+ return nil |
|
| 499 | 565 |
} |
| 500 | 566 |
|
| 501 | 567 |
clnt.deleteContainer(containerID) |
| 502 | 568 |
|
| 503 |
- return clnt.setExited(containerID) |
|
| 569 |
+ return clnt.setExited(containerID, uint32(255)) |
|
| 504 | 570 |
} |
| 505 | 571 |
|
| 506 | 572 |
type exitNotifier struct {
|
| ... | ... |
@@ -21,6 +21,7 @@ import ( |
| 21 | 21 |
sysinfo "github.com/docker/docker/pkg/system" |
| 22 | 22 |
"github.com/docker/docker/utils" |
| 23 | 23 |
"github.com/golang/protobuf/ptypes" |
| 24 |
+ "github.com/golang/protobuf/ptypes/timestamp" |
|
| 24 | 25 |
"golang.org/x/net/context" |
| 25 | 26 |
"google.golang.org/grpc" |
| 26 | 27 |
"google.golang.org/grpc/grpclog" |
| ... | ... |
@@ -40,22 +41,22 @@ const ( |
| 40 | 40 |
|
| 41 | 41 |
type remote struct {
|
| 42 | 42 |
sync.RWMutex |
| 43 |
- apiClient containerd.APIClient |
|
| 44 |
- daemonPid int |
|
| 45 |
- stateDir string |
|
| 46 |
- rpcAddr string |
|
| 47 |
- startDaemon bool |
|
| 48 |
- closeManually bool |
|
| 49 |
- debugLog bool |
|
| 50 |
- rpcConn *grpc.ClientConn |
|
| 51 |
- clients []*client |
|
| 52 |
- eventTsPath string |
|
| 53 |
- pastEvents map[string]*containerd.Event |
|
| 54 |
- runtime string |
|
| 55 |
- runtimeArgs []string |
|
| 56 |
- daemonWaitCh chan struct{}
|
|
| 57 |
- liveRestore bool |
|
| 58 |
- oomScore int |
|
| 43 |
+ apiClient containerd.APIClient |
|
| 44 |
+ daemonPid int |
|
| 45 |
+ stateDir string |
|
| 46 |
+ rpcAddr string |
|
| 47 |
+ startDaemon bool |
|
| 48 |
+ closeManually bool |
|
| 49 |
+ debugLog bool |
|
| 50 |
+ rpcConn *grpc.ClientConn |
|
| 51 |
+ clients []*client |
|
| 52 |
+ eventTsPath string |
|
| 53 |
+ runtime string |
|
| 54 |
+ runtimeArgs []string |
|
| 55 |
+ daemonWaitCh chan struct{}
|
|
| 56 |
+ liveRestore bool |
|
| 57 |
+ oomScore int |
|
| 58 |
+ restoreFromTimestamp *timestamp.Timestamp |
|
| 59 | 59 |
} |
| 60 | 60 |
|
| 61 | 61 |
// New creates a fresh instance of libcontainerd remote. |
| ... | ... |
@@ -69,7 +70,6 @@ func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
|
| 69 | 69 |
stateDir: stateDir, |
| 70 | 70 |
daemonPid: -1, |
| 71 | 71 |
eventTsPath: filepath.Join(stateDir, eventTimestampFilename), |
| 72 |
- pastEvents: make(map[string]*containerd.Event), |
|
| 73 | 72 |
} |
| 74 | 73 |
for _, option := range options {
|
| 75 | 74 |
if err := option.Apply(r); err != nil {
|
| ... | ... |
@@ -106,6 +106,14 @@ func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
|
| 106 | 106 |
r.rpcConn = conn |
| 107 | 107 |
r.apiClient = containerd.NewAPIClient(conn) |
| 108 | 108 |
|
| 109 |
+ // Get the timestamp to restore from |
|
| 110 |
+ t := r.getLastEventTimestamp() |
|
| 111 |
+ tsp, err := ptypes.TimestampProto(t) |
|
| 112 |
+ if err != nil {
|
|
| 113 |
+ logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
|
|
| 114 |
+ } |
|
| 115 |
+ r.restoreFromTimestamp = tsp |
|
| 116 |
+ |
|
| 109 | 117 |
go r.handleConnectionChange() |
| 110 | 118 |
|
| 111 | 119 |
if err := r.startEventsMonitor(); err != nil {
|
| ... | ... |
@@ -257,7 +265,8 @@ func (r *remote) getLastEventTimestamp() time.Time {
|
| 257 | 257 |
|
| 258 | 258 |
func (r *remote) startEventsMonitor() error {
|
| 259 | 259 |
// First, get past events |
| 260 |
- tsp, err := ptypes.TimestampProto(r.getLastEventTimestamp()) |
|
| 260 |
+ t := r.getLastEventTimestamp() |
|
| 261 |
+ tsp, err := ptypes.TimestampProto(t) |
|
| 261 | 262 |
if err != nil {
|
| 262 | 263 |
logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
|
| 263 | 264 |
} |
| ... | ... |
@@ -299,7 +308,7 @@ func (r *remote) handleEventStream(events containerd.API_EventsClient) {
|
| 299 | 299 |
} |
| 300 | 300 |
r.RUnlock() |
| 301 | 301 |
if container == nil {
|
| 302 |
- logrus.Errorf("libcontainerd: %q", err)
|
|
| 302 |
+ logrus.Warnf("libcontainerd: unknown container %s", e.Id)
|
|
| 303 | 303 |
continue |
| 304 | 304 |
} |
| 305 | 305 |
|