Browse code

Merge pull request #36173 from cpuguy83/fix_containerd_crash_spin

Refresh containerd remotes on containerd restarted

Yong Tang authored on 2018/02/08 23:19:29
Showing 2 changed files
... ...
@@ -114,8 +114,21 @@ type client struct {
114 114
 	containers map[string]*container
115 115
 }
116 116
 
117
+func (c *client) setRemote(remote *containerd.Client) {
118
+	c.Lock()
119
+	c.remote = remote
120
+	c.Unlock()
121
+}
122
+
123
+func (c *client) getRemote() *containerd.Client {
124
+	c.RLock()
125
+	remote := c.remote
126
+	c.RUnlock()
127
+	return remote
128
+}
129
+
117 130
 func (c *client) Version(ctx context.Context) (containerd.Version, error) {
118
-	return c.remote.Version(ctx)
131
+	return c.getRemote().Version(ctx)
119 132
 }
120 133
 
121 134
 func (c *client) Restore(ctx context.Context, id string, attachStdio StdioCallback) (alive bool, pid int, err error) {
... ...
@@ -187,7 +200,7 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run
187 187
 
188 188
 	c.logger.WithField("bundle", bdir).WithField("root", ociSpec.Root.Path).Debug("bundle dir created")
189 189
 
190
-	cdCtr, err := c.remote.NewContainer(ctx, id,
190
+	cdCtr, err := c.getRemote().NewContainer(ctx, id,
191 191
 		containerd.WithSpec(ociSpec),
192 192
 		// TODO(mlaventure): when containerd support lcow, revisit runtime value
193 193
 		containerd.WithRuntime(fmt.Sprintf("io.containerd.runtime.v1.%s", runtime.GOOS), runtimeOptions))
... ...
@@ -230,7 +243,7 @@ func (c *client) Start(ctx context.Context, id, checkpointDir string, withStdin
230 230
 		// remove the checkpoint when we're done
231 231
 		defer func() {
232 232
 			if cp != nil {
233
-				err := c.remote.ContentStore().Delete(context.Background(), cp.Digest)
233
+				err := c.getRemote().ContentStore().Delete(context.Background(), cp.Digest)
234 234
 				if err != nil {
235 235
 					c.logger.WithError(err).WithFields(logrus.Fields{
236 236
 						"ref":    checkpointDir,
... ...
@@ -528,14 +541,14 @@ func (c *client) CreateCheckpoint(ctx context.Context, containerID, checkpointDi
528 528
 	}
529 529
 	// Whatever happens, delete the checkpoint from containerd
530 530
 	defer func() {
531
-		err := c.remote.ImageService().Delete(context.Background(), img.Name())
531
+		err := c.getRemote().ImageService().Delete(context.Background(), img.Name())
532 532
 		if err != nil {
533 533
 			c.logger.WithError(err).WithField("digest", img.Target().Digest).
534 534
 				Warnf("failed to delete checkpoint image")
535 535
 		}
536 536
 	}()
537 537
 
538
-	b, err := content.ReadBlob(ctx, c.remote.ContentStore(), img.Target().Digest)
538
+	b, err := content.ReadBlob(ctx, c.getRemote().ContentStore(), img.Target().Digest)
539 539
 	if err != nil {
540 540
 		return errdefs.System(errors.Wrapf(err, "failed to retrieve checkpoint data"))
541 541
 	}
... ...
@@ -555,7 +568,7 @@ func (c *client) CreateCheckpoint(ctx context.Context, containerID, checkpointDi
555 555
 		return errdefs.System(errors.Wrapf(err, "invalid checkpoint"))
556 556
 	}
557 557
 
558
-	rat, err := c.remote.ContentStore().ReaderAt(ctx, cpDesc.Digest)
558
+	rat, err := c.getRemote().ContentStore().ReaderAt(ctx, cpDesc.Digest)
559 559
 	if err != nil {
560 560
 		return errdefs.System(errors.Wrapf(err, "failed to get checkpoint reader"))
561 561
 	}
... ...
@@ -708,7 +721,7 @@ func (c *client) processEventStream(ctx context.Context) {
708 708
 		}
709 709
 	}()
710 710
 
711
-	eventStream, err = c.remote.EventService().Subscribe(ctx, &eventsapi.SubscribeRequest{
711
+	eventStream, err = c.getRemote().EventService().Subscribe(ctx, &eventsapi.SubscribeRequest{
712 712
 		Filters: []string{
713 713
 			// Filter on both namespace *and* topic. To create an "and" filter,
714 714
 			// this must be a single, comma-separated string
... ...
@@ -719,6 +732,8 @@ func (c *client) processEventStream(ctx context.Context) {
719 719
 		return
720 720
 	}
721 721
 
722
+	c.logger.WithField("namespace", c.namespace).Debug("processing event stream")
723
+
722 724
 	var oomKilled bool
723 725
 	for {
724 726
 		ev, err = eventStream.Recv()
... ...
@@ -822,7 +837,7 @@ func (c *client) processEventStream(ctx context.Context) {
822 822
 }
823 823
 
824 824
 func (c *client) writeContent(ctx context.Context, mediaType, ref string, r io.Reader) (*types.Descriptor, error) {
825
-	writer, err := c.remote.ContentStore().Writer(ctx, ref, 0, "")
825
+	writer, err := c.getRemote().ContentStore().Writer(ctx, ref, 0, "")
826 826
 	if err != nil {
827 827
 		return nil, err
828 828
 	}
... ...
@@ -260,7 +260,7 @@ func (r *remote) startContainerd() error {
260 260
 	return nil
261 261
 }
262 262
 
263
-func (r *remote) monitorConnection(client *containerd.Client) {
263
+func (r *remote) monitorConnection(monitor *containerd.Client) {
264 264
 	var transientFailureCount = 0
265 265
 
266 266
 	ticker := time.NewTicker(500 * time.Millisecond)
... ...
@@ -269,7 +269,7 @@ func (r *remote) monitorConnection(client *containerd.Client) {
269 269
 	for {
270 270
 		<-ticker.C
271 271
 		ctx, cancel := context.WithTimeout(r.shutdownContext, healthCheckTimeout)
272
-		_, err := client.IsServing(ctx)
272
+		_, err := monitor.IsServing(ctx)
273 273
 		cancel()
274 274
 		if err == nil {
275 275
 			transientFailureCount = 0
... ...
@@ -279,39 +279,69 @@ func (r *remote) monitorConnection(client *containerd.Client) {
279 279
 		select {
280 280
 		case <-r.shutdownContext.Done():
281 281
 			r.logger.Info("stopping healthcheck following graceful shutdown")
282
-			client.Close()
282
+			monitor.Close()
283 283
 			return
284 284
 		default:
285 285
 		}
286 286
 
287 287
 		r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
288 288
 
289
-		if r.daemonPid != -1 {
290
-			transientFailureCount++
291
-			if transientFailureCount >= maxConnectionRetryCount || !system.IsProcessAlive(r.daemonPid) {
292
-				transientFailureCount = 0
293
-				if system.IsProcessAlive(r.daemonPid) {
294
-					r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
295
-					// Try to get a stack trace
296
-					syscall.Kill(r.daemonPid, syscall.SIGUSR1)
297
-					<-time.After(100 * time.Millisecond)
298
-					system.KillProcess(r.daemonPid)
299
-				}
300
-				<-r.daemonWaitCh
301
-				var err error
302
-				client.Close()
303
-				os.Remove(r.GRPC.Address)
304
-				if err = r.startContainerd(); err != nil {
305
-					r.logger.WithError(err).Error("failed restarting containerd")
306
-				} else {
307
-					newClient, err := containerd.New(r.GRPC.Address)
308
-					if err != nil {
309
-						r.logger.WithError(err).Error("failed connect to containerd")
310
-					} else {
311
-						client = newClient
312
-					}
289
+		if r.daemonPid == -1 {
290
+			continue
291
+		}
292
+
293
+		transientFailureCount++
294
+		if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
295
+			continue
296
+		}
297
+
298
+		transientFailureCount = 0
299
+		if system.IsProcessAlive(r.daemonPid) {
300
+			r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
301
+			// Try to get a stack trace
302
+			syscall.Kill(r.daemonPid, syscall.SIGUSR1)
303
+			<-time.After(100 * time.Millisecond)
304
+			system.KillProcess(r.daemonPid)
305
+		}
306
+		<-r.daemonWaitCh
307
+
308
+		monitor.Close()
309
+		os.Remove(r.GRPC.Address)
310
+		if err := r.startContainerd(); err != nil {
311
+			r.logger.WithError(err).Error("failed restarting containerd")
312
+			continue
313
+		}
314
+
315
+		newMonitor, err := containerd.New(r.GRPC.Address)
316
+		if err != nil {
317
+			r.logger.WithError(err).Error("failed connect to containerd")
318
+			continue
319
+		}
320
+
321
+		monitor = newMonitor
322
+		var wg sync.WaitGroup
323
+
324
+		for _, c := range r.clients {
325
+			wg.Add(1)
326
+
327
+			go func(c *client) {
328
+				defer wg.Done()
329
+				c.logger.WithField("namespace", c.namespace).Debug("creating new containerd remote client")
330
+				c.remote.Close()
331
+
332
+				remote, err := containerd.New(r.GRPC.Address, containerd.WithDefaultNamespace(c.namespace))
333
+				if err != nil {
334
+					r.logger.WithError(err).Error("failed to connect to containerd")
335
+					// TODO: Better way to handle this?
336
+					// This *shouldn't* happen, but this could wind up where the daemon
337
+					// is not able to communicate with an eventually up containerd
338
+					return
313 339
 				}
314
-			}
340
+
341
+				c.setRemote(remote)
342
+			}(c)
343
+
344
+			wg.Wait()
315 345
 		}
316 346
 	}
317 347
 }