Refresh containerd remotes on containerd restarted
| ... | ... |
@@ -114,8 +114,21 @@ type client struct {
|
| 114 | 114 |
containers map[string]*container |
| 115 | 115 |
} |
| 116 | 116 |
|
| 117 |
+func (c *client) setRemote(remote *containerd.Client) {
|
|
| 118 |
+ c.Lock() |
|
| 119 |
+ c.remote = remote |
|
| 120 |
+ c.Unlock() |
|
| 121 |
+} |
|
| 122 |
+ |
|
| 123 |
+func (c *client) getRemote() *containerd.Client {
|
|
| 124 |
+ c.RLock() |
|
| 125 |
+ remote := c.remote |
|
| 126 |
+ c.RUnlock() |
|
| 127 |
+ return remote |
|
| 128 |
+} |
|
| 129 |
+ |
|
| 117 | 130 |
func (c *client) Version(ctx context.Context) (containerd.Version, error) {
|
| 118 |
- return c.remote.Version(ctx) |
|
| 131 |
+ return c.getRemote().Version(ctx) |
|
| 119 | 132 |
} |
| 120 | 133 |
|
| 121 | 134 |
func (c *client) Restore(ctx context.Context, id string, attachStdio StdioCallback) (alive bool, pid int, err error) {
|
| ... | ... |
@@ -187,7 +200,7 @@ func (c *client) Create(ctx context.Context, id string, ociSpec *specs.Spec, run |
| 187 | 187 |
|
| 188 | 188 |
c.logger.WithField("bundle", bdir).WithField("root", ociSpec.Root.Path).Debug("bundle dir created")
|
| 189 | 189 |
|
| 190 |
- cdCtr, err := c.remote.NewContainer(ctx, id, |
|
| 190 |
+ cdCtr, err := c.getRemote().NewContainer(ctx, id, |
|
| 191 | 191 |
containerd.WithSpec(ociSpec), |
| 192 | 192 |
// TODO(mlaventure): when containerd support lcow, revisit runtime value |
| 193 | 193 |
containerd.WithRuntime(fmt.Sprintf("io.containerd.runtime.v1.%s", runtime.GOOS), runtimeOptions))
|
| ... | ... |
@@ -230,7 +243,7 @@ func (c *client) Start(ctx context.Context, id, checkpointDir string, withStdin |
| 230 | 230 |
// remove the checkpoint when we're done |
| 231 | 231 |
defer func() {
|
| 232 | 232 |
if cp != nil {
|
| 233 |
- err := c.remote.ContentStore().Delete(context.Background(), cp.Digest) |
|
| 233 |
+ err := c.getRemote().ContentStore().Delete(context.Background(), cp.Digest) |
|
| 234 | 234 |
if err != nil {
|
| 235 | 235 |
c.logger.WithError(err).WithFields(logrus.Fields{
|
| 236 | 236 |
"ref": checkpointDir, |
| ... | ... |
@@ -528,14 +541,14 @@ func (c *client) CreateCheckpoint(ctx context.Context, containerID, checkpointDi |
| 528 | 528 |
} |
| 529 | 529 |
// Whatever happens, delete the checkpoint from containerd |
| 530 | 530 |
defer func() {
|
| 531 |
- err := c.remote.ImageService().Delete(context.Background(), img.Name()) |
|
| 531 |
+ err := c.getRemote().ImageService().Delete(context.Background(), img.Name()) |
|
| 532 | 532 |
if err != nil {
|
| 533 | 533 |
c.logger.WithError(err).WithField("digest", img.Target().Digest).
|
| 534 | 534 |
Warnf("failed to delete checkpoint image")
|
| 535 | 535 |
} |
| 536 | 536 |
}() |
| 537 | 537 |
|
| 538 |
- b, err := content.ReadBlob(ctx, c.remote.ContentStore(), img.Target().Digest) |
|
| 538 |
+ b, err := content.ReadBlob(ctx, c.getRemote().ContentStore(), img.Target().Digest) |
|
| 539 | 539 |
if err != nil {
|
| 540 | 540 |
return errdefs.System(errors.Wrapf(err, "failed to retrieve checkpoint data")) |
| 541 | 541 |
} |
| ... | ... |
@@ -555,7 +568,7 @@ func (c *client) CreateCheckpoint(ctx context.Context, containerID, checkpointDi |
| 555 | 555 |
return errdefs.System(errors.Wrapf(err, "invalid checkpoint")) |
| 556 | 556 |
} |
| 557 | 557 |
|
| 558 |
- rat, err := c.remote.ContentStore().ReaderAt(ctx, cpDesc.Digest) |
|
| 558 |
+ rat, err := c.getRemote().ContentStore().ReaderAt(ctx, cpDesc.Digest) |
|
| 559 | 559 |
if err != nil {
|
| 560 | 560 |
return errdefs.System(errors.Wrapf(err, "failed to get checkpoint reader")) |
| 561 | 561 |
} |
| ... | ... |
@@ -708,7 +721,7 @@ func (c *client) processEventStream(ctx context.Context) {
|
| 708 | 708 |
} |
| 709 | 709 |
}() |
| 710 | 710 |
|
| 711 |
- eventStream, err = c.remote.EventService().Subscribe(ctx, &eventsapi.SubscribeRequest{
|
|
| 711 |
+ eventStream, err = c.getRemote().EventService().Subscribe(ctx, &eventsapi.SubscribeRequest{
|
|
| 712 | 712 |
Filters: []string{
|
| 713 | 713 |
// Filter on both namespace *and* topic. To create an "and" filter, |
| 714 | 714 |
// this must be a single, comma-separated string |
| ... | ... |
@@ -719,6 +732,8 @@ func (c *client) processEventStream(ctx context.Context) {
|
| 719 | 719 |
return |
| 720 | 720 |
} |
| 721 | 721 |
|
| 722 |
+ c.logger.WithField("namespace", c.namespace).Debug("processing event stream")
|
|
| 723 |
+ |
|
| 722 | 724 |
var oomKilled bool |
| 723 | 725 |
for {
|
| 724 | 726 |
ev, err = eventStream.Recv() |
| ... | ... |
@@ -822,7 +837,7 @@ func (c *client) processEventStream(ctx context.Context) {
|
| 822 | 822 |
} |
| 823 | 823 |
|
| 824 | 824 |
func (c *client) writeContent(ctx context.Context, mediaType, ref string, r io.Reader) (*types.Descriptor, error) {
|
| 825 |
- writer, err := c.remote.ContentStore().Writer(ctx, ref, 0, "") |
|
| 825 |
+ writer, err := c.getRemote().ContentStore().Writer(ctx, ref, 0, "") |
|
| 826 | 826 |
if err != nil {
|
| 827 | 827 |
return nil, err |
| 828 | 828 |
} |
| ... | ... |
@@ -260,7 +260,7 @@ func (r *remote) startContainerd() error {
|
| 260 | 260 |
return nil |
| 261 | 261 |
} |
| 262 | 262 |
|
| 263 |
-func (r *remote) monitorConnection(client *containerd.Client) {
|
|
| 263 |
+func (r *remote) monitorConnection(monitor *containerd.Client) {
|
|
| 264 | 264 |
var transientFailureCount = 0 |
| 265 | 265 |
|
| 266 | 266 |
ticker := time.NewTicker(500 * time.Millisecond) |
| ... | ... |
@@ -269,7 +269,7 @@ func (r *remote) monitorConnection(client *containerd.Client) {
|
| 269 | 269 |
for {
|
| 270 | 270 |
<-ticker.C |
| 271 | 271 |
ctx, cancel := context.WithTimeout(r.shutdownContext, healthCheckTimeout) |
| 272 |
- _, err := client.IsServing(ctx) |
|
| 272 |
+ _, err := monitor.IsServing(ctx) |
|
| 273 | 273 |
cancel() |
| 274 | 274 |
if err == nil {
|
| 275 | 275 |
transientFailureCount = 0 |
| ... | ... |
@@ -279,39 +279,69 @@ func (r *remote) monitorConnection(client *containerd.Client) {
|
| 279 | 279 |
select {
|
| 280 | 280 |
case <-r.shutdownContext.Done(): |
| 281 | 281 |
r.logger.Info("stopping healthcheck following graceful shutdown")
|
| 282 |
- client.Close() |
|
| 282 |
+ monitor.Close() |
|
| 283 | 283 |
return |
| 284 | 284 |
default: |
| 285 | 285 |
} |
| 286 | 286 |
|
| 287 | 287 |
r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
|
| 288 | 288 |
|
| 289 |
- if r.daemonPid != -1 {
|
|
| 290 |
- transientFailureCount++ |
|
| 291 |
- if transientFailureCount >= maxConnectionRetryCount || !system.IsProcessAlive(r.daemonPid) {
|
|
| 292 |
- transientFailureCount = 0 |
|
| 293 |
- if system.IsProcessAlive(r.daemonPid) {
|
|
| 294 |
- r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
|
|
| 295 |
- // Try to get a stack trace |
|
| 296 |
- syscall.Kill(r.daemonPid, syscall.SIGUSR1) |
|
| 297 |
- <-time.After(100 * time.Millisecond) |
|
| 298 |
- system.KillProcess(r.daemonPid) |
|
| 299 |
- } |
|
| 300 |
- <-r.daemonWaitCh |
|
| 301 |
- var err error |
|
| 302 |
- client.Close() |
|
| 303 |
- os.Remove(r.GRPC.Address) |
|
| 304 |
- if err = r.startContainerd(); err != nil {
|
|
| 305 |
- r.logger.WithError(err).Error("failed restarting containerd")
|
|
| 306 |
- } else {
|
|
| 307 |
- newClient, err := containerd.New(r.GRPC.Address) |
|
| 308 |
- if err != nil {
|
|
| 309 |
- r.logger.WithError(err).Error("failed connect to containerd")
|
|
| 310 |
- } else {
|
|
| 311 |
- client = newClient |
|
| 312 |
- } |
|
| 289 |
+ if r.daemonPid == -1 {
|
|
| 290 |
+ continue |
|
| 291 |
+ } |
|
| 292 |
+ |
|
| 293 |
+ transientFailureCount++ |
|
| 294 |
+ if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
|
|
| 295 |
+ continue |
|
| 296 |
+ } |
|
| 297 |
+ |
|
| 298 |
+ transientFailureCount = 0 |
|
| 299 |
+ if system.IsProcessAlive(r.daemonPid) {
|
|
| 300 |
+ r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
|
|
| 301 |
+ // Try to get a stack trace |
|
| 302 |
+ syscall.Kill(r.daemonPid, syscall.SIGUSR1) |
|
| 303 |
+ <-time.After(100 * time.Millisecond) |
|
| 304 |
+ system.KillProcess(r.daemonPid) |
|
| 305 |
+ } |
|
| 306 |
+ <-r.daemonWaitCh |
|
| 307 |
+ |
|
| 308 |
+ monitor.Close() |
|
| 309 |
+ os.Remove(r.GRPC.Address) |
|
| 310 |
+ if err := r.startContainerd(); err != nil {
|
|
| 311 |
+ r.logger.WithError(err).Error("failed restarting containerd")
|
|
| 312 |
+ continue |
|
| 313 |
+ } |
|
| 314 |
+ |
|
| 315 |
+ newMonitor, err := containerd.New(r.GRPC.Address) |
|
| 316 |
+ if err != nil {
|
|
| 317 |
+ r.logger.WithError(err).Error("failed connect to containerd")
|
|
| 318 |
+ continue |
|
| 319 |
+ } |
|
| 320 |
+ |
|
| 321 |
+ monitor = newMonitor |
|
| 322 |
+ var wg sync.WaitGroup |
|
| 323 |
+ |
|
| 324 |
+ for _, c := range r.clients {
|
|
| 325 |
+ wg.Add(1) |
|
| 326 |
+ |
|
| 327 |
+ go func(c *client) {
|
|
| 328 |
+ defer wg.Done() |
|
| 329 |
+ c.logger.WithField("namespace", c.namespace).Debug("creating new containerd remote client")
|
|
| 330 |
+ c.remote.Close() |
|
| 331 |
+ |
|
| 332 |
+ remote, err := containerd.New(r.GRPC.Address, containerd.WithDefaultNamespace(c.namespace)) |
|
| 333 |
+ if err != nil {
|
|
| 334 |
+ r.logger.WithError(err).Error("failed to connect to containerd")
|
|
| 335 |
+ // TODO: Better way to handle this? |
|
| 336 |
+ // This *shouldn't* happen, but this could wind up where the daemon |
|
| 337 |
+ // is not able to communicate with an eventually up containerd |
|
| 338 |
+ return |
|
| 313 | 339 |
} |
| 314 |
- } |
|
| 340 |
+ |
|
| 341 |
+ c.setRemote(remote) |
|
| 342 |
+ }(c) |
|
| 343 |
+ |
|
| 344 |
+ wg.Wait() |
|
| 315 | 345 |
} |
| 316 | 346 |
} |
| 317 | 347 |
} |