| ... | ... |
@@ -888,6 +888,24 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S |
| 888 | 888 |
registerMetricsPluginCallback(d.PluginStore, metricsSockPath) |
| 889 | 889 |
|
| 890 | 890 |
gopts := []grpc.DialOption{
|
| 891 |
+ // WithBlock makes sure that the following containerd request |
|
| 892 |
+ // is reliable. |
|
| 893 |
+ // |
|
| 894 |
+ // NOTE: In one edge case with high load pressure, kernel kills |
|
| 895 |
+ // dockerd, containerd and containerd-shims caused by OOM. |
|
| 896 |
+ // When both dockerd and containerd restart, but containerd |
|
| 897 |
+ // will take time to recover all the existing containers. Before |
|
| 898 |
+ // containerd serving, dockerd will failed with gRPC error. |
|
| 899 |
+ // That bad thing is that restore action will still ignore the |
|
| 900 |
+ // any non-NotFound errors and returns running state for |
|
| 901 |
+ // already stopped container. It is unexpected behavior. And |
|
| 902 |
+ // we need to restart dockerd to make sure that anything is OK. |
|
| 903 |
+ // |
|
| 904 |
+ // It is painful. Add WithBlock can prevent the edge case. And |
|
| 905 |
+ // n common case, the containerd will be serving in shortly. |
|
| 906 |
+ // It is not harm to add WithBlock for containerd connection. |
|
| 907 |
+ grpc.WithBlock(), |
|
| 908 |
+ |
|
| 891 | 909 |
grpc.WithInsecure(), |
| 892 | 910 |
grpc.WithBackoffMaxDelay(3 * time.Second), |
| 893 | 911 |
grpc.WithContextDialer(dialer.ContextDialer), |