Browse code

Fix panic on daemon restart with running plugin

Scenario:

Daemon is ungracefully shutdown and leaves plugins running (no
live-restore).
Daemon comes back up.
The next time a container tries to use that plugin it will cause a
daemon panic because the plugin client is not set.

This fixes that by ensuring that the plugin does get shutdown.
Note, I do not think there would be any harm in just re-attaching to the
running plugin instead of shutting it down, however historically we shut
down plugins and containers when live-restore is not enabled.

[kir@: consolidate code to deleteTaskAndContainer, a few minor nits]

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>

Brian Goff authored on 2018/04/20 23:48:54
Showing 5 changed files
... ...
@@ -58,6 +58,19 @@ type Executor struct {
58 58
 	exitHandler ExitHandler
59 59
 }
60 60
 
61
+// deleteTaskAndContainer deletes plugin task and then plugin container from containerd
62
+func deleteTaskAndContainer(ctx context.Context, cli Client, id string) {
63
+	_, _, err := cli.DeleteTask(ctx, id)
64
+	if err != nil && !errdefs.IsNotFound(err) {
65
+		logrus.WithError(err).WithField("id", id).Error("failed to delete plugin task from containerd")
66
+	}
67
+
68
+	err = cli.Delete(ctx, id)
69
+	if err != nil && !errdefs.IsNotFound(err) {
70
+		logrus.WithError(err).WithField("id", id).Error("failed to delete plugin container from containerd")
71
+	}
72
+}
73
+
61 74
 // Create creates a new container
62 75
 func (e *Executor) Create(id string, spec specs.Spec, stdout, stderr io.WriteCloser) error {
63 76
 	opts := runctypes.RuncOptions{
... ...
@@ -87,34 +100,21 @@ func (e *Executor) Create(id string, spec specs.Spec, stdout, stderr io.WriteClo
87 87
 
88 88
 	_, err = e.client.Start(ctx, id, "", false, attachStreamsFunc(stdout, stderr))
89 89
 	if err != nil {
90
-		if _, _, err2 := e.client.DeleteTask(ctx, id); err2 != nil && !errdefs.IsNotFound(err2) {
91
-			logrus.WithError(err2).WithField("id", id).Warn("Received an error while attempting to clean up containerd plugin task after failed start")
92
-		}
93
-		if err2 := e.client.Delete(ctx, id); err2 != nil && !errdefs.IsNotFound(err2) {
94
-			logrus.WithError(err2).WithField("id", id).Warn("Received an error while attempting to clean up containerd plugin container after failed start")
95
-		}
90
+		deleteTaskAndContainer(ctx, e.client, id)
96 91
 	}
97 92
 	return err
98 93
 }
99 94
 
100 95
 // Restore restores a container
101
-func (e *Executor) Restore(id string, stdout, stderr io.WriteCloser) error {
96
+func (e *Executor) Restore(id string, stdout, stderr io.WriteCloser) (bool, error) {
102 97
 	alive, _, err := e.client.Restore(context.Background(), id, attachStreamsFunc(stdout, stderr))
103 98
 	if err != nil && !errdefs.IsNotFound(err) {
104
-		return err
99
+		return false, err
105 100
 	}
106 101
 	if !alive {
107
-		_, _, err = e.client.DeleteTask(context.Background(), id)
108
-		if err != nil && !errdefs.IsNotFound(err) {
109
-			logrus.WithError(err).Errorf("failed to delete container plugin %s task from containerd", id)
110
-		}
111
-
112
-		err = e.client.Delete(context.Background(), id)
113
-		if err != nil && !errdefs.IsNotFound(err) {
114
-			logrus.WithError(err).Errorf("failed to delete container plugin %s from containerd", id)
115
-		}
102
+		deleteTaskAndContainer(context.Background(), e.client, id)
116 103
 	}
117
-	return nil
104
+	return alive, nil
118 105
 }
119 106
 
120 107
 // IsRunning returns if the container with the given id is running
... ...
@@ -133,14 +133,7 @@ func (e *Executor) Signal(id string, signal int) error {
133 133
 func (e *Executor) ProcessEvent(id string, et libcontainerd.EventType, ei libcontainerd.EventInfo) error {
134 134
 	switch et {
135 135
 	case libcontainerd.EventExit:
136
-		// delete task and container
137
-		if _, _, err := e.client.DeleteTask(context.Background(), id); err != nil {
138
-			logrus.WithError(err).Errorf("failed to delete container plugin %s task from containerd", id)
139
-		}
140
-
141
-		if err := e.client.Delete(context.Background(), id); err != nil {
142
-			logrus.WithError(err).Errorf("failed to delete container plugin %s from containerd", id)
143
-		}
136
+		deleteTaskAndContainer(context.Background(), e.client, id)
144 137
 		return e.exitHandler.HandleExitEvent(ei.ContainerID)
145 138
 	}
146 139
 	return nil
... ...
@@ -37,14 +37,14 @@ var validFullID = regexp.MustCompile(`^([a-f0-9]{64})$`)
37 37
 // Executor is the interface that the plugin manager uses to interact with for starting/stopping plugins
38 38
 type Executor interface {
39 39
 	Create(id string, spec specs.Spec, stdout, stderr io.WriteCloser) error
40
-	Restore(id string, stdout, stderr io.WriteCloser) error
41 40
 	IsRunning(id string) (bool, error)
41
+	Restore(id string, stdout, stderr io.WriteCloser) (alive bool, err error)
42 42
 	Signal(id string, signal int) error
43 43
 }
44 44
 
45
-func (pm *Manager) restorePlugin(p *v2.Plugin) error {
45
+func (pm *Manager) restorePlugin(p *v2.Plugin, c *controller) error {
46 46
 	if p.IsEnabled() {
47
-		return pm.restore(p)
47
+		return pm.restore(p, c)
48 48
 	}
49 49
 	return nil
50 50
 }
... ...
@@ -143,12 +143,15 @@ func (pm *Manager) HandleExitEvent(id string) error {
143 143
 		return err
144 144
 	}
145 145
 
146
-	os.RemoveAll(filepath.Join(pm.config.ExecRoot, id))
146
+	if err := os.RemoveAll(filepath.Join(pm.config.ExecRoot, id)); err != nil && !os.IsNotExist(err) {
147
+		logrus.WithError(err).WithField("id", id).Error("Could not remove plugin bundle dir")
148
+	}
147 149
 
148 150
 	pm.mu.RLock()
149 151
 	c := pm.cMap[p]
150 152
 	if c.exitChan != nil {
151 153
 		close(c.exitChan)
154
+		c.exitChan = nil // ignore duplicate events (containerd issue #2299)
152 155
 	}
153 156
 	restart := c.restart
154 157
 	pm.mu.RUnlock()
... ...
@@ -205,12 +208,15 @@ func (pm *Manager) reload() error { // todo: restore
205 205
 	var wg sync.WaitGroup
206 206
 	wg.Add(len(plugins))
207 207
 	for _, p := range plugins {
208
-		c := &controller{} // todo: remove this
208
+		c := &controller{exitChan: make(chan bool)}
209
+		pm.mu.Lock()
209 210
 		pm.cMap[p] = c
211
+		pm.mu.Unlock()
212
+
210 213
 		go func(p *v2.Plugin) {
211 214
 			defer wg.Done()
212
-			if err := pm.restorePlugin(p); err != nil {
213
-				logrus.Errorf("failed to restore plugin '%s': %s", p.Name(), err)
215
+			if err := pm.restorePlugin(p, c); err != nil {
216
+				logrus.WithError(err).WithField("id", p.GetID()).Error("Failed to restore plugin")
214 217
 				return
215 218
 			}
216 219
 
... ...
@@ -248,7 +254,7 @@ func (pm *Manager) reload() error { // todo: restore
248 248
 			if requiresManualRestore {
249 249
 				// if liveRestore is not enabled, the plugin will be stopped now so we should enable it
250 250
 				if err := pm.enable(p, c, true); err != nil {
251
-					logrus.Errorf("failed to enable plugin '%s': %s", p.Name(), err)
251
+					logrus.WithError(err).WithField("id", p.GetID()).Error("failed to enable plugin")
252 252
 				}
253 253
 			}
254 254
 		}(p)
... ...
@@ -79,7 +79,7 @@ func (pm *Manager) pluginPostStart(p *v2.Plugin, c *controller) error {
79 79
 		client, err := plugins.NewClientWithTimeout(addr.Network()+"://"+addr.String(), nil, p.Timeout())
80 80
 		if err != nil {
81 81
 			c.restart = false
82
-			shutdownPlugin(p, c, pm.executor)
82
+			shutdownPlugin(p, c.exitChan, pm.executor)
83 83
 			return errors.WithStack(err)
84 84
 		}
85 85
 
... ...
@@ -106,7 +106,7 @@ func (pm *Manager) pluginPostStart(p *v2.Plugin, c *controller) error {
106 106
 			c.restart = false
107 107
 			// While restoring plugins, we need to explicitly set the state to disabled
108 108
 			pm.config.Store.SetState(p, false)
109
-			shutdownPlugin(p, c, pm.executor)
109
+			shutdownPlugin(p, c.exitChan, pm.executor)
110 110
 			return err
111 111
 		}
112 112
 
... ...
@@ -117,16 +117,15 @@ func (pm *Manager) pluginPostStart(p *v2.Plugin, c *controller) error {
117 117
 	return pm.save(p)
118 118
 }
119 119
 
120
-func (pm *Manager) restore(p *v2.Plugin) error {
120
+func (pm *Manager) restore(p *v2.Plugin, c *controller) error {
121 121
 	stdout, stderr := makeLoggerStreams(p.GetID())
122
-	if err := pm.executor.Restore(p.GetID(), stdout, stderr); err != nil {
122
+	alive, err := pm.executor.Restore(p.GetID(), stdout, stderr)
123
+	if err != nil {
123 124
 		return err
124 125
 	}
125 126
 
126 127
 	if pm.config.LiveRestoreEnabled {
127
-		c := &controller{}
128
-		if isRunning, _ := pm.executor.IsRunning(p.GetID()); !isRunning {
129
-			// plugin is not running, so follow normal startup procedure
128
+		if !alive {
130 129
 			return pm.enable(p, c, true)
131 130
 		}
132 131
 
... ...
@@ -138,10 +137,16 @@ func (pm *Manager) restore(p *v2.Plugin) error {
138 138
 		return pm.pluginPostStart(p, c)
139 139
 	}
140 140
 
141
+	if alive {
142
+		// TODO(@cpuguy83): Should we always just re-attach to the running plugin instead of doing this?
143
+		c.restart = false
144
+		shutdownPlugin(p, c.exitChan, pm.executor)
145
+	}
146
+
141 147
 	return nil
142 148
 }
143 149
 
144
-func shutdownPlugin(p *v2.Plugin, c *controller, executor Executor) {
150
+func shutdownPlugin(p *v2.Plugin, ec chan bool, executor Executor) {
145 151
 	pluginID := p.GetID()
146 152
 
147 153
 	err := executor.Signal(pluginID, int(unix.SIGTERM))
... ...
@@ -149,7 +154,7 @@ func shutdownPlugin(p *v2.Plugin, c *controller, executor Executor) {
149 149
 		logrus.Errorf("Sending SIGTERM to plugin failed with error: %v", err)
150 150
 	} else {
151 151
 		select {
152
-		case <-c.exitChan:
152
+		case <-ec:
153 153
 			logrus.Debug("Clean shutdown of plugin")
154 154
 		case <-time.After(time.Second * 10):
155 155
 			logrus.Debug("Force shutdown plugin")
... ...
@@ -157,7 +162,7 @@ func shutdownPlugin(p *v2.Plugin, c *controller, executor Executor) {
157 157
 				logrus.Errorf("Sending SIGKILL to plugin failed with error: %v", err)
158 158
 			}
159 159
 			select {
160
-			case <-c.exitChan:
160
+			case <-ec:
161 161
 				logrus.Debug("SIGKILL plugin shutdown")
162 162
 			case <-time.After(time.Second * 10):
163 163
 				logrus.Debug("Force shutdown plugin FAILED")
... ...
@@ -172,7 +177,7 @@ func (pm *Manager) disable(p *v2.Plugin, c *controller) error {
172 172
 	}
173 173
 
174 174
 	c.restart = false
175
-	shutdownPlugin(p, c, pm.executor)
175
+	shutdownPlugin(p, c.exitChan, pm.executor)
176 176
 	pm.config.Store.SetState(p, false)
177 177
 	return pm.save(p)
178 178
 }
... ...
@@ -191,7 +196,7 @@ func (pm *Manager) Shutdown() {
191 191
 		}
192 192
 		if pm.executor != nil && p.IsEnabled() {
193 193
 			c.restart = false
194
-			shutdownPlugin(p, c, pm.executor)
194
+			shutdownPlugin(p, c.exitChan, pm.executor)
195 195
 		}
196 196
 	}
197 197
 	if err := mount.RecursiveUnmount(pm.config.Root); err != nil {
... ...
@@ -3,12 +3,14 @@ package plugin // import "github.com/docker/docker/plugin"
3 3
 import (
4 4
 	"io"
5 5
 	"io/ioutil"
6
+	"net"
6 7
 	"os"
7 8
 	"path/filepath"
8 9
 	"testing"
9 10
 
10 11
 	"github.com/docker/docker/api/types"
11 12
 	"github.com/docker/docker/pkg/mount"
13
+	"github.com/docker/docker/pkg/stringid"
12 14
 	"github.com/docker/docker/pkg/system"
13 15
 	"github.com/docker/docker/plugin/v2"
14 16
 	"github.com/gotestyourself/gotestyourself/skip"
... ...
@@ -59,7 +61,7 @@ func TestManagerWithPluginMounts(t *testing.T) {
59 59
 		t.Fatal(err)
60 60
 	}
61 61
 
62
-	if err := m.Remove(p1.Name(), &types.PluginRmConfig{ForceRemove: true}); err != nil {
62
+	if err := m.Remove(p1.GetID(), &types.PluginRmConfig{ForceRemove: true}); err != nil {
63 63
 		t.Fatal(err)
64 64
 	}
65 65
 	if mounted, err := mount.Mounted(p2Mount); !mounted || err != nil {
... ...
@@ -68,17 +70,18 @@ func TestManagerWithPluginMounts(t *testing.T) {
68 68
 }
69 69
 
70 70
 func newTestPlugin(t *testing.T, name, cap, root string) *v2.Plugin {
71
-	rootfs := filepath.Join(root, name)
71
+	id := stringid.GenerateNonCryptoID()
72
+	rootfs := filepath.Join(root, id)
72 73
 	if err := os.MkdirAll(rootfs, 0755); err != nil {
73 74
 		t.Fatal(err)
74 75
 	}
75 76
 
76
-	p := v2.Plugin{PluginObj: types.Plugin{Name: name}}
77
+	p := v2.Plugin{PluginObj: types.Plugin{ID: id, Name: name}}
77 78
 	p.Rootfs = rootfs
78 79
 	iType := types.PluginInterfaceType{Capability: cap, Prefix: "docker", Version: "1.0"}
79
-	i := types.PluginConfigInterface{Socket: "plugins.sock", Types: []types.PluginInterfaceType{iType}}
80
+	i := types.PluginConfigInterface{Socket: "plugin.sock", Types: []types.PluginInterfaceType{iType}}
80 81
 	p.PluginObj.Config.Interface = i
81
-	p.PluginObj.ID = name
82
+	p.PluginObj.ID = id
82 83
 
83 84
 	return &p
84 85
 }
... ...
@@ -90,8 +93,8 @@ func (e *simpleExecutor) Create(id string, spec specs.Spec, stdout, stderr io.Wr
90 90
 	return errors.New("Create failed")
91 91
 }
92 92
 
93
-func (e *simpleExecutor) Restore(id string, stdout, stderr io.WriteCloser) error {
94
-	return nil
93
+func (e *simpleExecutor) Restore(id string, stdout, stderr io.WriteCloser) (bool, error) {
94
+	return false, nil
95 95
 }
96 96
 
97 97
 func (e *simpleExecutor) IsRunning(id string) (bool, error) {
... ...
@@ -133,7 +136,144 @@ func TestCreateFailed(t *testing.T) {
133 133
 		t.Fatalf("expected Create failed error, got %v", err)
134 134
 	}
135 135
 
136
-	if err := m.Remove(p.Name(), &types.PluginRmConfig{ForceRemove: true}); err != nil {
136
+	if err := m.Remove(p.GetID(), &types.PluginRmConfig{ForceRemove: true}); err != nil {
137
+		t.Fatal(err)
138
+	}
139
+}
140
+
141
+type executorWithRunning struct {
142
+	m         *Manager
143
+	root      string
144
+	exitChans map[string]chan struct{}
145
+}
146
+
147
+func (e *executorWithRunning) Create(id string, spec specs.Spec, stdout, stderr io.WriteCloser) error {
148
+	sockAddr := filepath.Join(e.root, id, "plugin.sock")
149
+	ch := make(chan struct{})
150
+	if e.exitChans == nil {
151
+		e.exitChans = make(map[string]chan struct{})
152
+	}
153
+	e.exitChans[id] = ch
154
+	listenTestPlugin(sockAddr, ch)
155
+	return nil
156
+}
157
+
158
+func (e *executorWithRunning) IsRunning(id string) (bool, error) {
159
+	return true, nil
160
+}
161
+func (e *executorWithRunning) Restore(id string, stdout, stderr io.WriteCloser) (bool, error) {
162
+	return true, nil
163
+}
164
+
165
+func (e *executorWithRunning) Signal(id string, signal int) error {
166
+	ch := e.exitChans[id]
167
+	ch <- struct{}{}
168
+	<-ch
169
+	e.m.HandleExitEvent(id)
170
+	return nil
171
+}
172
+
173
+func TestPluginAlreadyRunningOnStartup(t *testing.T) {
174
+	t.Parallel()
175
+
176
+	root, err := ioutil.TempDir("", t.Name())
177
+	if err != nil {
137 178
 		t.Fatal(err)
138 179
 	}
180
+	defer system.EnsureRemoveAll(root)
181
+
182
+	for _, test := range []struct {
183
+		desc   string
184
+		config ManagerConfig
185
+	}{
186
+		{
187
+			desc: "live-restore-disabled",
188
+			config: ManagerConfig{
189
+				LogPluginEvent: func(_, _, _ string) {},
190
+			},
191
+		},
192
+		{
193
+			desc: "live-restore-enabled",
194
+			config: ManagerConfig{
195
+				LogPluginEvent:     func(_, _, _ string) {},
196
+				LiveRestoreEnabled: true,
197
+			},
198
+		},
199
+	} {
200
+		t.Run(test.desc, func(t *testing.T) {
201
+			config := test.config
202
+			desc := test.desc
203
+			t.Parallel()
204
+
205
+			p := newTestPlugin(t, desc, desc, config.Root)
206
+			p.PluginObj.Enabled = true
207
+
208
+			// Need a short-ish path here so we don't run into unix socket path length issues.
209
+			config.ExecRoot, err = ioutil.TempDir("", "plugintest")
210
+
211
+			executor := &executorWithRunning{root: config.ExecRoot}
212
+			config.CreateExecutor = func(m *Manager) (Executor, error) { executor.m = m; return executor, nil }
213
+
214
+			if err := executor.Create(p.GetID(), specs.Spec{}, nil, nil); err != nil {
215
+				t.Fatal(err)
216
+			}
217
+
218
+			root := filepath.Join(root, desc)
219
+			config.Root = filepath.Join(root, "manager")
220
+			if err := os.MkdirAll(filepath.Join(config.Root, p.GetID()), 0755); err != nil {
221
+				t.Fatal(err)
222
+			}
223
+
224
+			if !p.IsEnabled() {
225
+				t.Fatal("plugin should be enabled")
226
+			}
227
+			if err := (&Manager{config: config}).save(p); err != nil {
228
+				t.Fatal(err)
229
+			}
230
+
231
+			s := NewStore()
232
+			config.Store = s
233
+			if err != nil {
234
+				t.Fatal(err)
235
+			}
236
+			defer system.EnsureRemoveAll(config.ExecRoot)
237
+
238
+			m, err := NewManager(config)
239
+			if err != nil {
240
+				t.Fatal(err)
241
+			}
242
+			defer m.Shutdown()
243
+
244
+			p = s.GetAll()[p.GetID()] // refresh `p` with what the manager knows
245
+			if p.Client() == nil {
246
+				t.Fatal("plugin client should not be nil")
247
+			}
248
+		})
249
+	}
250
+}
251
+
252
+func listenTestPlugin(sockAddr string, exit chan struct{}) (net.Listener, error) {
253
+	if err := os.MkdirAll(filepath.Dir(sockAddr), 0755); err != nil {
254
+		return nil, err
255
+	}
256
+	l, err := net.Listen("unix", sockAddr)
257
+	if err != nil {
258
+		return nil, err
259
+	}
260
+	go func() {
261
+		for {
262
+			conn, err := l.Accept()
263
+			if err != nil {
264
+				return
265
+			}
266
+			conn.Close()
267
+		}
268
+	}()
269
+	go func() {
270
+		<-exit
271
+		l.Close()
272
+		os.Remove(sockAddr)
273
+		exit <- struct{}{}
274
+	}()
275
+	return l, nil
139 276
 }
... ...
@@ -19,7 +19,7 @@ func (pm *Manager) disable(p *v2.Plugin, c *controller) error {
19 19
 	return fmt.Errorf("Not implemented")
20 20
 }
21 21
 
22
-func (pm *Manager) restore(p *v2.Plugin) error {
22
+func (pm *Manager) restore(p *v2.Plugin, c *controller) error {
23 23
 	return fmt.Errorf("Not implemented")
24 24
 }
25 25