Browse code

Make plugin removes more resilient to failure

Before this patch, if the plugin's `config.json` is successfully removed
but the main plugin state dir could not be removed for some reason (e.g.
leaked mount), it will prevent the daemon from being able to be
restarted.

This patches changes this to atomically remove the plugin such that on
daemon restart we can detect that there was an error and re-try. It also
changes the logic so that it only logs errors on restore rather than
erroring out the daemon.

This also removes some code which is now duplicated elsewhere.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>

Brian Goff authored on 2017/06/27 03:54:14
Showing 4 changed files
... ...
@@ -5,6 +5,7 @@ package main
5 5
 import (
6 6
 	"bufio"
7 7
 	"bytes"
8
+	"context"
8 9
 	"encoding/json"
9 10
 	"fmt"
10 11
 	"io"
... ...
@@ -25,6 +26,9 @@ import (
25 25
 	"crypto/x509"
26 26
 
27 27
 	"github.com/cloudflare/cfssl/helpers"
28
+	"github.com/docker/docker/api"
29
+	"github.com/docker/docker/api/types"
30
+	"github.com/docker/docker/client"
28 31
 	"github.com/docker/docker/integration-cli/checker"
29 32
 	"github.com/docker/docker/integration-cli/cli"
30 33
 	"github.com/docker/docker/integration-cli/daemon"
... ...
@@ -2980,3 +2984,45 @@ func (s *DockerDaemonSuite) TestShmSizeReload(c *check.C) {
2980 2980
 	c.Assert(err, check.IsNil, check.Commentf("Output: %s", out))
2981 2981
 	c.Assert(strings.TrimSpace(out), check.Equals, fmt.Sprintf("%v", size))
2982 2982
 }
2983
+
2984
+// TestFailedPluginRemove makes sure that a failed plugin remove does not block
2985
+// the daemon from starting
2986
+func (s *DockerDaemonSuite) TestFailedPluginRemove(c *check.C) {
2987
+	testRequires(c, DaemonIsLinux, IsAmd64, SameHostDaemon)
2988
+	d := daemon.New(c, dockerBinary, dockerdBinary, daemon.Config{})
2989
+	d.Start(c)
2990
+	cli, err := client.NewClient(d.Sock(), api.DefaultVersion, nil, nil)
2991
+	c.Assert(err, checker.IsNil)
2992
+
2993
+	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Second)
2994
+	defer cancel()
2995
+
2996
+	name := "test-plugin-rm-fail"
2997
+	out, err := cli.PluginInstall(ctx, name, types.PluginInstallOptions{
2998
+		Disabled:             true,
2999
+		AcceptAllPermissions: true,
3000
+		RemoteRef:            "cpuguy83/docker-logdriver-test",
3001
+	})
3002
+	c.Assert(err, checker.IsNil)
3003
+	defer out.Close()
3004
+	io.Copy(ioutil.Discard, out)
3005
+
3006
+	ctx, cancel = context.WithTimeout(context.Background(), 30*time.Second)
3007
+	defer cancel()
3008
+	p, _, err := cli.PluginInspectWithRaw(ctx, name)
3009
+	c.Assert(err, checker.IsNil)
3010
+
3011
+	// simulate a bad/partial removal by removing the plugin config.
3012
+	configPath := filepath.Join(d.Root, "plugins", p.ID, "config.json")
3013
+	c.Assert(os.Remove(configPath), checker.IsNil)
3014
+
3015
+	d.Restart(c)
3016
+	ctx, cancel = context.WithTimeout(context.Background(), 30*time.Second)
3017
+	defer cancel()
3018
+	_, err = cli.Ping(ctx)
3019
+	c.Assert(err, checker.IsNil)
3020
+
3021
+	_, _, err = cli.PluginInspectWithRaw(ctx, name)
3022
+	// plugin should be gone since the config.json is gone
3023
+	c.Assert(err, checker.NotNil)
3024
+}
... ...
@@ -13,7 +13,6 @@ import (
13 13
 	"os"
14 14
 	"path"
15 15
 	"path/filepath"
16
-	"sort"
17 16
 	"strings"
18 17
 
19 18
 	"github.com/Sirupsen/logrus"
... ...
@@ -32,6 +31,7 @@ import (
32 32
 	"github.com/docker/docker/pkg/mount"
33 33
 	"github.com/docker/docker/pkg/pools"
34 34
 	"github.com/docker/docker/pkg/progress"
35
+	"github.com/docker/docker/pkg/system"
35 36
 	"github.com/docker/docker/plugin/v2"
36 37
 	refstore "github.com/docker/docker/reference"
37 38
 	"github.com/opencontainers/go-digest"
... ...
@@ -624,14 +624,20 @@ func (pm *Manager) Remove(name string, config *types.PluginRmConfig) error {
624 624
 	}()
625 625
 
626 626
 	id := p.GetID()
627
-	pm.config.Store.Remove(p)
628 627
 	pluginDir := filepath.Join(pm.config.Root, id)
629
-	if err := recursiveUnmount(pluginDir); err != nil {
630
-		logrus.WithField("dir", pluginDir).WithField("id", id).Warn(err)
628
+
629
+	if err := mount.RecursiveUnmount(pluginDir); err != nil {
630
+		return errors.Wrap(err, "error unmounting plugin data")
631 631
 	}
632
-	if err := os.RemoveAll(pluginDir); err != nil {
633
-		logrus.Warnf("unable to remove %q from plugin remove: %v", pluginDir, err)
632
+
633
+	if err := os.Rename(pluginDir, pluginDir+"-removing"); err != nil {
634
+		return errors.Wrap(err, "error performing atomic remove of plugin dir")
635
+	}
636
+
637
+	if err := system.EnsureRemoveAll(pluginDir); err != nil {
638
+		return errors.Wrap(err, "error removing plugin dir")
634 639
 	}
640
+	pm.config.Store.Remove(p)
635 641
 	pm.config.LogPluginEvent(id, name, "remove")
636 642
 	return nil
637 643
 }
... ...
@@ -652,27 +658,6 @@ func getMounts(root string) ([]string, error) {
652 652
 	return mounts, nil
653 653
 }
654 654
 
655
-func recursiveUnmount(root string) error {
656
-	mounts, err := getMounts(root)
657
-	if err != nil {
658
-		return err
659
-	}
660
-
661
-	// sort in reverse-lexicographic order so the root mount will always be last
662
-	sort.Sort(sort.Reverse(sort.StringSlice(mounts)))
663
-
664
-	for i, m := range mounts {
665
-		if err := mount.Unmount(m); err != nil {
666
-			if i == len(mounts)-1 {
667
-				return errors.Wrapf(err, "error performing recursive unmount on %s", root)
668
-			}
669
-			logrus.WithError(err).WithField("mountpoint", m).Warn("could not unmount")
670
-		}
671
-	}
672
-
673
-	return nil
674
-}
675
-
676 655
 // Set sets plugin args
677 656
 func (pm *Manager) Set(name string, args []string) error {
678 657
 	p, err := pm.config.Store.GetV2Plugin(name)
... ...
@@ -163,6 +163,19 @@ func (pm *Manager) StateChanged(id string, e libcontainerd.StateInfo) error {
163 163
 	return nil
164 164
 }
165 165
 
166
+func handleLoadError(err error, id string) {
167
+	if err == nil {
168
+		return
169
+	}
170
+	logger := logrus.WithError(err).WithField("id", id)
171
+	if os.IsNotExist(errors.Cause(err)) {
172
+		// Likely some error while removing on an older version of docker
173
+		logger.Warn("missing plugin config, skipping: this may be caused due to a failed remove and requires manual cleanup.")
174
+		return
175
+	}
176
+	logger.Error("error loading plugin, skipping")
177
+}
178
+
166 179
 func (pm *Manager) reload() error { // todo: restore
167 180
 	dir, err := ioutil.ReadDir(pm.config.Root)
168 181
 	if err != nil {
... ...
@@ -173,9 +186,17 @@ func (pm *Manager) reload() error { // todo: restore
173 173
 		if validFullID.MatchString(v.Name()) {
174 174
 			p, err := pm.loadPlugin(v.Name())
175 175
 			if err != nil {
176
-				return err
176
+				handleLoadError(err, v.Name())
177
+				continue
177 178
 			}
178 179
 			plugins[p.GetID()] = p
180
+		} else {
181
+			if validFullID.MatchString(strings.TrimSuffix(v.Name(), "-removing")) {
182
+				// There was likely some error while removing this plugin, let's try to remove again here
183
+				if err := system.EnsureRemoveAll(v.Name()); err != nil {
184
+					logrus.WithError(err).WithField("id", v.Name()).Warn("error while attempting to clean up previously removed plugin")
185
+				}
186
+			}
179 187
 		}
180 188
 	}
181 189
 
... ...
@@ -204,7 +204,7 @@ func (pm *Manager) upgradePlugin(p *v2.Plugin, configDigest digest.Digest, blobs
204 204
 	// Make sure nothing is mounted
205 205
 	// This could happen if the plugin was disabled with `-f` with active mounts.
206 206
 	// If there is anything in `orig` is still mounted, this should error out.
207
-	if err := recursiveUnmount(orig); err != nil {
207
+	if err := mount.RecursiveUnmount(orig); err != nil {
208 208
 		return err
209 209
 	}
210 210