Browse code

Windows: DetachVhd attempt in cleanup

Signed-off-by: John Howard <jhoward@microsoft.com>

This is a fix for a few related scenarios where it's impossible to remove layers or containers
until the host is rebooted. Generally (or at least easiest to repro) through a forced daemon kill
while a container is running.

Possibly slightly worse than that, as following a host reboot, the scratch layer would possibly be leaked and
left on disk under the dataroot\windowsfilter directory after the container is removed.

One such example of a failure:

1. run a long running container with the --rm flag
docker run --rm -d --name test microsoft/windowsservercore powershell sleep 30
2. Force kill the daemon not allowing it to cleanup. Simulates a crash or a host power-cycle.
3. (re-)Start daemon
4. docker ps -a
PS C:\control> docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
7aff773d782b malloc "powershell start-sl…" 11 seconds ago Removal In Progress malloc
5. Try to remove
PS C:\control> docker rm 7aff
Error response from daemon: container 7aff773d782bbf35d95095369ffcb170b7b8f0e6f8f65d5aff42abf61234855d: driver "windowsfilter" failed to remove root filesystem: rename C:\control\windowsfilter\7aff773d782bbf35d95095369ffcb170b7b8f0e6f8f65d5aff42abf61234855d C:\control\windowsfilter\7aff773d782bbf35d95095369ffcb170b7b8f0e6f8f65d5aff42abf61234855d-removing: Access is denied.
PS C:\control>

Step 5 fails.

John Howard authored on 2018/08/24 07:56:27
Showing 1 changed files
... ...
@@ -6,7 +6,6 @@ import (
6 6
 	"bufio"
7 7
 	"bytes"
8 8
 	"encoding/json"
9
-	"errors"
10 9
 	"fmt"
11 10
 	"io"
12 11
 	"io/ioutil"
... ...
@@ -23,6 +22,7 @@ import (
23 23
 	"github.com/Microsoft/go-winio"
24 24
 	"github.com/Microsoft/go-winio/archive/tar"
25 25
 	"github.com/Microsoft/go-winio/backuptar"
26
+	"github.com/Microsoft/go-winio/vhd"
26 27
 	"github.com/Microsoft/hcsshim"
27 28
 	"github.com/docker/docker/daemon/graphdriver"
28 29
 	"github.com/docker/docker/pkg/archive"
... ...
@@ -33,6 +33,7 @@ import (
33 33
 	"github.com/docker/docker/pkg/reexec"
34 34
 	"github.com/docker/docker/pkg/system"
35 35
 	units "github.com/docker/go-units"
36
+	"github.com/pkg/errors"
36 37
 	"github.com/sirupsen/logrus"
37 38
 	"golang.org/x/sys/windows"
38 39
 )
... ...
@@ -331,7 +332,18 @@ func (d *Driver) Remove(id string) error {
331 331
 	tmpID := fmt.Sprintf("%s-removing", rID)
332 332
 	tmpLayerPath := filepath.Join(d.info.HomeDir, tmpID)
333 333
 	if err := os.Rename(layerPath, tmpLayerPath); err != nil && !os.IsNotExist(err) {
334
-		return err
334
+		if !os.IsPermission(err) {
335
+			return err
336
+		}
337
+		// If permission denied, it's possible that the scratch is still mounted, an
338
+		// artifact after a hard daemon crash for example. Worth a shot to try detaching it
339
+		// before retrying the rename.
340
+		if detachErr := vhd.DetachVhd(filepath.Join(layerPath, "sandbox.vhdx")); detachErr != nil {
341
+			return errors.Wrapf(err, "failed to detach VHD: %s", detachErr)
342
+		}
343
+		if renameErr := os.Rename(layerPath, tmpLayerPath); renameErr != nil && !os.IsNotExist(renameErr) {
344
+			return errors.Wrapf(err, "second rename attempt following detach failed: %s", renameErr)
345
+		}
335 346
 	}
336 347
 	if err := hcsshim.DestroyLayer(d.info, tmpID); err != nil {
337 348
 		logrus.Errorf("Failed to DestroyLayer %s: %s", id, err)