Browse code

Merge pull request #17514 from mavenugo/ugr

Fixing a case of dangling endpoint during ungraceful daemon restart

Brian Goff authored on 2015/10/31 02:38:27
Showing 5 changed files
... ...
@@ -893,6 +893,13 @@ func (container *Container) buildCreateEndpointOptions(n libnetwork.Network) ([]
893 893
 }
894 894
 
895 895
 func (container *Container) allocateNetwork() error {
896
+	sb := container.getNetworkSandbox()
897
+	if sb != nil {
898
+		// Cleanup any stale sandbox left over due to ungraceful daemon shutdown
899
+		if err := sb.Delete(); err != nil {
900
+			logrus.Errorf("failed to cleanup up stale network sandbox for container %s", container.ID)
901
+		}
902
+	}
896 903
 	updateSettings := false
897 904
 	if len(container.NetworkSettings.Networks) == 0 {
898 905
 		mode := container.hostConfig.NetworkMode
... ...
@@ -919,6 +926,18 @@ func (container *Container) allocateNetwork() error {
919 919
 	return container.writeHostConfig()
920 920
 }
921 921
 
922
+func (container *Container) getNetworkSandbox() libnetwork.Sandbox {
923
+	var sb libnetwork.Sandbox
924
+	container.daemon.netController.WalkSandboxes(func(s libnetwork.Sandbox) bool {
925
+		if s.ContainerID() == container.ID {
926
+			sb = s
927
+			return true
928
+		}
929
+		return false
930
+	})
931
+	return sb
932
+}
933
+
922 934
 // ConnectToNetwork connects a container to a netork
923 935
 func (container *Container) ConnectToNetwork(idOrName string) error {
924 936
 	if !container.Running {
... ...
@@ -984,14 +1003,7 @@ func (container *Container) connectToNetwork(idOrName string, updateSettings boo
984 984
 		return err
985 985
 	}
986 986
 
987
-	var sb libnetwork.Sandbox
988
-	controller.WalkSandboxes(func(s libnetwork.Sandbox) bool {
989
-		if s.ContainerID() == container.ID {
990
-			sb = s
991
-			return true
992
-		}
993
-		return false
994
-	})
987
+	sb := container.getNetworkSandbox()
995 988
 	if sb == nil {
996 989
 		options, err := container.buildSandboxOptions(n)
997 990
 		if err != nil {
... ...
@@ -21,7 +21,7 @@ clone git github.com/vdemeester/shakers 3c10293ce22b900c27acad7b28656196fcc2f73b
21 21
 clone git golang.org/x/net 3cffabab72adf04f8e3b01c5baf775361837b5fe https://github.com/golang/net.git
22 22
 
23 23
 #get libnetwork packages
24
-clone git github.com/docker/libnetwork 20351a84241aa1278493d74492db947336989be6
24
+clone git github.com/docker/libnetwork 5fc6ba506daa7914f4d58befb38480ec8e9c9f70
25 25
 clone git github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
26 26
 clone git github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b
27 27
 clone git github.com/hashicorp/memberlist 9a1e242e454d2443df330bdd51a436d5a9058fc4
... ...
@@ -118,6 +118,12 @@ func (d *driver) Leave(nid, eid string) error {
118 118
 		return fmt.Errorf("could not find network with id %s", nid)
119 119
 	}
120 120
 
121
+	ep := n.endpoint(eid)
122
+
123
+	if ep == nil {
124
+		return types.InternalMaskableErrorf("could not find endpoint with id %s", eid)
125
+	}
126
+
121 127
 	if d.notifyCh != nil {
122 128
 		d.notifyCh <- ovNotify{
123 129
 			action: "leave",
... ...
@@ -168,6 +168,7 @@ func (sb *sandbox) Delete() error {
168 168
 	c := sb.controller
169 169
 
170 170
 	// Detach from all endpoints
171
+	retain := false
171 172
 	for _, ep := range sb.getConnectedEndpoints() {
172 173
 		// endpoint in the Gateway network will be cleaned up
173 174
 		// when when sandbox no longer needs external connectivity
... ...
@@ -176,14 +177,22 @@ func (sb *sandbox) Delete() error {
176 176
 		}
177 177
 
178 178
 		if err := ep.Leave(sb); err != nil {
179
+			retain = true
179 180
 			log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
180 181
 		}
181 182
 
182 183
 		if err := ep.Delete(); err != nil {
184
+			retain = true
183 185
 			log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
184 186
 		}
185 187
 	}
186 188
 
189
+	if retain {
190
+		sb.Lock()
191
+		sb.inDelete = false
192
+		sb.Unlock()
193
+		return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id)
194
+	}
187 195
 	// Container is going away. Path cache in etchosts is most
188 196
 	// likely not required any more. Drop it.
189 197
 	etchosts.Drop(sb.config.hostsPath)
... ...
@@ -3,6 +3,7 @@ package libnetwork
3 3
 import (
4 4
 	"container/heap"
5 5
 	"encoding/json"
6
+	"sync"
6 7
 
7 8
 	"github.com/Sirupsen/logrus"
8 9
 	"github.com/docker/libnetwork/datastore"
... ...
@@ -119,8 +120,9 @@ func (sbs *sbState) DataScope() string {
119 119
 
120 120
 func (sb *sandbox) storeUpdate() error {
121 121
 	sbs := &sbState{
122
-		c:  sb.controller,
123
-		ID: sb.id,
122
+		c:   sb.controller,
123
+		ID:  sb.id,
124
+		Cid: sb.containerID,
124 125
 	}
125 126
 
126 127
 retry:
... ...
@@ -197,15 +199,17 @@ func (c *controller) sandboxCleanup() {
197 197
 
198 198
 		for _, eps := range sbs.Eps {
199 199
 			n, err := c.getNetworkFromStore(eps.Nid)
200
+			var ep *endpoint
200 201
 			if err != nil {
201 202
 				logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
202
-				continue
203
-			}
204
-
205
-			ep, err := n.getEndpointFromStore(eps.Eid)
206
-			if err != nil {
207
-				logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
208
-				continue
203
+				n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}}
204
+				ep = &endpoint{id: eps.Eid, network: n}
205
+			} else {
206
+				ep, err = n.getEndpointFromStore(eps.Eid)
207
+				if err != nil {
208
+					logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
209
+					ep = &endpoint{id: eps.Eid, network: n}
210
+				}
209 211
 			}
210 212
 
211 213
 			heap.Push(&sb.endpoints, ep)