Browse code

daemon: rename: don't reload endpoint from datastore

Commit 8b7af1d0f added some code to update the DNSNames of all
endpoints attached to a sandbox by loading a new instance of each
affected endpoints from the datastore through a call to
`Network.EndpointByID()`.

This method then calls `Network.getEndpointFromStore()`, that in
turn calls `store.GetObject()`, which then calls `cache.get()`,
which calls `o.CopyTo(kvObject)`. This effectively creates a fresh
new instance of an Endpoint. However, endpoints are already kept in
memory by Sandbox, meaning we now have two in-memory instances of
the same Endpoint.

As it turns out, libnetwork is built around the idea that no two objects
representing the same thing should leave in-memory, otherwise breaking
mutex locking and optimistic locking (as both instances will have a drifting
version tracking ID -- dbIndex in libnetwork parliance).

In this specific case, this bug materializes by container rename failing
when applied a second time for a given container. An integration test is
added to make sure this won't happen again.

Signed-off-by: Albin Kerouanton <albinker@gmail.com>

Albin Kerouanton authored on 2024/01/24 04:15:45
Showing 7 changed files
... ...
@@ -2,6 +2,7 @@ package daemon // import "github.com/docker/docker/daemon"
2 2
 
3 3
 import (
4 4
 	"context"
5
+	"fmt"
5 6
 	"strings"
6 7
 
7 8
 	"github.com/containerd/log"
... ...
@@ -127,9 +128,9 @@ func (daemon *Daemon) ContainerRename(oldName, newName string) (retErr error) {
127 127
 				return err
128 128
 			}
129 129
 
130
-			ep, err := nw.EndpointByID(epConfig.EndpointID)
131
-			if err != nil {
132
-				return err
130
+			ep := sb.GetEndpoint(epConfig.EndpointID)
131
+			if ep == nil {
132
+				return fmt.Errorf("no endpoint attached to network %s found", nw.Name())
133 133
 			}
134 134
 
135 135
 			oldDNSNames := make([]string, len(epConfig.DNSNames))
... ...
@@ -192,3 +192,25 @@ func TestRenameContainerWithLinkedContainer(t *testing.T) {
192 192
 	assert.NilError(t, err)
193 193
 	assert.Check(t, is.Equal(db1ID, inspect.ID))
194 194
 }
195
+
196
+// Regression test for https://github.com/moby/moby/issues/47186
197
+func TestRenameContainerTwice(t *testing.T) {
198
+	ctx := setupTest(t)
199
+	apiClient := testEnv.APIClient()
200
+
201
+	ctrName := "c0"
202
+	container.Run(ctx, t, apiClient, container.WithName("c0"))
203
+	defer func() {
204
+		container.Remove(ctx, t, apiClient, ctrName, containertypes.RemoveOptions{
205
+			Force: true,
206
+		})
207
+	}()
208
+
209
+	err := apiClient.ContainerRename(ctx, "c0", "c1")
210
+	assert.NilError(t, err)
211
+	ctrName = "c1"
212
+
213
+	err = apiClient.ContainerRename(ctx, "c1", "c2")
214
+	assert.NilError(t, err)
215
+	ctrName = "c2"
216
+}
... ...
@@ -623,7 +623,7 @@ func (ep *Endpoint) addServiceInfoToCluster(sb *Sandbox) error {
623 623
 	// In case the deleteServiceInfoToCluster arrives first, this one is happening after the endpoint is
624 624
 	// removed from the list, in this situation the delete will bail out not finding any data to cleanup
625 625
 	// and the add will bail out not finding the endpoint on the sandbox.
626
-	if err := sb.getEndpoint(ep.ID()); err == nil {
626
+	if err := sb.GetEndpoint(ep.ID()); err == nil {
627 627
 		log.G(context.TODO()).Warnf("addServiceInfoToCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
628 628
 		return nil
629 629
 	}
... ...
@@ -692,7 +692,7 @@ func (ep *Endpoint) deleteServiceInfoFromCluster(sb *Sandbox, fullRemove bool, m
692 692
 	// get caught in disableServceInNetworkDB, but we check here to make the
693 693
 	// nature of the condition more clear.
694 694
 	// See comment in addServiceInfoToCluster()
695
-	if err := sb.getEndpoint(ep.ID()); err == nil {
695
+	if err := sb.GetEndpoint(ep.ID()); err == nil {
696 696
 		log.G(context.TODO()).Warnf("deleteServiceInfoFromCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
697 697
 		return nil
698 698
 	}
... ...
@@ -194,7 +194,7 @@ func (ep *Endpoint) Info() EndpointInfo {
194 194
 		return ep
195 195
 	}
196 196
 
197
-	return sb.getEndpoint(ep.ID())
197
+	return sb.GetEndpoint(ep.ID())
198 198
 }
199 199
 
200 200
 // Iface returns information about the interface which was assigned to
... ...
@@ -1276,8 +1276,8 @@ func (n *Network) EndpointByName(name string) (*Endpoint, error) {
1276 1276
 	return e, nil
1277 1277
 }
1278 1278
 
1279
-// EndpointByID returns the Endpoint which has the passed id. If not found,
1280
-// the error ErrNoSuchEndpoint is returned.
1279
+// EndpointByID should *never* be called as it's going to create a 2nd instance of an Endpoint. The first one lives in
1280
+// the Sandbox the endpoint is attached to. Instead, the endpoint should be retrieved by calling [Sandbox.Endpoints()].
1281 1281
 func (n *Network) EndpointByID(id string) (*Endpoint, error) {
1282 1282
 	if id == "" {
1283 1283
 		return nil, ErrInvalidID(id)
... ...
@@ -334,7 +334,7 @@ func (sb *Sandbox) removeEndpointRaw(ep *Endpoint) {
334 334
 	}
335 335
 }
336 336
 
337
-func (sb *Sandbox) getEndpoint(id string) *Endpoint {
337
+func (sb *Sandbox) GetEndpoint(id string) *Endpoint {
338 338
 	sb.mu.Lock()
339 339
 	defer sb.mu.Unlock()
340 340
 
... ...
@@ -554,7 +554,7 @@ func (sb *Sandbox) DisableService() (err error) {
554 554
 }
555 555
 
556 556
 func (sb *Sandbox) clearNetworkResources(origEp *Endpoint) error {
557
-	ep := sb.getEndpoint(origEp.id)
557
+	ep := sb.GetEndpoint(origEp.id)
558 558
 	if ep == nil {
559 559
 		return fmt.Errorf("could not find the sandbox endpoint data for endpoint %s",
560 560
 			origEp.id)
... ...
@@ -57,7 +57,7 @@ func (n *Network) findLBEndpointSandbox() (*Endpoint, *Sandbox, error) {
57 57
 	if !ok {
58 58
 		return nil, nil, fmt.Errorf("Unable to get sandbox for %s(%s) in for %s", ep.Name(), ep.ID(), n.ID())
59 59
 	}
60
-	sep := sb.getEndpoint(ep.ID())
60
+	sep := sb.GetEndpoint(ep.ID())
61 61
 	if sep == nil {
62 62
 		return nil, nil, fmt.Errorf("Load balancing endpoint %s(%s) removed from %s", ep.Name(), ep.ID(), n.ID())
63 63
 	}