Browse code

cmd/docker-proxy: UDP: reply to clients with original daddr

When a UDP server is running on a multihomed server, as is the case with
pretty much _all_ Docker hosts (eg. eth0 + docker0), the kernel has to
choose which source address is used when replying to a UDP client. But
that process is based on heuristics and is fallible.

If the address picked doesn't match the original destination address
used by the client, it'll drop the datagram and return an ICMP Port
Unreachable.

To prevent that, we need to:

- `setsockopt(IP_PKTINFO)` on proxy's sockets.
- Extract the original destination address from an ancillary message
every time a new 'UDP connection' is 'established' (ie. every time we
insert a new entry into the UDP conntrack table).
- And finally, pass a control message containing the desired source
address to the kernel, every time we send a response back to the
client.

Also, update the inline comment on read errors in `(*UDPProxy).Run()`.
This comment was misleadingly referencing ECONNREFUSED - Linux's UDP
implementation never returns this error (see [1]). Instead, state why
`net.ErrClosed` is perfectly fine and doesn't need to be logged
(although, docker-proxy currently logs to nowhere).

[1]: https://github.com/search?q=repo%3Atorvalds%2Flinux+ECONNREFUSED+path%3A%2F%5Enet%5C%2F%28ipv4%7Cipv6%29%5C%2F%28udp%7Ctcp%29%2F&type=code

Signed-off-by: Albin Kerouanton <albinker@gmail.com>

Albin Kerouanton authored on 2024/10/03 04:30:36
Showing 5 changed files
... ...
@@ -11,6 +11,8 @@ import (
11 11
 
12 12
 	"github.com/docker/docker/dockerversion"
13 13
 	"github.com/ishidawataru/sctp"
14
+	"golang.org/x/net/ipv4"
15
+	"golang.org/x/net/ipv6"
14 16
 )
15 17
 
16 18
 // The caller is expected to pass-in open file descriptors ...
... ...
@@ -59,9 +61,9 @@ func main() {
59 59
 }
60 60
 
61 61
 func newProxy(config ProxyConfig) (p Proxy, err error) {
62
-	ipv := ipv4
62
+	ipv := ip4
63 63
 	if config.HostIP.To4() == nil {
64
-		ipv = ipv6
64
+		ipv = ip6
65 65
 	}
66 66
 
67 67
 	switch config.Proto {
... ...
@@ -96,6 +98,21 @@ func newProxy(config ProxyConfig) (p Proxy, err error) {
96 96
 			if err != nil {
97 97
 				return nil, fmt.Errorf("failed to listen on %s: %w", hostAddr, err)
98 98
 			}
99
+			// We need to setsockopt(IP_PKTINFO) on the listener to get the destination address as an ancillary
100
+			// message. The daddr will be used as the source address when sending back replies coming from the
101
+			// container to the client. If we don't do this, the kernel will have to pick a source address for us, and
102
+			// it might not pick what the client expects. That would result in ICMP Port Unreachable.
103
+			if ipv == ip4 {
104
+				pc := ipv4.NewPacketConn(listener)
105
+				if err := pc.SetControlMessage(ipv4.FlagDst, true); err != nil {
106
+					return nil, fmt.Errorf("failed to setsockopt(IP_PKTINFO): %w", err)
107
+				}
108
+			} else {
109
+				pc := ipv6.NewPacketConn(listener)
110
+				if err := pc.SetControlMessage(ipv6.FlagDst, true); err != nil {
111
+					return nil, fmt.Errorf("failed to setsockopt(IPV6_RECVPKTINFO): %w", err)
112
+				}
113
+			}
99 114
 		} else {
100 115
 			l, err := net.FilePacketConn(config.ListenSock)
101 116
 			if err != nil {
... ...
@@ -108,7 +125,7 @@ func newProxy(config ProxyConfig) (p Proxy, err error) {
108 108
 			}
109 109
 		}
110 110
 		container := &net.UDPAddr{IP: config.ContainerIP, Port: config.ContainerPort}
111
-		p, err = NewUDPProxy(listener, container)
111
+		p, err = NewUDPProxy(listener, container, ipv)
112 112
 	case "sctp":
113 113
 		var listener *sctp.SCTPListener
114 114
 		if config.ListenSock != nil {
... ...
@@ -7,9 +7,9 @@ type ipVersion string
7 7
 
8 8
 const (
9 9
 	// IPv4 is version 4
10
-	ipv4 ipVersion = "4"
10
+	ip4 ipVersion = "4"
11 11
 	// IPv4 is version 6
12
-	ipv6 ipVersion = "6"
12
+	ip6 ipVersion = "6"
13 13
 )
14 14
 
15 15
 // Proxy defines the behavior of a proxy. It forwards traffic back and forth
... ...
@@ -2,12 +2,15 @@ package main
2 2
 
3 3
 import (
4 4
 	"encoding/binary"
5
+	"errors"
5 6
 	"log"
6 7
 	"net"
7
-	"strings"
8 8
 	"sync"
9 9
 	"syscall"
10 10
 	"time"
11
+
12
+	"golang.org/x/net/ipv4"
13
+	"golang.org/x/net/ipv6"
11 14
 )
12 15
 
13 16
 const (
... ...
@@ -51,19 +54,21 @@ type UDPProxy struct {
51 51
 	backendAddr    *net.UDPAddr
52 52
 	connTrackTable connTrackMap
53 53
 	connTrackLock  sync.Mutex
54
+	ipVer          ipVersion
54 55
 }
55 56
 
56 57
 // NewUDPProxy creates a new UDPProxy.
57
-func NewUDPProxy(listener *net.UDPConn, backendAddr *net.UDPAddr) (*UDPProxy, error) {
58
+func NewUDPProxy(listener *net.UDPConn, backendAddr *net.UDPAddr, ipVer ipVersion) (*UDPProxy, error) {
58 59
 	return &UDPProxy{
59 60
 		listener:       listener,
60 61
 		frontendAddr:   listener.LocalAddr().(*net.UDPAddr),
61 62
 		backendAddr:    backendAddr,
62 63
 		connTrackTable: make(connTrackMap),
64
+		ipVer:          ipVer,
63 65
 	}, nil
64 66
 }
65 67
 
66
-func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr, clientKey *connTrackKey) {
68
+func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, serverAddr net.IP, clientAddr *net.UDPAddr, clientKey *connTrackKey) {
67 69
 	defer func() {
68 70
 		proxy.connTrackLock.Lock()
69 71
 		delete(proxy.connTrackTable, *clientKey)
... ...
@@ -71,6 +76,15 @@ func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr
71 71
 		proxyConn.Close()
72 72
 	}()
73 73
 
74
+	var oob []byte
75
+	if proxy.ipVer == ip4 {
76
+		cm := &ipv4.ControlMessage{Src: serverAddr}
77
+		oob = cm.Marshal()
78
+	} else {
79
+		cm := &ipv6.ControlMessage{Src: serverAddr}
80
+		oob = cm.Marshal()
81
+	}
82
+
74 83
 	readBuf := make([]byte, UDPBufSize)
75 84
 	for {
76 85
 		proxyConn.SetReadDeadline(time.Now().Add(UDPConnTrackTimeout))
... ...
@@ -88,7 +102,7 @@ func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr
88 88
 			return
89 89
 		}
90 90
 		for i := 0; i != read; {
91
-			written, err := proxy.listener.WriteToUDP(readBuf[i:read], clientAddr)
91
+			written, _, err := proxy.listener.WriteMsgUDP(readBuf[i:read], oob, clientAddr)
92 92
 			if err != nil {
93 93
 				return
94 94
 			}
... ...
@@ -100,13 +114,19 @@ func (proxy *UDPProxy) replyLoop(proxyConn *net.UDPConn, clientAddr *net.UDPAddr
100 100
 // Run starts forwarding the traffic using UDP.
101 101
 func (proxy *UDPProxy) Run() {
102 102
 	readBuf := make([]byte, UDPBufSize)
103
+	var oob []byte
104
+	if proxy.ipVer == ip4 {
105
+		oob = ipv4.NewControlMessage(ipv4.FlagDst)
106
+	} else {
107
+		oob = ipv6.NewControlMessage(ipv6.FlagDst)
108
+	}
109
+
103 110
 	for {
104
-		read, from, err := proxy.listener.ReadFromUDP(readBuf)
111
+		read, _, _, from, err := proxy.listener.ReadMsgUDP(readBuf, oob)
105 112
 		if err != nil {
106
-			// NOTE: Apparently ReadFrom doesn't return
107
-			// ECONNREFUSED like Read do (see comment in
108
-			// UDPProxy.replyLoop)
109
-			if !isClosedError(err) {
113
+			// The frontend listener socket might be closed by the signal
114
+			// handler. In that case, don't log anything - it's not an error.
115
+			if !errors.Is(err, net.ErrClosed) {
110 116
 				log.Printf("Stopping proxy on udp/%v for udp/%v (%s)", proxy.frontendAddr, proxy.backendAddr, err)
111 117
 			}
112 118
 			break
... ...
@@ -123,7 +143,15 @@ func (proxy *UDPProxy) Run() {
123 123
 				continue
124 124
 			}
125 125
 			proxy.connTrackTable[*fromKey] = proxyConn
126
-			go proxy.replyLoop(proxyConn, from, fromKey)
126
+
127
+			daddr, err := readDestFromCmsg(oob, proxy.ipVer)
128
+			if err != nil {
129
+				log.Printf("Failed to parse control message: %v", err)
130
+				proxy.connTrackLock.Unlock()
131
+				continue
132
+			}
133
+
134
+			go proxy.replyLoop(proxyConn, daddr, from, fromKey)
127 135
 		}
128 136
 		proxy.connTrackLock.Unlock()
129 137
 		for i := 0; i != read; {
... ...
@@ -137,6 +165,35 @@ func (proxy *UDPProxy) Run() {
137 137
 	}
138 138
 }
139 139
 
140
+func readDestFromCmsg(oob []byte, ipVer ipVersion) (_ net.IP, err error) {
141
+	defer func() {
142
+		// In case of partial upgrade / downgrade, docker-proxy could read
143
+		// control messages from a socket which doesn't have the sockopt
144
+		// IP_PKTINFO enabled. In that case, the control message will be all-0
145
+		// and Go's ControlMessage.Parse() will report an 'invalid header
146
+		// length' error. In that case, ignore the error and return an empty
147
+		// daddr - the kernel will pick a source address for us anyway (but
148
+		// maybe it'll be the wrong one).
149
+		if err != nil && err.Error() == "invalid header length" {
150
+			err = nil
151
+		}
152
+	}()
153
+
154
+	if ipVer == ip4 {
155
+		cm := &ipv4.ControlMessage{}
156
+		if err := cm.Parse(oob); err != nil {
157
+			return nil, err
158
+		}
159
+		return cm.Dst, nil
160
+	}
161
+
162
+	cm := &ipv6.ControlMessage{}
163
+	if err := cm.Parse(oob); err != nil {
164
+		return nil, err
165
+	}
166
+	return cm.Dst, nil
167
+}
168
+
140 169
 // Close stops forwarding the traffic.
141 170
 func (proxy *UDPProxy) Close() {
142 171
 	proxy.listener.Close()
... ...
@@ -146,13 +203,3 @@ func (proxy *UDPProxy) Close() {
146 146
 		conn.Close()
147 147
 	}
148 148
 }
149
-
150
-func isClosedError(err error) bool {
151
-	/* This comparison is ugly, but unfortunately, net.go doesn't export errClosing.
152
-	 * See:
153
-	 * http://golang.org/src/pkg/net/net.go
154
-	 * https://code.google.com/p/go/issues/detail?id=4337
155
-	 * https://groups.google.com/forum/#!msg/golang-nuts/0_aaCvBmOcM/SptmDyX1XJMJ
156
-	 */
157
-	return strings.HasSuffix(err.Error(), "use of closed network connection")
158
-}
... ...
@@ -128,9 +128,9 @@ func TestDisableNAT(t *testing.T) {
128 128
 	}
129 129
 }
130 130
 
131
-// Check that a container on one network can reach a service in a container on
132
-// another network, via a mapped port on the host.
133
-func TestPortMappedHairpin(t *testing.T) {
131
+// Check that a container on one network can reach a TCP service in a container
132
+// on another network, via a mapped port on the host.
133
+func TestPortMappedHairpinTCP(t *testing.T) {
134 134
 	skip.If(t, testEnv.IsRootless)
135 135
 
136 136
 	ctx := setupTest(t)
... ...
@@ -174,6 +174,56 @@ func TestPortMappedHairpin(t *testing.T) {
174 174
 	assert.Check(t, is.Contains(res.Stderr.String(), "404 Not Found"))
175 175
 }
176 176
 
177
+// Check that a container on one network can reach a UDP service in a container
178
+// on another network, via a mapped port on the host.
179
+// Regression test for https://github.com/moby/libnetwork/issues/1729.
180
+func TestPortMappedHairpinUDP(t *testing.T) {
181
+	skip.If(t, testEnv.IsRootless)
182
+
183
+	ctx := setupTest(t)
184
+	d := daemon.New(t)
185
+	d.StartWithBusybox(ctx, t)
186
+	defer d.Stop(t)
187
+	c := d.NewClientT(t)
188
+	defer c.Close()
189
+
190
+	// Find an address on the test host.
191
+	conn, err := net.Dial("tcp4", "hub.docker.com:80")
192
+	assert.NilError(t, err)
193
+	hostAddr := conn.LocalAddr().(*net.TCPAddr).IP.String()
194
+	conn.Close()
195
+
196
+	const serverNetName = "servernet"
197
+	network.CreateNoError(ctx, t, c, serverNetName)
198
+	defer network.RemoveNoError(ctx, t, c, serverNetName)
199
+	const clientNetName = "clientnet"
200
+	network.CreateNoError(ctx, t, c, clientNetName)
201
+	defer network.RemoveNoError(ctx, t, c, clientNetName)
202
+
203
+	serverId := container.Run(ctx, t, c,
204
+		container.WithNetworkMode(serverNetName),
205
+		container.WithExposedPorts("54/udp"),
206
+		container.WithPortMap(nat.PortMap{"54/udp": {{HostIP: "0.0.0.0"}}}),
207
+		container.WithCmd("/bin/sh", "-c", "echo 'foobar.internal 192.168.155.23' | dnsd -c - -p 54"),
208
+	)
209
+	defer c.ContainerRemove(ctx, serverId, containertypes.RemoveOptions{Force: true})
210
+
211
+	inspect := container.Inspect(ctx, t, c, serverId)
212
+	hostPort := inspect.NetworkSettings.Ports["54/udp"][0].HostPort
213
+
214
+	// nslookup gets an answer quickly from the dns server, but then tries to
215
+	// query another DNS server (for some unknown reasons) and times out. Hence,
216
+	// we need >5s to execute this test.
217
+	clientCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
218
+	defer cancel()
219
+	res := container.RunAttach(clientCtx, t, c,
220
+		container.WithNetworkMode(clientNetName),
221
+		container.WithCmd("nslookup", "foobar.internal", net.JoinHostPort(hostAddr, hostPort)),
222
+		container.WithAutoRemove,
223
+	)
224
+	assert.Check(t, is.Contains(res.Stdout.String(), "192.168.155.23"))
225
+}
226
+
177 227
 // Check that a container on an IPv4-only network can have a port mapping
178 228
 // from a specific IPv6 host address (using docker-proxy).
179 229
 // Regression test for https://github.com/moby/moby/issues/48067 (which
... ...
@@ -605,6 +605,22 @@ func bindTCPOrUDP(cfg portBindingReq, port, typ, proto int) (_ portBinding, retE
605 605
 	if domain == syscall.AF_INET6 {
606 606
 		syscall.SetsockoptInt(sd, syscall.IPPROTO_IPV6, syscall.IPV6_V6ONLY, 1)
607 607
 	}
608
+	if typ == syscall.SOCK_DGRAM {
609
+		// Enable IP_PKTINFO for UDP sockets to get the destination address.
610
+		// The destination address will be used as the source address when
611
+		// sending back replies coming from the container.
612
+		lvl := syscall.IPPROTO_IP
613
+		opt := syscall.IP_PKTINFO
614
+		optName := "IP_PKTINFO"
615
+		if domain == syscall.AF_INET6 {
616
+			lvl = syscall.IPPROTO_IPV6
617
+			opt = syscall.IPV6_RECVPKTINFO
618
+			optName = "IPV6_RECVPKTINFO"
619
+		}
620
+		if err := syscall.SetsockoptInt(sd, lvl, opt, 1); err != nil {
621
+			return portBinding{}, fmt.Errorf("failed to setsockopt(%s) for %s: %w", optName, cfg, err)
622
+		}
623
+	}
608 624
 	if err := syscall.Bind(sd, sa); err != nil {
609 625
 		if cfg.HostPort == cfg.HostPortEnd {
610 626
 			return portBinding{}, fmt.Errorf("failed to bind host port for %s: %w", cfg, err)