package portallocator import ( "context" "fmt" "net" "net/netip" "os" "syscall" "github.com/containerd/log" "github.com/docker/docker/daemon/libnetwork/types" "github.com/ishidawataru/sctp" ) type OSAllocator struct { // allocator is used to logically reserve ports, to avoid those we know // are already in use. This is useful to ensure callers don't burn their // retry budget unnecessarily. allocator *PortAllocator } func NewOSAllocator() OSAllocator { return OSAllocator{ allocator: Get(), } } // RequestPortsInRange reserves a port available in the range [portStart, portEnd] // for all the specified addrs, and then try to bind those addresses to allocate // the port from the OS. It returns the allocated port, and all the sockets // bound, or an error if the reserved port isn't available. Callers must take // care of closing the returned sockets. // // Due to the semantic of SO_REUSEADDR, the OSAllocator can't fully determine // if a port is free when binding 0.0.0.0 or ::. If another socket is binding // the same port, but it's not listening to it yet, the bind will succeed but a // subsequent listen might fail. For this reason, RequestPortsInRange doesn't // retry on failure — it's caller's responsibility. // // It's safe for concurrent use. func (pa OSAllocator) RequestPortsInRange(addrs []net.IP, proto types.Protocol, portStart, portEnd int) (_ int, _ []*os.File, retErr error) { port, err := pa.allocator.RequestPortsInRange(addrs, proto.String(), portStart, portEnd) if err != nil { return 0, nil, err } defer func() { if retErr != nil { for _, addr := range addrs { pa.allocator.ReleasePort(addr, proto.String(), port) } } }() var boundSocks []*os.File defer func() { if retErr != nil { for i, sock := range boundSocks { if err := sock.Close(); err != nil { log.G(context.TODO()).WithFields(log.Fields{ "addr": addrs[i], "port": port, }).WithError(err).Warnf("failed to close socket during port allocation") } } } }() for _, addr := range addrs { addr, _ := netip.AddrFromSlice(addr) addrPort := netip.AddrPortFrom(addr.Unmap(), uint16(port)) var sock *os.File switch proto { case types.TCP: sock, err = bindTCPOrUDP(addrPort, syscall.SOCK_STREAM, syscall.IPPROTO_TCP) case types.UDP: sock, err = bindTCPOrUDP(addrPort, syscall.SOCK_DGRAM, syscall.IPPROTO_UDP) case types.SCTP: sock, err = bindSCTP(addrPort) default: return 0, nil, fmt.Errorf("protocol %s not supported", proto) } if err != nil { return 0, nil, err } boundSocks = append(boundSocks, sock) } return port, boundSocks, nil } // ReleasePorts releases a common port reserved for a list of addrs. It doesn't // close the sockets bound by [RequestPortsInRange]. This must be taken care of // independently by the caller. func (pa OSAllocator) ReleasePorts(addrs []net.IP, proto types.Protocol, port int) { for _, addr := range addrs { pa.allocator.ReleasePort(addr, proto.String(), port) } } func bindTCPOrUDP(addr netip.AddrPort, typ int, proto types.Protocol) (_ *os.File, retErr error) { var domain int var sa syscall.Sockaddr if addr.Addr().Unmap().Is4() { domain = syscall.AF_INET sa = &syscall.SockaddrInet4{Addr: addr.Addr().As4(), Port: int(addr.Port())} } else { domain = syscall.AF_INET6 sa = &syscall.SockaddrInet6{Addr: addr.Addr().Unmap().As16(), Port: int(addr.Port())} } sd, err := syscall.Socket(domain, typ|syscall.SOCK_CLOEXEC, int(proto)) if err != nil { return nil, fmt.Errorf("failed to create socket for %s/%s: %w", addr, proto, err) } defer func() { if retErr != nil { syscall.Close(sd) } }() if err := syscall.SetsockoptInt(sd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1); err != nil { return nil, fmt.Errorf("failed to setsockopt(SO_REUSEADDR) for %s/%s: %w", addr, proto, err) } if domain == syscall.AF_INET6 { syscall.SetsockoptInt(sd, syscall.IPPROTO_IPV6, syscall.IPV6_V6ONLY, 1) } if typ == syscall.SOCK_DGRAM { // Enable IP_PKTINFO for UDP sockets to get the destination address. // The destination address will be used as the source address when // sending back replies coming from the container. lvl := syscall.IPPROTO_IP opt := syscall.IP_PKTINFO optName := "IP_PKTINFO" if domain == syscall.AF_INET6 { lvl = syscall.IPPROTO_IPV6 opt = syscall.IPV6_RECVPKTINFO optName = "IPV6_RECVPKTINFO" } if err := syscall.SetsockoptInt(sd, lvl, opt, 1); err != nil { return nil, fmt.Errorf("failed to setsockopt(%s) for %s/%s: %w", optName, addr, proto, err) } } if err := syscall.Bind(sd, sa); err != nil { return nil, fmt.Errorf("failed to bind host port %s/%s: %w", addr, proto, err) } boundSocket := os.NewFile(uintptr(sd), "listener") if boundSocket == nil { return nil, fmt.Errorf("failed to convert socket to file for %s/%s", addr, proto) } return boundSocket, nil } // bindSCTP is based on sctp.ListenSCTP. The socket is created and bound, but // does not start listening. func bindSCTP(addr netip.AddrPort) (_ *os.File, retErr error) { domain := syscall.AF_INET if addr.Addr().Unmap().Is6() { domain = syscall.AF_INET6 } sd, err := syscall.Socket(domain, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, syscall.IPPROTO_SCTP) if err != nil { return nil, fmt.Errorf("failed to create socket for %s/sctp: %w", addr, err) } defer func() { if retErr != nil { syscall.Close(sd) } }() if domain == syscall.AF_INET6 { syscall.SetsockoptInt(sd, syscall.IPPROTO_IPV6, syscall.IPV6_V6ONLY, 1) } if errno := setSCTPInitMsg(sd, sctp.InitMsg{NumOstreams: sctp.SCTP_MAX_STREAM}); errno != 0 { return nil, errno } if err := sctp.SCTPBind(sd, &sctp.SCTPAddr{IPAddrs: []net.IPAddr{{IP: addr.Addr().Unmap().AsSlice()}}, Port: int(addr.Port())}, sctp.SCTP_BINDX_ADD_ADDR); err != nil { return nil, fmt.Errorf("failed to bind host port %s/sctp: %w", addr, err) } boundSocket := os.NewFile(uintptr(sd), "listener") if boundSocket == nil { return nil, fmt.Errorf("failed to convert socket %s/sctp", addr) } return boundSocket, nil }