Browse code

Add default sysctls to allow ping sockets and privileged ports with no capabilities

Currently default capability CAP_NET_RAW allows users to open ICMP echo
sockets, and CAP_NET_BIND_SERVICE allows binding to ports under 1024.
Both of these are safe operations, and Linux now provides ways that
these can be set, per container, to be allowed without any capabilties
for non root users. Enable these by default. Users can revert to the
previous behaviour by overriding the sysctl values explicitly.

Signed-off-by: Justin Cormack <justin.cormack@docker.com>

Justin Cormack authored on 2020/05/26 23:58:24
Showing 3 changed files
... ...
@@ -716,6 +716,14 @@ func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
716 716
 	}
717 717
 }
718 718
 
719
+// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
720
+// exist, so do not add the default ones if running on an old kernel.
721
+func sysctlExists(s string) bool {
722
+	f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1))
723
+	_, err := os.Stat(f)
724
+	return err == nil
725
+}
726
+
719 727
 // WithCommonOptions sets common docker options
720 728
 func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
721 729
 	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
... ...
@@ -768,6 +776,23 @@ func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
768 768
 		s.Hostname = c.Config.Hostname
769 769
 		setLinuxDomainname(c, s)
770 770
 
771
+		// Add default sysctls that are generally safe and useful; currently we
772
+		// grant the capabilities to allow these anyway. You can override if
773
+		// you want to restore the original behaviour.
774
+		// We do not set network sysctls if network namespace is host, or if we are
775
+		// joining an existing namespace, only if we create a new net namespace.
776
+		if c.HostConfig.NetworkMode.IsPrivate() {
777
+			// We cannot set up ping socket support in a user namespace
778
+			if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") {
779
+				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
780
+				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
781
+			}
782
+			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
783
+			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
784
+				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
785
+			}
786
+		}
787
+
771 788
 		return nil
772 789
 	}
773 790
 }
... ...
@@ -114,7 +114,9 @@ func TestSysctlOverride(t *testing.T) {
114 114
 			Domainname: "baz.cyphar.com",
115 115
 		},
116 116
 		HostConfig: &containertypes.HostConfig{
117
-			Sysctls: map[string]string{},
117
+			NetworkMode: "bridge",
118
+			Sysctls:     map[string]string{},
119
+			UsernsMode:  "host",
118 120
 		},
119 121
 	}
120 122
 	d := setupFakeDaemon(t, c)
... ...
@@ -125,15 +127,51 @@ func TestSysctlOverride(t *testing.T) {
125 125
 	assert.NilError(t, err)
126 126
 	assert.Equal(t, s.Hostname, "foobar")
127 127
 	assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.Config.Domainname)
128
+	if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
129
+		assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0")
130
+	}
131
+	if sysctlExists("net.ipv4.ping_group_range") {
132
+		assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
133
+	}
128 134
 
129 135
 	// Set an explicit sysctl.
130 136
 	c.HostConfig.Sysctls["kernel.domainname"] = "foobar.net"
131 137
 	assert.Assert(t, c.HostConfig.Sysctls["kernel.domainname"] != c.Config.Domainname)
138
+	c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024"
132 139
 
133 140
 	s, err = d.createSpec(c)
134 141
 	assert.NilError(t, err)
135 142
 	assert.Equal(t, s.Hostname, "foobar")
136 143
 	assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.HostConfig.Sysctls["kernel.domainname"])
144
+	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"])
145
+}
146
+
147
+// TestSysctlOverrideHost ensures that any implicit network sysctls are not set
148
+// with host networking
149
+func TestSysctlOverrideHost(t *testing.T) {
150
+	c := &container.Container{
151
+		Config: &containertypes.Config{},
152
+		HostConfig: &containertypes.HostConfig{
153
+			NetworkMode: "host",
154
+			Sysctls:     map[string]string{},
155
+			UsernsMode:  "host",
156
+		},
157
+	}
158
+	d := setupFakeDaemon(t, c)
159
+	defer cleanupFakeContainer(c)
160
+
161
+	// Ensure that the implicit sysctl is not set
162
+	s, err := d.createSpec(c)
163
+	assert.NilError(t, err)
164
+	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "")
165
+	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "")
166
+
167
+	// Set an explicit sysctl.
168
+	c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024"
169
+
170
+	s, err = d.createSpec(c)
171
+	assert.NilError(t, err)
172
+	assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"])
137 173
 }
138 174
 
139 175
 func TestGetSourceMount(t *testing.T) {
... ...
@@ -1252,12 +1252,13 @@ func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetBindService(c *testing.T
1252 1252
 	// test that a root user has default capability CAP_NET_BIND_SERVICE
1253 1253
 	dockerCmd(c, "run", "syscall-test", "socket-test")
1254 1254
 	// test that non root user does not have default capability CAP_NET_BIND_SERVICE
1255
-	icmd.RunCommand(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "socket-test").Assert(c, icmd.Expected{
1255
+	// as we allow this via sysctl, also tweak the sysctl back to default
1256
+	icmd.RunCommand(dockerBinary, "run", "--user", "1000:1000", "--sysctl", "net.ipv4.ip_unprivileged_port_start=1024", "syscall-test", "socket-test").Assert(c, icmd.Expected{
1256 1257
 		ExitCode: 1,
1257 1258
 		Err:      "Permission denied",
1258 1259
 	})
1259 1260
 	// test that root user can drop default capability CAP_NET_BIND_SERVICE
1260
-	icmd.RunCommand(dockerBinary, "run", "--cap-drop", "net_bind_service", "syscall-test", "socket-test").Assert(c, icmd.Expected{
1261
+	icmd.RunCommand(dockerBinary, "run", "--cap-drop", "net_bind_service", "--sysctl", "net.ipv4.ip_unprivileged_port_start=1024", "syscall-test", "socket-test").Assert(c, icmd.Expected{
1261 1262
 		ExitCode: 1,
1262 1263
 		Err:      "Permission denied",
1263 1264
 	})