Browse code

daemon: Maintain container exec-inspect invariant

We have integration tests which assert the invariant that a
GET /containers/{id}/json response lists only IDs of execs which are in
the Running state, according to GET /exec/{id}/json. The invariant could
be violated if those requests were to race the handling of the exec's
task-exit event. The coarse-grained locking of the container ExecStore
when starting an exec task was accidentally synchronizing
(*Daemon).ProcessEvent and (*Daemon).ContainerExecInspect to it just
enough to make it improbable for the integration tests to catch the
invariant violation on execs which exit immediately. Removing the
unnecessary locking made the underlying race condition more likely for
the tests to hit.

Maintain the invariant by deleting the exec from its container's
ExecCommands before clearing its Running flag. Additionally, fix other
potential data races with execs by ensuring that the ExecConfig lock is
held whenever a mutable field is read from or written to.

Signed-off-by: Cory Snider <csnider@mirantis.com>

Cory Snider authored on 2022/08/25 08:35:07
Showing 4 changed files
... ...
@@ -183,6 +183,7 @@ func (daemon *Daemon) ContainerExecStart(ctx context.Context, name string, optio
183 183
 	defer func() {
184 184
 		if err != nil {
185 185
 			ec.Lock()
186
+			ec.Container.ExecCommands.Delete(ec.ID)
186 187
 			ec.Running = false
187 188
 			exitCode := 126
188 189
 			ec.ExitCode = &exitCode
... ...
@@ -190,7 +191,6 @@ func (daemon *Daemon) ContainerExecStart(ctx context.Context, name string, optio
190 190
 				logrus.Errorf("failed to cleanup exec %s streams: %s", ec.Container.ID, err)
191 191
 			}
192 192
 			ec.Unlock()
193
-			ec.Container.ExecCommands.Delete(ec.ID)
194 193
 		}
195 194
 	}()
196 195
 
... ...
@@ -287,7 +287,7 @@ func (daemon *Daemon) ContainerExecStart(ctx context.Context, name string, optio
287 287
 	// close the chan to notify readiness
288 288
 	close(ec.Started)
289 289
 	if err != nil {
290
-		ec.Unlock()
290
+		defer ec.Unlock()
291 291
 		return translateContainerdStartErr(ec.Entrypoint, ec.SetExitCode, err)
292 292
 	}
293 293
 	ec.Unlock()
... ...
@@ -149,14 +149,23 @@ func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container
149 149
 	if err != nil {
150 150
 		return nil, err
151 151
 	}
152
-	if info.ExitCode == nil {
153
-		return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
152
+	exitCode, err := func() (int, error) {
153
+		info.Lock()
154
+		defer info.Unlock()
155
+		if info.ExitCode == nil {
156
+			info.Unlock()
157
+			return 0, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
158
+		}
159
+		return *info.ExitCode, nil
160
+	}()
161
+	if err != nil {
162
+		return nil, err
154 163
 	}
155 164
 	// Note: Go's json package will handle invalid UTF-8 for us
156 165
 	out := output.String()
157 166
 	return &types.HealthcheckResult{
158 167
 		End:      time.Now(),
159
-		ExitCode: *info.ExitCode,
168
+		ExitCode: exitCode,
160 169
 		Output:   out,
161 170
 	}, nil
162 171
 }
... ...
@@ -218,6 +218,8 @@ func (daemon *Daemon) ContainerExecInspect(id string) (*backend.ExecInspect, err
218 218
 		return nil, errExecNotFound(id)
219 219
 	}
220 220
 
221
+	e.Lock()
222
+	defer e.Unlock()
221 223
 	pc := inspectExecProcessConfig(e)
222 224
 	var pid int
223 225
 	if e.Process != nil {
... ...
@@ -163,6 +163,13 @@ func (daemon *Daemon) ProcessEvent(id string, e libcontainerdtypes.EventType, ei
163 163
 			ec := int(ei.ExitCode)
164 164
 			execConfig.Lock()
165 165
 			defer execConfig.Unlock()
166
+
167
+			// Remove the exec command from the container's store only and not the
168
+			// daemon's store so that the exec command can be inspected. Remove it
169
+			// before mutating execConfig to maintain the invariant that
170
+			// c.ExecCommands only contain execs in the Running state.
171
+			c.ExecCommands.Delete(execConfig.ID)
172
+
166 173
 			execConfig.ExitCode = &ec
167 174
 			execConfig.Running = false
168 175
 
... ...
@@ -174,10 +181,6 @@ func (daemon *Daemon) ProcessEvent(id string, e libcontainerdtypes.EventType, ei
174 174
 				logrus.Errorf("failed to cleanup exec %s streams: %s", c.ID, err)
175 175
 			}
176 176
 
177
-			// remove the exec command from the container's store only and not the
178
-			// daemon's store so that the exec command can be inspected.
179
-			c.ExecCommands.Delete(execConfig.ID)
180
-
181 177
 			exitCode = ec
182 178
 
183 179
 			go func() {