Browse code

container: protect the health status with mutex

Adds a mutex to protect the status, as well. When running the race
detector with the unit test, we can see that the Status field is written
without holding this lock. Adding a mutex to read and set status
addresses the issue.

Signed-off-by: Stephen J Day <stephen.day@docker.com>

Stephen J Day authored on 2017/11/16 12:28:36
Showing 4 changed files
... ...
@@ -16,19 +16,42 @@ type Health struct {
16 16
 
17 17
 // String returns a human-readable description of the health-check state
18 18
 func (s *Health) String() string {
19
-	// This happens when the monitor has yet to be setup.
20
-	if s.Status == "" {
21
-		return types.Unhealthy
22
-	}
19
+	status := s.Status()
23 20
 
24
-	switch s.Status {
21
+	switch status {
25 22
 	case types.Starting:
26 23
 		return "health: starting"
27 24
 	default: // Healthy and Unhealthy are clear on their own
28
-		return s.Status
25
+		return s.Health.Status
29 26
 	}
30 27
 }
31 28
 
29
+// Status returns the current health status.
30
+//
31
+// Note that this takes a lock and the value may change after being read.
32
+func (s *Health) Status() string {
33
+	s.mu.Lock()
34
+	defer s.mu.Unlock()
35
+
36
+	// This happens when the monitor has yet to be setup.
37
+	if s.Health.Status == "" {
38
+		return types.Unhealthy
39
+	}
40
+
41
+	return s.Health.Status
42
+}
43
+
44
+// SetStatus writes the current status to the underlying health structure,
45
+// obeying the locking semantics.
46
+//
47
+// Status may be set directly if another lock is used.
48
+func (s *Health) SetStatus(new string) {
49
+	s.mu.Lock()
50
+	defer s.mu.Unlock()
51
+
52
+	s.Health.Status = new
53
+}
54
+
32 55
 // OpenMonitorChannel creates and returns a new monitor channel. If there
33 56
 // already is one, it returns nil.
34 57
 func (s *Health) OpenMonitorChannel() chan struct{} {
... ...
@@ -53,7 +76,7 @@ func (s *Health) CloseMonitorChannel() {
53 53
 		close(s.stop)
54 54
 		s.stop = nil
55 55
 		// unhealthy when the monitor has stopped for compatibility reasons
56
-		s.Status = types.Unhealthy
56
+		s.Health.Status = types.Unhealthy
57 57
 		logrus.Debug("CloseMonitorChannel done")
58 58
 	}
59 59
 }
... ...
@@ -129,7 +129,7 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
129 129
 	}
130 130
 
131 131
 	h := c.State.Health
132
-	oldStatus := h.Status
132
+	oldStatus := h.Status()
133 133
 
134 134
 	if len(h.Log) >= maxLogEntries {
135 135
 		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
... ...
@@ -139,14 +139,14 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
139 139
 
140 140
 	if result.ExitCode == exitStatusHealthy {
141 141
 		h.FailingStreak = 0
142
-		h.Status = types.Healthy
142
+		h.SetStatus(types.Healthy)
143 143
 	} else { // Failure (including invalid exit code)
144 144
 		shouldIncrementStreak := true
145 145
 
146 146
 		// If the container is starting (i.e. we never had a successful health check)
147 147
 		// then we check if we are within the start period of the container in which
148 148
 		// case we do not increment the failure streak.
149
-		if h.Status == types.Starting {
149
+		if h.Status() == types.Starting {
150 150
 			startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
151 151
 			timeSinceStart := result.Start.Sub(c.State.StartedAt)
152 152
 
... ...
@@ -160,7 +160,7 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
160 160
 			h.FailingStreak++
161 161
 
162 162
 			if h.FailingStreak >= retries {
163
-				h.Status = types.Unhealthy
163
+				h.SetStatus(types.Unhealthy)
164 164
 			}
165 165
 		}
166 166
 		// Else we're starting or healthy. Stay in that state.
... ...
@@ -173,8 +173,9 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
173 173
 		logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
174 174
 	}
175 175
 
176
-	if oldStatus != h.Status {
177
-		d.LogContainerEvent(c, "health_status: "+h.Status)
176
+	current := h.Status()
177
+	if oldStatus != current {
178
+		d.LogContainerEvent(c, "health_status: "+current)
178 179
 	}
179 180
 }
180 181
 
... ...
@@ -293,11 +294,11 @@ func (d *Daemon) initHealthMonitor(c *container.Container) {
293 293
 	d.stopHealthchecks(c)
294 294
 
295 295
 	if h := c.State.Health; h != nil {
296
-		h.Status = types.Starting
296
+		h.SetStatus(types.Starting)
297 297
 		h.FailingStreak = 0
298 298
 	} else {
299 299
 		h := &container.Health{}
300
-		h.Status = types.Starting
300
+		h.SetStatus(types.Starting)
301 301
 		c.State.Health = h
302 302
 	}
303 303
 
... ...
@@ -14,7 +14,7 @@ import (
14 14
 func reset(c *container.Container) {
15 15
 	c.State = &container.State{}
16 16
 	c.State.Health = &container.Health{}
17
-	c.State.Health.Status = types.Starting
17
+	c.State.Health.SetStatus(types.Starting)
18 18
 }
19 19
 
20 20
 func TestNoneHealthcheck(t *testing.T) {
... ...
@@ -111,8 +111,8 @@ func TestHealthStates(t *testing.T) {
111 111
 
112 112
 	handleResult(c.State.StartedAt.Add(20*time.Second), 1)
113 113
 	handleResult(c.State.StartedAt.Add(40*time.Second), 1)
114
-	if c.State.Health.Status != types.Starting {
115
-		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
114
+	if status := c.State.Health.Status(); status != types.Starting {
115
+		t.Errorf("Expecting starting, but got %#v\n", status)
116 116
 	}
117 117
 	if c.State.Health.FailingStreak != 2 {
118 118
 		t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak)
... ...
@@ -133,15 +133,15 @@ func TestHealthStates(t *testing.T) {
133 133
 	c.Config.Healthcheck.StartPeriod = 30 * time.Second
134 134
 
135 135
 	handleResult(c.State.StartedAt.Add(20*time.Second), 1)
136
-	if c.State.Health.Status != types.Starting {
137
-		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
136
+	if status := c.State.Health.Status(); status != types.Starting {
137
+		t.Errorf("Expecting starting, but got %#v\n", status)
138 138
 	}
139 139
 	if c.State.Health.FailingStreak != 0 {
140 140
 		t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak)
141 141
 	}
142 142
 	handleResult(c.State.StartedAt.Add(50*time.Second), 1)
143
-	if c.State.Health.Status != types.Starting {
144
-		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
143
+	if status := c.State.Health.Status(); status != types.Starting {
144
+		t.Errorf("Expecting starting, but got %#v\n", status)
145 145
 	}
146 146
 	if c.State.Health.FailingStreak != 1 {
147 147
 		t.Errorf("Expecting FailingStreak=1, but got %d\n", c.State.Health.FailingStreak)
... ...
@@ -139,7 +139,7 @@ func (daemon *Daemon) getInspectData(container *container.Container) (*types.Con
139 139
 	var containerHealth *types.Health
140 140
 	if container.State.Health != nil {
141 141
 		containerHealth = &types.Health{
142
-			Status:        container.State.Health.Status,
142
+			Status:        container.State.Health.Status(),
143 143
 			FailingStreak: container.State.Health.FailingStreak,
144 144
 			Log:           append([]*types.HealthcheckResult{}, container.State.Health.Log...),
145 145
 		}