Browse code

extend health check to start service

Signed-off-by: runshenzhu <runshen.zhu@gmail.com>
Signed-off-by: Runshen Zhu <runshen.zhu@gmail.com>
(cherry picked from commit a99db84b4a966f0f09e81c446e857323a2a3302c)
Signed-off-by: Tibor Vass <tibor@docker.com>

runshenzhu authored on 2016/07/13 02:56:01
Showing 2 changed files
... ...
@@ -139,7 +139,58 @@ func (r *controller) Start(ctx context.Context) error {
139 139
 		return errors.Wrap(err, "starting container failed")
140 140
 	}
141 141
 
142
-	return nil
142
+	// no health check
143
+	if ctnr.Config == nil || ctnr.Config.Healthcheck == nil {
144
+		return nil
145
+	}
146
+
147
+	healthCmd := ctnr.Config.Healthcheck.Test
148
+
149
+	if len(healthCmd) == 0 || healthCmd[0] == "NONE" {
150
+		return nil
151
+	}
152
+
153
+	// wait for container to be healthy
154
+	eventq := r.adapter.events(ctx)
155
+
156
+	var healthErr error
157
+	for {
158
+		select {
159
+		case event := <-eventq:
160
+			if !r.matchevent(event) {
161
+				continue
162
+			}
163
+
164
+			switch event.Action {
165
+			case "die": // exit on terminal events
166
+				ctnr, err := r.adapter.inspect(ctx)
167
+				if err != nil {
168
+					return errors.Wrap(err, "die event received")
169
+				} else if ctnr.State.ExitCode != 0 {
170
+					return &exitError{code: ctnr.State.ExitCode, cause: healthErr}
171
+				}
172
+
173
+				return nil
174
+			case "destroy":
175
+				// If we get here, something has gone wrong but we want to exit
176
+				// and report anyways.
177
+				return ErrContainerDestroyed
178
+			case "health_status: unhealthy":
179
+				// in this case, we stop the container and report unhealthy status
180
+				if err := r.Shutdown(ctx); err != nil {
181
+					return errors.Wrap(err, "unhealthy container shutdown failed")
182
+				}
183
+				// set health check error, and wait for container to fully exit ("die" event)
184
+				healthErr = ErrContainerUnhealthy
185
+			case "health_status: healthy":
186
+				return nil
187
+			}
188
+		case <-ctx.Done():
189
+			return ctx.Err()
190
+		case <-r.closed:
191
+			return r.err
192
+		}
193
+	}
143 194
 }
144 195
 
145 196
 // Wait on the container to exit.
146 197
new file mode 100644
... ...
@@ -0,0 +1,191 @@
0
+// +build !windows
1
+
2
+package main
3
+
4
+import (
5
+	"strconv"
6
+	"strings"
7
+
8
+	"github.com/docker/docker/daemon/cluster/executor/container"
9
+	"github.com/docker/docker/pkg/integration/checker"
10
+	"github.com/docker/engine-api/types/swarm"
11
+	"github.com/go-check/check"
12
+)
13
+
14
+// start a service, and then make its task unhealthy during running
15
+// finally, unhealthy task should be detected and killed
16
+func (s *DockerSwarmSuite) TestServiceHealthRun(c *check.C) {
17
+	testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
18
+
19
+	d := s.AddDaemon(c, true, true)
20
+
21
+	// build image with health-check
22
+	// note: use `daemon.buildImageWithOut` to build, do not use `buildImage` to build
23
+	imageName := "testhealth"
24
+	_, _, err := d.buildImageWithOut(imageName,
25
+		`FROM busybox
26
+		RUN touch /status
27
+		HEALTHCHECK --interval=1s --timeout=1s --retries=1\
28
+		  CMD cat /status`,
29
+		true)
30
+	c.Check(err, check.IsNil)
31
+
32
+	serviceName := "healthServiceRun"
33
+	out, err := d.Cmd("service", "create", "--name", serviceName, imageName, "top")
34
+	c.Assert(err, checker.IsNil, check.Commentf(out))
35
+	id := strings.TrimSpace(out)
36
+
37
+	var tasks []swarm.Task
38
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
39
+		tasks = d.getServiceTasks(c, id)
40
+		return tasks, nil
41
+	}, checker.HasLen, 1)
42
+
43
+	task := tasks[0]
44
+
45
+	// wait for task to start
46
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
47
+		task = d.getTask(c, task.ID)
48
+		return task.Status.State, nil
49
+	}, checker.Equals, swarm.TaskStateStarting)
50
+	containerID := task.Status.ContainerStatus.ContainerID
51
+
52
+	// wait for container to be healthy
53
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
54
+		out, _ := d.Cmd("inspect", "--format={{.State.Health.Status}}", containerID)
55
+		return strings.TrimSpace(out), nil
56
+	}, checker.Equals, "healthy")
57
+
58
+	// make it fail
59
+	d.Cmd("exec", containerID, "rm", "/status")
60
+	// wait for container to be unhealthy
61
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
62
+		out, _ := d.Cmd("inspect", "--format={{.State.Health.Status}}", containerID)
63
+		return strings.TrimSpace(out), nil
64
+	}, checker.Equals, "unhealthy")
65
+
66
+	// Task should be terminated
67
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
68
+		task = d.getTask(c, task.ID)
69
+		return task.Status.State, nil
70
+	}, checker.Equals, swarm.TaskStateFailed)
71
+
72
+	if !strings.Contains(task.Status.Err, container.ErrContainerUnhealthy.Error()) {
73
+		c.Fatal("unhealthy task exits because of other error")
74
+	}
75
+}
76
+
77
+// start a service whose task is unhealthy at beginning
78
+// its tasks should be blocked in starting stage, until health check is passed
79
+func (s *DockerSwarmSuite) TestServiceHealthStart(c *check.C) {
80
+	testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
81
+
82
+	d := s.AddDaemon(c, true, true)
83
+
84
+	// service started from this image won't pass health check
85
+	imageName := "testhealth"
86
+	_, _, err := d.buildImageWithOut(imageName,
87
+		`FROM busybox
88
+		HEALTHCHECK --interval=1s --timeout=1s --retries=1024\
89
+		  CMD cat /status`,
90
+		true)
91
+	c.Check(err, check.IsNil)
92
+
93
+	serviceName := "healthServiceStart"
94
+	out, err := d.Cmd("service", "create", "--name", serviceName, imageName, "top")
95
+	c.Assert(err, checker.IsNil, check.Commentf(out))
96
+	id := strings.TrimSpace(out)
97
+
98
+	var tasks []swarm.Task
99
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
100
+		tasks = d.getServiceTasks(c, id)
101
+		return tasks, nil
102
+	}, checker.HasLen, 1)
103
+
104
+	task := tasks[0]
105
+
106
+	// wait for task to start
107
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
108
+		task = d.getTask(c, task.ID)
109
+		return task.Status.State, nil
110
+	}, checker.Equals, swarm.TaskStateStarting)
111
+
112
+	containerID := task.Status.ContainerStatus.ContainerID
113
+
114
+	// wait for health check to work
115
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
116
+		out, _ := d.Cmd("inspect", "--format={{.State.Health.FailingStreak}}", containerID)
117
+		failingStreak, _ := strconv.Atoi(strings.TrimSpace(out))
118
+		return failingStreak, nil
119
+	}, checker.GreaterThan, 0)
120
+
121
+	// task should be blocked at starting status
122
+	task = d.getTask(c, task.ID)
123
+	c.Assert(task.Status.State, check.Equals, swarm.TaskStateStarting)
124
+
125
+	// make it healthy
126
+	d.Cmd("exec", containerID, "touch", "/status")
127
+
128
+	// Task should be at running status
129
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
130
+		task = d.getTask(c, task.ID)
131
+		return task.Status.State, nil
132
+	}, checker.Equals, swarm.TaskStateRunning)
133
+}
134
+
135
+// start a service whose task is unhealthy at beginning
136
+// its tasks should be blocked in starting stage, until health check is passed
137
+func (s *DockerSwarmSuite) TestServiceHealthUpdate(c *check.C) {
138
+	testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
139
+
140
+	d := s.AddDaemon(c, true, true)
141
+
142
+	// service started from this image won't pass health check
143
+	imageName := "testhealth"
144
+	_, _, err := d.buildImageWithOut(imageName,
145
+		`FROM busybox
146
+		HEALTHCHECK --interval=1s --timeout=1s --retries=1024\
147
+		  CMD cat /status`,
148
+		true)
149
+	c.Check(err, check.IsNil)
150
+
151
+	serviceName := "healthServiceStart"
152
+	out, err := d.Cmd("service", "create", "--name", serviceName, imageName, "top")
153
+	c.Assert(err, checker.IsNil, check.Commentf(out))
154
+	id := strings.TrimSpace(out)
155
+
156
+	var tasks []swarm.Task
157
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
158
+		tasks = d.getServiceTasks(c, id)
159
+		return tasks, nil
160
+	}, checker.HasLen, 1)
161
+
162
+	task := tasks[0]
163
+
164
+	// wait for task to start
165
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
166
+		task = d.getTask(c, task.ID)
167
+		return task.Status.State, nil
168
+	}, checker.Equals, swarm.TaskStateStarting)
169
+
170
+	containerID := task.Status.ContainerStatus.ContainerID
171
+
172
+	// wait for health check to work
173
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
174
+		out, _ := d.Cmd("inspect", "--format={{.State.Health.FailingStreak}}", containerID)
175
+		failingStreak, _ := strconv.Atoi(strings.TrimSpace(out))
176
+		return failingStreak, nil
177
+	}, checker.GreaterThan, 0)
178
+
179
+	// task should be blocked at starting status
180
+	task = d.getTask(c, task.ID)
181
+	c.Assert(task.Status.State, check.Equals, swarm.TaskStateStarting)
182
+
183
+	// make it healthy
184
+	d.Cmd("exec", containerID, "touch", "/status")
185
+	// Task should be at running status
186
+	waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
187
+		task = d.getTask(c, task.ID)
188
+		return task.Status.State, nil
189
+	}, checker.Equals, swarm.TaskStateRunning)
190
+}