Browse code

Merge pull request #29554 from cpuguy83/keep_running_count_of_states

Use counter for tracking container states

Brian Goff authored on 2017/05/06 05:15:22
Showing 9 changed files
... ...
@@ -151,6 +151,7 @@ func (daemon *Daemon) create(params types.ContainerCreateConfig, managed bool) (
151 151
 		return nil, err
152 152
 	}
153 153
 	daemon.Register(container)
154
+	stateCtr.set(container.ID, "stopped")
154 155
 	daemon.LogContainerEvent(container, "create")
155 156
 	return container, nil
156 157
 }
... ...
@@ -198,6 +198,7 @@ func (daemon *Daemon) restore() error {
198 198
 			if err := backportMountSpec(c); err != nil {
199 199
 				logrus.Error("Failed to migrate old mounts to use new spec format")
200 200
 			}
201
+			daemon.setStateCounter(c)
201 202
 
202 203
 			if c.IsRunning() || c.IsPaused() {
203 204
 				c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking
... ...
@@ -124,6 +124,7 @@ func (daemon *Daemon) cleanupContainer(container *container.Container, forceRemo
124 124
 				logrus.Error(e)
125 125
 			}
126 126
 			daemon.LogContainerEvent(container, "destroy")
127
+			stateCtr.del(container.ID)
127 128
 		}
128 129
 	}()
129 130
 
... ...
@@ -4,14 +4,12 @@ import (
4 4
 	"fmt"
5 5
 	"os"
6 6
 	"runtime"
7
-	"sync/atomic"
8 7
 	"time"
9 8
 
10 9
 	"github.com/Sirupsen/logrus"
11 10
 	"github.com/docker/docker/api"
12 11
 	"github.com/docker/docker/api/types"
13 12
 	"github.com/docker/docker/cli/debug"
14
-	"github.com/docker/docker/container"
15 13
 	"github.com/docker/docker/daemon/logger"
16 14
 	"github.com/docker/docker/dockerversion"
17 15
 	"github.com/docker/docker/pkg/fileutils"
... ...
@@ -58,18 +56,7 @@ func (daemon *Daemon) SystemInfo() (*types.Info, error) {
58 58
 	}
59 59
 
60 60
 	sysInfo := sysinfo.New(true)
61
-
62
-	var cRunning, cPaused, cStopped int32
63
-	daemon.containers.ApplyAll(func(c *container.Container) {
64
-		switch c.StateString() {
65
-		case "paused":
66
-			atomic.AddInt32(&cPaused, 1)
67
-		case "running":
68
-			atomic.AddInt32(&cRunning, 1)
69
-		default:
70
-			atomic.AddInt32(&cStopped, 1)
71
-		}
72
-	})
61
+	cRunning, cPaused, cStopped := stateCtr.get()
73 62
 
74 63
 	securityOptions := []string{}
75 64
 	if sysInfo.AppArmor {
... ...
@@ -1,9 +1,15 @@
1 1
 package daemon
2 2
 
3
-import "github.com/docker/go-metrics"
3
+import (
4
+	"sync"
5
+
6
+	"github.com/docker/go-metrics"
7
+	"github.com/prometheus/client_golang/prometheus"
8
+)
4 9
 
5 10
 var (
6 11
 	containerActions          metrics.LabeledTimer
12
+	containerStates           metrics.LabeledGauge
7 13
 	imageActions              metrics.LabeledTimer
8 14
 	networkActions            metrics.LabeledTimer
9 15
 	engineVersion             metrics.LabeledGauge
... ...
@@ -11,6 +17,8 @@ var (
11 11
 	engineMemory              metrics.Gauge
12 12
 	healthChecksCounter       metrics.Counter
13 13
 	healthChecksFailedCounter metrics.Counter
14
+
15
+	stateCtr *stateCounter
14 16
 )
15 17
 
16 18
 func init() {
... ...
@@ -25,6 +33,7 @@ func init() {
25 25
 	} {
26 26
 		containerActions.WithValues(a).Update(0)
27 27
 	}
28
+
28 29
 	networkActions = ns.NewLabeledTimer("network_actions", "The number of seconds it takes to process each network action", "action")
29 30
 	engineVersion = ns.NewLabeledGauge("engine", "The version and commit information for the engine process", metrics.Unit("info"),
30 31
 		"version",
... ...
@@ -38,5 +47,60 @@ func init() {
38 38
 	healthChecksCounter = ns.NewCounter("health_checks", "The total number of health checks")
39 39
 	healthChecksFailedCounter = ns.NewCounter("health_checks_failed", "The total number of failed health checks")
40 40
 	imageActions = ns.NewLabeledTimer("image_actions", "The number of seconds it takes to process each image action", "action")
41
+
42
+	stateCtr = newStateCounter(ns.NewDesc("container_states", "The count of containers in various states", metrics.Unit("containers"), "state"))
43
+	ns.Add(stateCtr)
44
+
41 45
 	metrics.Register(ns)
42 46
 }
47
+
48
+type stateCounter struct {
49
+	mu     sync.Mutex
50
+	states map[string]string
51
+	desc   *prometheus.Desc
52
+}
53
+
54
+func newStateCounter(desc *prometheus.Desc) *stateCounter {
55
+	return &stateCounter{
56
+		states: make(map[string]string),
57
+		desc:   desc,
58
+	}
59
+}
60
+
61
+func (ctr *stateCounter) get() (running int, paused int, stopped int) {
62
+	ctr.mu.Lock()
63
+	defer ctr.mu.Unlock()
64
+
65
+	states := map[string]int{
66
+		"running": 0,
67
+		"paused":  0,
68
+		"stopped": 0,
69
+	}
70
+	for _, state := range ctr.states {
71
+		states[state]++
72
+	}
73
+	return states["running"], states["paused"], states["stopped"]
74
+}
75
+
76
+func (ctr *stateCounter) set(id, label string) {
77
+	ctr.mu.Lock()
78
+	ctr.states[id] = label
79
+	ctr.mu.Unlock()
80
+}
81
+
82
+func (ctr *stateCounter) del(id string) {
83
+	ctr.mu.Lock()
84
+	delete(ctr.states, id)
85
+	ctr.mu.Unlock()
86
+}
87
+
88
+func (ctr *stateCounter) Describe(ch chan<- *prometheus.Desc) {
89
+	ch <- ctr.desc
90
+}
91
+
92
+func (ctr *stateCounter) Collect(ch chan<- prometheus.Metric) {
93
+	running, paused, stopped := ctr.get()
94
+	ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(running), "running")
95
+	ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(paused), "paused")
96
+	ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(stopped), "stopped")
97
+}
... ...
@@ -9,10 +9,22 @@ import (
9 9
 
10 10
 	"github.com/Sirupsen/logrus"
11 11
 	"github.com/docker/docker/api/types"
12
+	"github.com/docker/docker/container"
12 13
 	"github.com/docker/docker/libcontainerd"
13 14
 	"github.com/docker/docker/restartmanager"
14 15
 )
15 16
 
17
+func (daemon *Daemon) setStateCounter(c *container.Container) {
18
+	switch c.StateString() {
19
+	case "paused":
20
+		stateCtr.set(c.ID, "paused")
21
+	case "running":
22
+		stateCtr.set(c.ID, "running")
23
+	default:
24
+		stateCtr.set(c.ID, "stopped")
25
+	}
26
+}
27
+
16 28
 // StateChanged updates daemon state changes from containerd
17 29
 func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
18 30
 	c := daemon.containers.Get(id)
... ...
@@ -81,6 +93,8 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
81 81
 			}()
82 82
 		}
83 83
 
84
+		daemon.setStateCounter(c)
85
+
84 86
 		defer c.Unlock()
85 87
 		if err := c.ToDisk(); err != nil {
86 88
 			return err
... ...
@@ -109,15 +123,19 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
109 109
 		c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
110 110
 		c.HasBeenManuallyStopped = false
111 111
 		c.HasBeenStartedBefore = true
112
+		daemon.setStateCounter(c)
113
+
112 114
 		if err := c.ToDisk(); err != nil {
113 115
 			c.Reset(false)
114 116
 			return err
115 117
 		}
116 118
 		daemon.initHealthMonitor(c)
119
+
117 120
 		daemon.LogContainerEvent(c, "start")
118 121
 	case libcontainerd.StatePause:
119 122
 		// Container is already locked in this case
120 123
 		c.Paused = true
124
+		daemon.setStateCounter(c)
121 125
 		if err := c.ToDisk(); err != nil {
122 126
 			return err
123 127
 		}
... ...
@@ -126,12 +144,12 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
126 126
 	case libcontainerd.StateResume:
127 127
 		// Container is already locked in this case
128 128
 		c.Paused = false
129
+		daemon.setStateCounter(c)
129 130
 		if err := c.ToDisk(); err != nil {
130 131
 			return err
131 132
 		}
132 133
 		daemon.updateHealthMonitor(c)
133 134
 		daemon.LogContainerEvent(c, "unpause")
134 135
 	}
135
-
136 136
 	return nil
137 137
 }
... ...
@@ -136,7 +136,7 @@ github.com/flynn-archive/go-shlex 3f9db97f856818214da2e1057f8ad84803971cff
136 136
 github.com/Nvveen/Gotty a8b993ba6abdb0e0c12b0125c603323a71c7790c https://github.com/ijc25/Gotty
137 137
 
138 138
 # metrics
139
-github.com/docker/go-metrics 86138d05f285fd9737a99bee2d9be30866b59d72
139
+github.com/docker/go-metrics 8fd5772bf1584597834c6f7961a530f06cbfbb87
140 140
 
141 141
 # composefile
142 142
 github.com/mitchellh/mapstructure f3009df150dadf309fdee4a54ed65c124afad715
... ...
@@ -2,10 +2,67 @@
2 2
 
3 3
 This package is small wrapper around the prometheus go client to help enforce convention and best practices for metrics collection in Docker projects.
4 4
 
5
-## Status
5
+## Best Practices
6 6
 
7
-This project is a work in progress.
8
-It is under heavy development and not intended to be used.
7
+This packages is meant to be used for collecting metrics in Docker projects.
8
+It is not meant to be used as a replacement for the prometheus client but to help enforce consistent naming across metrics collected.
9
+If you have not already read the prometheus best practices around naming and labels you can read the page [here](https://prometheus.io/docs/practices/naming/).
10
+
11
+The following are a few Docker specific rules that will help you name and work with metrics in your project.
12
+
13
+1. Namespace and Subsystem
14
+
15
+This package provides you with a namespace type that allows you to specify the same namespace and subsystem for your metrics.
16
+
17
+```go
18
+ns := metrics.NewNamespace("engine", "daemon", metrics.Labels{
19
+        "version": dockerversion.Version,
20
+        "commit":  dockerversion.GitCommit,
21
+})
22
+```
23
+
24
+In the example above we are creating metrics for the Docker engine's daemon package.
25
+`engine` would be the namespace in this example where `daemon` is the subsystem or package where we are collecting the metrics.
26
+
27
+A namespace also allows you to attach constant labels to the metrics such as the git commit and version that it is collecting.
28
+
29
+2. Declaring your Metrics
30
+
31
+Try to keep all your metric declarations in one file.
32
+This makes it easy for others to see what constant labels are defined on the namespace and what labels are defined on the metrics when they are created.
33
+
34
+3. Use labels instead of multiple metrics
35
+
36
+Labels allow you to define one metric such as the time it takes to perform a certain action on an object.
37
+If we wanted to collect timings on various container actions such as create, start, and delete then we can define one metric called `container_actions` and use labels to specify the type of action.
38
+
39
+
40
+```go
41
+containerActions = ns.NewLabeledTimer("container_actions", "The number of milliseconds it takes to process each container action", "action")
42
+```
43
+
44
+The last parameter is the label name or key.
45
+When adding a data point to the metric you will use the `WithValues` function to specify the `action` that you are collecting for.
46
+
47
+```go
48
+containerActions.WithValues("create").UpdateSince(start)
49
+```
50
+
51
+4. Always use a unit
52
+
53
+The metric name should describe what you are measuring but you also need to provide the unit that it is being measured with.
54
+For a timer, the standard unit is seconds and a counter's standard unit is a total.
55
+For gauges you must provide the unit.
56
+This package provides a standard set of units for use within the Docker projects.
57
+
58
+```go
59
+Nanoseconds Unit = "nanoseconds"
60
+Seconds     Unit = "seconds"
61
+Bytes       Unit = "bytes"
62
+Total       Unit = "total"
63
+```
64
+
65
+If you need to use a unit but it is not defined in the package please open a PR to add it but first try to see if one of the already created units will work for your metric, i.e. seconds or nanoseconds vs adding milliseconds.
9 66
 
10 67
 ## Docs
11 68
 
... ...
@@ -40,21 +40,25 @@ type Namespace struct {
40 40
 //  Only metrics created with the returned namespace will get the new constant
41 41
 //  labels.  The returned namespace must be registered separately.
42 42
 func (n *Namespace) WithConstLabels(labels Labels) *Namespace {
43
-	ns := *n
44
-	ns.metrics = nil // blank this out
45
-	ns.labels = mergeLabels(ns.labels, labels)
46
-	return &ns
43
+	n.mu.Lock()
44
+	ns := &Namespace{
45
+		name:      n.name,
46
+		subsystem: n.subsystem,
47
+		labels:    mergeLabels(n.labels, labels),
48
+	}
49
+	n.mu.Unlock()
50
+	return ns
47 51
 }
48 52
 
49 53
 func (n *Namespace) NewCounter(name, help string) Counter {
50 54
 	c := &counter{pc: prometheus.NewCounter(n.newCounterOpts(name, help))}
51
-	n.addMetric(c)
55
+	n.Add(c)
52 56
 	return c
53 57
 }
54 58
 
55 59
 func (n *Namespace) NewLabeledCounter(name, help string, labels ...string) LabeledCounter {
56 60
 	c := &labeledCounter{pc: prometheus.NewCounterVec(n.newCounterOpts(name, help), labels)}
57
-	n.addMetric(c)
61
+	n.Add(c)
58 62
 	return c
59 63
 }
60 64
 
... ...
@@ -72,7 +76,7 @@ func (n *Namespace) NewTimer(name, help string) Timer {
72 72
 	t := &timer{
73 73
 		m: prometheus.NewHistogram(n.newTimerOpts(name, help)),
74 74
 	}
75
-	n.addMetric(t)
75
+	n.Add(t)
76 76
 	return t
77 77
 }
78 78
 
... ...
@@ -80,7 +84,7 @@ func (n *Namespace) NewLabeledTimer(name, help string, labels ...string) Labeled
80 80
 	t := &labeledTimer{
81 81
 		m: prometheus.NewHistogramVec(n.newTimerOpts(name, help), labels),
82 82
 	}
83
-	n.addMetric(t)
83
+	n.Add(t)
84 84
 	return t
85 85
 }
86 86
 
... ...
@@ -98,7 +102,7 @@ func (n *Namespace) NewGauge(name, help string, unit Unit) Gauge {
98 98
 	g := &gauge{
99 99
 		pg: prometheus.NewGauge(n.newGaugeOpts(name, help, unit)),
100 100
 	}
101
-	n.addMetric(g)
101
+	n.Add(g)
102 102
 	return g
103 103
 }
104 104
 
... ...
@@ -106,7 +110,7 @@ func (n *Namespace) NewLabeledGauge(name, help string, unit Unit, labels ...stri
106 106
 	g := &labeledGauge{
107 107
 		pg: prometheus.NewGaugeVec(n.newGaugeOpts(name, help, unit), labels),
108 108
 	}
109
-	n.addMetric(g)
109
+	n.Add(g)
110 110
 	return g
111 111
 }
112 112
 
... ...
@@ -138,12 +142,24 @@ func (n *Namespace) Collect(ch chan<- prometheus.Metric) {
138 138
 	}
139 139
 }
140 140
 
141
-func (n *Namespace) addMetric(collector prometheus.Collector) {
141
+func (n *Namespace) Add(collector prometheus.Collector) {
142 142
 	n.mu.Lock()
143 143
 	n.metrics = append(n.metrics, collector)
144 144
 	n.mu.Unlock()
145 145
 }
146 146
 
147
+func (n *Namespace) NewDesc(name, help string, unit Unit, labels ...string) *prometheus.Desc {
148
+	if string(unit) != "" {
149
+		name = fmt.Sprintf("%s_%s", name, unit)
150
+	}
151
+	namespace := n.name
152
+	if n.subsystem != "" {
153
+		namespace = fmt.Sprintf("%s_%s", namespace, n.subsystem)
154
+	}
155
+	name = fmt.Sprintf("%s_%s", namespace, name)
156
+	return prometheus.NewDesc(name, help, labels, prometheus.Labels(n.labels))
157
+}
158
+
147 159
 // mergeLabels merges two or more labels objects into a single map, favoring
148 160
 // the later labels.
149 161
 func mergeLabels(lbs ...Labels) Labels {