Use counter for tracking container states
Brian Goff authored on 2017/05/06 05:15:22... | ... |
@@ -151,6 +151,7 @@ func (daemon *Daemon) create(params types.ContainerCreateConfig, managed bool) ( |
151 | 151 |
return nil, err |
152 | 152 |
} |
153 | 153 |
daemon.Register(container) |
154 |
+ stateCtr.set(container.ID, "stopped") |
|
154 | 155 |
daemon.LogContainerEvent(container, "create") |
155 | 156 |
return container, nil |
156 | 157 |
} |
... | ... |
@@ -198,6 +198,7 @@ func (daemon *Daemon) restore() error { |
198 | 198 |
if err := backportMountSpec(c); err != nil { |
199 | 199 |
logrus.Error("Failed to migrate old mounts to use new spec format") |
200 | 200 |
} |
201 |
+ daemon.setStateCounter(c) |
|
201 | 202 |
|
202 | 203 |
if c.IsRunning() || c.IsPaused() { |
203 | 204 |
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking |
... | ... |
@@ -4,14 +4,12 @@ import ( |
4 | 4 |
"fmt" |
5 | 5 |
"os" |
6 | 6 |
"runtime" |
7 |
- "sync/atomic" |
|
8 | 7 |
"time" |
9 | 8 |
|
10 | 9 |
"github.com/Sirupsen/logrus" |
11 | 10 |
"github.com/docker/docker/api" |
12 | 11 |
"github.com/docker/docker/api/types" |
13 | 12 |
"github.com/docker/docker/cli/debug" |
14 |
- "github.com/docker/docker/container" |
|
15 | 13 |
"github.com/docker/docker/daemon/logger" |
16 | 14 |
"github.com/docker/docker/dockerversion" |
17 | 15 |
"github.com/docker/docker/pkg/fileutils" |
... | ... |
@@ -58,18 +56,7 @@ func (daemon *Daemon) SystemInfo() (*types.Info, error) { |
58 | 58 |
} |
59 | 59 |
|
60 | 60 |
sysInfo := sysinfo.New(true) |
61 |
- |
|
62 |
- var cRunning, cPaused, cStopped int32 |
|
63 |
- daemon.containers.ApplyAll(func(c *container.Container) { |
|
64 |
- switch c.StateString() { |
|
65 |
- case "paused": |
|
66 |
- atomic.AddInt32(&cPaused, 1) |
|
67 |
- case "running": |
|
68 |
- atomic.AddInt32(&cRunning, 1) |
|
69 |
- default: |
|
70 |
- atomic.AddInt32(&cStopped, 1) |
|
71 |
- } |
|
72 |
- }) |
|
61 |
+ cRunning, cPaused, cStopped := stateCtr.get() |
|
73 | 62 |
|
74 | 63 |
securityOptions := []string{} |
75 | 64 |
if sysInfo.AppArmor { |
... | ... |
@@ -1,9 +1,15 @@ |
1 | 1 |
package daemon |
2 | 2 |
|
3 |
-import "github.com/docker/go-metrics" |
|
3 |
+import ( |
|
4 |
+ "sync" |
|
5 |
+ |
|
6 |
+ "github.com/docker/go-metrics" |
|
7 |
+ "github.com/prometheus/client_golang/prometheus" |
|
8 |
+) |
|
4 | 9 |
|
5 | 10 |
var ( |
6 | 11 |
containerActions metrics.LabeledTimer |
12 |
+ containerStates metrics.LabeledGauge |
|
7 | 13 |
imageActions metrics.LabeledTimer |
8 | 14 |
networkActions metrics.LabeledTimer |
9 | 15 |
engineVersion metrics.LabeledGauge |
... | ... |
@@ -11,6 +17,8 @@ var ( |
11 | 11 |
engineMemory metrics.Gauge |
12 | 12 |
healthChecksCounter metrics.Counter |
13 | 13 |
healthChecksFailedCounter metrics.Counter |
14 |
+ |
|
15 |
+ stateCtr *stateCounter |
|
14 | 16 |
) |
15 | 17 |
|
16 | 18 |
func init() { |
... | ... |
@@ -25,6 +33,7 @@ func init() { |
25 | 25 |
} { |
26 | 26 |
containerActions.WithValues(a).Update(0) |
27 | 27 |
} |
28 |
+ |
|
28 | 29 |
networkActions = ns.NewLabeledTimer("network_actions", "The number of seconds it takes to process each network action", "action") |
29 | 30 |
engineVersion = ns.NewLabeledGauge("engine", "The version and commit information for the engine process", metrics.Unit("info"), |
30 | 31 |
"version", |
... | ... |
@@ -38,5 +47,60 @@ func init() { |
38 | 38 |
healthChecksCounter = ns.NewCounter("health_checks", "The total number of health checks") |
39 | 39 |
healthChecksFailedCounter = ns.NewCounter("health_checks_failed", "The total number of failed health checks") |
40 | 40 |
imageActions = ns.NewLabeledTimer("image_actions", "The number of seconds it takes to process each image action", "action") |
41 |
+ |
|
42 |
+ stateCtr = newStateCounter(ns.NewDesc("container_states", "The count of containers in various states", metrics.Unit("containers"), "state")) |
|
43 |
+ ns.Add(stateCtr) |
|
44 |
+ |
|
41 | 45 |
metrics.Register(ns) |
42 | 46 |
} |
47 |
+ |
|
48 |
+type stateCounter struct { |
|
49 |
+ mu sync.Mutex |
|
50 |
+ states map[string]string |
|
51 |
+ desc *prometheus.Desc |
|
52 |
+} |
|
53 |
+ |
|
54 |
+func newStateCounter(desc *prometheus.Desc) *stateCounter { |
|
55 |
+ return &stateCounter{ |
|
56 |
+ states: make(map[string]string), |
|
57 |
+ desc: desc, |
|
58 |
+ } |
|
59 |
+} |
|
60 |
+ |
|
61 |
+func (ctr *stateCounter) get() (running int, paused int, stopped int) { |
|
62 |
+ ctr.mu.Lock() |
|
63 |
+ defer ctr.mu.Unlock() |
|
64 |
+ |
|
65 |
+ states := map[string]int{ |
|
66 |
+ "running": 0, |
|
67 |
+ "paused": 0, |
|
68 |
+ "stopped": 0, |
|
69 |
+ } |
|
70 |
+ for _, state := range ctr.states { |
|
71 |
+ states[state]++ |
|
72 |
+ } |
|
73 |
+ return states["running"], states["paused"], states["stopped"] |
|
74 |
+} |
|
75 |
+ |
|
76 |
+func (ctr *stateCounter) set(id, label string) { |
|
77 |
+ ctr.mu.Lock() |
|
78 |
+ ctr.states[id] = label |
|
79 |
+ ctr.mu.Unlock() |
|
80 |
+} |
|
81 |
+ |
|
82 |
+func (ctr *stateCounter) del(id string) { |
|
83 |
+ ctr.mu.Lock() |
|
84 |
+ delete(ctr.states, id) |
|
85 |
+ ctr.mu.Unlock() |
|
86 |
+} |
|
87 |
+ |
|
88 |
+func (ctr *stateCounter) Describe(ch chan<- *prometheus.Desc) { |
|
89 |
+ ch <- ctr.desc |
|
90 |
+} |
|
91 |
+ |
|
92 |
+func (ctr *stateCounter) Collect(ch chan<- prometheus.Metric) { |
|
93 |
+ running, paused, stopped := ctr.get() |
|
94 |
+ ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(running), "running") |
|
95 |
+ ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(paused), "paused") |
|
96 |
+ ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(stopped), "stopped") |
|
97 |
+} |
... | ... |
@@ -9,10 +9,22 @@ import ( |
9 | 9 |
|
10 | 10 |
"github.com/Sirupsen/logrus" |
11 | 11 |
"github.com/docker/docker/api/types" |
12 |
+ "github.com/docker/docker/container" |
|
12 | 13 |
"github.com/docker/docker/libcontainerd" |
13 | 14 |
"github.com/docker/docker/restartmanager" |
14 | 15 |
) |
15 | 16 |
|
17 |
+func (daemon *Daemon) setStateCounter(c *container.Container) { |
|
18 |
+ switch c.StateString() { |
|
19 |
+ case "paused": |
|
20 |
+ stateCtr.set(c.ID, "paused") |
|
21 |
+ case "running": |
|
22 |
+ stateCtr.set(c.ID, "running") |
|
23 |
+ default: |
|
24 |
+ stateCtr.set(c.ID, "stopped") |
|
25 |
+ } |
|
26 |
+} |
|
27 |
+ |
|
16 | 28 |
// StateChanged updates daemon state changes from containerd |
17 | 29 |
func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { |
18 | 30 |
c := daemon.containers.Get(id) |
... | ... |
@@ -81,6 +93,8 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { |
81 | 81 |
}() |
82 | 82 |
} |
83 | 83 |
|
84 |
+ daemon.setStateCounter(c) |
|
85 |
+ |
|
84 | 86 |
defer c.Unlock() |
85 | 87 |
if err := c.ToDisk(); err != nil { |
86 | 88 |
return err |
... | ... |
@@ -109,15 +123,19 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { |
109 | 109 |
c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart) |
110 | 110 |
c.HasBeenManuallyStopped = false |
111 | 111 |
c.HasBeenStartedBefore = true |
112 |
+ daemon.setStateCounter(c) |
|
113 |
+ |
|
112 | 114 |
if err := c.ToDisk(); err != nil { |
113 | 115 |
c.Reset(false) |
114 | 116 |
return err |
115 | 117 |
} |
116 | 118 |
daemon.initHealthMonitor(c) |
119 |
+ |
|
117 | 120 |
daemon.LogContainerEvent(c, "start") |
118 | 121 |
case libcontainerd.StatePause: |
119 | 122 |
// Container is already locked in this case |
120 | 123 |
c.Paused = true |
124 |
+ daemon.setStateCounter(c) |
|
121 | 125 |
if err := c.ToDisk(); err != nil { |
122 | 126 |
return err |
123 | 127 |
} |
... | ... |
@@ -126,12 +144,12 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { |
126 | 126 |
case libcontainerd.StateResume: |
127 | 127 |
// Container is already locked in this case |
128 | 128 |
c.Paused = false |
129 |
+ daemon.setStateCounter(c) |
|
129 | 130 |
if err := c.ToDisk(); err != nil { |
130 | 131 |
return err |
131 | 132 |
} |
132 | 133 |
daemon.updateHealthMonitor(c) |
133 | 134 |
daemon.LogContainerEvent(c, "unpause") |
134 | 135 |
} |
135 |
- |
|
136 | 136 |
return nil |
137 | 137 |
} |
... | ... |
@@ -136,7 +136,7 @@ github.com/flynn-archive/go-shlex 3f9db97f856818214da2e1057f8ad84803971cff |
136 | 136 |
github.com/Nvveen/Gotty a8b993ba6abdb0e0c12b0125c603323a71c7790c https://github.com/ijc25/Gotty |
137 | 137 |
|
138 | 138 |
# metrics |
139 |
-github.com/docker/go-metrics 86138d05f285fd9737a99bee2d9be30866b59d72 |
|
139 |
+github.com/docker/go-metrics 8fd5772bf1584597834c6f7961a530f06cbfbb87 |
|
140 | 140 |
|
141 | 141 |
# composefile |
142 | 142 |
github.com/mitchellh/mapstructure f3009df150dadf309fdee4a54ed65c124afad715 |
... | ... |
@@ -2,10 +2,67 @@ |
2 | 2 |
|
3 | 3 |
This package is small wrapper around the prometheus go client to help enforce convention and best practices for metrics collection in Docker projects. |
4 | 4 |
|
5 |
-## Status |
|
5 |
+## Best Practices |
|
6 | 6 |
|
7 |
-This project is a work in progress. |
|
8 |
-It is under heavy development and not intended to be used. |
|
7 |
+This packages is meant to be used for collecting metrics in Docker projects. |
|
8 |
+It is not meant to be used as a replacement for the prometheus client but to help enforce consistent naming across metrics collected. |
|
9 |
+If you have not already read the prometheus best practices around naming and labels you can read the page [here](https://prometheus.io/docs/practices/naming/). |
|
10 |
+ |
|
11 |
+The following are a few Docker specific rules that will help you name and work with metrics in your project. |
|
12 |
+ |
|
13 |
+1. Namespace and Subsystem |
|
14 |
+ |
|
15 |
+This package provides you with a namespace type that allows you to specify the same namespace and subsystem for your metrics. |
|
16 |
+ |
|
17 |
+```go |
|
18 |
+ns := metrics.NewNamespace("engine", "daemon", metrics.Labels{ |
|
19 |
+ "version": dockerversion.Version, |
|
20 |
+ "commit": dockerversion.GitCommit, |
|
21 |
+}) |
|
22 |
+``` |
|
23 |
+ |
|
24 |
+In the example above we are creating metrics for the Docker engine's daemon package. |
|
25 |
+`engine` would be the namespace in this example where `daemon` is the subsystem or package where we are collecting the metrics. |
|
26 |
+ |
|
27 |
+A namespace also allows you to attach constant labels to the metrics such as the git commit and version that it is collecting. |
|
28 |
+ |
|
29 |
+2. Declaring your Metrics |
|
30 |
+ |
|
31 |
+Try to keep all your metric declarations in one file. |
|
32 |
+This makes it easy for others to see what constant labels are defined on the namespace and what labels are defined on the metrics when they are created. |
|
33 |
+ |
|
34 |
+3. Use labels instead of multiple metrics |
|
35 |
+ |
|
36 |
+Labels allow you to define one metric such as the time it takes to perform a certain action on an object. |
|
37 |
+If we wanted to collect timings on various container actions such as create, start, and delete then we can define one metric called `container_actions` and use labels to specify the type of action. |
|
38 |
+ |
|
39 |
+ |
|
40 |
+```go |
|
41 |
+containerActions = ns.NewLabeledTimer("container_actions", "The number of milliseconds it takes to process each container action", "action") |
|
42 |
+``` |
|
43 |
+ |
|
44 |
+The last parameter is the label name or key. |
|
45 |
+When adding a data point to the metric you will use the `WithValues` function to specify the `action` that you are collecting for. |
|
46 |
+ |
|
47 |
+```go |
|
48 |
+containerActions.WithValues("create").UpdateSince(start) |
|
49 |
+``` |
|
50 |
+ |
|
51 |
+4. Always use a unit |
|
52 |
+ |
|
53 |
+The metric name should describe what you are measuring but you also need to provide the unit that it is being measured with. |
|
54 |
+For a timer, the standard unit is seconds and a counter's standard unit is a total. |
|
55 |
+For gauges you must provide the unit. |
|
56 |
+This package provides a standard set of units for use within the Docker projects. |
|
57 |
+ |
|
58 |
+```go |
|
59 |
+Nanoseconds Unit = "nanoseconds" |
|
60 |
+Seconds Unit = "seconds" |
|
61 |
+Bytes Unit = "bytes" |
|
62 |
+Total Unit = "total" |
|
63 |
+``` |
|
64 |
+ |
|
65 |
+If you need to use a unit but it is not defined in the package please open a PR to add it but first try to see if one of the already created units will work for your metric, i.e. seconds or nanoseconds vs adding milliseconds. |
|
9 | 66 |
|
10 | 67 |
## Docs |
11 | 68 |
|
... | ... |
@@ -40,21 +40,25 @@ type Namespace struct { |
40 | 40 |
// Only metrics created with the returned namespace will get the new constant |
41 | 41 |
// labels. The returned namespace must be registered separately. |
42 | 42 |
func (n *Namespace) WithConstLabels(labels Labels) *Namespace { |
43 |
- ns := *n |
|
44 |
- ns.metrics = nil // blank this out |
|
45 |
- ns.labels = mergeLabels(ns.labels, labels) |
|
46 |
- return &ns |
|
43 |
+ n.mu.Lock() |
|
44 |
+ ns := &Namespace{ |
|
45 |
+ name: n.name, |
|
46 |
+ subsystem: n.subsystem, |
|
47 |
+ labels: mergeLabels(n.labels, labels), |
|
48 |
+ } |
|
49 |
+ n.mu.Unlock() |
|
50 |
+ return ns |
|
47 | 51 |
} |
48 | 52 |
|
49 | 53 |
func (n *Namespace) NewCounter(name, help string) Counter { |
50 | 54 |
c := &counter{pc: prometheus.NewCounter(n.newCounterOpts(name, help))} |
51 |
- n.addMetric(c) |
|
55 |
+ n.Add(c) |
|
52 | 56 |
return c |
53 | 57 |
} |
54 | 58 |
|
55 | 59 |
func (n *Namespace) NewLabeledCounter(name, help string, labels ...string) LabeledCounter { |
56 | 60 |
c := &labeledCounter{pc: prometheus.NewCounterVec(n.newCounterOpts(name, help), labels)} |
57 |
- n.addMetric(c) |
|
61 |
+ n.Add(c) |
|
58 | 62 |
return c |
59 | 63 |
} |
60 | 64 |
|
... | ... |
@@ -72,7 +76,7 @@ func (n *Namespace) NewTimer(name, help string) Timer { |
72 | 72 |
t := &timer{ |
73 | 73 |
m: prometheus.NewHistogram(n.newTimerOpts(name, help)), |
74 | 74 |
} |
75 |
- n.addMetric(t) |
|
75 |
+ n.Add(t) |
|
76 | 76 |
return t |
77 | 77 |
} |
78 | 78 |
|
... | ... |
@@ -80,7 +84,7 @@ func (n *Namespace) NewLabeledTimer(name, help string, labels ...string) Labeled |
80 | 80 |
t := &labeledTimer{ |
81 | 81 |
m: prometheus.NewHistogramVec(n.newTimerOpts(name, help), labels), |
82 | 82 |
} |
83 |
- n.addMetric(t) |
|
83 |
+ n.Add(t) |
|
84 | 84 |
return t |
85 | 85 |
} |
86 | 86 |
|
... | ... |
@@ -98,7 +102,7 @@ func (n *Namespace) NewGauge(name, help string, unit Unit) Gauge { |
98 | 98 |
g := &gauge{ |
99 | 99 |
pg: prometheus.NewGauge(n.newGaugeOpts(name, help, unit)), |
100 | 100 |
} |
101 |
- n.addMetric(g) |
|
101 |
+ n.Add(g) |
|
102 | 102 |
return g |
103 | 103 |
} |
104 | 104 |
|
... | ... |
@@ -106,7 +110,7 @@ func (n *Namespace) NewLabeledGauge(name, help string, unit Unit, labels ...stri |
106 | 106 |
g := &labeledGauge{ |
107 | 107 |
pg: prometheus.NewGaugeVec(n.newGaugeOpts(name, help, unit), labels), |
108 | 108 |
} |
109 |
- n.addMetric(g) |
|
109 |
+ n.Add(g) |
|
110 | 110 |
return g |
111 | 111 |
} |
112 | 112 |
|
... | ... |
@@ -138,12 +142,24 @@ func (n *Namespace) Collect(ch chan<- prometheus.Metric) { |
138 | 138 |
} |
139 | 139 |
} |
140 | 140 |
|
141 |
-func (n *Namespace) addMetric(collector prometheus.Collector) { |
|
141 |
+func (n *Namespace) Add(collector prometheus.Collector) { |
|
142 | 142 |
n.mu.Lock() |
143 | 143 |
n.metrics = append(n.metrics, collector) |
144 | 144 |
n.mu.Unlock() |
145 | 145 |
} |
146 | 146 |
|
147 |
+func (n *Namespace) NewDesc(name, help string, unit Unit, labels ...string) *prometheus.Desc { |
|
148 |
+ if string(unit) != "" { |
|
149 |
+ name = fmt.Sprintf("%s_%s", name, unit) |
|
150 |
+ } |
|
151 |
+ namespace := n.name |
|
152 |
+ if n.subsystem != "" { |
|
153 |
+ namespace = fmt.Sprintf("%s_%s", namespace, n.subsystem) |
|
154 |
+ } |
|
155 |
+ name = fmt.Sprintf("%s_%s", namespace, name) |
|
156 |
+ return prometheus.NewDesc(name, help, labels, prometheus.Labels(n.labels)) |
|
157 |
+} |
|
158 |
+ |
|
147 | 159 |
// mergeLabels merges two or more labels objects into a single map, favoring |
148 | 160 |
// the later labels. |
149 | 161 |
func mergeLabels(lbs ...Labels) Labels { |