Use counter for tracking container states
| ... | ... |
@@ -151,6 +151,7 @@ func (daemon *Daemon) create(params types.ContainerCreateConfig, managed bool) ( |
| 151 | 151 |
return nil, err |
| 152 | 152 |
} |
| 153 | 153 |
daemon.Register(container) |
| 154 |
+ stateCtr.set(container.ID, "stopped") |
|
| 154 | 155 |
daemon.LogContainerEvent(container, "create") |
| 155 | 156 |
return container, nil |
| 156 | 157 |
} |
| ... | ... |
@@ -198,6 +198,7 @@ func (daemon *Daemon) restore() error {
|
| 198 | 198 |
if err := backportMountSpec(c); err != nil {
|
| 199 | 199 |
logrus.Error("Failed to migrate old mounts to use new spec format")
|
| 200 | 200 |
} |
| 201 |
+ daemon.setStateCounter(c) |
|
| 201 | 202 |
|
| 202 | 203 |
if c.IsRunning() || c.IsPaused() {
|
| 203 | 204 |
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking |
| ... | ... |
@@ -4,14 +4,12 @@ import ( |
| 4 | 4 |
"fmt" |
| 5 | 5 |
"os" |
| 6 | 6 |
"runtime" |
| 7 |
- "sync/atomic" |
|
| 8 | 7 |
"time" |
| 9 | 8 |
|
| 10 | 9 |
"github.com/Sirupsen/logrus" |
| 11 | 10 |
"github.com/docker/docker/api" |
| 12 | 11 |
"github.com/docker/docker/api/types" |
| 13 | 12 |
"github.com/docker/docker/cli/debug" |
| 14 |
- "github.com/docker/docker/container" |
|
| 15 | 13 |
"github.com/docker/docker/daemon/logger" |
| 16 | 14 |
"github.com/docker/docker/dockerversion" |
| 17 | 15 |
"github.com/docker/docker/pkg/fileutils" |
| ... | ... |
@@ -58,18 +56,7 @@ func (daemon *Daemon) SystemInfo() (*types.Info, error) {
|
| 58 | 58 |
} |
| 59 | 59 |
|
| 60 | 60 |
sysInfo := sysinfo.New(true) |
| 61 |
- |
|
| 62 |
- var cRunning, cPaused, cStopped int32 |
|
| 63 |
- daemon.containers.ApplyAll(func(c *container.Container) {
|
|
| 64 |
- switch c.StateString() {
|
|
| 65 |
- case "paused": |
|
| 66 |
- atomic.AddInt32(&cPaused, 1) |
|
| 67 |
- case "running": |
|
| 68 |
- atomic.AddInt32(&cRunning, 1) |
|
| 69 |
- default: |
|
| 70 |
- atomic.AddInt32(&cStopped, 1) |
|
| 71 |
- } |
|
| 72 |
- }) |
|
| 61 |
+ cRunning, cPaused, cStopped := stateCtr.get() |
|
| 73 | 62 |
|
| 74 | 63 |
securityOptions := []string{}
|
| 75 | 64 |
if sysInfo.AppArmor {
|
| ... | ... |
@@ -1,9 +1,15 @@ |
| 1 | 1 |
package daemon |
| 2 | 2 |
|
| 3 |
-import "github.com/docker/go-metrics" |
|
| 3 |
+import ( |
|
| 4 |
+ "sync" |
|
| 5 |
+ |
|
| 6 |
+ "github.com/docker/go-metrics" |
|
| 7 |
+ "github.com/prometheus/client_golang/prometheus" |
|
| 8 |
+) |
|
| 4 | 9 |
|
| 5 | 10 |
var ( |
| 6 | 11 |
containerActions metrics.LabeledTimer |
| 12 |
+ containerStates metrics.LabeledGauge |
|
| 7 | 13 |
imageActions metrics.LabeledTimer |
| 8 | 14 |
networkActions metrics.LabeledTimer |
| 9 | 15 |
engineVersion metrics.LabeledGauge |
| ... | ... |
@@ -11,6 +17,8 @@ var ( |
| 11 | 11 |
engineMemory metrics.Gauge |
| 12 | 12 |
healthChecksCounter metrics.Counter |
| 13 | 13 |
healthChecksFailedCounter metrics.Counter |
| 14 |
+ |
|
| 15 |
+ stateCtr *stateCounter |
|
| 14 | 16 |
) |
| 15 | 17 |
|
| 16 | 18 |
func init() {
|
| ... | ... |
@@ -25,6 +33,7 @@ func init() {
|
| 25 | 25 |
} {
|
| 26 | 26 |
containerActions.WithValues(a).Update(0) |
| 27 | 27 |
} |
| 28 |
+ |
|
| 28 | 29 |
networkActions = ns.NewLabeledTimer("network_actions", "The number of seconds it takes to process each network action", "action")
|
| 29 | 30 |
engineVersion = ns.NewLabeledGauge("engine", "The version and commit information for the engine process", metrics.Unit("info"),
|
| 30 | 31 |
"version", |
| ... | ... |
@@ -38,5 +47,60 @@ func init() {
|
| 38 | 38 |
healthChecksCounter = ns.NewCounter("health_checks", "The total number of health checks")
|
| 39 | 39 |
healthChecksFailedCounter = ns.NewCounter("health_checks_failed", "The total number of failed health checks")
|
| 40 | 40 |
imageActions = ns.NewLabeledTimer("image_actions", "The number of seconds it takes to process each image action", "action")
|
| 41 |
+ |
|
| 42 |
+ stateCtr = newStateCounter(ns.NewDesc("container_states", "The count of containers in various states", metrics.Unit("containers"), "state"))
|
|
| 43 |
+ ns.Add(stateCtr) |
|
| 44 |
+ |
|
| 41 | 45 |
metrics.Register(ns) |
| 42 | 46 |
} |
| 47 |
+ |
|
| 48 |
+type stateCounter struct {
|
|
| 49 |
+ mu sync.Mutex |
|
| 50 |
+ states map[string]string |
|
| 51 |
+ desc *prometheus.Desc |
|
| 52 |
+} |
|
| 53 |
+ |
|
| 54 |
+func newStateCounter(desc *prometheus.Desc) *stateCounter {
|
|
| 55 |
+ return &stateCounter{
|
|
| 56 |
+ states: make(map[string]string), |
|
| 57 |
+ desc: desc, |
|
| 58 |
+ } |
|
| 59 |
+} |
|
| 60 |
+ |
|
| 61 |
+func (ctr *stateCounter) get() (running int, paused int, stopped int) {
|
|
| 62 |
+ ctr.mu.Lock() |
|
| 63 |
+ defer ctr.mu.Unlock() |
|
| 64 |
+ |
|
| 65 |
+ states := map[string]int{
|
|
| 66 |
+ "running": 0, |
|
| 67 |
+ "paused": 0, |
|
| 68 |
+ "stopped": 0, |
|
| 69 |
+ } |
|
| 70 |
+ for _, state := range ctr.states {
|
|
| 71 |
+ states[state]++ |
|
| 72 |
+ } |
|
| 73 |
+ return states["running"], states["paused"], states["stopped"] |
|
| 74 |
+} |
|
| 75 |
+ |
|
| 76 |
+func (ctr *stateCounter) set(id, label string) {
|
|
| 77 |
+ ctr.mu.Lock() |
|
| 78 |
+ ctr.states[id] = label |
|
| 79 |
+ ctr.mu.Unlock() |
|
| 80 |
+} |
|
| 81 |
+ |
|
| 82 |
+func (ctr *stateCounter) del(id string) {
|
|
| 83 |
+ ctr.mu.Lock() |
|
| 84 |
+ delete(ctr.states, id) |
|
| 85 |
+ ctr.mu.Unlock() |
|
| 86 |
+} |
|
| 87 |
+ |
|
| 88 |
+func (ctr *stateCounter) Describe(ch chan<- *prometheus.Desc) {
|
|
| 89 |
+ ch <- ctr.desc |
|
| 90 |
+} |
|
| 91 |
+ |
|
| 92 |
+func (ctr *stateCounter) Collect(ch chan<- prometheus.Metric) {
|
|
| 93 |
+ running, paused, stopped := ctr.get() |
|
| 94 |
+ ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(running), "running") |
|
| 95 |
+ ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(paused), "paused") |
|
| 96 |
+ ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(stopped), "stopped") |
|
| 97 |
+} |
| ... | ... |
@@ -9,10 +9,22 @@ import ( |
| 9 | 9 |
|
| 10 | 10 |
"github.com/Sirupsen/logrus" |
| 11 | 11 |
"github.com/docker/docker/api/types" |
| 12 |
+ "github.com/docker/docker/container" |
|
| 12 | 13 |
"github.com/docker/docker/libcontainerd" |
| 13 | 14 |
"github.com/docker/docker/restartmanager" |
| 14 | 15 |
) |
| 15 | 16 |
|
| 17 |
+func (daemon *Daemon) setStateCounter(c *container.Container) {
|
|
| 18 |
+ switch c.StateString() {
|
|
| 19 |
+ case "paused": |
|
| 20 |
+ stateCtr.set(c.ID, "paused") |
|
| 21 |
+ case "running": |
|
| 22 |
+ stateCtr.set(c.ID, "running") |
|
| 23 |
+ default: |
|
| 24 |
+ stateCtr.set(c.ID, "stopped") |
|
| 25 |
+ } |
|
| 26 |
+} |
|
| 27 |
+ |
|
| 16 | 28 |
// StateChanged updates daemon state changes from containerd |
| 17 | 29 |
func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 18 | 30 |
c := daemon.containers.Get(id) |
| ... | ... |
@@ -81,6 +93,8 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 81 | 81 |
}() |
| 82 | 82 |
} |
| 83 | 83 |
|
| 84 |
+ daemon.setStateCounter(c) |
|
| 85 |
+ |
|
| 84 | 86 |
defer c.Unlock() |
| 85 | 87 |
if err := c.ToDisk(); err != nil {
|
| 86 | 88 |
return err |
| ... | ... |
@@ -109,15 +123,19 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 109 | 109 |
c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart) |
| 110 | 110 |
c.HasBeenManuallyStopped = false |
| 111 | 111 |
c.HasBeenStartedBefore = true |
| 112 |
+ daemon.setStateCounter(c) |
|
| 113 |
+ |
|
| 112 | 114 |
if err := c.ToDisk(); err != nil {
|
| 113 | 115 |
c.Reset(false) |
| 114 | 116 |
return err |
| 115 | 117 |
} |
| 116 | 118 |
daemon.initHealthMonitor(c) |
| 119 |
+ |
|
| 117 | 120 |
daemon.LogContainerEvent(c, "start") |
| 118 | 121 |
case libcontainerd.StatePause: |
| 119 | 122 |
// Container is already locked in this case |
| 120 | 123 |
c.Paused = true |
| 124 |
+ daemon.setStateCounter(c) |
|
| 121 | 125 |
if err := c.ToDisk(); err != nil {
|
| 122 | 126 |
return err |
| 123 | 127 |
} |
| ... | ... |
@@ -126,12 +144,12 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
| 126 | 126 |
case libcontainerd.StateResume: |
| 127 | 127 |
// Container is already locked in this case |
| 128 | 128 |
c.Paused = false |
| 129 |
+ daemon.setStateCounter(c) |
|
| 129 | 130 |
if err := c.ToDisk(); err != nil {
|
| 130 | 131 |
return err |
| 131 | 132 |
} |
| 132 | 133 |
daemon.updateHealthMonitor(c) |
| 133 | 134 |
daemon.LogContainerEvent(c, "unpause") |
| 134 | 135 |
} |
| 135 |
- |
|
| 136 | 136 |
return nil |
| 137 | 137 |
} |
| ... | ... |
@@ -136,7 +136,7 @@ github.com/flynn-archive/go-shlex 3f9db97f856818214da2e1057f8ad84803971cff |
| 136 | 136 |
github.com/Nvveen/Gotty a8b993ba6abdb0e0c12b0125c603323a71c7790c https://github.com/ijc25/Gotty |
| 137 | 137 |
|
| 138 | 138 |
# metrics |
| 139 |
-github.com/docker/go-metrics 86138d05f285fd9737a99bee2d9be30866b59d72 |
|
| 139 |
+github.com/docker/go-metrics 8fd5772bf1584597834c6f7961a530f06cbfbb87 |
|
| 140 | 140 |
|
| 141 | 141 |
# composefile |
| 142 | 142 |
github.com/mitchellh/mapstructure f3009df150dadf309fdee4a54ed65c124afad715 |
| ... | ... |
@@ -2,10 +2,67 @@ |
| 2 | 2 |
|
| 3 | 3 |
This package is small wrapper around the prometheus go client to help enforce convention and best practices for metrics collection in Docker projects. |
| 4 | 4 |
|
| 5 |
-## Status |
|
| 5 |
+## Best Practices |
|
| 6 | 6 |
|
| 7 |
-This project is a work in progress. |
|
| 8 |
-It is under heavy development and not intended to be used. |
|
| 7 |
+This packages is meant to be used for collecting metrics in Docker projects. |
|
| 8 |
+It is not meant to be used as a replacement for the prometheus client but to help enforce consistent naming across metrics collected. |
|
| 9 |
+If you have not already read the prometheus best practices around naming and labels you can read the page [here](https://prometheus.io/docs/practices/naming/). |
|
| 10 |
+ |
|
| 11 |
+The following are a few Docker specific rules that will help you name and work with metrics in your project. |
|
| 12 |
+ |
|
| 13 |
+1. Namespace and Subsystem |
|
| 14 |
+ |
|
| 15 |
+This package provides you with a namespace type that allows you to specify the same namespace and subsystem for your metrics. |
|
| 16 |
+ |
|
| 17 |
+```go |
|
| 18 |
+ns := metrics.NewNamespace("engine", "daemon", metrics.Labels{
|
|
| 19 |
+ "version": dockerversion.Version, |
|
| 20 |
+ "commit": dockerversion.GitCommit, |
|
| 21 |
+}) |
|
| 22 |
+``` |
|
| 23 |
+ |
|
| 24 |
+In the example above we are creating metrics for the Docker engine's daemon package. |
|
| 25 |
+`engine` would be the namespace in this example where `daemon` is the subsystem or package where we are collecting the metrics. |
|
| 26 |
+ |
|
| 27 |
+A namespace also allows you to attach constant labels to the metrics such as the git commit and version that it is collecting. |
|
| 28 |
+ |
|
| 29 |
+2. Declaring your Metrics |
|
| 30 |
+ |
|
| 31 |
+Try to keep all your metric declarations in one file. |
|
| 32 |
+This makes it easy for others to see what constant labels are defined on the namespace and what labels are defined on the metrics when they are created. |
|
| 33 |
+ |
|
| 34 |
+3. Use labels instead of multiple metrics |
|
| 35 |
+ |
|
| 36 |
+Labels allow you to define one metric such as the time it takes to perform a certain action on an object. |
|
| 37 |
+If we wanted to collect timings on various container actions such as create, start, and delete then we can define one metric called `container_actions` and use labels to specify the type of action. |
|
| 38 |
+ |
|
| 39 |
+ |
|
| 40 |
+```go |
|
| 41 |
+containerActions = ns.NewLabeledTimer("container_actions", "The number of milliseconds it takes to process each container action", "action")
|
|
| 42 |
+``` |
|
| 43 |
+ |
|
| 44 |
+The last parameter is the label name or key. |
|
| 45 |
+When adding a data point to the metric you will use the `WithValues` function to specify the `action` that you are collecting for. |
|
| 46 |
+ |
|
| 47 |
+```go |
|
| 48 |
+containerActions.WithValues("create").UpdateSince(start)
|
|
| 49 |
+``` |
|
| 50 |
+ |
|
| 51 |
+4. Always use a unit |
|
| 52 |
+ |
|
| 53 |
+The metric name should describe what you are measuring but you also need to provide the unit that it is being measured with. |
|
| 54 |
+For a timer, the standard unit is seconds and a counter's standard unit is a total. |
|
| 55 |
+For gauges you must provide the unit. |
|
| 56 |
+This package provides a standard set of units for use within the Docker projects. |
|
| 57 |
+ |
|
| 58 |
+```go |
|
| 59 |
+Nanoseconds Unit = "nanoseconds" |
|
| 60 |
+Seconds Unit = "seconds" |
|
| 61 |
+Bytes Unit = "bytes" |
|
| 62 |
+Total Unit = "total" |
|
| 63 |
+``` |
|
| 64 |
+ |
|
| 65 |
+If you need to use a unit but it is not defined in the package please open a PR to add it but first try to see if one of the already created units will work for your metric, i.e. seconds or nanoseconds vs adding milliseconds. |
|
| 9 | 66 |
|
| 10 | 67 |
## Docs |
| 11 | 68 |
|
| ... | ... |
@@ -40,21 +40,25 @@ type Namespace struct {
|
| 40 | 40 |
// Only metrics created with the returned namespace will get the new constant |
| 41 | 41 |
// labels. The returned namespace must be registered separately. |
| 42 | 42 |
func (n *Namespace) WithConstLabels(labels Labels) *Namespace {
|
| 43 |
- ns := *n |
|
| 44 |
- ns.metrics = nil // blank this out |
|
| 45 |
- ns.labels = mergeLabels(ns.labels, labels) |
|
| 46 |
- return &ns |
|
| 43 |
+ n.mu.Lock() |
|
| 44 |
+ ns := &Namespace{
|
|
| 45 |
+ name: n.name, |
|
| 46 |
+ subsystem: n.subsystem, |
|
| 47 |
+ labels: mergeLabels(n.labels, labels), |
|
| 48 |
+ } |
|
| 49 |
+ n.mu.Unlock() |
|
| 50 |
+ return ns |
|
| 47 | 51 |
} |
| 48 | 52 |
|
| 49 | 53 |
func (n *Namespace) NewCounter(name, help string) Counter {
|
| 50 | 54 |
c := &counter{pc: prometheus.NewCounter(n.newCounterOpts(name, help))}
|
| 51 |
- n.addMetric(c) |
|
| 55 |
+ n.Add(c) |
|
| 52 | 56 |
return c |
| 53 | 57 |
} |
| 54 | 58 |
|
| 55 | 59 |
func (n *Namespace) NewLabeledCounter(name, help string, labels ...string) LabeledCounter {
|
| 56 | 60 |
c := &labeledCounter{pc: prometheus.NewCounterVec(n.newCounterOpts(name, help), labels)}
|
| 57 |
- n.addMetric(c) |
|
| 61 |
+ n.Add(c) |
|
| 58 | 62 |
return c |
| 59 | 63 |
} |
| 60 | 64 |
|
| ... | ... |
@@ -72,7 +76,7 @@ func (n *Namespace) NewTimer(name, help string) Timer {
|
| 72 | 72 |
t := &timer{
|
| 73 | 73 |
m: prometheus.NewHistogram(n.newTimerOpts(name, help)), |
| 74 | 74 |
} |
| 75 |
- n.addMetric(t) |
|
| 75 |
+ n.Add(t) |
|
| 76 | 76 |
return t |
| 77 | 77 |
} |
| 78 | 78 |
|
| ... | ... |
@@ -80,7 +84,7 @@ func (n *Namespace) NewLabeledTimer(name, help string, labels ...string) Labeled |
| 80 | 80 |
t := &labeledTimer{
|
| 81 | 81 |
m: prometheus.NewHistogramVec(n.newTimerOpts(name, help), labels), |
| 82 | 82 |
} |
| 83 |
- n.addMetric(t) |
|
| 83 |
+ n.Add(t) |
|
| 84 | 84 |
return t |
| 85 | 85 |
} |
| 86 | 86 |
|
| ... | ... |
@@ -98,7 +102,7 @@ func (n *Namespace) NewGauge(name, help string, unit Unit) Gauge {
|
| 98 | 98 |
g := &gauge{
|
| 99 | 99 |
pg: prometheus.NewGauge(n.newGaugeOpts(name, help, unit)), |
| 100 | 100 |
} |
| 101 |
- n.addMetric(g) |
|
| 101 |
+ n.Add(g) |
|
| 102 | 102 |
return g |
| 103 | 103 |
} |
| 104 | 104 |
|
| ... | ... |
@@ -106,7 +110,7 @@ func (n *Namespace) NewLabeledGauge(name, help string, unit Unit, labels ...stri |
| 106 | 106 |
g := &labeledGauge{
|
| 107 | 107 |
pg: prometheus.NewGaugeVec(n.newGaugeOpts(name, help, unit), labels), |
| 108 | 108 |
} |
| 109 |
- n.addMetric(g) |
|
| 109 |
+ n.Add(g) |
|
| 110 | 110 |
return g |
| 111 | 111 |
} |
| 112 | 112 |
|
| ... | ... |
@@ -138,12 +142,24 @@ func (n *Namespace) Collect(ch chan<- prometheus.Metric) {
|
| 138 | 138 |
} |
| 139 | 139 |
} |
| 140 | 140 |
|
| 141 |
-func (n *Namespace) addMetric(collector prometheus.Collector) {
|
|
| 141 |
+func (n *Namespace) Add(collector prometheus.Collector) {
|
|
| 142 | 142 |
n.mu.Lock() |
| 143 | 143 |
n.metrics = append(n.metrics, collector) |
| 144 | 144 |
n.mu.Unlock() |
| 145 | 145 |
} |
| 146 | 146 |
|
| 147 |
+func (n *Namespace) NewDesc(name, help string, unit Unit, labels ...string) *prometheus.Desc {
|
|
| 148 |
+ if string(unit) != "" {
|
|
| 149 |
+ name = fmt.Sprintf("%s_%s", name, unit)
|
|
| 150 |
+ } |
|
| 151 |
+ namespace := n.name |
|
| 152 |
+ if n.subsystem != "" {
|
|
| 153 |
+ namespace = fmt.Sprintf("%s_%s", namespace, n.subsystem)
|
|
| 154 |
+ } |
|
| 155 |
+ name = fmt.Sprintf("%s_%s", namespace, name)
|
|
| 156 |
+ return prometheus.NewDesc(name, help, labels, prometheus.Labels(n.labels)) |
|
| 157 |
+} |
|
| 158 |
+ |
|
| 147 | 159 |
// mergeLabels merges two or more labels objects into a single map, favoring |
| 148 | 160 |
// the later labels. |
| 149 | 161 |
func mergeLabels(lbs ...Labels) Labels {
|