Browse code

Correct CPU usage calculation in presence of offline CPUs and newer Linux

In https://github.com/torvalds/linux/commit/5ca3726 (released in v4.7-rc1) the
content of the `cpuacct.usage_percpu` file in sysfs was changed to include both
online and offline cpus. This broke the arithmetic in the stats helpers used by
`docker stats`, since it was using the length of the PerCPUUsage array as a
proxy for the number of online CPUs.

Add current number of online CPUs to types.StatsJSON and use it in the
calculation.

Keep a fallback to `len(v.CPUStats.CPUUsage.PercpuUsage)` so this code
continues to work when talking to an older daemon. An old client talking to a
new daemon will ignore the new field and behave as before.

Fixes #28941.

Signed-off-by: Ian Campbell <ian.campbell@docker.com>

Ian Campbell authored on 2017/03/07 02:29:09
Showing 7 changed files
... ...
@@ -3468,6 +3468,10 @@ paths:
3468 3468
         The `precpu_stats` is the CPU statistic of last read, which is used
3469 3469
         for calculating the CPU usage percentage. It is not the same as the
3470 3470
         `cpu_stats` field.
3471
+
3472
+        If either `precpu_stats.online_cpus` or `cpu_stats.online_cpus` is
3473
+        nil then for compatibility with older daemons the length of the
3474
+        corresponding `cpu_usage.percpu_usage` array should be used.
3471 3475
       operationId: "ContainerStats"
3472 3476
       produces: ["application/json"]
3473 3477
       responses:
... ...
@@ -3546,6 +3550,7 @@ paths:
3546 3546
                   total_usage: 100215355
3547 3547
                   usage_in_kernelmode: 30000000
3548 3548
                 system_cpu_usage: 739306590000000
3549
+                online_cpus: 4
3549 3550
                 throttling_data:
3550 3551
                   periods: 0
3551 3552
                   throttled_periods: 0
... ...
@@ -3561,6 +3566,7 @@ paths:
3561 3561
                   total_usage: 100093996
3562 3562
                   usage_in_kernelmode: 30000000
3563 3563
                 system_cpu_usage: 9492140000000
3564
+                online_cpus: 4
3564 3565
                 throttling_data:
3565 3566
                   periods: 0
3566 3567
                   throttled_periods: 0
... ...
@@ -47,6 +47,9 @@ type CPUStats struct {
47 47
 	// System Usage. Linux only.
48 48
 	SystemUsage uint64 `json:"system_cpu_usage,omitempty"`
49 49
 
50
+	// Online CPUs. Linux only.
51
+	OnlineCPUs uint32 `json:"online_cpus,omitempty"`
52
+
50 53
 	// Throttling Data. Linux only.
51 54
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
52 55
 }
... ...
@@ -178,10 +178,14 @@ func calculateCPUPercentUnix(previousCPU, previousSystem uint64, v *types.StatsJ
178 178
 		cpuDelta = float64(v.CPUStats.CPUUsage.TotalUsage) - float64(previousCPU)
179 179
 		// calculate the change for the entire system between readings
180 180
 		systemDelta = float64(v.CPUStats.SystemUsage) - float64(previousSystem)
181
+		onlineCPUs  = float64(v.CPUStats.OnlineCPUs)
181 182
 	)
182 183
 
184
+	if onlineCPUs == 0.0 {
185
+		onlineCPUs = float64(len(v.CPUStats.CPUUsage.PercpuUsage))
186
+	}
183 187
 	if systemDelta > 0.0 && cpuDelta > 0.0 {
184
-		cpuPercent = (cpuDelta / systemDelta) * float64(len(v.CPUStats.CPUUsage.PercpuUsage)) * 100.0
188
+		cpuPercent = (cpuDelta / systemDelta) * onlineCPUs * 100.0
185 189
 	}
186 190
 	return cpuPercent
187 191
 }
... ...
@@ -80,6 +80,12 @@ func (s *Collector) Run() {
80 80
 			continue
81 81
 		}
82 82
 
83
+		onlineCPUs, err := s.getNumberOnlineCPUs()
84
+		if err != nil {
85
+			logrus.Errorf("collecting system online cpu count: %v", err)
86
+			continue
87
+		}
88
+
83 89
 		for _, pair := range pairs {
84 90
 			stats, err := s.supervisor.GetContainerStats(pair.container)
85 91
 			if err != nil {
... ...
@@ -97,6 +103,7 @@ func (s *Collector) Run() {
97 97
 			}
98 98
 			// FIXME: move to containerd on Linux (not Windows)
99 99
 			stats.CPUStats.SystemUsage = systemUsage
100
+			stats.CPUStats.OnlineCPUs = onlineCPUs
100 101
 
101 102
 			pair.publisher.Publish(*stats)
102 103
 		}
... ...
@@ -11,6 +11,11 @@ import (
11 11
 	"github.com/opencontainers/runc/libcontainer/system"
12 12
 )
13 13
 
14
+/*
15
+#include <unistd.h>
16
+*/
17
+import "C"
18
+
14 19
 // platformNewStatsCollector performs platform specific initialisation of the
15 20
 // Collector structure.
16 21
 func platformNewStatsCollector(s *Collector) {
... ...
@@ -64,3 +69,11 @@ func (s *Collector) getSystemCPUUsage() (uint64, error) {
64 64
 	}
65 65
 	return 0, fmt.Errorf("invalid stat format. Error trying to parse the '/proc/stat' file")
66 66
 }
67
+
68
+func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
69
+	i, err := C.sysconf(C._SC_NPROCESSORS_ONLN)
70
+	if err != nil {
71
+		return 0, err
72
+	}
73
+	return uint32(i), nil
74
+}
... ...
@@ -13,3 +13,7 @@ func platformNewStatsCollector(s *Collector) {
13 13
 func (s *Collector) getSystemCPUUsage() (uint64, error) {
14 14
 	return 0, nil
15 15
 }
16
+
17
+func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
18
+	return 0, nil
19
+}
... ...
@@ -24,6 +24,7 @@ keywords: "API, Docker, rcli, REST, documentation"
24 24
 * `POST /build` now accepts `extrahosts` parameter to specify a host to ip mapping to use during the build.
25 25
 * `POST /services/create` and `POST /services/(id or name)/update` now accept a `rollback` value for `FailureAction`.
26 26
 * `POST /services/create` and `POST /services/(id or name)/update` now accept an optional `RollbackConfig` object which specifies rollback options.
27
+* `GET /containers/(id or name)/stats` now includes an `online_cpus` field in both `precpu_stats` and `cpu_stats`. If this field is `nil` then for compatibility with older daemons the length of the corresponding `cpu_usage.percpu_usage` array should be used.
27 28
 
28 29
 ## v1.26 API changes
29 30