Browse code

Added list of mlock-using processes to peakmem_tracker output

The change makes peakmem_tracker list processes that lock memory pages
from swapping to disk. It may be helpful when debugging oom-killer job
failures in gate in case when dstat shows that swap is not fully used
when oom-killer is triggered.

The peakmem_tracker service was renamed into memory_tracker to reflect
its new broader scope.

Needed-By: I5862d92478397eac2e61b8a61ce3437b698678be
Change-Id: I1dca120448ee87930fe903fd81277b58efaefc92

Ihar Hrachyshka authored on 2017/02/10 15:17:37
Showing 4 changed files
... ...
@@ -21,16 +21,22 @@ function start_dstat {
21 21
     # A better kind of sysstat, with the top process per time slice
22 22
     run_process dstat "$TOP_DIR/tools/dstat.sh $LOGDIR"
23 23
 
24
-    # To enable peakmem_tracker add:
25
-    #    enable_service peakmem_tracker
24
+    # To enable memory_tracker add:
25
+    #    enable_service memory_tracker
26 26
     # to your localrc
27
-    run_process peakmem_tracker "$TOP_DIR/tools/peakmem_tracker.sh"
27
+    run_process memory_tracker "$TOP_DIR/tools/memory_tracker.sh"
28
+
29
+    # remove support for the old name when it's no longer used (sometime in Queens)
30
+    if is_service_enabled peakmem_tracker; then
31
+        deprecated "Use of peakmem_tracker in devstack is deprecated, use memory_tracker instead"
32
+        run_process peakmem_tracker "$TOP_DIR/tools/memory_tracker.sh"
33
+    fi
28 34
 }
29 35
 
30 36
 # stop_dstat() stop dstat process
31 37
 function stop_dstat {
32 38
     stop_process dstat
33
-    stop_process peakmem_tracker
39
+    stop_process memory_tracker
34 40
 }
35 41
 
36 42
 # Restore xtrace
37 43
new file mode 100755
... ...
@@ -0,0 +1,118 @@
0
+#!/bin/bash
1
+#
2
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
3
+# not use this file except in compliance with the License. You may obtain
4
+# a copy of the License at
5
+#
6
+#    http://www.apache.org/licenses/LICENSE-2.0
7
+#
8
+# Unless required by applicable law or agreed to in writing, software
9
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
10
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
11
+# License for the specific language governing permissions and limitations
12
+# under the License.
13
+
14
+set -o errexit
15
+
16
+# time to sleep between checks
17
+SLEEP_TIME=20
18
+
19
+# MemAvailable is the best estimation and has built-in heuristics
20
+# around reclaimable memory.  However, it is not available until 3.14
21
+# kernel (i.e. Ubuntu LTS Trusty misses it).  In that case, we fall
22
+# back to free+buffers+cache as the available memory.
23
+USE_MEM_AVAILABLE=0
24
+if grep -q '^MemAvailable:' /proc/meminfo; then
25
+    USE_MEM_AVAILABLE=1
26
+fi
27
+
28
+function get_mem_unevictable {
29
+    awk '/^Unevictable:/ {print $2}' /proc/meminfo
30
+}
31
+
32
+function get_mem_available {
33
+    if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then
34
+        awk '/^MemAvailable:/ {print $2}' /proc/meminfo
35
+    else
36
+        awk '/^MemFree:/ {free=$2}
37
+            /^Buffers:/ {buffers=$2}
38
+            /^Cached:/  {cached=$2}
39
+            END { print free+buffers+cached }' /proc/meminfo
40
+    fi
41
+}
42
+
43
+function tracker {
44
+    local low_point
45
+    local unevictable_point
46
+    low_point=$(get_mem_available)
47
+    # log mlocked memory at least on first iteration
48
+    unevictable_point=0
49
+    while [ 1 ]; do
50
+
51
+        local mem_available
52
+        mem_available=$(get_mem_available)
53
+
54
+        local unevictable
55
+        unevictable=$(get_mem_unevictable)
56
+
57
+        if [ $mem_available -lt $low_point -o $unevictable -ne $unevictable_point ]; then
58
+            echo "[[["
59
+            date
60
+
61
+            # whenever we see less memory available than last time, dump the
62
+            # snapshot of current usage; i.e. checking the latest entry in the file
63
+            # will give the peak-memory usage
64
+            if [[ $mem_available -lt $low_point ]]; then
65
+                low_point=$mem_available
66
+                echo "---"
67
+                # always available greppable output; given difference in
68
+                # meminfo output as described above...
69
+                echo "memory_tracker low_point: $mem_available"
70
+                echo "---"
71
+                cat /proc/meminfo
72
+                echo "---"
73
+                # would hierarchial view be more useful (-H)?  output is
74
+                # not sorted by usage then, however, and the first
75
+                # question is "what's using up the memory"
76
+                #
77
+                # there are a lot of kernel threads, especially on a 8-cpu
78
+                # system.  do a best-effort removal to improve
79
+                # signal/noise ratio of output.
80
+                ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
81
+                    grep -v ']$'
82
+            fi
83
+            echo "---"
84
+
85
+            # list processes that lock memory from swap
86
+            if [[ $unevictable -ne $unevictable_point ]]; then
87
+                unevictable_point=$unevictable
88
+                sudo ./tools/mlock_report.py
89
+            fi
90
+
91
+            echo "]]]"
92
+        fi
93
+        sleep $SLEEP_TIME
94
+    done
95
+}
96
+
97
+function usage {
98
+    echo "Usage: $0 [-x] [-s N]" 1>&2
99
+    exit 1
100
+}
101
+
102
+while getopts ":s:x" opt; do
103
+    case $opt in
104
+        s)
105
+            SLEEP_TIME=$OPTARG
106
+            ;;
107
+        x)
108
+            set -o xtrace
109
+            ;;
110
+        *)
111
+            usage
112
+            ;;
113
+    esac
114
+done
115
+shift $((OPTIND-1))
116
+
117
+tracker
0 118
new file mode 100755
... ...
@@ -0,0 +1,59 @@
0
+#!/usr/bin/env python
1
+
2
+# This tool lists processes that lock memory pages from swapping to disk.
3
+
4
+import re
5
+import subprocess
6
+
7
+import psutil
8
+
9
+
10
+SUMMARY_REGEX = re.compile(r".*\s+(?P<locked>[\d]+)\s+KB")
11
+
12
+
13
+def main():
14
+    try:
15
+        print _get_report()
16
+    except Exception as e:
17
+        print "Failure listing processes locking memory: %s" % str(e)
18
+
19
+
20
+def _get_report():
21
+    mlock_users = []
22
+    for proc in psutil.process_iter():
23
+        pid = proc.pid
24
+        # sadly psutil does not expose locked pages info, that's why we
25
+        # call to pmap and parse the output here
26
+        try:
27
+            out = subprocess.check_output(['pmap', '-XX', str(pid)])
28
+        except subprocess.CalledProcessError as e:
29
+            # 42 means process just vanished, which is ok
30
+            if e.returncode == 42:
31
+                continue
32
+            raise
33
+        last_line = out.splitlines()[-1]
34
+
35
+        # some processes don't provide a memory map, for example those
36
+        # running as kernel services, so we need to skip those that don't
37
+        # match
38
+        result = SUMMARY_REGEX.match(last_line)
39
+        if result:
40
+            locked = int(result.group('locked'))
41
+            if locked:
42
+                mlock_users.append({'name': proc.name(),
43
+                                    'pid': pid,
44
+                                    'locked': locked})
45
+
46
+    # produce a single line log message with per process mlock stats
47
+    if mlock_users:
48
+        return "; ".join(
49
+            "[%(name)s (pid:%(pid)s)]=%(locked)dKB" % args
50
+            # log heavy users first
51
+            for args in sorted(mlock_users, key=lambda d: d['locked'])
52
+        )
53
+    else:
54
+        return "no locked memory"
55
+
56
+
57
+if __name__ == "__main__":
58
+    main()
0 59
deleted file mode 100755
... ...
@@ -1,98 +0,0 @@
1
-#!/bin/bash
2
-#
3
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
4
-# not use this file except in compliance with the License. You may obtain
5
-# a copy of the License at
6
-#
7
-#    http://www.apache.org/licenses/LICENSE-2.0
8
-#
9
-# Unless required by applicable law or agreed to in writing, software
10
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12
-# License for the specific language governing permissions and limitations
13
-# under the License.
14
-
15
-set -o errexit
16
-
17
-# time to sleep between checks
18
-SLEEP_TIME=20
19
-
20
-# MemAvailable is the best estimation and has built-in heuristics
21
-# around reclaimable memory.  However, it is not available until 3.14
22
-# kernel (i.e. Ubuntu LTS Trusty misses it).  In that case, we fall
23
-# back to free+buffers+cache as the available memory.
24
-USE_MEM_AVAILBLE=0
25
-if grep -q '^MemAvailable:' /proc/meminfo; then
26
-    USE_MEM_AVAILABLE=1
27
-fi
28
-
29
-function get_mem_available {
30
-    if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then
31
-        awk '/^MemAvailable:/ {print $2}' /proc/meminfo
32
-    else
33
-        awk '/^MemFree:/ {free=$2}
34
-            /^Buffers:/ {buffers=$2}
35
-            /^Cached:/  {cached=$2}
36
-            END { print free+buffers+cached }' /proc/meminfo
37
-    fi
38
-}
39
-
40
-# whenever we see less memory available than last time, dump the
41
-# snapshot of current usage; i.e. checking the latest entry in the
42
-# file will give the peak-memory usage
43
-function tracker {
44
-    local low_point
45
-    low_point=$(get_mem_available)
46
-    while [ 1 ]; do
47
-
48
-        local mem_available
49
-        mem_available=$(get_mem_available)
50
-
51
-        if [[ $mem_available -lt $low_point ]]; then
52
-            low_point=$mem_available
53
-            echo "[[["
54
-            date
55
-            echo "---"
56
-            # always available greppable output; given difference in
57
-            # meminfo output as described above...
58
-            echo "peakmem_tracker low_point: $mem_available"
59
-            echo "---"
60
-            cat /proc/meminfo
61
-            echo "---"
62
-            # would hierarchial view be more useful (-H)?  output is
63
-            # not sorted by usage then, however, and the first
64
-            # question is "what's using up the memory"
65
-            #
66
-            # there are a lot of kernel threads, especially on a 8-cpu
67
-            # system.  do a best-effort removal to improve
68
-            # signal/noise ratio of output.
69
-            ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
70
-                grep -v ']$'
71
-            echo "]]]"
72
-        fi
73
-
74
-        sleep $SLEEP_TIME
75
-    done
76
-}
77
-
78
-function usage {
79
-    echo "Usage: $0 [-x] [-s N]" 1>&2
80
-    exit 1
81
-}
82
-
83
-while getopts ":s:x" opt; do
84
-    case $opt in
85
-        s)
86
-            SLEEP_TIME=$OPTARG
87
-            ;;
88
-        x)
89
-            set -o xtrace
90
-            ;;
91
-        *)
92
-            usage
93
-            ;;
94
-    esac
95
-done
96
-shift $((OPTIND-1))
97
-
98
-tracker