The change makes peakmem_tracker list processes that lock memory pages
from swapping to disk. It may be helpful when debugging oom-killer job
failures in gate in case when dstat shows that swap is not fully used
when oom-killer is triggered.
The peakmem_tracker service was renamed into memory_tracker to reflect
its new broader scope.
Needed-By: I5862d92478397eac2e61b8a61ce3437b698678be
Change-Id: I1dca120448ee87930fe903fd81277b58efaefc92
| ... | ... |
@@ -21,16 +21,22 @@ function start_dstat {
|
| 21 | 21 |
# A better kind of sysstat, with the top process per time slice |
| 22 | 22 |
run_process dstat "$TOP_DIR/tools/dstat.sh $LOGDIR" |
| 23 | 23 |
|
| 24 |
- # To enable peakmem_tracker add: |
|
| 25 |
- # enable_service peakmem_tracker |
|
| 24 |
+ # To enable memory_tracker add: |
|
| 25 |
+ # enable_service memory_tracker |
|
| 26 | 26 |
# to your localrc |
| 27 |
- run_process peakmem_tracker "$TOP_DIR/tools/peakmem_tracker.sh" |
|
| 27 |
+ run_process memory_tracker "$TOP_DIR/tools/memory_tracker.sh" |
|
| 28 |
+ |
|
| 29 |
+ # remove support for the old name when it's no longer used (sometime in Queens) |
|
| 30 |
+ if is_service_enabled peakmem_tracker; then |
|
| 31 |
+ deprecated "Use of peakmem_tracker in devstack is deprecated, use memory_tracker instead" |
|
| 32 |
+ run_process peakmem_tracker "$TOP_DIR/tools/memory_tracker.sh" |
|
| 33 |
+ fi |
|
| 28 | 34 |
} |
| 29 | 35 |
|
| 30 | 36 |
# stop_dstat() stop dstat process |
| 31 | 37 |
function stop_dstat {
|
| 32 | 38 |
stop_process dstat |
| 33 |
- stop_process peakmem_tracker |
|
| 39 |
+ stop_process memory_tracker |
|
| 34 | 40 |
} |
| 35 | 41 |
|
| 36 | 42 |
# Restore xtrace |
| 37 | 43 |
new file mode 100755 |
| ... | ... |
@@ -0,0 +1,118 @@ |
| 0 |
+#!/bin/bash |
|
| 1 |
+# |
|
| 2 |
+# Licensed under the Apache License, Version 2.0 (the "License"); you may |
|
| 3 |
+# not use this file except in compliance with the License. You may obtain |
|
| 4 |
+# a copy of the License at |
|
| 5 |
+# |
|
| 6 |
+# http://www.apache.org/licenses/LICENSE-2.0 |
|
| 7 |
+# |
|
| 8 |
+# Unless required by applicable law or agreed to in writing, software |
|
| 9 |
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
| 10 |
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
| 11 |
+# License for the specific language governing permissions and limitations |
|
| 12 |
+# under the License. |
|
| 13 |
+ |
|
| 14 |
+set -o errexit |
|
| 15 |
+ |
|
| 16 |
+# time to sleep between checks |
|
| 17 |
+SLEEP_TIME=20 |
|
| 18 |
+ |
|
| 19 |
+# MemAvailable is the best estimation and has built-in heuristics |
|
| 20 |
+# around reclaimable memory. However, it is not available until 3.14 |
|
| 21 |
+# kernel (i.e. Ubuntu LTS Trusty misses it). In that case, we fall |
|
| 22 |
+# back to free+buffers+cache as the available memory. |
|
| 23 |
+USE_MEM_AVAILABLE=0 |
|
| 24 |
+if grep -q '^MemAvailable:' /proc/meminfo; then |
|
| 25 |
+ USE_MEM_AVAILABLE=1 |
|
| 26 |
+fi |
|
| 27 |
+ |
|
| 28 |
+function get_mem_unevictable {
|
|
| 29 |
+ awk '/^Unevictable:/ {print $2}' /proc/meminfo
|
|
| 30 |
+} |
|
| 31 |
+ |
|
| 32 |
+function get_mem_available {
|
|
| 33 |
+ if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then |
|
| 34 |
+ awk '/^MemAvailable:/ {print $2}' /proc/meminfo
|
|
| 35 |
+ else |
|
| 36 |
+ awk '/^MemFree:/ {free=$2}
|
|
| 37 |
+ /^Buffers:/ {buffers=$2}
|
|
| 38 |
+ /^Cached:/ {cached=$2}
|
|
| 39 |
+ END { print free+buffers+cached }' /proc/meminfo
|
|
| 40 |
+ fi |
|
| 41 |
+} |
|
| 42 |
+ |
|
| 43 |
+function tracker {
|
|
| 44 |
+ local low_point |
|
| 45 |
+ local unevictable_point |
|
| 46 |
+ low_point=$(get_mem_available) |
|
| 47 |
+ # log mlocked memory at least on first iteration |
|
| 48 |
+ unevictable_point=0 |
|
| 49 |
+ while [ 1 ]; do |
|
| 50 |
+ |
|
| 51 |
+ local mem_available |
|
| 52 |
+ mem_available=$(get_mem_available) |
|
| 53 |
+ |
|
| 54 |
+ local unevictable |
|
| 55 |
+ unevictable=$(get_mem_unevictable) |
|
| 56 |
+ |
|
| 57 |
+ if [ $mem_available -lt $low_point -o $unevictable -ne $unevictable_point ]; then |
|
| 58 |
+ echo "[[[" |
|
| 59 |
+ date |
|
| 60 |
+ |
|
| 61 |
+ # whenever we see less memory available than last time, dump the |
|
| 62 |
+ # snapshot of current usage; i.e. checking the latest entry in the file |
|
| 63 |
+ # will give the peak-memory usage |
|
| 64 |
+ if [[ $mem_available -lt $low_point ]]; then |
|
| 65 |
+ low_point=$mem_available |
|
| 66 |
+ echo "---" |
|
| 67 |
+ # always available greppable output; given difference in |
|
| 68 |
+ # meminfo output as described above... |
|
| 69 |
+ echo "memory_tracker low_point: $mem_available" |
|
| 70 |
+ echo "---" |
|
| 71 |
+ cat /proc/meminfo |
|
| 72 |
+ echo "---" |
|
| 73 |
+ # would hierarchial view be more useful (-H)? output is |
|
| 74 |
+ # not sorted by usage then, however, and the first |
|
| 75 |
+ # question is "what's using up the memory" |
|
| 76 |
+ # |
|
| 77 |
+ # there are a lot of kernel threads, especially on a 8-cpu |
|
| 78 |
+ # system. do a best-effort removal to improve |
|
| 79 |
+ # signal/noise ratio of output. |
|
| 80 |
+ ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 | |
|
| 81 |
+ grep -v ']$' |
|
| 82 |
+ fi |
|
| 83 |
+ echo "---" |
|
| 84 |
+ |
|
| 85 |
+ # list processes that lock memory from swap |
|
| 86 |
+ if [[ $unevictable -ne $unevictable_point ]]; then |
|
| 87 |
+ unevictable_point=$unevictable |
|
| 88 |
+ sudo ./tools/mlock_report.py |
|
| 89 |
+ fi |
|
| 90 |
+ |
|
| 91 |
+ echo "]]]" |
|
| 92 |
+ fi |
|
| 93 |
+ sleep $SLEEP_TIME |
|
| 94 |
+ done |
|
| 95 |
+} |
|
| 96 |
+ |
|
| 97 |
+function usage {
|
|
| 98 |
+ echo "Usage: $0 [-x] [-s N]" 1>&2 |
|
| 99 |
+ exit 1 |
|
| 100 |
+} |
|
| 101 |
+ |
|
| 102 |
+while getopts ":s:x" opt; do |
|
| 103 |
+ case $opt in |
|
| 104 |
+ s) |
|
| 105 |
+ SLEEP_TIME=$OPTARG |
|
| 106 |
+ ;; |
|
| 107 |
+ x) |
|
| 108 |
+ set -o xtrace |
|
| 109 |
+ ;; |
|
| 110 |
+ *) |
|
| 111 |
+ usage |
|
| 112 |
+ ;; |
|
| 113 |
+ esac |
|
| 114 |
+done |
|
| 115 |
+shift $((OPTIND-1)) |
|
| 116 |
+ |
|
| 117 |
+tracker |
| 0 | 118 |
new file mode 100755 |
| ... | ... |
@@ -0,0 +1,59 @@ |
| 0 |
+#!/usr/bin/env python |
|
| 1 |
+ |
|
| 2 |
+# This tool lists processes that lock memory pages from swapping to disk. |
|
| 3 |
+ |
|
| 4 |
+import re |
|
| 5 |
+import subprocess |
|
| 6 |
+ |
|
| 7 |
+import psutil |
|
| 8 |
+ |
|
| 9 |
+ |
|
| 10 |
+SUMMARY_REGEX = re.compile(r".*\s+(?P<locked>[\d]+)\s+KB") |
|
| 11 |
+ |
|
| 12 |
+ |
|
| 13 |
+def main(): |
|
| 14 |
+ try: |
|
| 15 |
+ print _get_report() |
|
| 16 |
+ except Exception as e: |
|
| 17 |
+ print "Failure listing processes locking memory: %s" % str(e) |
|
| 18 |
+ |
|
| 19 |
+ |
|
| 20 |
+def _get_report(): |
|
| 21 |
+ mlock_users = [] |
|
| 22 |
+ for proc in psutil.process_iter(): |
|
| 23 |
+ pid = proc.pid |
|
| 24 |
+ # sadly psutil does not expose locked pages info, that's why we |
|
| 25 |
+ # call to pmap and parse the output here |
|
| 26 |
+ try: |
|
| 27 |
+ out = subprocess.check_output(['pmap', '-XX', str(pid)]) |
|
| 28 |
+ except subprocess.CalledProcessError as e: |
|
| 29 |
+ # 42 means process just vanished, which is ok |
|
| 30 |
+ if e.returncode == 42: |
|
| 31 |
+ continue |
|
| 32 |
+ raise |
|
| 33 |
+ last_line = out.splitlines()[-1] |
|
| 34 |
+ |
|
| 35 |
+ # some processes don't provide a memory map, for example those |
|
| 36 |
+ # running as kernel services, so we need to skip those that don't |
|
| 37 |
+ # match |
|
| 38 |
+ result = SUMMARY_REGEX.match(last_line) |
|
| 39 |
+ if result: |
|
| 40 |
+ locked = int(result.group('locked'))
|
|
| 41 |
+ if locked: |
|
| 42 |
+ mlock_users.append({'name': proc.name(),
|
|
| 43 |
+ 'pid': pid, |
|
| 44 |
+ 'locked': locked}) |
|
| 45 |
+ |
|
| 46 |
+ # produce a single line log message with per process mlock stats |
|
| 47 |
+ if mlock_users: |
|
| 48 |
+ return "; ".join( |
|
| 49 |
+ "[%(name)s (pid:%(pid)s)]=%(locked)dKB" % args |
|
| 50 |
+ # log heavy users first |
|
| 51 |
+ for args in sorted(mlock_users, key=lambda d: d['locked']) |
|
| 52 |
+ ) |
|
| 53 |
+ else: |
|
| 54 |
+ return "no locked memory" |
|
| 55 |
+ |
|
| 56 |
+ |
|
| 57 |
+if __name__ == "__main__": |
|
| 58 |
+ main() |
| 0 | 59 |
deleted file mode 100755 |
| ... | ... |
@@ -1,98 +0,0 @@ |
| 1 |
-#!/bin/bash |
|
| 2 |
-# |
|
| 3 |
-# Licensed under the Apache License, Version 2.0 (the "License"); you may |
|
| 4 |
-# not use this file except in compliance with the License. You may obtain |
|
| 5 |
-# a copy of the License at |
|
| 6 |
-# |
|
| 7 |
-# http://www.apache.org/licenses/LICENSE-2.0 |
|
| 8 |
-# |
|
| 9 |
-# Unless required by applicable law or agreed to in writing, software |
|
| 10 |
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
|
| 11 |
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
|
| 12 |
-# License for the specific language governing permissions and limitations |
|
| 13 |
-# under the License. |
|
| 14 |
- |
|
| 15 |
-set -o errexit |
|
| 16 |
- |
|
| 17 |
-# time to sleep between checks |
|
| 18 |
-SLEEP_TIME=20 |
|
| 19 |
- |
|
| 20 |
-# MemAvailable is the best estimation and has built-in heuristics |
|
| 21 |
-# around reclaimable memory. However, it is not available until 3.14 |
|
| 22 |
-# kernel (i.e. Ubuntu LTS Trusty misses it). In that case, we fall |
|
| 23 |
-# back to free+buffers+cache as the available memory. |
|
| 24 |
-USE_MEM_AVAILBLE=0 |
|
| 25 |
-if grep -q '^MemAvailable:' /proc/meminfo; then |
|
| 26 |
- USE_MEM_AVAILABLE=1 |
|
| 27 |
-fi |
|
| 28 |
- |
|
| 29 |
-function get_mem_available {
|
|
| 30 |
- if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then |
|
| 31 |
- awk '/^MemAvailable:/ {print $2}' /proc/meminfo
|
|
| 32 |
- else |
|
| 33 |
- awk '/^MemFree:/ {free=$2}
|
|
| 34 |
- /^Buffers:/ {buffers=$2}
|
|
| 35 |
- /^Cached:/ {cached=$2}
|
|
| 36 |
- END { print free+buffers+cached }' /proc/meminfo
|
|
| 37 |
- fi |
|
| 38 |
-} |
|
| 39 |
- |
|
| 40 |
-# whenever we see less memory available than last time, dump the |
|
| 41 |
-# snapshot of current usage; i.e. checking the latest entry in the |
|
| 42 |
-# file will give the peak-memory usage |
|
| 43 |
-function tracker {
|
|
| 44 |
- local low_point |
|
| 45 |
- low_point=$(get_mem_available) |
|
| 46 |
- while [ 1 ]; do |
|
| 47 |
- |
|
| 48 |
- local mem_available |
|
| 49 |
- mem_available=$(get_mem_available) |
|
| 50 |
- |
|
| 51 |
- if [[ $mem_available -lt $low_point ]]; then |
|
| 52 |
- low_point=$mem_available |
|
| 53 |
- echo "[[[" |
|
| 54 |
- date |
|
| 55 |
- echo "---" |
|
| 56 |
- # always available greppable output; given difference in |
|
| 57 |
- # meminfo output as described above... |
|
| 58 |
- echo "peakmem_tracker low_point: $mem_available" |
|
| 59 |
- echo "---" |
|
| 60 |
- cat /proc/meminfo |
|
| 61 |
- echo "---" |
|
| 62 |
- # would hierarchial view be more useful (-H)? output is |
|
| 63 |
- # not sorted by usage then, however, and the first |
|
| 64 |
- # question is "what's using up the memory" |
|
| 65 |
- # |
|
| 66 |
- # there are a lot of kernel threads, especially on a 8-cpu |
|
| 67 |
- # system. do a best-effort removal to improve |
|
| 68 |
- # signal/noise ratio of output. |
|
| 69 |
- ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 | |
|
| 70 |
- grep -v ']$' |
|
| 71 |
- echo "]]]" |
|
| 72 |
- fi |
|
| 73 |
- |
|
| 74 |
- sleep $SLEEP_TIME |
|
| 75 |
- done |
|
| 76 |
-} |
|
| 77 |
- |
|
| 78 |
-function usage {
|
|
| 79 |
- echo "Usage: $0 [-x] [-s N]" 1>&2 |
|
| 80 |
- exit 1 |
|
| 81 |
-} |
|
| 82 |
- |
|
| 83 |
-while getopts ":s:x" opt; do |
|
| 84 |
- case $opt in |
|
| 85 |
- s) |
|
| 86 |
- SLEEP_TIME=$OPTARG |
|
| 87 |
- ;; |
|
| 88 |
- x) |
|
| 89 |
- set -o xtrace |
|
| 90 |
- ;; |
|
| 91 |
- *) |
|
| 92 |
- usage |
|
| 93 |
- ;; |
|
| 94 |
- esac |
|
| 95 |
-done |
|
| 96 |
-shift $((OPTIND-1)) |
|
| 97 |
- |
|
| 98 |
-tracker |