Added list of mlock-using processes to peakmem_tracker output

The change makes peakmem_tracker list processes that lock memory pages
from swapping to disk. It may be helpful when debugging oom-killer job
failures in gate in case when dstat shows that swap is not fully used
when oom-killer is triggered.

The peakmem_tracker service was renamed into memory_tracker to reflect
its new broader scope.

Needed-By: I5862d92478397eac2e61b8a61ce3437b698678be
Change-Id: I1dca120448ee87930fe903fd81277b58efaefc92
This commit is contained in:
Ihar Hrachyshka 2017-02-10 06:17:37 +00:00
parent 23d03b697f
commit 2b4735f1b3
3 changed files with 111 additions and 26 deletions

View File

@ -21,16 +21,22 @@ function start_dstat {
# A better kind of sysstat, with the top process per time slice # A better kind of sysstat, with the top process per time slice
run_process dstat "$TOP_DIR/tools/dstat.sh $LOGDIR" run_process dstat "$TOP_DIR/tools/dstat.sh $LOGDIR"
# To enable peakmem_tracker add: # To enable memory_tracker add:
# enable_service peakmem_tracker # enable_service memory_tracker
# to your localrc # to your localrc
run_process peakmem_tracker "$TOP_DIR/tools/peakmem_tracker.sh" run_process memory_tracker "$TOP_DIR/tools/memory_tracker.sh"
# remove support for the old name when it's no longer used (sometime in Queens)
if is_service_enabled peakmem_tracker; then
deprecated "Use of peakmem_tracker in devstack is deprecated, use memory_tracker instead"
run_process peakmem_tracker "$TOP_DIR/tools/memory_tracker.sh"
fi
} }
# stop_dstat() stop dstat process # stop_dstat() stop dstat process
function stop_dstat { function stop_dstat {
stop_process dstat stop_process dstat
stop_process peakmem_tracker stop_process memory_tracker
} }
# Restore xtrace # Restore xtrace

View File

@ -21,11 +21,15 @@ SLEEP_TIME=20
# around reclaimable memory. However, it is not available until 3.14 # around reclaimable memory. However, it is not available until 3.14
# kernel (i.e. Ubuntu LTS Trusty misses it). In that case, we fall # kernel (i.e. Ubuntu LTS Trusty misses it). In that case, we fall
# back to free+buffers+cache as the available memory. # back to free+buffers+cache as the available memory.
USE_MEM_AVAILBLE=0 USE_MEM_AVAILABLE=0
if grep -q '^MemAvailable:' /proc/meminfo; then if grep -q '^MemAvailable:' /proc/meminfo; then
USE_MEM_AVAILABLE=1 USE_MEM_AVAILABLE=1
fi fi
function get_mem_unevictable {
awk '/^Unevictable:/ {print $2}' /proc/meminfo
}
function get_mem_available { function get_mem_available {
if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then
awk '/^MemAvailable:/ {print $2}' /proc/meminfo awk '/^MemAvailable:/ {print $2}' /proc/meminfo
@ -37,40 +41,56 @@ function get_mem_available {
fi fi
} }
# whenever we see less memory available than last time, dump the
# snapshot of current usage; i.e. checking the latest entry in the
# file will give the peak-memory usage
function tracker { function tracker {
local low_point local low_point
local unevictable_point
low_point=$(get_mem_available) low_point=$(get_mem_available)
# log mlocked memory at least on first iteration
unevictable_point=0
while [ 1 ]; do while [ 1 ]; do
local mem_available local mem_available
mem_available=$(get_mem_available) mem_available=$(get_mem_available)
if [[ $mem_available -lt $low_point ]]; then local unevictable
low_point=$mem_available unevictable=$(get_mem_unevictable)
if [ $mem_available -lt $low_point -o $unevictable -ne $unevictable_point ]; then
echo "[[[" echo "[[["
date date
# whenever we see less memory available than last time, dump the
# snapshot of current usage; i.e. checking the latest entry in the file
# will give the peak-memory usage
if [[ $mem_available -lt $low_point ]]; then
low_point=$mem_available
echo "---"
# always available greppable output; given difference in
# meminfo output as described above...
echo "memory_tracker low_point: $mem_available"
echo "---"
cat /proc/meminfo
echo "---"
# would hierarchial view be more useful (-H)? output is
# not sorted by usage then, however, and the first
# question is "what's using up the memory"
#
# there are a lot of kernel threads, especially on a 8-cpu
# system. do a best-effort removal to improve
# signal/noise ratio of output.
ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
grep -v ']$'
fi
echo "---" echo "---"
# always available greppable output; given difference in
# meminfo output as described above... # list processes that lock memory from swap
echo "peakmem_tracker low_point: $mem_available" if [[ $unevictable -ne $unevictable_point ]]; then
echo "---" unevictable_point=$unevictable
cat /proc/meminfo sudo ./tools/mlock_report.py
echo "---" fi
# would hierarchial view be more useful (-H)? output is
# not sorted by usage then, however, and the first
# question is "what's using up the memory"
#
# there are a lot of kernel threads, especially on a 8-cpu
# system. do a best-effort removal to improve
# signal/noise ratio of output.
ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
grep -v ']$'
echo "]]]" echo "]]]"
fi fi
sleep $SLEEP_TIME sleep $SLEEP_TIME
done done
} }

59
tools/mlock_report.py Executable file
View File

@ -0,0 +1,59 @@
#!/usr/bin/env python
# This tool lists processes that lock memory pages from swapping to disk.
import re
import subprocess
import psutil
SUMMARY_REGEX = re.compile(r".*\s+(?P<locked>[\d]+)\s+KB")
def main():
try:
print _get_report()
except Exception as e:
print "Failure listing processes locking memory: %s" % str(e)
def _get_report():
mlock_users = []
for proc in psutil.process_iter():
pid = proc.pid
# sadly psutil does not expose locked pages info, that's why we
# call to pmap and parse the output here
try:
out = subprocess.check_output(['pmap', '-XX', str(pid)])
except subprocess.CalledProcessError as e:
# 42 means process just vanished, which is ok
if e.returncode == 42:
continue
raise
last_line = out.splitlines()[-1]
# some processes don't provide a memory map, for example those
# running as kernel services, so we need to skip those that don't
# match
result = SUMMARY_REGEX.match(last_line)
if result:
locked = int(result.group('locked'))
if locked:
mlock_users.append({'name': proc.name(),
'pid': pid,
'locked': locked})
# produce a single line log message with per process mlock stats
if mlock_users:
return "; ".join(
"[%(name)s (pid:%(pid)s)]=%(locked)dKB" % args
# log heavy users first
for args in sorted(mlock_users, key=lambda d: d['locked'])
)
else:
return "no locked memory"
if __name__ == "__main__":
main()