Added list of mlock-using processes to peakmem_tracker output
The change makes peakmem_tracker list processes that lock memory pages from swapping to disk. It may be helpful when debugging oom-killer job failures in gate in case when dstat shows that swap is not fully used when oom-killer is triggered. The peakmem_tracker service was renamed into memory_tracker to reflect its new broader scope. Needed-By: I5862d92478397eac2e61b8a61ce3437b698678be Change-Id: I1dca120448ee87930fe903fd81277b58efaefc92
This commit is contained in:
parent
23d03b697f
commit
2b4735f1b3
14
lib/dstat
14
lib/dstat
@ -21,16 +21,22 @@ function start_dstat {
|
|||||||
# A better kind of sysstat, with the top process per time slice
|
# A better kind of sysstat, with the top process per time slice
|
||||||
run_process dstat "$TOP_DIR/tools/dstat.sh $LOGDIR"
|
run_process dstat "$TOP_DIR/tools/dstat.sh $LOGDIR"
|
||||||
|
|
||||||
# To enable peakmem_tracker add:
|
# To enable memory_tracker add:
|
||||||
# enable_service peakmem_tracker
|
# enable_service memory_tracker
|
||||||
# to your localrc
|
# to your localrc
|
||||||
run_process peakmem_tracker "$TOP_DIR/tools/peakmem_tracker.sh"
|
run_process memory_tracker "$TOP_DIR/tools/memory_tracker.sh"
|
||||||
|
|
||||||
|
# remove support for the old name when it's no longer used (sometime in Queens)
|
||||||
|
if is_service_enabled peakmem_tracker; then
|
||||||
|
deprecated "Use of peakmem_tracker in devstack is deprecated, use memory_tracker instead"
|
||||||
|
run_process peakmem_tracker "$TOP_DIR/tools/memory_tracker.sh"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# stop_dstat() stop dstat process
|
# stop_dstat() stop dstat process
|
||||||
function stop_dstat {
|
function stop_dstat {
|
||||||
stop_process dstat
|
stop_process dstat
|
||||||
stop_process peakmem_tracker
|
stop_process memory_tracker
|
||||||
}
|
}
|
||||||
|
|
||||||
# Restore xtrace
|
# Restore xtrace
|
||||||
|
@ -21,11 +21,15 @@ SLEEP_TIME=20
|
|||||||
# around reclaimable memory. However, it is not available until 3.14
|
# around reclaimable memory. However, it is not available until 3.14
|
||||||
# kernel (i.e. Ubuntu LTS Trusty misses it). In that case, we fall
|
# kernel (i.e. Ubuntu LTS Trusty misses it). In that case, we fall
|
||||||
# back to free+buffers+cache as the available memory.
|
# back to free+buffers+cache as the available memory.
|
||||||
USE_MEM_AVAILBLE=0
|
USE_MEM_AVAILABLE=0
|
||||||
if grep -q '^MemAvailable:' /proc/meminfo; then
|
if grep -q '^MemAvailable:' /proc/meminfo; then
|
||||||
USE_MEM_AVAILABLE=1
|
USE_MEM_AVAILABLE=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
function get_mem_unevictable {
|
||||||
|
awk '/^Unevictable:/ {print $2}' /proc/meminfo
|
||||||
|
}
|
||||||
|
|
||||||
function get_mem_available {
|
function get_mem_available {
|
||||||
if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then
|
if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then
|
||||||
awk '/^MemAvailable:/ {print $2}' /proc/meminfo
|
awk '/^MemAvailable:/ {print $2}' /proc/meminfo
|
||||||
@ -37,40 +41,56 @@ function get_mem_available {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# whenever we see less memory available than last time, dump the
|
|
||||||
# snapshot of current usage; i.e. checking the latest entry in the
|
|
||||||
# file will give the peak-memory usage
|
|
||||||
function tracker {
|
function tracker {
|
||||||
local low_point
|
local low_point
|
||||||
|
local unevictable_point
|
||||||
low_point=$(get_mem_available)
|
low_point=$(get_mem_available)
|
||||||
|
# log mlocked memory at least on first iteration
|
||||||
|
unevictable_point=0
|
||||||
while [ 1 ]; do
|
while [ 1 ]; do
|
||||||
|
|
||||||
local mem_available
|
local mem_available
|
||||||
mem_available=$(get_mem_available)
|
mem_available=$(get_mem_available)
|
||||||
|
|
||||||
if [[ $mem_available -lt $low_point ]]; then
|
local unevictable
|
||||||
low_point=$mem_available
|
unevictable=$(get_mem_unevictable)
|
||||||
|
|
||||||
|
if [ $mem_available -lt $low_point -o $unevictable -ne $unevictable_point ]; then
|
||||||
echo "[[["
|
echo "[[["
|
||||||
date
|
date
|
||||||
|
|
||||||
|
# whenever we see less memory available than last time, dump the
|
||||||
|
# snapshot of current usage; i.e. checking the latest entry in the file
|
||||||
|
# will give the peak-memory usage
|
||||||
|
if [[ $mem_available -lt $low_point ]]; then
|
||||||
|
low_point=$mem_available
|
||||||
|
echo "---"
|
||||||
|
# always available greppable output; given difference in
|
||||||
|
# meminfo output as described above...
|
||||||
|
echo "memory_tracker low_point: $mem_available"
|
||||||
|
echo "---"
|
||||||
|
cat /proc/meminfo
|
||||||
|
echo "---"
|
||||||
|
# would hierarchial view be more useful (-H)? output is
|
||||||
|
# not sorted by usage then, however, and the first
|
||||||
|
# question is "what's using up the memory"
|
||||||
|
#
|
||||||
|
# there are a lot of kernel threads, especially on a 8-cpu
|
||||||
|
# system. do a best-effort removal to improve
|
||||||
|
# signal/noise ratio of output.
|
||||||
|
ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
|
||||||
|
grep -v ']$'
|
||||||
|
fi
|
||||||
echo "---"
|
echo "---"
|
||||||
# always available greppable output; given difference in
|
|
||||||
# meminfo output as described above...
|
# list processes that lock memory from swap
|
||||||
echo "peakmem_tracker low_point: $mem_available"
|
if [[ $unevictable -ne $unevictable_point ]]; then
|
||||||
echo "---"
|
unevictable_point=$unevictable
|
||||||
cat /proc/meminfo
|
sudo ./tools/mlock_report.py
|
||||||
echo "---"
|
fi
|
||||||
# would hierarchial view be more useful (-H)? output is
|
|
||||||
# not sorted by usage then, however, and the first
|
|
||||||
# question is "what's using up the memory"
|
|
||||||
#
|
|
||||||
# there are a lot of kernel threads, especially on a 8-cpu
|
|
||||||
# system. do a best-effort removal to improve
|
|
||||||
# signal/noise ratio of output.
|
|
||||||
ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
|
|
||||||
grep -v ']$'
|
|
||||||
echo "]]]"
|
echo "]]]"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
sleep $SLEEP_TIME
|
sleep $SLEEP_TIME
|
||||||
done
|
done
|
||||||
}
|
}
|
59
tools/mlock_report.py
Executable file
59
tools/mlock_report.py
Executable file
@ -0,0 +1,59 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# This tool lists processes that lock memory pages from swapping to disk.
|
||||||
|
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
|
||||||
|
SUMMARY_REGEX = re.compile(r".*\s+(?P<locked>[\d]+)\s+KB")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
print _get_report()
|
||||||
|
except Exception as e:
|
||||||
|
print "Failure listing processes locking memory: %s" % str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_report():
|
||||||
|
mlock_users = []
|
||||||
|
for proc in psutil.process_iter():
|
||||||
|
pid = proc.pid
|
||||||
|
# sadly psutil does not expose locked pages info, that's why we
|
||||||
|
# call to pmap and parse the output here
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(['pmap', '-XX', str(pid)])
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
# 42 means process just vanished, which is ok
|
||||||
|
if e.returncode == 42:
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
last_line = out.splitlines()[-1]
|
||||||
|
|
||||||
|
# some processes don't provide a memory map, for example those
|
||||||
|
# running as kernel services, so we need to skip those that don't
|
||||||
|
# match
|
||||||
|
result = SUMMARY_REGEX.match(last_line)
|
||||||
|
if result:
|
||||||
|
locked = int(result.group('locked'))
|
||||||
|
if locked:
|
||||||
|
mlock_users.append({'name': proc.name(),
|
||||||
|
'pid': pid,
|
||||||
|
'locked': locked})
|
||||||
|
|
||||||
|
# produce a single line log message with per process mlock stats
|
||||||
|
if mlock_users:
|
||||||
|
return "; ".join(
|
||||||
|
"[%(name)s (pid:%(pid)s)]=%(locked)dKB" % args
|
||||||
|
# log heavy users first
|
||||||
|
for args in sorted(mlock_users, key=lambda d: d['locked'])
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return "no locked memory"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user