Implement CNI cache file cleanup for stale files
It has been observed in systems running for months -> years that the CNI cache files (representing attributes of network attachment definitions of pods) can accumulate in large numbers in the /var/lib/cni/results/ and /var/lib/cni/multus/ directories. The cache files in /var/lib/cni/results/ have a naming signature of: <type>-<pod id>-<interface name> While the cache files in /var/lib/cni/multus have a naming signature of: <pod id> Normally these files are cleaned up automatically (I believe this is the responsibility of containerd). It has been seen that this happens reliably when one manually deletes a pod. The issue has been reproduced in the case of a host being manually rebooted. In this case, the pods are re-created when the host comes back up, but with a different pod-id than was used before In this case, _most_ of the time the cache files from the previous instantiation of the pod are deleted, but occasionally a few are missed by the internal garbage collection mechanism. Once a cache file from the previous instantiation of a pod escapes garbage collection, it seems to be left as a stale file for all subsequent reboots. Over time, this can cause these stale files to accumulate and take up disk space unnecessarily. The script will be called once by the k8s-pod-recovery service on system startup, and then periodically via a cron job installed by puppet. The cleanup mechanism analyzes the cache files by name and compares them with the id(s) of the currently running pods. Any stale files detected are deleted. Test Plan: PASS: Verify existing pods do not have their cache files removed PASS: Verify files younger than the specified 'olderthan' time are not removed PASS: Verify stale cache files for pods that do not exist anymore are removed. PASS: Verify the script does not run if kubelet is not up yet. Failure Path: PASS: Verify files not matching the naming signature (pod id embedded in file name) are not processed Regression: PASS: Verify system install PASS: Verify feature logging Partial-Bug: 1947386 Signed-off-by: Steven Webster <steven.webster@windriver.com> Change-Id: I0ce06646001e52d1cc6d204b924f41d049264b4c
This commit is contained in:
parent
5bcbe8ea8a
commit
5d1a26b89d
@ -170,6 +170,7 @@ kubernetes-1.21.3-kubeadm
|
|||||||
kubernetes-1.21.3-client
|
kubernetes-1.21.3-client
|
||||||
containerd
|
containerd
|
||||||
k8s-pod-recovery
|
k8s-pod-recovery
|
||||||
|
k8s-cni-cache-cleanup
|
||||||
containernetworking-plugins
|
containernetworking-plugins
|
||||||
|
|
||||||
# resource-agents
|
# resource-agents
|
||||||
|
@ -66,6 +66,7 @@ kubernetes/chartmuseum
|
|||||||
kubernetes/armada-helm-toolkit
|
kubernetes/armada-helm-toolkit
|
||||||
kubernetes/armada
|
kubernetes/armada
|
||||||
kubernetes/k8s-pod-recovery
|
kubernetes/k8s-pod-recovery
|
||||||
|
kubernetes/k8s-cni-cache-cleanup
|
||||||
kubernetes/plugins/isolcpus-device-plugin
|
kubernetes/plugins/isolcpus-device-plugin
|
||||||
python/python-kubernetes
|
python/python-kubernetes
|
||||||
grub/grubby
|
grub/grubby
|
||||||
|
4
kubernetes/k8s-cni-cache-cleanup/centos/build_srpm.data
Normal file
4
kubernetes/k8s-cni-cache-cleanup/centos/build_srpm.data
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
SRC_DIR="."
|
||||||
|
COPY_LIST="$FILES_BASE/*"
|
||||||
|
|
||||||
|
TIS_PATCH_VER=PKG_GITREVCOUNT
|
@ -0,0 +1,214 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Copyright (c) 2021 Wind River Systems, Inc.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
#
|
||||||
|
|
||||||
|
NAME=$(basename $0)
|
||||||
|
RESULTSDIR="/var/lib/cni/results"
|
||||||
|
MULTUSDIR="/var/lib/cni/multus"
|
||||||
|
PODS=$(crictl ps -v 2> /dev/null | grep -w -E 'PodID|pod.name')
|
||||||
|
PODIDS=($(echo "$PODS" | grep PodID | awk '{print $2}'))
|
||||||
|
PODNAMES=($(echo "$PODS" | grep -w pod.name | awk '{print $3}'))
|
||||||
|
KUBELET_UPTIME_MINUTES=5
|
||||||
|
POD_ID_LENGTH=64
|
||||||
|
DELETE="no"
|
||||||
|
OLDERTHANHOURS=1
|
||||||
|
|
||||||
|
# Log info message to /var/log/daemon.log
|
||||||
|
function LOG {
|
||||||
|
logger -p daemon.info -t "${NAME}($$): " "${@}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Log error message to /var/log/daemon.log
|
||||||
|
function ERROR {
|
||||||
|
logger -p daemon.error -t "${NAME}($$): " "${@}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine the age of a file in hours.
|
||||||
|
function file_age {
|
||||||
|
local file=${1}
|
||||||
|
local SECONDSPERHOUR=3600
|
||||||
|
now=$(date +%s)
|
||||||
|
old=$(stat -c %Z ${file})
|
||||||
|
diff=$(((${now} - ${old})/${SECONDSPERHOUR}))
|
||||||
|
echo ${diff}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine the pod id associated with a result CNI cache file.
|
||||||
|
function results_cni_cache_file_to_pod_id {
|
||||||
|
local path=${1}
|
||||||
|
local ret=""
|
||||||
|
file=$(basename ${path})
|
||||||
|
|
||||||
|
# A valid CNI cache results file looks like:
|
||||||
|
# type-pod_id-interface_name
|
||||||
|
RESULTS_REGEX='^.*-([0-9a-zA-Z]{64})-[0-9a-zA-Z]+$'
|
||||||
|
|
||||||
|
if [[ ${file} =~ ${RESULTS_REGEX} ]]; then
|
||||||
|
ret=${BASH_REMATCH[1]}
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ${ret}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine the pod id associated with a multus CNI cache file.
|
||||||
|
function multus_cni_cache_file_to_pod_id {
|
||||||
|
local path=${1}
|
||||||
|
local ret=""
|
||||||
|
file=$(basename ${path})
|
||||||
|
|
||||||
|
# A valid CNI cache multus file is simply the pod id
|
||||||
|
MULTUS_REGEX='^([0-9a-zA-Z]{64})$'
|
||||||
|
|
||||||
|
if [[ ${file} =~ ${MULTUS_REGEX} ]]; then
|
||||||
|
ret=${BASH_REMATCH[1]}
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ${ret}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine the pod id associated with a CNI cache file.
|
||||||
|
function cni_cache_file_to_pod_id {
|
||||||
|
local path=${1}
|
||||||
|
local ret=""
|
||||||
|
dir=$(dirname ${path})
|
||||||
|
|
||||||
|
if [[ "${dir}" == "${RESULTSDIR}" ]]; then
|
||||||
|
ret=$(results_cni_cache_file_to_pod_id ${path})
|
||||||
|
elif [[ "${dir}" == "${MULTUSDIR}" ]]; then
|
||||||
|
ret=$(multus_cni_cache_file_to_pod_id ${path})
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ${ret}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine the original pod name from a CNI cache file (if any).
|
||||||
|
function cache_file_to_pod_name {
|
||||||
|
local path=${1}
|
||||||
|
local ret="unknown"
|
||||||
|
|
||||||
|
grep -q "K8S_POD_NAME" ${path}
|
||||||
|
if [ ${?} -eq 0 ]; then
|
||||||
|
ret=$(cat ${path} | sed "s/.*K8S_POD_NAME\",\"//g" | cut -f1 -d"\"")
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ${ret}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Given a CNI cache id, return the existing pod name (if any).
|
||||||
|
function get_pod {
|
||||||
|
local cacheid=${1}
|
||||||
|
local ret=""
|
||||||
|
|
||||||
|
for i in ${!PODIDS[@]}; do
|
||||||
|
podid=${PODIDS[${i}]}
|
||||||
|
if [[ "${podid}" == "${cacheid}" ]]; then
|
||||||
|
ret=${PODNAMES[${i}]}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ${ret}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine if the CNI cache file is old enough to process.
|
||||||
|
function check_cache_file_age {
|
||||||
|
local age=${1}
|
||||||
|
local ret=""
|
||||||
|
|
||||||
|
if [ -n ${OLDERTHANHOURS} ]; then
|
||||||
|
if [[ ${age} -ge ${OLDERTHANHOURS} ]]; then
|
||||||
|
ret=${age}
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ${ret}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Determine how long kubelet has been up in minutes
|
||||||
|
function kubelet_uptime {
|
||||||
|
local SECONDSPERMINUTE=60
|
||||||
|
|
||||||
|
kubelet_uptime=$(systemctl show kubelet --property WatchdogTimestamp | awk -F= '{print $2}')
|
||||||
|
[[ -n ${kubelet_uptime} ]]
|
||||||
|
if [ ${?} -ne 0 ]; then
|
||||||
|
ERROR "Failed to get kubelet uptime."
|
||||||
|
minutes=0
|
||||||
|
else
|
||||||
|
uptime=$(date --date="${kubelet_uptime}" +%s)
|
||||||
|
now=$(date +%s)
|
||||||
|
minutes=$(((${now}-${uptime})/${SECONDSPERMINUTE}))
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ${minutes}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Wait for kubelet to be up for long enough to process CNI cache files.
|
||||||
|
function check_kubelet {
|
||||||
|
local retries=0
|
||||||
|
|
||||||
|
while [ ${retries} -le 30 ]; do
|
||||||
|
uptime=$(kubelet_uptime)
|
||||||
|
if [ ${uptime} -ge ${KUBELET_UPTIME_MINUTES} ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
remaining=$((${KUBELET_UPTIME_MINUTES}-${uptime}))
|
||||||
|
LOG "Waiting for kubelet to be up for ${remaining} minutes ..."
|
||||||
|
retries=$((${retries}+1))
|
||||||
|
sleep 30
|
||||||
|
done
|
||||||
|
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts :o:d OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
o|--older-than)
|
||||||
|
OLDERTHANHOURS=${OPTARG}
|
||||||
|
;;
|
||||||
|
d|+d)
|
||||||
|
DELETE="yes"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "usage: ${0##*/} [-d] [-o older_than_hours]"
|
||||||
|
exit 2
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
check_kubelet
|
||||||
|
if [[ ${?} -ne 0 ]]; then
|
||||||
|
LOG "Kubelet must be up for a minimum of ${KUBELET_UPTIME_MINUTES} minutes. Not running CNI cache cleanup."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for f in ${RESULTSDIR}/* ${MULTUSDIR}/*; do
|
||||||
|
cacheid=$(cni_cache_file_to_pod_id ${f})
|
||||||
|
if [[ ${#cacheid} -ne ${POD_ID_LENGTH} ]]; then
|
||||||
|
# Unrecognized file pattern, skip.
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
existing_podname=$(get_pod ${cacheid})
|
||||||
|
if [[ ${existing_podname} ]]; then
|
||||||
|
LOG "Pod ${existing_podname} exists. Not cleaning up CNI cache file(s)."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
age=$(file_age ${f})
|
||||||
|
if [[ ! $(check_cache_file_age ${age}) ]]; then
|
||||||
|
LOG "Stale CNI cache file ${f} detected. Cleanup to occur after $((${OLDERTHANHOURS} - ${age})) hour(s)."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${DELETE}" == "yes" ]]; then
|
||||||
|
rm -f ${f}
|
||||||
|
action="Deleted"
|
||||||
|
else
|
||||||
|
action="Detected"
|
||||||
|
fi
|
||||||
|
|
||||||
|
cache_podname=$(cache_file_to_pod_name ${f})
|
||||||
|
LOG "${action} stale CNI cache file ${f}: [age: ${age} hours old, podname: ${cache_podname}]."
|
||||||
|
done
|
||||||
|
|
@ -0,0 +1,27 @@
|
|||||||
|
Name: k8s-cni-cache-cleanup
|
||||||
|
Version: 1.0
|
||||||
|
Release: 0%{?_tis_dist}.%{tis_patch_ver}
|
||||||
|
Summary: Kubernetes CNI Cache Cleanup Utility
|
||||||
|
License: Apache-2.0
|
||||||
|
Group: base
|
||||||
|
Packager: Wind River <info@windriver.com>
|
||||||
|
URL: unknown
|
||||||
|
Source0: k8s-cni-cache-cleanup
|
||||||
|
|
||||||
|
Requires: /bin/bash
|
||||||
|
|
||||||
|
%description
|
||||||
|
%{summary}
|
||||||
|
|
||||||
|
%define local_dir /usr/local
|
||||||
|
%define local_sbindir %{local_dir}/sbin
|
||||||
|
|
||||||
|
%prep
|
||||||
|
|
||||||
|
%install
|
||||||
|
install -d %{buildroot}%{local_sbindir}
|
||||||
|
install -m 755 %{SOURCE0} %{buildroot}%{local_sbindir}/k8s-cni-cache-cleanup
|
||||||
|
|
||||||
|
%files
|
||||||
|
%defattr(-,root,root,-)
|
||||||
|
%{local_sbindir}/k8s-cni-cache-cleanup
|
@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
. /etc/platform/platform.conf
|
. /etc/platform/platform.conf
|
||||||
|
|
||||||
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
|
export PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin:/usr/local/sbin
|
||||||
export KUBECONFIG=/etc/kubernetes/admin.conf
|
export KUBECONFIG=/etc/kubernetes/admin.conf
|
||||||
CONF_DIR=/etc/k8s-post-recovery.d
|
CONF_DIR=/etc/k8s-post-recovery.d
|
||||||
SLEEP_DELAY_SEC=15
|
SLEEP_DELAY_SEC=15
|
||||||
@ -74,6 +74,16 @@ function _wait_for_systemd {
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function _do_cni_cache_cleanup {
|
||||||
|
# Cleanup any stale CNI cache files (not associated with any running pod)
|
||||||
|
# that are older than 1 hour old
|
||||||
|
LOG "Starting CNI cache cleanup..."
|
||||||
|
k8s-cni-cache-cleanup -o 1 -d
|
||||||
|
if [[ ${?} -ne 0 ]]; then
|
||||||
|
ERROR "Failed to run CNI cache cleanup."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
function _wait_for_pod_stabilization {
|
function _wait_for_pod_stabilization {
|
||||||
|
|
||||||
local extra_args=$1
|
local extra_args=$1
|
||||||
@ -298,6 +308,7 @@ function start {
|
|||||||
_wait_for_systemd
|
_wait_for_systemd
|
||||||
_examine_pods 'recover'
|
_examine_pods 'recover'
|
||||||
_examine_pods 'verify'
|
_examine_pods 'verify'
|
||||||
|
_do_cni_cache_cleanup
|
||||||
}
|
}
|
||||||
|
|
||||||
function stop {
|
function stop {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user