Update Airflow logrotate logic
The current logrotate logic deletes logs that are more than X days old in the Airflow log path, however the Airflow log archive may still reach 100% usage and cause the airflow-worker to crashloop. This PS adds logic to logrotate.sh to delete the oldest logs and empty dirs when the Airflow log archive reaches the max usage specified in values.yaml. Change-Id: I3dcb80901d7dd36da6812850a1f54e7ebf3b1cf2
This commit is contained in:
parent
b5469c39ec
commit
5f92be2f07
@ -149,6 +149,8 @@ spec:
|
||||
imagePullPolicy: {{ .Values.images.pull_policy }}
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.airflow.logrotate | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
env:
|
||||
- name: PERCENT_MAX_LOG_FS_USAGE
|
||||
value: {{ .Values.logrotate.percent_max_log_fs_usage | quote }}
|
||||
- name: DAYS_BEFORE_LOG_DELETION
|
||||
value: {{ .Values.logrotate.days_before_deletion | quote }}
|
||||
- name: LOGROTATE_PATH
|
||||
|
@ -146,6 +146,7 @@ volume_worker:
|
||||
|
||||
logrotate:
|
||||
days_before_deletion: 30
|
||||
percent_max_log_fs_usage: 80
|
||||
|
||||
# typically overriden by environmental
|
||||
# values, but should include all endpoints
|
||||
|
@ -16,6 +16,10 @@
|
||||
|
||||
set -ex
|
||||
|
||||
get_usage() {
|
||||
df /usr/local/airflow/logs/ --output='pcent' | grep -o '[0-9]*'
|
||||
}
|
||||
|
||||
while true; do
|
||||
|
||||
# Delete logs that are more than 30 days old in the directories
|
||||
@ -23,6 +27,14 @@ while true; do
|
||||
# Delete empty directories under the Airflow log path
|
||||
find ${LOGROTATE_PATH} \( -type f -name '*.log' -mtime +${DAYS_BEFORE_LOG_DELETION} -o -type d -empty \) -print -delete
|
||||
|
||||
# Delete oldest logs and empty directories when
|
||||
# the Airflow log path filesystem reaches max usage
|
||||
CURR_USAGE=$(get_usage)
|
||||
while [ $CURR_USAGE -gt ${PERCENT_MAX_LOG_FS_USAGE} ]; do
|
||||
find ${LOGROTATE_PATH} \( -type f -name '*.log' -o -type d -empty \) -printf '%T+ %p\n' | sort | head -n 1 | xargs -r -l1 sh -c 'rm -rf $1'
|
||||
CURR_USAGE=$(get_usage)
|
||||
done
|
||||
|
||||
# Sleep for 1 hr between each wait loop
|
||||
sleep 3600
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user