[mariadb] Refactor liveness/readiness probes

* Move all probes into single script to reduce code duplication
* Check free disk percent, fail when we consume 99% to avoid
  data corruption
* Do not restart container when SST is in progress

Change-Id: I6efc7596753dc988aa9edd7ade4d57107db98bdd
This commit is contained in:
Vasyl Saienko 2024-11-11 16:39:41 +00:00
parent 842f0f11dc
commit 174f6f5bd5
8 changed files with 155 additions and 142 deletions

View File

@ -15,7 +15,7 @@ apiVersion: v1
appVersion: v10.6.7
description: OpenStack-Helm MariaDB
name: mariadb
version: 0.2.59
version: 0.2.60
home: https://mariadb.com/kb/en/
icon: http://badges.mariadb.org/mariadb-badge-180x60.png
sources:

View File

@ -0,0 +1,139 @@
#!/usr/bin/env bash
###########################################################################
# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#########################################################################
set -e
MYSQL="mysql \
--defaults-file=/etc/mysql/admin_user.cnf \
--host=localhost \
{{- if .Values.manifests.certificates }}
--ssl-verify-server-cert=false \
--ssl-ca=/etc/mysql/certs/ca.crt \
--ssl-key=/etc/mysql/certs/tls.key \
--ssl-cert=/etc/mysql/certs/tls.crt \
{{- end }}
--connect-timeout 2"
mysql_query () {
TABLE=$1
KEY=$2
$MYSQL -e "show ${TABLE} like \"${KEY}\"" | \
awk "/${KEY}/ { print \$NF; exit }"
}
function usage {
echo "Usage: $0 [-t <liveness|readiness>] [-d <percent>]" 1>&2
exit 1
}
PROBE_TYPE=''
while getopts ":t:d:" opt; do
case $opt in
t)
PROBE_TYPE=$OPTARG
;;
d)
DISK_ALARM_LIMIT=$OPTARG
;;
*)
usage
;;
esac
done
shift $((OPTIND-1))
check_readiness () {
if ! $MYSQL -e 'select 1' > /dev/null 2>&1 ; then
echo "Select from mysql failed"
exit 1
fi
DATADIR=$(mysql_query variables datadir)
TMPDIR=$(mysql_query variables tmpdir)
for partition in ${DATADIR} ${TMPDIR}; do
if [ "$(df --output=pcent ${partition} | grep -Po '\d+')" -ge "${DISK_ALARM_LIMIT:-100}" ]; then
echo "[ALARM] Critical high disk space utilization of ${partition}"
exit 1
fi
done
if [ "x$(mysql_query status wsrep_ready)" != "xON" ]; then
echo "WSREP says the node can not receive queries"
exit 1
fi
if [ "x$(mysql_query status wsrep_connected)" != "xON" ]; then
echo "WSREP not connected"
exit 1
fi
if [ "x$(mysql_query status wsrep_cluster_status)" != "xPrimary" ]; then
echo "Not in primary cluster"
exit 1
fi
if [ "x$(mysql_query status wsrep_local_state_comment)" != "xSynced" ]; then
echo "WSREP not synced"
exit 1
fi
}
check_liveness () {
if pidof mysql_upgrade > /dev/null 2>&1 ; then
echo "The process mysql_upgrade is active. Skip rest checks"
exit 0
fi
if ! pidof mysqld > /dev/null 2>&1 ; then
echo "The mysqld pid not found"
exit 1
fi
# NOTE(mkarpin): SST process may take significant time in case of large databases,
# killing mysqld during SST may destroy all data on the node.
local datadir="/var/lib/mysql"
if [ -f ${datadir}/sst_in_progress ]; then
echo "SST is still in progress, skip further checks as mysql won't respond"
else
# NOTE(vsaienko): in some cases maria might stuck during IST, or when neighbours
# IPs are changed. Here we check that we can connect to mysql socket to ensure
# process is alive.
if ! $MYSQL -e "show status like 'wsrep_cluster_status'" > /dev/null 2>&1 ; then
echo "Can't connect to mysql socket"
exit 1
fi
# Detect node that is not connected to wsrep provider
if [ "x$(mysql_query status wsrep_ready)" != "xON" ]; then
echo "WSREP says the node can not receive queries"
exit 1
fi
if [ "x$(mysql_query status wsrep_connected)" != "xON" ]; then
echo "WSREP not connected"
exit 1
fi
fi
}
case $PROBE_TYPE in
liveness)
check_liveness
;;
readiness)
check_readiness
;;
*)
echo "Unknown probe type: ${PROBE_TYPE}"
usage
;;
esac

View File

@ -1,68 +0,0 @@
#!/usr/bin/env bash
{{/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
set -e
MYSQL="mysql \
--defaults-file=/etc/mysql/admin_user.cnf \
--host=localhost \
{{- if .Values.manifests.certificates }}
--ssl-verify-server-cert=false \
--ssl-ca=/etc/mysql/certs/ca.crt \
--ssl-key=/etc/mysql/certs/tls.key \
--ssl-cert=/etc/mysql/certs/tls.crt \
{{- end }}
--connect-timeout 2"
mysql_status_query () {
STATUS=$1
$MYSQL -e "show status like \"${STATUS}\"" | \
awk "/${STATUS}/ { print \$NF; exit }"
}
{{- if eq (int .Values.pod.replicas.server) 1 }}
if ! $MYSQL -e 'select 1' > /dev/null 2>&1 ; then
exit 1
fi
{{- else }}
if [ -f /var/lib/mysql/sst_in_progress ]; then
# SST in progress, with this node receiving a snapshot.
# MariaDB won't be up yet; avoid killing.
exit 0
fi
if [ "x$(mysql_status_query wsrep_ready)" != "xON" ]; then
# WSREP says the node can receive queries
exit 1
fi
if [ "x$(mysql_status_query wsrep_connected)" != "xON" ]; then
# WSREP connected
exit 1
fi
if [ "x$(mysql_status_query wsrep_cluster_status)" != "xPrimary" ]; then
# Not in primary cluster
exit 1
fi
wsrep_local_state_comment=$(mysql_status_query wsrep_local_state_comment)
if [ "x${wsrep_local_state_comment}" != "xSynced" ] && [ "x${wsrep_local_state_comment}" != "xDonor/Desynced" ]; then
# WSREP not synced or not sending SST
exit 1
fi
{{- end }}

View File

@ -1,60 +0,0 @@
#!/usr/bin/env bash
{{/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
set -e
MYSQL="mysql \
--defaults-file=/etc/mysql/admin_user.cnf \
--host=localhost \
{{- if .Values.manifests.certificates }}
--ssl-verify-server-cert=false \
--ssl-ca=/etc/mysql/certs/ca.crt \
--ssl-key=/etc/mysql/certs/tls.key \
--ssl-cert=/etc/mysql/certs/tls.crt \
{{- end }}
--connect-timeout 2"
mysql_status_query () {
STATUS=$1
$MYSQL -e "show status like \"${STATUS}\"" | \
awk "/${STATUS}/ { print \$NF; exit }"
}
if ! $MYSQL -e 'select 1' > /dev/null 2>&1 ; then
exit 1
fi
{{- if gt (int .Values.pod.replicas.server) 1 }}
if [ "x$(mysql_status_query wsrep_ready)" != "xON" ]; then
# WSREP says the node can receive queries
exit 1
fi
if [ "x$(mysql_status_query wsrep_connected)" != "xON" ]; then
# WSREP connected
exit 1
fi
if [ "x$(mysql_status_query wsrep_cluster_status)" != "xPrimary" ]; then
# Not in primary cluster
exit 1
fi
if [ "x$(mysql_status_query wsrep_local_state_comment)" != "xSynced" ]; then
# WSREP not synced
exit 1
fi
{{- end }}

View File

@ -27,10 +27,8 @@ data:
image-repo-sync.sh: |
{{- include "helm-toolkit.scripts.image_repo_sync" . | indent 4 }}
{{- end }}
readiness.sh: |
{{ tuple "bin/_readiness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
liveness.sh: |
{{ tuple "bin/_liveness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
health.sh: |
{{ tuple "bin/_health.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
start.py: |
{{ tuple "bin/_start.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
test.sh: |

View File

@ -1,7 +1,7 @@
{{/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Y may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
@ -15,12 +15,18 @@ limitations under the License.
{{- define "mariadbReadinessProbe" }}
exec:
command:
- /tmp/readiness.sh
- /tmp/health.sh
- -t
- readiness
- -d
- {{ .Values.pod.probes.server.mariadb.readiness.disk_usage_percent | quote }}
{{- end }}
{{- define "mariadbLivenessProbe" }}
exec:
command:
- /tmp/liveness.sh
- /tmp/health.sh
- -t
- liveness
{{- end }}
{{- if (.Values.global).subchart_release_name }}
@ -226,12 +232,8 @@ spec:
subPath: stop.sh
readOnly: true
- name: mariadb-bin
mountPath: /tmp/readiness.sh
subPath: readiness.sh
readOnly: true
- name: mariadb-bin
mountPath: /tmp/liveness.sh
subPath: liveness.sh
mountPath: /tmp/health.sh
subPath: health.sh
readOnly: true
- name: mariadb-etc
mountPath: /etc/mysql/my.cnf

View File

@ -65,6 +65,7 @@ pod:
mariadb:
readiness:
enabled: true
disk_usage_percent: 99
params:
initialDelaySeconds: 30
periodSeconds: 30

View File

@ -75,4 +75,5 @@ mariadb:
- 0.2.57 Remove useless retries on conflicts during cm update
- 0.2.58 Prevent TypeError in get_active_endpoint function
- 0.2.59 Give more time on resolving configmap update conflicts
- 0.2.60 Refactor liveness/readiness probes
...