[grafana] Migrator job is mariadb-fail-proof

The main goal of this PS is to make sure the migrator can complete the migrations even if mariadb galera cluster dropped the migrator connection leaving the database in inconsistent state. It may happen that migration_log has a record of a successfully performed migration while the database scheme misses an entity so any further attempts to re-run the migrator fail because of missed entity the migrator expects to be present.

Also the migrator is running mariadb image as a main one and grafana binaries are mounted as /usr/share/grafana. Migrator job container is running under nobody user uid.

This PS runs migrator in a safe way:
- prepares database backup
- runs a single instance of grafana as migrator with log file as a background process in a loop
- constantly checks the log file in the main process
- in case of the migrations completed it stops grafana-server process and completed the job
- in case of a migration error it restores the previously prepared backup so the grafana-server that is running in a background loop can re-try the migration
- the database operations are prefixed with code that makes sure the database is reachable.

Change-Id: I4e1542b62777f25c08ddd2cb74f0a0e7bfea5145
This commit is contained in:
Markin, Sergiy 2023-01-04 22:01:20 -06:00
parent 05ba56e0a9
commit 00846e2e02
8 changed files with 176 additions and 17 deletions

View File

@ -15,7 +15,7 @@ apiVersion: v1
appVersion: v8.5.10
description: OpenStack-Helm Grafana
name: grafana
version: 0.1.17
version: 0.1.18
home: https://grafana.com/
sources:
- https://github.com/grafana/grafana

View File

@ -17,15 +17,101 @@ set -exo pipefail
COMMAND="${@:-start}"
PORT={{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
PIDFILE=/tmp/pid
DB_HOST={{ tuple "oslo_db" "direct" . | include "helm-toolkit.endpoints.hostname_fqdn_endpoint_lookup" }}
DB_PORT={{ tuple "oslo_db" "direct" "mysql" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
MYSQL_PARAMS=" \
--defaults-file=/tmp/my.cnf \
--host=${DB_HOST} \
--port=${DB_PORT}
{{- if .Values.manifests.certificates }}
--ssl-verify-server-cert=false \
--ssl-ca=/etc/mysql/certs/ca.crt \
--ssl-key=/etc/mysql/certs/tls.key \
--ssl-cert=/etc/mysql/certs/tls.crt \
{{- end }}
"
function start () {
exec /usr/share/grafana/bin/grafana-server -homepath=/usr/share/grafana -config=/etc/grafana/grafana.ini --pidfile="$PIDFILE"
}
function run_migrator () {
start &
timeout 60 bash -c "until timeout 5 bash -c '</dev/tcp/127.0.0.1/${PORT}'; do sleep 1; done"
BACKUP_FILE=$(mktemp)
LOG_FILE=$(mktemp)
STOP_FLAG=$(mktemp)
echo "Making sure the database is reachable...."
set +e
until mysql ${MYSQL_PARAMS} grafana -e "select 1;"
do
echo \"Database ${DB_HOST} is not reachable. Sleeping for 10 seconds...\"
sleep 10
done
set -e
echo "Preparing initial database backup..."
mysqldump ${MYSQL_PARAMS} --add-drop-table --quote-names grafana > "${BACKUP_FILE}"
echo "Backup SQL file ${BACKUP_FILE}"
ls -lh "${BACKUP_FILE}"
{
# this is the background process that re-starts grafana-server
# in prder to process grafana database migration
set +e
while true
do
start 2>&1 | tee "$LOG_FILE"
sleep 10
echo "Restarting the grafana-server..."
stop
echo "Emptying log file..."
echo > "$LOG_FILE"
while [ -f ${STOP_FLAG} ]
do
echo "Lock file still exists - ${STOP_FLAG}..."
ls -la ${STOP_FLAG}
echo "Waiting for lock file to get removed..."
sleep 5
done
echo "Lock file is removed, proceeding with grafana re-start.."
done
set -e
} &
until cat "${LOG_FILE}" | grep -E "migrations completed"
do
echo "The migrations are not completed yet..."
if cat "${LOG_FILE}" | grep -E "migration failed"
then
echo "Locking server restart by placing a flag file ${STOP_FLAG} .."
touch "${STOP_FLAG}"
echo "Migration failure has been detected. Stopping the grafana-server..."
set +e
stop
set -e
echo "Making sure the database is reachable...."
set +e
until mysql ${MYSQL_PARAMS} grafana -e "select 1;"
do
echo \"Database ${DB_HOST} is not reachable. Sleeping for 10 seconds...\"
sleep 10
done
set -e
echo "Cleaning the database..."
TABLES=$(
mysql ${MYSQL_PARAMS} grafana -e "show tables\G;" | grep Tables | cut -d " " -f 2
)
for TABLE in ${TABLES}
do
echo ${TABLE}
mysql ${MYSQL_PARAMS} grafana -e "drop table ${TABLE};"
done
echo "Restoring the database backup..."
mysql ${MYSQL_PARAMS} grafana < "${BACKUP_FILE}"
echo "Removing lock file ${STOP_FLAG} ..."
rm -f "${STOP_FLAG}"
echo "${STOP_FLAG} has been removed"
fi
sleep 10
done
stop
rm -f "${BACKUP_FILE}"
}
function stop () {

View File

@ -20,6 +20,19 @@ limitations under the License.
{{- $serviceAccountName := "grafana-run-migrator" }}
{{ tuple $envAll "run_migrator" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prepare-grafana-migrator
annotations:
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
data:
prepare-grafana-migrator.sh: |
#!/bin/bash
set -xe
cp -av /usr/share/grafana/* /usr/share/grafana-prepare/
exit 0
---
apiVersion: batch/v1
kind: Job
metadata:
@ -36,7 +49,7 @@ spec:
annotations:
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }}
{{ dict "envAll" $envAll "podName" "grafana-run-migrator" "containerNames" (list "grafana-run-migrator" "init") | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }}
{{ dict "envAll" $envAll "podName" "grafana-run-migrator" "containerNames" (list "prepare-grafana-migrator" "grafana-run-migrator" "init") | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }}
spec:
{{ dict "envAll" $envAll "application" "run_migrator" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
serviceAccountName: {{ $serviceAccountName }}
@ -45,9 +58,24 @@ spec:
{{ .Values.labels.job.node_selector_key }}: {{ .Values.labels.job.node_selector_value | quote }}
initContainers:
{{ tuple $envAll "run_migrator" list | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }}
- name: prepare-grafana-migrator
{{ tuple $envAll "grafana" | include "helm-toolkit.snippets.image" | indent 10 }}
{{ dict "envAll" $envAll "application" "run_migrator" "container" "prepare_grafana_migrator" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }}
command:
- /tmp/prepare-grafana-migrator.sh
resources: {}
volumeMounts:
- name: pod-tmp
mountPath: /tmp
- name: grafana-binary-image
mountPath: /usr/share/grafana-prepare
- name: prepare-grafana-migrator
mountPath: /tmp/prepare-grafana-migrator.sh
readOnly: true
subPath: prepare-grafana-migrator.sh
containers:
- name: grafana-run-migrator
{{ tuple $envAll "grafana" | include "helm-toolkit.snippets.image" | indent 10 }}
{{ tuple $envAll "mariadb" | include "helm-toolkit.snippets.image" | indent 10 }}
{{ tuple $envAll $envAll.Values.pod.resources.run_migrator | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
{{ dict "envAll" $envAll "application" "run_migrator" "container" "grafana_run_migrator" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }}
command:
@ -56,12 +84,12 @@ spec:
ports:
- name: dashboard
containerPort: {{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
readinessProbe:
httpGet:
path: /login
port: {{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
initialDelaySeconds: 30
timeoutSeconds: 30
# readinessProbe:
# httpGet:
# path: /login
# port: {{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }}
# initialDelaySeconds: 30
# timeoutSeconds: 30
env:
- name: GF_SECURITY_ADMIN_USER
valueFrom:
@ -103,6 +131,8 @@ spec:
mountPath: {{ .Values.conf.grafana.paths.alerting }}
- name: pod-csv-grafana
mountPath: {{ .Values.conf.grafana.paths.csv }}
- name: grafana-binary-image
mountPath: /usr/share/grafana
- name: grafana-bin
mountPath: /tmp/grafana.sh
subPath: grafana.sh
@ -119,6 +149,9 @@ spec:
- name: grafana-etc
mountPath: /etc/grafana/ldap.toml
subPath: ldap.toml
- name: grafana-db
mountPath: /tmp/my.cnf
subPath: my.cnf
- name: data
mountPath: /var/lib/grafana/data
{{- range $group, $dashboards := .Values.conf.dashboards }}
@ -146,6 +179,8 @@ spec:
emptyDir: {}
- name: pod-csv-grafana
emptyDir: {}
- name: grafana-binary-image
emptyDir: {}
- name: grafana-bin
configMap:
name: grafana-bin
@ -154,6 +189,10 @@ spec:
secret:
secretName: grafana-etc
defaultMode: 0444
- name: grafana-db
secret:
secretName: grafana-db
defaultMode: 0444
{{- range $group, $dashboards := .Values.conf.dashboards }}
- name: grafana-dashboards-{{$group}}
configMap:
@ -162,6 +201,10 @@ spec:
{{- end }}
- name: data
emptyDir: {}
- name: prepare-grafana-migrator
configMap:
defaultMode: 0555
name: prepare-grafana-migrator
{{- dict "enabled" $envAll.Values.manifests.certificates "name" $envAll.Values.endpoints.oslo_db.auth.admin.secret.tls.internal | include "helm-toolkit.snippets.tls_volume" | indent 8 }}
{{ if $mounts_grafana.volumes }}{{ toYaml $mounts_grafana.volumes | indent 8 }}{{ end }}
{{- end }}

View File

@ -30,4 +30,12 @@ data:
DB_CONNECTION: {{ $connection | b64enc -}}
{{- end }}
{{- end }}
---
apiVersion: v1
kind: Secret
metadata:
name: grafana-db
type: Opaque
data:
my.cnf: {{ tuple "secrets/_my.cnf.tpl" . | include "helm-toolkit.utils.template" | b64enc }}
{{- end }}

View File

@ -0,0 +1,17 @@
{{/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
[client]
user = {{ .Values.endpoints.oslo_db.auth.admin.username }}
password = {{ .Values.endpoints.oslo_db.auth.admin.password }}

View File

@ -18,6 +18,7 @@
images:
tags:
grafana: docker.io/grafana/grafana:8.5.10
mariadb: docker.io/openstackhelm/mariadb:latest-ubuntu_focal
dep_check: quay.io/airshipit/kubernetes-entrypoint:v1.0.0
db_init: docker.io/openstackhelm/heat:stein-ubuntu_bionic
grafana_db_session_sync: docker.io/openstackhelm/heat:stein-ubuntu_bionic
@ -49,7 +50,6 @@ pod:
security_context:
dashboard:
pod:
# The correct grafana uid = 472
runAsUser: 472
container:
grafana:
@ -57,7 +57,6 @@ pod:
readOnlyRootFilesystem: true
db_init:
pod:
# The correct grafana uid = 472
runAsUser: 472
container:
grafana_db_init_session:
@ -68,7 +67,6 @@ pod:
readOnlyRootFilesystem: true
db_session_sync:
pod:
# The correct grafana uid = 472
runAsUser: 472
container:
grafana_db_session_sync:
@ -76,7 +74,6 @@ pod:
readOnlyRootFilesystem: true
set_admin_user:
pod:
# The correct grafana uid = 472
runAsUser: 472
container:
grafana_set_admin_password:
@ -84,15 +81,21 @@ pod:
readOnlyRootFilesystem: true
run_migrator:
pod:
# The correct grafana uid = 472
runAsUser: 472
container:
prepare_grafana_migrator:
runAsUser: 0
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
grafana_run_migrator:
runAsUser: 65534
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
grafana_set_admin_password:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
test:
pod:
# The correct grafana uid = 472
runAsUser: 472
container:
helm_tests:

View File

@ -19,6 +19,7 @@ pod:
init: runtime/default
grafana-run-migrator:
grafana-run-migrator: runtime/default
prepare-grafana-migrator: runtime/default
init: runtime/default
grafana-test:
init: runtime/default

View File

@ -18,4 +18,5 @@ grafana:
- 0.1.15 Added OCI registry authentication
- 0.1.16 Grafana 8.5.10 with unified alerting
- 0.1.17 Fix uid for the user grafana
- 0.1.18 Migrator job is now mariadb-fail-proof
...