From 00846e2e025319692b1fb48230fc8c1d66c29e73 Mon Sep 17 00:00:00 2001 From: "Markin, Sergiy" Date: Wed, 4 Jan 2023 22:01:20 -0600 Subject: [PATCH] [grafana] Migrator job is mariadb-fail-proof The main goal of this PS is to make sure the migrator can complete the migrations even if mariadb galera cluster dropped the migrator connection leaving the database in inconsistent state. It may happen that migration_log has a record of a successfully performed migration while the database scheme misses an entity so any further attempts to re-run the migrator fail because of missed entity the migrator expects to be present. Also the migrator is running mariadb image as a main one and grafana binaries are mounted as /usr/share/grafana. Migrator job container is running under nobody user uid. This PS runs migrator in a safe way: - prepares database backup - runs a single instance of grafana as migrator with log file as a background process in a loop - constantly checks the log file in the main process - in case of the migrations completed it stops grafana-server process and completed the job - in case of a migration error it restores the previously prepared backup so the grafana-server that is running in a background loop can re-try the migration - the database operations are prefixed with code that makes sure the database is reachable. Change-Id: I4e1542b62777f25c08ddd2cb74f0a0e7bfea5145 --- grafana/Chart.yaml | 2 +- grafana/templates/bin/_grafana.sh.tpl | 90 ++++++++++++++++++++++++- grafana/templates/job-run-migrator.yaml | 59 +++++++++++++--- grafana/templates/secret-db.yaml | 8 +++ grafana/templates/secrets/_my.cnf.tpl | 17 +++++ grafana/values.yaml | 15 +++-- grafana/values_overrides/apparmor.yaml | 1 + releasenotes/notes/grafana.yaml | 1 + 8 files changed, 176 insertions(+), 17 deletions(-) create mode 100644 grafana/templates/secrets/_my.cnf.tpl diff --git a/grafana/Chart.yaml b/grafana/Chart.yaml index b700ba947..fa717efd1 100644 --- a/grafana/Chart.yaml +++ b/grafana/Chart.yaml @@ -15,7 +15,7 @@ apiVersion: v1 appVersion: v8.5.10 description: OpenStack-Helm Grafana name: grafana -version: 0.1.17 +version: 0.1.18 home: https://grafana.com/ sources: - https://github.com/grafana/grafana diff --git a/grafana/templates/bin/_grafana.sh.tpl b/grafana/templates/bin/_grafana.sh.tpl index 0c5ad8fdb..19e57dcf5 100644 --- a/grafana/templates/bin/_grafana.sh.tpl +++ b/grafana/templates/bin/_grafana.sh.tpl @@ -17,15 +17,101 @@ set -exo pipefail COMMAND="${@:-start}" PORT={{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} PIDFILE=/tmp/pid +DB_HOST={{ tuple "oslo_db" "direct" . | include "helm-toolkit.endpoints.hostname_fqdn_endpoint_lookup" }} +DB_PORT={{ tuple "oslo_db" "direct" "mysql" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} +MYSQL_PARAMS=" \ + --defaults-file=/tmp/my.cnf \ + --host=${DB_HOST} \ + --port=${DB_PORT} +{{- if .Values.manifests.certificates }} + --ssl-verify-server-cert=false \ + --ssl-ca=/etc/mysql/certs/ca.crt \ + --ssl-key=/etc/mysql/certs/tls.key \ + --ssl-cert=/etc/mysql/certs/tls.crt \ +{{- end }} + " function start () { exec /usr/share/grafana/bin/grafana-server -homepath=/usr/share/grafana -config=/etc/grafana/grafana.ini --pidfile="$PIDFILE" } function run_migrator () { - start & - timeout 60 bash -c "until timeout 5 bash -c ' "${BACKUP_FILE}" + echo "Backup SQL file ${BACKUP_FILE}" + ls -lh "${BACKUP_FILE}" + { + # this is the background process that re-starts grafana-server + # in prder to process grafana database migration + set +e + while true + do + start 2>&1 | tee "$LOG_FILE" + sleep 10 + echo "Restarting the grafana-server..." + stop + echo "Emptying log file..." + echo > "$LOG_FILE" + while [ -f ${STOP_FLAG} ] + do + echo "Lock file still exists - ${STOP_FLAG}..." + ls -la ${STOP_FLAG} + echo "Waiting for lock file to get removed..." + sleep 5 + done + echo "Lock file is removed, proceeding with grafana re-start.." + done + set -e + } & + until cat "${LOG_FILE}" | grep -E "migrations completed" + do + echo "The migrations are not completed yet..." + if cat "${LOG_FILE}" | grep -E "migration failed" + then + echo "Locking server restart by placing a flag file ${STOP_FLAG} .." + touch "${STOP_FLAG}" + echo "Migration failure has been detected. Stopping the grafana-server..." + set +e + stop + set -e + echo "Making sure the database is reachable...." + set +e + until mysql ${MYSQL_PARAMS} grafana -e "select 1;" + do + echo \"Database ${DB_HOST} is not reachable. Sleeping for 10 seconds...\" + sleep 10 + done + set -e + echo "Cleaning the database..." + TABLES=$( + mysql ${MYSQL_PARAMS} grafana -e "show tables\G;" | grep Tables | cut -d " " -f 2 + ) + for TABLE in ${TABLES} + do + echo ${TABLE} + mysql ${MYSQL_PARAMS} grafana -e "drop table ${TABLE};" + done + echo "Restoring the database backup..." + mysql ${MYSQL_PARAMS} grafana < "${BACKUP_FILE}" + echo "Removing lock file ${STOP_FLAG} ..." + rm -f "${STOP_FLAG}" + echo "${STOP_FLAG} has been removed" + fi + sleep 10 + done stop + rm -f "${BACKUP_FILE}" } function stop () { diff --git a/grafana/templates/job-run-migrator.yaml b/grafana/templates/job-run-migrator.yaml index 86b3dce70..e8d64c19c 100644 --- a/grafana/templates/job-run-migrator.yaml +++ b/grafana/templates/job-run-migrator.yaml @@ -20,6 +20,19 @@ limitations under the License. {{- $serviceAccountName := "grafana-run-migrator" }} {{ tuple $envAll "run_migrator" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} --- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prepare-grafana-migrator + annotations: + {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }} +data: + prepare-grafana-migrator.sh: | + #!/bin/bash + set -xe + cp -av /usr/share/grafana/* /usr/share/grafana-prepare/ + exit 0 +--- apiVersion: batch/v1 kind: Job metadata: @@ -36,7 +49,7 @@ spec: annotations: configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }} configmap-etc-hash: {{ tuple "configmap-etc.yaml" . | include "helm-toolkit.utils.hash" }} -{{ dict "envAll" $envAll "podName" "grafana-run-migrator" "containerNames" (list "grafana-run-migrator" "init") | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }} +{{ dict "envAll" $envAll "podName" "grafana-run-migrator" "containerNames" (list "prepare-grafana-migrator" "grafana-run-migrator" "init") | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }} spec: {{ dict "envAll" $envAll "application" "run_migrator" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }} serviceAccountName: {{ $serviceAccountName }} @@ -45,9 +58,24 @@ spec: {{ .Values.labels.job.node_selector_key }}: {{ .Values.labels.job.node_selector_value | quote }} initContainers: {{ tuple $envAll "run_migrator" list | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }} + - name: prepare-grafana-migrator +{{ tuple $envAll "grafana" | include "helm-toolkit.snippets.image" | indent 10 }} +{{ dict "envAll" $envAll "application" "run_migrator" "container" "prepare_grafana_migrator" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }} + command: + - /tmp/prepare-grafana-migrator.sh + resources: {} + volumeMounts: + - name: pod-tmp + mountPath: /tmp + - name: grafana-binary-image + mountPath: /usr/share/grafana-prepare + - name: prepare-grafana-migrator + mountPath: /tmp/prepare-grafana-migrator.sh + readOnly: true + subPath: prepare-grafana-migrator.sh containers: - name: grafana-run-migrator -{{ tuple $envAll "grafana" | include "helm-toolkit.snippets.image" | indent 10 }} +{{ tuple $envAll "mariadb" | include "helm-toolkit.snippets.image" | indent 10 }} {{ tuple $envAll $envAll.Values.pod.resources.run_migrator | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }} {{ dict "envAll" $envAll "application" "run_migrator" "container" "grafana_run_migrator" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }} command: @@ -56,12 +84,12 @@ spec: ports: - name: dashboard containerPort: {{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} - readinessProbe: - httpGet: - path: /login - port: {{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} - initialDelaySeconds: 30 - timeoutSeconds: 30 + # readinessProbe: + # httpGet: + # path: /login + # port: {{ tuple "grafana" "internal" "grafana" . | include "helm-toolkit.endpoints.endpoint_port_lookup" }} + # initialDelaySeconds: 30 + # timeoutSeconds: 30 env: - name: GF_SECURITY_ADMIN_USER valueFrom: @@ -103,6 +131,8 @@ spec: mountPath: {{ .Values.conf.grafana.paths.alerting }} - name: pod-csv-grafana mountPath: {{ .Values.conf.grafana.paths.csv }} + - name: grafana-binary-image + mountPath: /usr/share/grafana - name: grafana-bin mountPath: /tmp/grafana.sh subPath: grafana.sh @@ -119,6 +149,9 @@ spec: - name: grafana-etc mountPath: /etc/grafana/ldap.toml subPath: ldap.toml + - name: grafana-db + mountPath: /tmp/my.cnf + subPath: my.cnf - name: data mountPath: /var/lib/grafana/data {{- range $group, $dashboards := .Values.conf.dashboards }} @@ -146,6 +179,8 @@ spec: emptyDir: {} - name: pod-csv-grafana emptyDir: {} + - name: grafana-binary-image + emptyDir: {} - name: grafana-bin configMap: name: grafana-bin @@ -154,6 +189,10 @@ spec: secret: secretName: grafana-etc defaultMode: 0444 + - name: grafana-db + secret: + secretName: grafana-db + defaultMode: 0444 {{- range $group, $dashboards := .Values.conf.dashboards }} - name: grafana-dashboards-{{$group}} configMap: @@ -162,6 +201,10 @@ spec: {{- end }} - name: data emptyDir: {} + - name: prepare-grafana-migrator + configMap: + defaultMode: 0555 + name: prepare-grafana-migrator {{- dict "enabled" $envAll.Values.manifests.certificates "name" $envAll.Values.endpoints.oslo_db.auth.admin.secret.tls.internal | include "helm-toolkit.snippets.tls_volume" | indent 8 }} {{ if $mounts_grafana.volumes }}{{ toYaml $mounts_grafana.volumes | indent 8 }}{{ end }} {{- end }} diff --git a/grafana/templates/secret-db.yaml b/grafana/templates/secret-db.yaml index a05697e74..5d50ec8c3 100644 --- a/grafana/templates/secret-db.yaml +++ b/grafana/templates/secret-db.yaml @@ -30,4 +30,12 @@ data: DB_CONNECTION: {{ $connection | b64enc -}} {{- end }} {{- end }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: grafana-db +type: Opaque +data: + my.cnf: {{ tuple "secrets/_my.cnf.tpl" . | include "helm-toolkit.utils.template" | b64enc }} {{- end }} diff --git a/grafana/templates/secrets/_my.cnf.tpl b/grafana/templates/secrets/_my.cnf.tpl new file mode 100644 index 000000000..ca7acfec7 --- /dev/null +++ b/grafana/templates/secrets/_my.cnf.tpl @@ -0,0 +1,17 @@ +{{/* + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */}} + + [client] + user = {{ .Values.endpoints.oslo_db.auth.admin.username }} + password = {{ .Values.endpoints.oslo_db.auth.admin.password }} diff --git a/grafana/values.yaml b/grafana/values.yaml index 0c99ac67a..f29730ccf 100644 --- a/grafana/values.yaml +++ b/grafana/values.yaml @@ -18,6 +18,7 @@ images: tags: grafana: docker.io/grafana/grafana:8.5.10 + mariadb: docker.io/openstackhelm/mariadb:latest-ubuntu_focal dep_check: quay.io/airshipit/kubernetes-entrypoint:v1.0.0 db_init: docker.io/openstackhelm/heat:stein-ubuntu_bionic grafana_db_session_sync: docker.io/openstackhelm/heat:stein-ubuntu_bionic @@ -49,7 +50,6 @@ pod: security_context: dashboard: pod: - # The correct grafana uid = 472 runAsUser: 472 container: grafana: @@ -57,7 +57,6 @@ pod: readOnlyRootFilesystem: true db_init: pod: - # The correct grafana uid = 472 runAsUser: 472 container: grafana_db_init_session: @@ -68,7 +67,6 @@ pod: readOnlyRootFilesystem: true db_session_sync: pod: - # The correct grafana uid = 472 runAsUser: 472 container: grafana_db_session_sync: @@ -76,7 +74,6 @@ pod: readOnlyRootFilesystem: true set_admin_user: pod: - # The correct grafana uid = 472 runAsUser: 472 container: grafana_set_admin_password: @@ -84,15 +81,21 @@ pod: readOnlyRootFilesystem: true run_migrator: pod: - # The correct grafana uid = 472 runAsUser: 472 container: + prepare_grafana_migrator: + runAsUser: 0 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + grafana_run_migrator: + runAsUser: 65534 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true grafana_set_admin_password: allowPrivilegeEscalation: false readOnlyRootFilesystem: true test: pod: - # The correct grafana uid = 472 runAsUser: 472 container: helm_tests: diff --git a/grafana/values_overrides/apparmor.yaml b/grafana/values_overrides/apparmor.yaml index d1decc929..4693d2929 100644 --- a/grafana/values_overrides/apparmor.yaml +++ b/grafana/values_overrides/apparmor.yaml @@ -19,6 +19,7 @@ pod: init: runtime/default grafana-run-migrator: grafana-run-migrator: runtime/default + prepare-grafana-migrator: runtime/default init: runtime/default grafana-test: init: runtime/default diff --git a/releasenotes/notes/grafana.yaml b/releasenotes/notes/grafana.yaml index f70621410..bf72dd0ff 100644 --- a/releasenotes/notes/grafana.yaml +++ b/releasenotes/notes/grafana.yaml @@ -18,4 +18,5 @@ grafana: - 0.1.15 Added OCI registry authentication - 0.1.16 Grafana 8.5.10 with unified alerting - 0.1.17 Fix uid for the user grafana + - 0.1.18 Migrator job is now mariadb-fail-proof ...