[ceph-mon] Add a post-apply job to restart mons after mgrs

If the OnDelete pod restart strategy is used for the ceph-mon
daemonset, run a post-apply job to restart the ceph-mon pods one
at a time. Otherwise the mons could restart before the mgrs, which
can be problematic in some upgrade scenarios.

Change-Id: I57f87130e95088217c3cfe73512caaae41d3ef22
This commit is contained in:
Stephen Taylor 2022-02-07 14:03:01 -07:00
parent ea2c0115c4
commit ae17a61836
7 changed files with 289 additions and 1 deletions

View File

@ -15,6 +15,6 @@ apiVersion: v1
appVersion: v1.0.0 appVersion: v1.0.0
description: OpenStack-Helm Ceph Mon description: OpenStack-Helm Ceph Mon
name: ceph-mon name: ceph-mon
version: 0.1.18 version: 0.1.19
home: https://github.com/ceph/ceph home: https://github.com/ceph/ceph
... ...

View File

@ -0,0 +1,132 @@
#!/bin/bash
{{/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
export LC_ALL=C
: "${ADMIN_KEYRING:=/etc/ceph/${CLUSTER}.client.admin.keyring}"
if [[ ! -f /etc/ceph/${CLUSTER}.conf ]]; then
echo "ERROR- /etc/ceph/${CLUSTER}.conf must exist; get it from your existing mon"
exit 1
fi
if [[ ! -f ${ADMIN_KEYRING} ]]; then
echo "ERROR- ${ADMIN_KEYRING} must exist; get it from your existing mon"
exit 1
fi
ceph --cluster ${CLUSTER} -s
function wait_for_pods() {
timeout=${2:-1800}
end=$(date -ud "${timeout} seconds" +%s)
# Selecting containers with "ceph-mon" name and
# counting them based on "ready" field.
count_pods=".items | map(.status.containerStatuses | .[] | \
select(.name==\"ceph-mon\")) | \
group_by(.ready) | map({(.[0].ready | tostring): length}) | .[]"
min_mons="add | if .true >= (.false + .true) \
then \"pass\" else \"fail\" end"
while true; do
# Leave while loop if all mons are ready.
state=$(kubectl get pods --namespace="${1}" -l component=mon -o json | jq "${count_pods}")
mon_state=$(jq -s "${min_mons}" <<< "${state}")
if [[ "${mon_state}" == \"pass\" ]]; then
break
fi
sleep 5
if [ $(date -u +%s) -gt $end ] ; then
echo -e "Containers failed to start after $timeout seconds\n"
kubectl get pods --namespace "${1}" -o wide -l component=mon
exit 1
fi
done
}
function check_ds() {
for ds in `kubectl get ds --namespace=$CEPH_NAMESPACE -l component=mon --no-headers=true|awk '{print $1}'`
do
ds_query=`kubectl get ds -n $CEPH_NAMESPACE $ds -o json|jq -r .status`
if echo $ds_query |grep -i "numberAvailable" ;then
currentNumberScheduled=`echo $ds_query|jq -r .currentNumberScheduled`
desiredNumberScheduled=`echo $ds_query|jq -r .desiredNumberScheduled`
numberAvailable=`echo $ds_query|jq -r .numberAvailable`
numberReady=`echo $ds_query|jq -r .numberReady`
updatedNumberScheduled=`echo $ds_query|jq -r .updatedNumberScheduled`
ds_check=`echo "$currentNumberScheduled $desiredNumberScheduled $numberAvailable $numberReady $updatedNumberScheduled"| \
tr ' ' '\n'|sort -u|wc -l`
if [ $ds_check != 1 ]; then
echo "Some pods in daemonset $ds are not ready"
exit
else
echo "All pods in deamonset $ds are ready"
fi
else
echo "There are no mons under daemonset $ds"
fi
done
}
function restart_mons() {
mon_pods=`kubectl get po -n $CEPH_NAMESPACE -l component=mon --no-headers | awk '{print $1}'`
for pod in ${mon_pods}
do
if [[ -n "$pod" ]]; then
echo "Restarting pod $pod"
kubectl delete pod -n $CEPH_NAMESPACE $pod
fi
echo "Waiting for the pod $pod to restart"
# The pod will not be ready in first 60 seconds. Thus we can reduce
# amount of queries to kubernetes.
sleep 60
wait_for_pods
ceph -s
done
}
wait_for_pods $CEPH_NAMESPACE
require_upgrade=0
max_release=0
for ds in `kubectl get ds --namespace=$CEPH_NAMESPACE -l component=mon --no-headers=true|awk '{print $1}'`
do
updatedNumberScheduled=`kubectl get ds -n $CEPH_NAMESPACE $ds -o json|jq -r .status.updatedNumberScheduled`
desiredNumberScheduled=`kubectl get ds -n $CEPH_NAMESPACE $ds -o json|jq -r .status.desiredNumberScheduled`
if [[ $updatedNumberScheduled != $desiredNumberScheduled ]]; then
if kubectl get ds -n $CEPH_NAMESPACE $ds -o json|jq -r .status|grep -i "numberAvailable" ;then
require_upgrade=$((require_upgrade+1))
_release=`kubectl get ds -n $CEPH_NAMESPACE $ds -o json|jq -r .status.observedGeneration`
max_release=$(( max_release > _release ? max_release : _release ))
fi
fi
done
echo "Latest revision of the helm chart(s) is : $max_release"
if [[ $max_release -gt 1 ]]; then
if [[ $require_upgrade -gt 0 ]]; then
echo "Restart ceph-mon pods one at a time to prevent disruption"
restart_mons
fi
# Check all the ceph-mon daemonsets
echo "checking DS"
check_ds
else
echo "No revisions found for upgrade"
fi

View File

@ -29,6 +29,8 @@ data:
bootstrap.sh: | bootstrap.sh: |
{{ tuple "bin/_bootstrap.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{ tuple "bin/_bootstrap.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
{{- end }} {{- end }}
post-apply.sh: |
{{ tuple "bin/_post-apply.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
init-dirs.sh: | init-dirs.sh: |
{{ tuple "bin/_init-dirs.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{ tuple "bin/_init-dirs.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}

View File

@ -0,0 +1,143 @@
{{/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
{{- if eq .Values.pod.lifecycle.upgrades.daemonsets.pod_replacement_strategy "OnDelete" }}
{{- if and .Values.manifests.job_post_apply }}
{{- $envAll := . }}
{{- $serviceAccountName := printf "%s-%s" .Release.Name "post-apply" }}
{{ tuple $envAll "post-apply" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: {{ $serviceAccountName }}
rules:
- apiGroups:
- ''
resources:
- pods
- events
- jobs
- pods/exec
verbs:
- create
- get
- delete
- list
- apiGroups:
- 'apps'
resources:
- daemonsets
verbs:
- get
- list
- apiGroups:
- 'batch'
resources:
- jobs
verbs:
- get
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ $serviceAccountName }}
subjects:
- kind: ServiceAccount
name: {{ $serviceAccountName }}
namespace: {{ $envAll.Release.Namespace }}
roleRef:
kind: ClusterRole
name: {{ $serviceAccountName }}
apiGroup: rbac.authorization.k8s.io
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ $serviceAccountName }}
labels:
{{ tuple $envAll "ceph-upgrade" "post-apply" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }}
annotations:
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
spec:
template:
metadata:
labels:
{{ tuple $envAll "ceph-upgrade" "post-apply" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }}
annotations:
configmap-bin-hash: {{ tuple "configmap-bin.yaml" . | include "helm-toolkit.utils.hash" }}
{{ dict "envAll" $envAll "podName" "ceph-mon-post-apply" "containerNames" (list "ceph-mon-post-apply" "init" ) | include "helm-toolkit.snippets.kubernetes_mandatory_access_control_annotation" | indent 8 }}
spec:
{{ dict "envAll" $envAll "application" "post_apply" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }}
serviceAccountName: {{ $serviceAccountName }}
restartPolicy: OnFailure
nodeSelector:
{{ .Values.labels.job.node_selector_key }}: {{ .Values.labels.job.node_selector_value }}
initContainers:
{{ tuple $envAll "post-apply" list | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }}
containers:
- name: ceph-mon-post-apply
{{ tuple $envAll "ceph_config_helper" | include "helm-toolkit.snippets.image" | indent 10 }}
{{ tuple $envAll $envAll.Values.pod.resources.jobs.bootstrap | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
{{ dict "envAll" $envAll "application" "post_apply" "container" "ceph_mon_post_apply" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }}
env:
- name: CLUSTER
value: "ceph"
- name: CEPH_NAMESPACE
value: {{ .Release.Namespace }}
- name: RELEASE_GROUP_NAME
value: {{ .Release.Name }}
command:
- /tmp/post-apply.sh
volumeMounts:
- name: pod-tmp
mountPath: /tmp
- name: pod-etc-ceph
mountPath: /etc/ceph
- name: ceph-mon-bin
mountPath: /tmp/post-apply.sh
subPath: post-apply.sh
readOnly: true
- name: ceph-mon-bin
mountPath: /tmp/wait-for-pods.sh
subPath: wait-for-pods.sh
readOnly: true
- name: ceph-mon-etc
mountPath: /etc/ceph/ceph.conf
subPath: ceph.conf
readOnly: true
- name: ceph-mon-admin-keyring
mountPath: /etc/ceph/ceph.client.admin.keyring
subPath: ceph.client.admin.keyring
readOnly: true
volumes:
- name: pod-tmp
emptyDir: {}
- name: pod-etc-ceph
emptyDir: {}
- name: ceph-mon-bin
configMap:
name: {{ printf "%s-%s" $envAll.Release.Name "bin" | quote }}
defaultMode: 0555
- name: ceph-mon-etc
configMap:
name: {{ printf "%s-%s" $envAll.Release.Name "etc" | quote }}
defaultMode: 0444
- name: ceph-mon-admin-keyring
secret:
secretName: {{ .Values.secrets.keyrings.admin }}
{{- end }}
{{- end }}

View File

@ -111,6 +111,13 @@ pod:
ceph-osd-keyring-generator: ceph-osd-keyring-generator:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
post_apply:
pod:
runAsUser: 65534
container:
ceph_mon_post_apply:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
dns_policy: "ClusterFirstWithHostNet" dns_policy: "ClusterFirstWithHostNet"
replicas: replicas:
mgr: 2 mgr: 2
@ -452,6 +459,7 @@ manifests:
job_image_repo_sync: true job_image_repo_sync: true
job_bootstrap: true job_bootstrap: true
job_keyring: true job_keyring: true
job_post_apply: true
service_mon: true service_mon: true
service_mgr: true service_mgr: true
service_mon_discovery: true service_mon_discovery: true

View File

@ -30,6 +30,8 @@ pod:
ceph-osd-keyring-generator: ceph-osd-keyring-generator:
ceph-osd-keyring-generator: runtime/default ceph-osd-keyring-generator: runtime/default
init: runtime/default init: runtime/default
ceph-mon-post-apply:
ceph-mon-post-apply: runtime/default
bootstrap: bootstrap:
enabled: true enabled: true
manifests: manifests:

View File

@ -19,4 +19,5 @@ ceph-mon:
- 0.1.16 Correct Ceph Mon Check Ports - 0.1.16 Correct Ceph Mon Check Ports
- 0.1.17 Skip monmap endpoint check for missing mons - 0.1.17 Skip monmap endpoint check for missing mons
- 0.1.18 Move ceph-mgr deployment to the ceph-mon chart - 0.1.18 Move ceph-mgr deployment to the ceph-mon chart
- 0.1.19 Add a post-apply job to restart mons after mgrs
... ...