[ceph] Fix for ceph-osd pods restart
This PS updates ceph-osd pod containers making sure that osd pods are not stuck at deletion. Also added similar approach to add lifecycle ondelete hook to kill log-runner container process before pod restart. And added wait_for_degraded_object function to helm-test pod making sure that newly deployed pod are joined the ceph cluster and it is safe to go on with next ceph-osd chart releade upgrade. Change-Id: Ib31a5e1a82526906bff8c64ce1b199e3495b44b2
This commit is contained in:
parent
cd1ee943f9
commit
7811e90f4e
@ -15,6 +15,6 @@ apiVersion: v1
|
||||
appVersion: v1.0.0
|
||||
description: OpenStack-Helm Ceph OSD
|
||||
name: ceph-osd
|
||||
version: 0.1.54
|
||||
version: 0.1.55
|
||||
home: https://github.com/ceph/ceph
|
||||
...
|
||||
|
@ -16,6 +16,17 @@ limitations under the License.
|
||||
|
||||
set -ex
|
||||
|
||||
function wait_for_degraded_objects () {
|
||||
echo "#### Start: Checking for degraded objects ####"
|
||||
|
||||
# Loop until no degraded objects
|
||||
while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded'`" ]]
|
||||
do
|
||||
sleep 30
|
||||
ceph -s
|
||||
done
|
||||
}
|
||||
|
||||
function check_osd_count() {
|
||||
echo "#### Start: Checking OSD count ####"
|
||||
noup_flag=$(ceph osd stat | awk '/noup/ {print $2}')
|
||||
@ -38,20 +49,26 @@ function check_osd_count() {
|
||||
fi
|
||||
done
|
||||
echo "Caution: noup flag is set. ${count} OSDs in up/new state. Required number of OSDs: ${MIN_OSDS}."
|
||||
if [ $MIN_OSDS -gt $count ]; then
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
else
|
||||
if [ "${num_osd}" -eq 0 ]; then
|
||||
echo "There are no osds in the cluster"
|
||||
exit 1
|
||||
elif [ "${num_in_osds}" -ge "${MIN_OSDS}" ] && [ "${num_up_osds}" -ge "${MIN_OSDS}" ]; then
|
||||
echo "Required number of OSDs (${MIN_OSDS}) are UP and IN status"
|
||||
exit 0
|
||||
else
|
||||
echo "Required number of OSDs (${MIN_OSDS}) are NOT UP and IN status. Cluster shows OSD count=${num_osd}, UP=${num_up_osds}, IN=${num_in_osds}"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
check_osd_count
|
||||
# in case the chart has been re-installed in order to make changes to daemonset
|
||||
# we do not need rack_by_rack restarts
|
||||
# but we need to wait until all re-installed ceph-osd pods are healthy
|
||||
# and there is degraded objects
|
||||
while true; do
|
||||
check_osd_count
|
||||
sleep 10
|
||||
done
|
||||
wait_for_degraded_objects
|
||||
ceph -s
|
||||
|
@ -111,7 +111,7 @@ function wait_for_pgs () {
|
||||
else
|
||||
(( pgs_ready+=1 ))
|
||||
fi
|
||||
sleep 3
|
||||
sleep 30
|
||||
done
|
||||
}
|
||||
|
||||
@ -121,7 +121,7 @@ function wait_for_degraded_objects () {
|
||||
# Loop until no degraded objects
|
||||
while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded'`" ]]
|
||||
do
|
||||
sleep 3
|
||||
sleep 30
|
||||
ceph -s
|
||||
done
|
||||
}
|
||||
@ -132,7 +132,7 @@ function wait_for_degraded_and_misplaced_objects () {
|
||||
# Loop until no degraded or misplaced objects
|
||||
while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded\|misplaced'`" ]]
|
||||
do
|
||||
sleep 3
|
||||
sleep 30
|
||||
ceph -s
|
||||
done
|
||||
}
|
||||
@ -148,14 +148,17 @@ function restart_by_rack() {
|
||||
echo "hosts count under $rack are: ${#hosts_in_rack[@]}"
|
||||
for host in ${hosts_in_rack[@]}
|
||||
do
|
||||
echo "host is : $host"
|
||||
if [[ ! -z "$host" ]]; then
|
||||
pods_on_host=`kubectl get po -n $CEPH_NAMESPACE -l component=osd -o wide |grep $host|awk '{print $1}'`
|
||||
echo "Restartig the pods under host $host"
|
||||
kubectl delete po -n $CEPH_NAMESPACE $pods_on_host
|
||||
fi
|
||||
echo "host is : $host"
|
||||
if [[ ! -z "$host" ]]; then
|
||||
pods_on_host=$(kubectl get po -n "$CEPH_NAMESPACE" -l component=osd -o wide |grep "$host"|awk '{print $1}' | tr '\n' ' '|sed 's/ *$//g')
|
||||
echo "Restarting the pods under host $host"
|
||||
for pod in ${pods_on_host}
|
||||
do
|
||||
kubectl delete pod -n "$CEPH_NAMESPACE" "${pod}" || true
|
||||
done
|
||||
fi
|
||||
done
|
||||
echo "waiting for the pods under rack $rack from restart"
|
||||
echo "waiting for the pods under host $host from restart"
|
||||
# The pods will not be ready in first 60 seconds. Thus we can reduce
|
||||
# amount of queries to kubernetes.
|
||||
sleep 60
|
||||
|
26
ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl
Normal file
26
ceph-osd/templates/bin/osd/_log-runner-stop.sh.tpl
Normal file
@ -0,0 +1,26 @@
|
||||
#!/bin/bash
|
||||
|
||||
{{/*
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
set -ex
|
||||
|
||||
source /tmp/utils-resolveLocations.sh
|
||||
|
||||
TAIL_PID="$(cat /tmp/ceph-log-runner.pid)"
|
||||
while kill -0 ${TAIL_PID} >/dev/null 2>&1;
|
||||
do
|
||||
kill -9 ${TAIL_PID};
|
||||
sleep 1;
|
||||
done
|
@ -25,8 +25,9 @@ function tail_file () {
|
||||
while $keep_running; do
|
||||
tail --retry -f "${log_file}" &
|
||||
tail_pid=$!
|
||||
echo $tail_pid > /tmp/ceph-log-runner.pid
|
||||
wait $tail_pid
|
||||
sleep 1
|
||||
sleep 10
|
||||
done
|
||||
}
|
||||
|
||||
|
@ -18,15 +18,18 @@ set -ex
|
||||
|
||||
source /tmp/utils-resolveLocations.sh
|
||||
|
||||
CEPH_OSD_PID="$(cat /run/ceph-osd.pid)"
|
||||
while kill -0 ${CEPH_OSD_PID} >/dev/null 2>&1; do
|
||||
kill -SIGTERM ${CEPH_OSD_PID}
|
||||
sleep 1
|
||||
done
|
||||
|
||||
if [ "x${STORAGE_TYPE%-*}" == "xblock" ]; then
|
||||
OSD_DEVICE=$(readlink -f ${STORAGE_LOCATION})
|
||||
OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION})
|
||||
if [ "x${STORAGE_TYPE#*-}" == "xlogical" ]; then
|
||||
CEPH_OSD_PID="$(cat /run/ceph-osd.pid)"
|
||||
while kill -0 ${CEPH_OSD_PID} >/dev/null 2>&1; do
|
||||
kill -SIGTERM ${CEPH_OSD_PID}
|
||||
sleep 1
|
||||
done
|
||||
umount "$(findmnt -S "${OSD_DEVICE}1" | tail -n +2 | awk '{ print $1 }')"
|
||||
fi
|
||||
fi
|
||||
|
||||
fi
|
||||
|
@ -56,6 +56,8 @@ data:
|
||||
{{ tuple "bin/osd/_check.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
osd-stop.sh: |
|
||||
{{ tuple "bin/osd/_stop.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
log-runner-stop.sh: |
|
||||
{{ tuple "bin/osd/_log-runner-stop.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
init-dirs.sh: |
|
||||
{{ tuple "bin/_init-dirs.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
helm-tests.sh: |
|
||||
|
@ -327,6 +327,10 @@ spec:
|
||||
- name: pod-var-log
|
||||
mountPath: /var/log/ceph
|
||||
readOnly: false
|
||||
- name: ceph-osd-bin
|
||||
mountPath: /tmp/log-runner-stop.sh
|
||||
subPath: log-runner-stop.sh
|
||||
readOnly: true
|
||||
- name: ceph-osd-default
|
||||
{{ tuple $envAll "ceph_osd" | include "helm-toolkit.snippets.image" | indent 10 }}
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.osd | include "helm-toolkit.snippets.kubernetes_resources" | indent 10 }}
|
||||
|
@ -41,6 +41,8 @@ spec:
|
||||
{{ tuple $envAll $envAll.Values.pod.resources.jobs.tests | include "helm-toolkit.snippets.kubernetes_resources" | indent 6 }}
|
||||
{{ dict "envAll" $envAll "application" "test" "container" "ceph_cluster_helm_test" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 6 }}
|
||||
env:
|
||||
- name: CLUSTER
|
||||
value: "ceph"
|
||||
- name: CEPH_DEPLOYMENT_NAMESPACE
|
||||
value: {{ .Release.Namespace }}
|
||||
- name: REQUIRED_PERCENT_OF_OSDS
|
||||
|
@ -55,4 +55,5 @@ ceph-osd:
|
||||
- 0.1.52 Use quay.io/airshipit/kubernetes-entrypoint:latest-ubuntu_focal by default
|
||||
- 0.1.53 Update ceph-daemon to be able to use tini init system
|
||||
- 0.1.54 Remove use of tini for ceph-daemon
|
||||
- 0.1.55 Update ceph-osd pod containers to make sure OSD pods are properly terminated at restart
|
||||
...
|
||||
|
Loading…
Reference in New Issue
Block a user