[ceph-osd] Allow for unconditional OSD restart

This change allows OSDs to be restarted unconditionally by the ceph-osd chart. This can be useful in upgrade scenarios where ceph-osd pods are unhealthy during the upgrade. Change-Id: I6de98db2b4eb1d76411e1dbffa65c263de3aecee
2022-04-04 13:35:49 -06:00 · 2022-04-04 13:35:49 -06:00 · 76fb2562c6
commit 76fb2562c6
parent 50063c809c
5 changed files with 39 additions and 25 deletions
--- a/ceph-osd/Chart.yaml
+++ b/ceph-osd/Chart.yaml
@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph OSD
 name: ceph-osd
-version: 0.1.38
+version: 0.1.39
 home: https://github.com/ceph/ceph
 ...
--- a/ceph-osd/templates/bin/_post-apply.sh.tpl
+++ b/ceph-osd/templates/bin/_post-apply.sh.tpl
@ -188,8 +188,11 @@ done
 echo "Latest revision of the helm chart(s) is : $max_release"
-if [[ $max_release -gt 1  ]]; then
+# If flags are set that will prevent recovery, don't restart OSDs
-  if [[  $require_upgrade -gt 0 ]]; then
+ceph -s | grep "noup\|noin\|nobackfill\|norebalance\|norecover" > /dev/null
 if [[ $? -ne 0 ]]; then
  if [[ "$UNCONDITIONAL_OSD_RESTART" == "true" ]] || [[ $max_release -gt 1  ]]; then
    if [[ "$UNCONDITIONAL_OSD_RESTART" == "true" ]] || [[  $require_upgrade -gt 0 ]]; then
      if [[ "$DISRUPTIVE_OSD_RESTART" == "true" ]]; then
        echo "restarting all osds simultaneously"
        kubectl -n $CEPH_NAMESPACE delete pod -l component=osd
@ -216,3 +219,6 @@ if [[ $max_release -gt 1  ]]; then
  else
    echo "No revisions found for upgrade"
  fi
 else
  echo "Skipping OSD restarts because flags are set that would prevent recovery"
 fi
--- a/ceph-osd/templates/job-post-apply.yaml
+++ b/ceph-osd/templates/job-post-apply.yaml
@ -104,6 +104,8 @@ spec:
              value: {{ .Values.conf.ceph.target.required_percent_of_osds | ceil | quote }}
            - name: DISRUPTIVE_OSD_RESTART
              value: {{ .Values.conf.storage.disruptive_osd_restart | quote }}
            - name: UNCONDITIONAL_OSD_RESTART
              value: {{ .Values.conf.storage.unconditional_osd_restart | quote }}
          command:
            - /tmp/post-apply.sh
          volumeMounts:
--- a/ceph-osd/values.yaml
+++ b/ceph-osd/values.yaml
@ -293,6 +293,11 @@ conf:
    # OSD restarts more quickly with disruption.
    disruptive_osd_restart: "false"
    # The post-apply job will try to determine if OSDs need to be restarted and
    # only restart them if necessary. Set this value to "true" to restart OSDs
    # unconditionally.
    unconditional_osd_restart: "false"
 # NOTE(portdirect): for heterogeneous clusters the overrides section can be used to define
 # OSD pods that will be deployed upon specifc nodes.
 # overrides:
--- a/releasenotes/notes/ceph-osd.yaml
+++ b/releasenotes/notes/ceph-osd.yaml
@ -39,4 +39,5 @@ ceph-osd:
  - 0.1.36 Add OSD device location pre-check
  - 0.1.37 Add a disruptive OSD restart to the post-apply job
  - 0.1.38 Skip pod wait in post-apply job when disruptive
  - 0.1.39 Allow for unconditional OSD restart
 ...