From 8d6cc364b7d227013df29d87874bea9ad9cf0b17 Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Fri, 7 Jul 2023 08:35:54 -0600 Subject: [PATCH] [ceph-osd] Extend the ceph-osd post-apply job PG wait In some cases, especially for disruptive OSD restarts on upgrade, PGs can take longer than the allowed ~30 seconds to get into a peering state. In these cases, the post-apply job fails prematurely instead of allowing time for the OSDs and PGs to recover. This change extends that timeout to ~10 minutes instead to allow the PGs plenty of recovery time. The only negative effect of this change is that a legitimate failure where the PGs can't recover will take 10 minutes to fail the post-apply job instead of 30 seconds. Change-Id: I9c22bb692385dbb7bc2816233c83c7472e071dd4 --- ceph-osd/Chart.yaml | 2 +- ceph-osd/templates/bin/_post-apply.sh.tpl | 4 ++-- releasenotes/notes/ceph-osd.yaml | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml index 6fdb49ffd..d10448c15 100644 --- a/ceph-osd/Chart.yaml +++ b/ceph-osd/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph OSD name: ceph-osd -version: 0.1.44 +version: 0.1.45 home: https://github.com/ceph/ceph ... diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl index 74229676c..42732612a 100644 --- a/ceph-osd/templates/bin/_post-apply.sh.tpl +++ b/ceph-osd/templates/bin/_post-apply.sh.tpl @@ -97,8 +97,8 @@ function wait_for_pgs () { while [[ $pgs_ready -lt 3 ]]; do pgs_state=$(ceph --cluster ${CLUSTER} pg ls -f json | jq -c "${query}") if [[ $(jq -c '. | select(.state | contains("peering") | not)' <<< "${pgs_state}") ]]; then - if [[ $pgs_inactive -gt 10 ]]; then - # If inactive PGs aren't peering, fail + if [[ $pgs_inactive -gt 200 ]]; then + # If inactive PGs aren't peering after ~10 minutes, fail echo "Failure, found inactive PGs that aren't peering" exit 1 fi diff --git a/releasenotes/notes/ceph-osd.yaml b/releasenotes/notes/ceph-osd.yaml index e101fcf3d..e4c50b4ed 100644 --- a/releasenotes/notes/ceph-osd.yaml +++ b/releasenotes/notes/ceph-osd.yaml @@ -45,4 +45,5 @@ ceph-osd: - 0.1.42 Added OCI registry authentication - 0.1.43 Update all Ceph images to Focal - 0.1.44 Update Ceph to 17.2.6 + - 0.1.45 Extend the ceph-osd post-apply job PG wait ...