Improve stability of Ceph cluster deployment script

Do not fail and retry when some of mon pods are not found
while checking its status.

This is to avoid situations like the following:
```
2025-07-10 14:53:17.670728 | primary | + MON_PODS='rook-ceph-mon-a-canary-6d7bf54997-mtzmt
2025-07-10 14:53:17.670767 | primary | rook-ceph-mon-b-canary-7ff47b6fc6-sbtjh
2025-07-10 14:53:17.670781 | primary | rook-ceph-mon-c-canary-68cf8fb595-4jptf'
2025-07-10 14:53:17.670786 | primary | + for MON_POD in $MON_PODS
2025-07-10 14:53:17.670791 | primary | + kubectl get pod --namespace=ceph rook-ceph-mon-a-canary-6d7bf54997-mtzmt
2025-07-10 14:53:17.824501 | primary | + kubectl wait --namespace=ceph --for=condition=ready pod/rook-ceph-mon-a-canary-6d7bf54997-mtzmt --timeout=600s
2025-07-10 14:53:17.897216 | primary | Error from server (NotFound): pods "rook-ceph-mon-a-canary-6d7bf54997-mtzmt" not found

```

Change-Id: I7f10df4d9b395a5775aa3afd42e17dbd09855304
Signed-off-by: Vladimir Kozhukalov <kozhukalov@gmail.com>
This commit is contained in:
Vladimir Kozhukalov
2025-07-10 16:25:56 -05:00
parent 6c4dcad32d
commit 087091663f

View File

@@ -371,13 +371,25 @@ helm osh wait-for-pods rook-ceph
kubectl wait --namespace=ceph --for=condition=ready pod --selector=app=rook-ceph-tools --timeout=600s
# Wait for all monitor pods to be ready
MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }')
for MON_POD in $MON_PODS; do
if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then
kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=600s
else
echo "Pod $MON_POD not found, skipping..."
fi
wait_start_time=$(date +%s)
while [[ $(($(date +%s) - $wait_start_time)) -lt 1800 ]]; do
sleep 30
MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }')
MON_PODS_NUM=$(echo $MON_PODS | wc -w)
MON_PODS_READY=0
for MON_POD in $MON_PODS; do
if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then
kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=60s && \
{ MON_PODS_READY=$(($MON_PODS_READY+1)); } || \
echo "Pod $MON_POD not ready, skipping..."
else
echo "Pod $MON_POD not found, skipping..."
fi
done
if [[ ${MON_PODS_READY} == ${MON_PODS_NUM} ]]; then
echo "Monitor pods are ready. Moving on."
break;
fi
done
echo "=========== CEPH K8S PODS LIST ============"