Fix check if restore in progress

When performing the BnR procedure with the wipe_ceph_osds
flag and the rook-ceph backend configured, an error was
given when removing the app.

This happened because a restore in progress check in the
DB was done in the app's lifecycle and false was always
returned, as the insert had not yet been performed before
this task.

To fix this, the database query has been replaced by
checking the '/etc/platform/.restore_in_progress' flag.

Test Plan:
- PASS: Build rook-ceph app
- PASS: optimized AIO-SX B&R with wipe_ceph_osds flag
- PASS: legacy STD + DX B&R with wipe_ceph_osds flag

Partial-Bug: 2086473

Change-Id: Ica3befe51ff08a53eb1b33af12e96fa4358e6c0f
Signed-off-by: Erickson Silva de Oliveira <Erickson.SilvadeOliveira@windriver.com>
This commit is contained in:
Erickson Silva de Oliveira 2024-11-02 15:55:44 -03:00
parent 0670391cad
commit 3fb190bfe6

View File

@ -12,6 +12,7 @@
import re import re
import json import json
from os import path
from time import sleep from time import sleep
from subprocess import run from subprocess import run
from string import Template from string import Template
@ -152,11 +153,8 @@ class RookCephAppLifecycleOperator(base.AppLifecycleOperator):
:param app: AppOperator.Application object :param app: AppOperator.Application object
""" """
LOG.info("Cleaning up the ceph cluster")
if (not self._verify_restore_in_progress(app_op._dbapi) or self.cluster_cleanup(app_op, context)
cutils.is_aio_simplex_system(app_op._dbapi)):
LOG.info("Cleaning up the ceph cluster")
self.cluster_cleanup(app_op, context)
LOG.info("Removing ceph alarms") LOG.info("Removing ceph alarms")
self.remove_alarms(app_op) self.remove_alarms(app_op)
@ -485,7 +483,7 @@ class RookCephAppLifecycleOperator(base.AppLifecycleOperator):
# ------- # -------
# Conditionally force clean the cluster in cleanup jobs not completed successfully # Conditionally force clean the cluster in cleanup jobs not completed successfully
# ------- # -------
if not is_jobs_completed: if not is_jobs_completed and not path.isfile(constants.RESTORE_IN_PROGRESS_FLAG):
LOG.info("Cleanup Jobs did not completed. Force removing finalizers and wiping OSDs") LOG.info("Cleanup Jobs did not completed. Force removing finalizers and wiping OSDs")
self.wipe_all_osds(app_op._dbapi, context) self.wipe_all_osds(app_op._dbapi, context)
self.remove_resource_finalizers() self.remove_resource_finalizers()
@ -885,7 +883,7 @@ class RookCephAppLifecycleOperator(base.AppLifecycleOperator):
dbapi = app_op._dbapi dbapi = app_op._dbapi
# Check if is being called by backup and restore process # Check if is being called by backup and restore process
if not self._verify_restore_in_progress(dbapi): if not path.isfile(constants.RESTORE_IN_PROGRESS_FLAG):
# CHECK AND FAIL: All hosts must be unlocked/enabled/{avaliable,degraded} # CHECK AND FAIL: All hosts must be unlocked/enabled/{avaliable,degraded}
hosts = self.get_hosts(dbapi, {}, only_rook=True) hosts = self.get_hosts(dbapi, {}, only_rook=True)
for host in hosts: for host in hosts:
@ -1816,16 +1814,6 @@ class RookCephAppLifecycleOperator(base.AppLifecycleOperator):
namespace="rook-ceph" namespace="rook-ceph"
) )
def _verify_restore_in_progress(self, dbapi):
"""Check if restore is in progress"""
try:
dbapi.restore_get_one(
filters={'state': constants.RESTORE_STATE_IN_PROGRESS})
except exception.NotFound:
return False
else:
return True
def create_job_to_rm_mon_data(self, hostname, mon_name): def create_job_to_rm_mon_data(self, hostname, mon_name):
LOG.info("Creating job to remove mon-%s data from %s" % (mon_name, hostname)) LOG.info("Creating job to remove mon-%s data from %s" % (mon_name, hostname))
remove_mon_job_template = self.get_rm_mon_data_job_template() remove_mon_job_template = self.get_rm_mon_data_job_template()