Improve dcmanager-state scalability
This commit includes the following changes: - Implement the new fm-api methods regarding raising/clearing alarms in batches. The new keep_existing_alarms option was also implemented to make sure we don't update alarms as we're not checking if it exist before trying to raise them again. - Moving from FaultAPIs to FaultAPIsV2, which raises exceptions if there's an error in FM, preventing state to continue and not clearing/raising alarms when FM is offline. This can happen during a swact where FM process is stopped before state. - Introduce a db call to get the subcloud object and current status of endpoint instead of receiving a simplified subcloud through RPC. The reason for doing this instead of a simplified subcloud is that dcmanager-audit is faster to process than state, so until state updates, audit will keep sending information causing duplicated updates, slowing down the time it takes to update every subcloud. - Convert all logs into a default format with the subcloud name at the start, for better traceability. E.g: "Subcloud: subcloud1. <msg>". - Removed unused function update_subcloud_sync_endpoint_type. Test plan: - PASS: Deploy a subcloud and verify state communicates to cert-mon that it became online and then updates the dc_cert endpoint after receiving the response. - PASS: Manage the subcloud and verify all endpoint are updated and the final sync status is in-sync. - PASS: Force a subcloud to have an out-of-sync kube root-ca and kubernetes and verify state correctly updates the db and raise the alarms. - PASS: Turn off the subcloud and verify: - Subcloud availability was updated in db - All endpoints were updated in db - Dcorch was communicated - All endpoints alarms were cleared - The offline alarm was raised - PASS: Unmanage the subcloud and verify all endpoints, whith the exception of dc_cert, were updated to unknown. - PASS: Unmanage and stop the fm-mgs service and turn off the subcloud. Verify the subcloud is not updated to offline until fm comes back on. - PASS: Perform scale tests and verify that updating availability and endpoints is faster. Story: 2011311 Task: 52283 Depends-on: https://review.opendev.org/c/starlingx/fault/+/952671 Change-Id: I8792e1cbf8eb0af0cc9dd1be25987fac2503ecee Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
This commit is contained in:
@@ -459,14 +459,12 @@ class SubcloudAuditManager(manager.Manager):
|
||||
audit_kube_rootca_update = True
|
||||
break
|
||||
LOG.info(
|
||||
"Triggered subcloud audit: firmware=(%s) kube=(%s) kube-rootca=(%s) "
|
||||
"software=(%s)"
|
||||
% (
|
||||
audit_firmware,
|
||||
audit_kubernetes,
|
||||
audit_kube_rootca_update,
|
||||
audit_software,
|
||||
)
|
||||
"Triggered subcloud audit: "
|
||||
f"firmware=({audit_firmware}) "
|
||||
f"kube=({audit_kubernetes}) "
|
||||
f"kube-rootca=({audit_kube_rootca_update}) "
|
||||
f"software=({audit_software}) "
|
||||
f"update_state_only=({update_subcloud_state})"
|
||||
)
|
||||
(
|
||||
firmware_audit_data,
|
||||
|
@@ -570,20 +570,12 @@ class SubcloudAuditWorkerManager(manager.Manager):
|
||||
)
|
||||
|
||||
if availability_data or (endpoint_data and any(endpoint_data.values())):
|
||||
simplified_subcloud = {
|
||||
"id": subcloud.id,
|
||||
"name": subcloud.name,
|
||||
"availability_status": subcloud.availability_status,
|
||||
"management_state": subcloud.management_state,
|
||||
"deploy_status": subcloud.deploy_status,
|
||||
"region_name": subcloud.region_name,
|
||||
}
|
||||
|
||||
try:
|
||||
# If a value is not None, an update should be sent to the rpc client
|
||||
bulk_update_subcloud_availability_and_endpoint_status(
|
||||
self.context,
|
||||
simplified_subcloud,
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
availability_data,
|
||||
endpoint_data,
|
||||
)
|
||||
|
@@ -197,9 +197,11 @@ def subcloud_get(context, subcloud_id):
|
||||
return IMPL.Connection(context).subcloud_get(subcloud_id)
|
||||
|
||||
|
||||
def subcloud_get_with_status(context, subcloud_id):
|
||||
def subcloud_get_with_status(context, subcloud_id, endpoint_type=None):
|
||||
"""Retrieve a subcloud and all endpoint sync statuses."""
|
||||
return IMPL.Connection(context).subcloud_get_with_status(subcloud_id)
|
||||
return IMPL.Connection(context).subcloud_get_with_status(
|
||||
subcloud_id, endpoint_type=endpoint_type
|
||||
)
|
||||
|
||||
|
||||
def subcloud_get_by_name(context, name) -> models.Subcloud:
|
||||
|
@@ -449,8 +449,8 @@ class Connection(object):
|
||||
return result
|
||||
|
||||
@require_context()
|
||||
def subcloud_get_with_status(self, subcloud_id):
|
||||
result = (
|
||||
def subcloud_get_with_status(self, subcloud_id, endpoint_type=None):
|
||||
query = (
|
||||
model_query(self.context, models.Subcloud, models.SubcloudStatus)
|
||||
.outerjoin(
|
||||
models.SubcloudStatus,
|
||||
@@ -459,9 +459,12 @@ class Connection(object):
|
||||
)
|
||||
.filter(models.Subcloud.id == subcloud_id)
|
||||
.filter(models.Subcloud.deleted == 0)
|
||||
.order_by(models.SubcloudStatus.endpoint_type)
|
||||
.all()
|
||||
)
|
||||
if endpoint_type:
|
||||
query = query.filter(
|
||||
models.SubcloudStatus.endpoint_type == endpoint_type,
|
||||
)
|
||||
result = query.order_by(models.SubcloudStatus.endpoint_type).all()
|
||||
|
||||
if not result:
|
||||
raise exception.SubcloudNotFound(subcloud_id=subcloud_id)
|
||||
|
@@ -70,14 +70,15 @@ class SubcloudStateClient(RPCClient):
|
||||
)
|
||||
|
||||
def bulk_update_subcloud_availability_and_endpoint_status(
|
||||
self, ctxt, simplified_subcloud, availability_data, endpoint_data
|
||||
self, ctxt, subcloud_id, subcloud_name, availability_data, endpoint_data
|
||||
):
|
||||
# Note: This is an asynchronous operation.
|
||||
return self.cast(
|
||||
ctxt,
|
||||
self.make_msg(
|
||||
"bulk_update_subcloud_availability_and_endpoint_status",
|
||||
simplified_subcloud=simplified_subcloud,
|
||||
subcloud_id=subcloud_id,
|
||||
subcloud_name=subcloud_name,
|
||||
availability_data=availability_data,
|
||||
endpoint_data=endpoint_data,
|
||||
),
|
||||
|
@@ -25,6 +25,7 @@ import oslo_messaging
|
||||
from oslo_service import service
|
||||
|
||||
from dccommon import consts as dccommon_consts
|
||||
from dccommon import utils as cutils
|
||||
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.common import context
|
||||
@@ -77,7 +78,7 @@ class DCManagerStateService(service.Service):
|
||||
self.subcloud_state_manager = SubcloudStateManager()
|
||||
|
||||
def start(self):
|
||||
LOG.info("Starting %s", self.__class__.__name__)
|
||||
LOG.info(f"Starting {self.__class__.__name__}")
|
||||
utils.set_open_file_limit(cfg.CONF.state_worker_rlimit_nofile)
|
||||
self._init_managers()
|
||||
target = oslo_messaging.Target(
|
||||
@@ -99,10 +100,10 @@ class DCManagerStateService(service.Service):
|
||||
self._rpc_server.wait()
|
||||
LOG.info("Engine service stopped successfully")
|
||||
except Exception as ex:
|
||||
LOG.error("Failed to stop engine service: %s", str(ex))
|
||||
LOG.error(f"Failed to stop engine service: {str(ex)}")
|
||||
|
||||
def stop(self):
|
||||
LOG.info("Stopping %s", self.__class__.__name__)
|
||||
LOG.info(f"Stopping {self.__class__.__name__}")
|
||||
self._stop_rpc_server()
|
||||
# Terminate the engine process
|
||||
LOG.info("All threads were gone, terminating engine")
|
||||
@@ -111,20 +112,21 @@ class DCManagerStateService(service.Service):
|
||||
@request_context
|
||||
def update_subcloud_endpoint_status(
|
||||
self,
|
||||
context,
|
||||
subcloud_name=None,
|
||||
subcloud_region=None,
|
||||
endpoint_type=None,
|
||||
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
||||
alarmable=True,
|
||||
ignore_endpoints=None,
|
||||
):
|
||||
context: context.RequestContext,
|
||||
subcloud_name: str = None,
|
||||
subcloud_region: str = None,
|
||||
endpoint_type: str = None,
|
||||
sync_status: str = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
||||
alarmable: bool = True,
|
||||
ignore_endpoints: list[str] = None,
|
||||
) -> None:
|
||||
# Updates subcloud endpoint sync status
|
||||
LOG.info(
|
||||
"Handling update_subcloud_endpoint_status request for subcloud: "
|
||||
f"({subcloud_name if subcloud_name is not None else subcloud_region}) "
|
||||
name = subcloud_name if subcloud_name is not None else subcloud_region
|
||||
msg = (
|
||||
"Handling update_subcloud_endpoint_status request. "
|
||||
f"endpoint: ({endpoint_type}) status: ({sync_status})"
|
||||
)
|
||||
cutils.log_subcloud_msg(LOG.info, msg, name)
|
||||
|
||||
self.subcloud_state_manager.update_subcloud_endpoint_status(
|
||||
context,
|
||||
@@ -164,17 +166,16 @@ class DCManagerStateService(service.Service):
|
||||
@request_context
|
||||
def update_subcloud_availability(
|
||||
self,
|
||||
context,
|
||||
subcloud_name,
|
||||
subcloud_region,
|
||||
availability_status,
|
||||
update_state_only=False,
|
||||
audit_fail_count=None,
|
||||
):
|
||||
context: context.RequestContext,
|
||||
subcloud_name: str,
|
||||
subcloud_region: str,
|
||||
availability_status: str,
|
||||
update_state_only: bool = False,
|
||||
audit_fail_count: int = None,
|
||||
) -> None:
|
||||
# Updates subcloud availability
|
||||
LOG.info(
|
||||
"Handling update_subcloud_availability request for: %s" % subcloud_name
|
||||
)
|
||||
msg = "Handling update_subcloud_availability request"
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
self.subcloud_state_manager.update_subcloud_availability(
|
||||
context,
|
||||
subcloud_region,
|
||||
@@ -184,14 +185,17 @@ class DCManagerStateService(service.Service):
|
||||
)
|
||||
|
||||
def bulk_update_subcloud_availability_and_endpoint_status(
|
||||
self, context, simplified_subcloud, availability_data, endpoint_data
|
||||
):
|
||||
LOG.debug(
|
||||
"Handling bulk_update_subcloud_availability_and_endpoint_status request "
|
||||
f"for subcloud: {simplified_subcloud['name']}"
|
||||
)
|
||||
self,
|
||||
context: context.RequestContext,
|
||||
subcloud_id: int,
|
||||
subcloud_name: str,
|
||||
availability_data: dict,
|
||||
endpoint_data: dict[str, str],
|
||||
) -> None:
|
||||
msg = "Handling bulk_update_subcloud_availability_and_endpoint_status request"
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
|
||||
manager = self.subcloud_state_manager
|
||||
manager.bulk_update_subcloud_availability_and_endpoint_status(
|
||||
context, simplified_subcloud, availability_data, endpoint_data
|
||||
context, subcloud_id, subcloud_name, availability_data, endpoint_data
|
||||
)
|
||||
|
@@ -10,20 +10,19 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
# Copyright (c) 2017-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2017-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# The right to copy, distribute, modify, or otherwise make use
|
||||
# of this software may be licensed only pursuant to the terms
|
||||
# of an applicable Wind River license agreement.
|
||||
#
|
||||
|
||||
import copy
|
||||
|
||||
from fm_api import constants as fm_const
|
||||
from fm_api import fm_api
|
||||
from oslo_log import log as logging
|
||||
|
||||
from dccommon import consts as dccommon_consts
|
||||
from dccommon import utils as cutils
|
||||
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.common import context
|
||||
@@ -31,7 +30,7 @@ from dcmanager.common import exceptions
|
||||
from dcmanager.common import manager
|
||||
from dcmanager.common import utils
|
||||
from dcmanager.db import api as db_api
|
||||
from dcmanager.db.sqlalchemy.models import Subcloud
|
||||
from dcmanager.db.sqlalchemy import models
|
||||
from dcmanager.rpc import client as rpc_client
|
||||
from dcorch.rpc import client as dcorch_rpc_client
|
||||
|
||||
@@ -43,7 +42,7 @@ def sync_update_subcloud_endpoint_status(func):
|
||||
"""Synchronized lock decorator for _update_subcloud_endpoint_status."""
|
||||
|
||||
def _get_lock_and_call(*args, **kwargs):
|
||||
"""Get a single fair lock per subcloud based on subcloud region."""
|
||||
"""Get a single fair lock per subcloud based on subcloud name/region."""
|
||||
|
||||
# subcloud region is the 3rd argument to
|
||||
# _update_subcloud_endpoint_status()
|
||||
@@ -67,18 +66,18 @@ class SubcloudStateManager(manager.Manager):
|
||||
)
|
||||
self.context = context.get_admin_context()
|
||||
self.dcorch_rpc_client = dcorch_rpc_client.EngineWorkerClient()
|
||||
self.fm_api = fm_api.FaultAPIs()
|
||||
self.fm_api = fm_api.FaultAPIsV2()
|
||||
self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient()
|
||||
|
||||
def _do_update_subcloud_endpoint_status(
|
||||
self,
|
||||
context,
|
||||
subcloud_id,
|
||||
endpoint_type,
|
||||
sync_status,
|
||||
alarmable,
|
||||
ignore_endpoints=None,
|
||||
):
|
||||
context: context.RequestContext,
|
||||
subcloud_id: int,
|
||||
endpoint_type: str,
|
||||
sync_status: str,
|
||||
alarmable: bool,
|
||||
ignore_endpoints: list[str],
|
||||
) -> None:
|
||||
"""Update online/managed subcloud endpoint status
|
||||
|
||||
:param context: request context object
|
||||
@@ -90,211 +89,114 @@ class SubcloudStateManager(manager.Manager):
|
||||
endpoint_type is None)
|
||||
"""
|
||||
|
||||
if ignore_endpoints is None:
|
||||
ignore_endpoints = []
|
||||
endpoint_status_dict = {}
|
||||
endpoint_to_update_list = []
|
||||
faults_to_raise = []
|
||||
faults_to_clear = []
|
||||
|
||||
# The subcloud object will always be the same, so we just keep the last one
|
||||
for subcloud, endpoint_status in db_api.subcloud_get_with_status(
|
||||
context,
|
||||
subcloud_id,
|
||||
endpoint_type=endpoint_type,
|
||||
):
|
||||
endpoint_status_dict[endpoint_status.endpoint_type] = endpoint_status
|
||||
|
||||
if endpoint_type:
|
||||
status = endpoint_status_dict.get(endpoint_type)
|
||||
if status and status.sync_status == sync_status:
|
||||
msg = f"Sync status ({sync_status}) did not change - ignoring update"
|
||||
cutils.log_subcloud_msg(LOG.debug, msg, subcloud.name)
|
||||
return
|
||||
elif not status:
|
||||
msg = f"Subcloud: {subcloud.name}. Endpoint {endpoint_type} not found"
|
||||
raise exceptions.BadRequest(
|
||||
resource="subcloud",
|
||||
msg=msg,
|
||||
)
|
||||
|
||||
self._trigger_subcloud_audits_after_identity_sync(
|
||||
context,
|
||||
subcloud_id,
|
||||
subcloud,
|
||||
sync_status,
|
||||
endpoint_type,
|
||||
endpoint_status_dict,
|
||||
)
|
||||
|
||||
for endpoint in endpoint_status_dict.values():
|
||||
if not endpoint_type and endpoint.endpoint_type in ignore_endpoints:
|
||||
continue
|
||||
endpoint_to_update_list.append(endpoint.endpoint_type)
|
||||
|
||||
entity_instance_id = (
|
||||
f"subcloud={subcloud.name}.resource={endpoint.endpoint_type}"
|
||||
)
|
||||
if sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
|
||||
faults_to_clear.append((ALARM_OUT_OF_SYNC, entity_instance_id))
|
||||
elif alarmable and (sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
|
||||
fault = self._create_fault_out_of_sync(subcloud.name, endpoint_type)
|
||||
faults_to_raise.append(fault)
|
||||
|
||||
subcloud_status_list = []
|
||||
subcloud = None
|
||||
original_identity_status = None
|
||||
# retrieve the info from the db for this subcloud.
|
||||
# subcloud_id should not be None
|
||||
try:
|
||||
for subcloud, subcloud_status in db_api.subcloud_get_with_status(
|
||||
context, subcloud_id
|
||||
):
|
||||
if subcloud_status:
|
||||
subcloud_status_list.append(
|
||||
db_api.subcloud_endpoint_status_db_model_to_dict(
|
||||
subcloud_status
|
||||
)
|
||||
)
|
||||
if (
|
||||
subcloud_status.endpoint_type
|
||||
== dccommon_consts.ENDPOINT_TYPE_IDENTITY
|
||||
):
|
||||
original_identity_status = subcloud_status.sync_status
|
||||
# We first want to raise/clear any alarms because in case of an
|
||||
# unresponsive FM, like we have during a swact, the operations wont process
|
||||
# again if the DB state is already the correct one, leading to a persistent
|
||||
# alarm
|
||||
self._raise_and_clear_subcloud_alarms_list(
|
||||
subcloud.name, faults_to_raise, faults_to_clear
|
||||
)
|
||||
if endpoint_to_update_list:
|
||||
db_api.subcloud_status_update_endpoints(
|
||||
context, subcloud_id, endpoint_to_update_list, sync_status
|
||||
)
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
msg = f"Failed to update subcloud endpoint status: {e}"
|
||||
cutils.log_subcloud_msg(LOG.error, msg, subcloud.name)
|
||||
raise e
|
||||
|
||||
if subcloud:
|
||||
if endpoint_type:
|
||||
# updating a single endpoint on a single subcloud
|
||||
for subcloud_status in subcloud_status_list:
|
||||
if subcloud_status["endpoint_type"] == endpoint_type:
|
||||
if subcloud_status["sync_status"] == sync_status:
|
||||
# No change in the sync_status
|
||||
LOG.debug(
|
||||
"Sync status (%s) for subcloud %s did not change "
|
||||
"- ignore update" % (sync_status, subcloud.name)
|
||||
)
|
||||
return
|
||||
# We found the endpoint
|
||||
break
|
||||
else:
|
||||
# We did not find the endpoint
|
||||
raise exceptions.BadRequest(
|
||||
resource="subcloud",
|
||||
msg="Endpoint %s not found for subcloud" % endpoint_type,
|
||||
)
|
||||
def _trigger_subcloud_audits_after_identity_sync(
|
||||
self,
|
||||
context: context.RequestContext,
|
||||
subcloud_id: int,
|
||||
subcloud: models.Subcloud,
|
||||
sync_status: str,
|
||||
endpoint_type: str,
|
||||
endpoint_status_dict: dict[str, models.SubcloudStatus],
|
||||
) -> None:
|
||||
"""Trigger audits for a subcloud after the first identity sync is complete
|
||||
|
||||
LOG.info(
|
||||
"Updating subcloud:%s endpoint:%s sync:%s"
|
||||
% (subcloud.name, endpoint_type, sync_status)
|
||||
)
|
||||
db_api.subcloud_status_update(
|
||||
context, subcloud_id, endpoint_type, sync_status
|
||||
:param context: request context object
|
||||
:param subcloud_id: id of the subcloud to update
|
||||
:param subcloud: subcloud object
|
||||
:param sync_status: sync status to set
|
||||
:param endpoint_type: endpoint type to update
|
||||
:param endpoint_status_dict: dict of endpoint types and their status
|
||||
"""
|
||||
is_sync_not_unknown = sync_status != dccommon_consts.SYNC_STATUS_UNKNOWN
|
||||
identity_endpoint = endpoint_status_dict.get(
|
||||
dccommon_consts.ENDPOINT_TYPE_IDENTITY
|
||||
)
|
||||
is_identity_unknown = (
|
||||
identity_endpoint
|
||||
and identity_endpoint.sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
||||
)
|
||||
if (
|
||||
endpoint_type == dccommon_consts.ENDPOINT_TYPE_IDENTITY
|
||||
and is_sync_not_unknown
|
||||
and is_identity_unknown
|
||||
):
|
||||
if not subcloud.first_identity_sync_complete:
|
||||
db_api.subcloud_update(
|
||||
context, subcloud_id, first_identity_sync_complete=True
|
||||
)
|
||||
msg = "Request for audits after updating identity out of unknown"
|
||||
cutils.log_subcloud_msg(LOG.debug, msg, subcloud.name)
|
||||
self.audit_rpc_client.trigger_subcloud_audits(context, subcloud_id)
|
||||
|
||||
# Trigger subcloud audits for the subcloud after
|
||||
# its identity endpoint turns to other status from unknown
|
||||
is_sync_unknown = sync_status != dccommon_consts.SYNC_STATUS_UNKNOWN
|
||||
is_identity_unknown = (
|
||||
original_identity_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
||||
)
|
||||
if (
|
||||
endpoint_type == dccommon_consts.ENDPOINT_TYPE_IDENTITY
|
||||
and is_sync_unknown
|
||||
and is_identity_unknown
|
||||
):
|
||||
if not subcloud.first_identity_sync_complete:
|
||||
db_api.subcloud_update(
|
||||
context, subcloud_id, first_identity_sync_complete=True
|
||||
)
|
||||
LOG.debug(
|
||||
"Request for audits for %s after updating "
|
||||
"identity out of unknown" % subcloud.name
|
||||
)
|
||||
self.audit_rpc_client.trigger_subcloud_audits(context, subcloud_id)
|
||||
|
||||
entity_instance_id = "subcloud=%s.resource=%s" % (
|
||||
subcloud.name,
|
||||
endpoint_type,
|
||||
)
|
||||
fault = self.fm_api.get_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
|
||||
|
||||
if (sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC) and fault:
|
||||
try:
|
||||
self.fm_api.clear_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
|
||||
elif (
|
||||
not fault
|
||||
and alarmable
|
||||
and (sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
|
||||
):
|
||||
entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD
|
||||
try:
|
||||
|
||||
fault = fm_api.Fault(
|
||||
alarm_id=ALARM_OUT_OF_SYNC,
|
||||
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
||||
entity_type_id=entity_type_id,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
|
||||
reason_text=(
|
||||
"%s %s sync_status is out-of-sync"
|
||||
% (subcloud.name, endpoint_type)
|
||||
),
|
||||
alarm_type=fm_const.FM_ALARM_TYPE_0,
|
||||
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
|
||||
proposed_repair_action=(
|
||||
"If problem persists contact next level of support"
|
||||
),
|
||||
service_affecting=False,
|
||||
)
|
||||
|
||||
self.fm_api.set_fault(fault)
|
||||
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
|
||||
else:
|
||||
# update all endpoints on this subcloud
|
||||
LOG.info(
|
||||
"Updating all endpoints on subcloud: %s sync: %s "
|
||||
"ignore_endpoints: %s"
|
||||
% (subcloud.name, sync_status, ignore_endpoints)
|
||||
)
|
||||
|
||||
# TODO(yuxing): The following code can be further optimized when
|
||||
# batch alarm clearance APIs are available, so we don't need to
|
||||
# loop over all the endpoints of a given subcloud, e.g.
|
||||
# if not ignore_endpoints:
|
||||
# db_api.subcloud_status_update_endpoints_all(...)
|
||||
# else:
|
||||
# db_api.subcloud_status_update_endpoints(...)
|
||||
endpoint_to_update_list = []
|
||||
for entry in subcloud_status_list:
|
||||
endpoint = entry[consts.ENDPOINT_TYPE]
|
||||
if endpoint in ignore_endpoints:
|
||||
# Do not update this endpoint
|
||||
continue
|
||||
endpoint_to_update_list.append(endpoint)
|
||||
|
||||
entity_instance_id = "subcloud=%s.resource=%s" % (
|
||||
subcloud.name,
|
||||
endpoint,
|
||||
)
|
||||
|
||||
fault = self.fm_api.get_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
|
||||
|
||||
# TODO(yuxing): batch clear all the out-of-sync alarms of a
|
||||
# given subcloud if fm_api support it. Be careful with the
|
||||
# dc-cert endpoint when adding the above; the endpoint
|
||||
# alarm must remain for offline subclouds.
|
||||
if (
|
||||
sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
|
||||
) and fault:
|
||||
try:
|
||||
self.fm_api.clear_fault(
|
||||
ALARM_OUT_OF_SYNC, entity_instance_id
|
||||
)
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
|
||||
elif (
|
||||
not fault
|
||||
and alarmable
|
||||
and (sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
|
||||
):
|
||||
entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD
|
||||
try:
|
||||
fault = fm_api.Fault(
|
||||
alarm_id=ALARM_OUT_OF_SYNC,
|
||||
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
||||
entity_type_id=entity_type_id,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
|
||||
reason_text=(
|
||||
"%s %s sync_status is out-of-sync"
|
||||
% (subcloud.name, endpoint)
|
||||
),
|
||||
alarm_type=fm_const.FM_ALARM_TYPE_0,
|
||||
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
|
||||
proposed_repair_action=(
|
||||
"If problem persists contact next level of support"
|
||||
),
|
||||
service_affecting=False,
|
||||
)
|
||||
|
||||
self.fm_api.set_fault(fault)
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
|
||||
if endpoint_to_update_list:
|
||||
try:
|
||||
db_api.subcloud_status_update_endpoints(
|
||||
context, subcloud_id, endpoint_to_update_list, sync_status
|
||||
)
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
|
||||
else:
|
||||
LOG.error("Subcloud not found:%s" % subcloud_id)
|
||||
|
||||
def _should_update_endpoint_status(self, subcloud, endpoint_type, sync_status):
|
||||
def _should_update_endpoint_status(
|
||||
self, subcloud: models.Subcloud, endpoint_type: str, sync_status: str
|
||||
) -> bool:
|
||||
"""Verifies if the subcloud's endpoint should have its sync status updated"""
|
||||
|
||||
# Rules for updating sync status:
|
||||
@@ -332,13 +234,13 @@ class SubcloudStateManager(manager.Manager):
|
||||
@sync_update_subcloud_endpoint_status
|
||||
def _update_subcloud_endpoint_status(
|
||||
self,
|
||||
context,
|
||||
subcloud_region,
|
||||
endpoint_type=None,
|
||||
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
||||
alarmable=True,
|
||||
ignore_endpoints=None,
|
||||
):
|
||||
context: context.RequestContext,
|
||||
subcloud_region: str,
|
||||
endpoint_type: str,
|
||||
sync_status: str,
|
||||
alarmable: bool,
|
||||
ignore_endpoints: list[str],
|
||||
) -> None:
|
||||
"""Update subcloud endpoint status
|
||||
|
||||
:param context: request context object
|
||||
@@ -350,9 +252,6 @@ class SubcloudStateManager(manager.Manager):
|
||||
endpoint_type is None)
|
||||
"""
|
||||
|
||||
if ignore_endpoints is None:
|
||||
ignore_endpoints = []
|
||||
|
||||
if not subcloud_region:
|
||||
raise exceptions.BadRequest(
|
||||
resource="subcloud", msg="Subcloud region not provided"
|
||||
@@ -379,19 +278,18 @@ class SubcloudStateManager(manager.Manager):
|
||||
LOG.exception(e)
|
||||
raise e
|
||||
else:
|
||||
LOG.info(
|
||||
"Ignoring subcloud sync_status update for subcloud:%s "
|
||||
"availability:%s management:%s endpoint:%s sync:%s"
|
||||
% (
|
||||
subcloud.name,
|
||||
subcloud.availability_status,
|
||||
subcloud.management_state,
|
||||
endpoint_type,
|
||||
sync_status,
|
||||
)
|
||||
msg = (
|
||||
f"Ignoring subcloud sync_status update. "
|
||||
f"Availability: {subcloud.availability_status}; "
|
||||
f"Management:{subcloud.management_state}; "
|
||||
f"Endpoint:{endpoint_type}; "
|
||||
f"sync_status:{sync_status}"
|
||||
)
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud.name)
|
||||
|
||||
def _create_fault(self, subcloud_name, endpoint):
|
||||
def _create_fault_out_of_sync(
|
||||
self, subcloud_name: str, endpoint: str
|
||||
) -> fm_api.Fault:
|
||||
"""Creates a fault for an endpoint out-of-sync
|
||||
|
||||
:param subcloud_name: subcloud's name
|
||||
@@ -412,21 +310,79 @@ class SubcloudStateManager(manager.Manager):
|
||||
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
|
||||
proposed_repair_action="If problem persists contact next level of support",
|
||||
service_affecting=False,
|
||||
keep_existing_alarm=True,
|
||||
)
|
||||
|
||||
def _create_fault_offline(self, subcloud_name: str) -> fm_api.Fault:
|
||||
"""Creates a fault for an offline subcloud
|
||||
|
||||
:param subcloud_name: subcloud's name
|
||||
|
||||
:return: an FM fault object
|
||||
:rtype: Fault
|
||||
"""
|
||||
entity_instance_id = f"subcloud={subcloud_name}"
|
||||
return fm_api.Fault(
|
||||
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
|
||||
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
|
||||
reason_text=("%s is offline" % subcloud_name),
|
||||
alarm_type=fm_const.FM_ALARM_TYPE_0,
|
||||
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
|
||||
proposed_repair_action=(
|
||||
"Wait for subcloud to become online; if problem persists "
|
||||
"contact next level of support."
|
||||
),
|
||||
service_affecting=True,
|
||||
keep_existing_alarm=True,
|
||||
)
|
||||
|
||||
def bulk_update_subcloud_availability_and_endpoint_status(
|
||||
self, context, simplified_subcloud, availability_data, endpoint_data
|
||||
):
|
||||
self,
|
||||
context: context.RequestContext,
|
||||
subcloud_id: int,
|
||||
subcloud_name: str,
|
||||
availability_data: dict,
|
||||
endpoint_data: dict[str, str],
|
||||
) -> None:
|
||||
"""Bulk update subcloud availability and endpoint status
|
||||
|
||||
:param context: request context object
|
||||
:param subcloud_id: id of the subcloud to update
|
||||
:param subcloud_name: name of the subcloud to update
|
||||
:param availability_data: a dict containing the availability status,
|
||||
update_state_only and audit_fail_count
|
||||
:param endpoint_data: a dict containing the endpoint as key and its sync
|
||||
status as value
|
||||
"""
|
||||
# This bulk update is executed as part of the audit process in dcmanager and
|
||||
# its related endpoints. This method is not used by dcorch and cert-mon.
|
||||
|
||||
# When the request is performed through RPC, the subcloud object is sent as
|
||||
# a dict and needs to be redefined as a model
|
||||
subcloud = Subcloud(**simplified_subcloud)
|
||||
# The subcloud object will always be the same, so we just keep the last one
|
||||
unchanged_endpoints = []
|
||||
for subcloud, endpoint_status in db_api.subcloud_get_with_status(
|
||||
context,
|
||||
subcloud_id,
|
||||
):
|
||||
if (
|
||||
endpoint_data.get(endpoint_status.endpoint_type)
|
||||
== endpoint_status.sync_status
|
||||
):
|
||||
unchanged_endpoints.append(endpoint_status.endpoint_type)
|
||||
del endpoint_data[endpoint_status.endpoint_type]
|
||||
if unchanged_endpoints:
|
||||
msg = (
|
||||
"The following endpoints are already set to updated values, "
|
||||
f"not updating: {unchanged_endpoints}"
|
||||
)
|
||||
cutils.log_subcloud_msg(LOG.debug, msg, subcloud_name)
|
||||
|
||||
if availability_data:
|
||||
self.update_subcloud_availability(
|
||||
context,
|
||||
subcloud.name,
|
||||
subcloud.region_name,
|
||||
availability_data["availability_status"],
|
||||
availability_data["update_state_only"],
|
||||
@@ -438,14 +394,17 @@ class SubcloudStateManager(manager.Manager):
|
||||
|
||||
@sync_update_subcloud_endpoint_status
|
||||
def _do_bulk_update_subcloud_endpoint_status(
|
||||
self, context, region_name, subcloud_id, subcloud_name, endpoint_data
|
||||
):
|
||||
self,
|
||||
context: context.RequestContext,
|
||||
subcloud_name: str,
|
||||
subcloud_id: int,
|
||||
endpoint_data: dict[str, str],
|
||||
) -> None:
|
||||
"""Updates an online and managed subcloud's endpoints sync status
|
||||
|
||||
:param context: request context object
|
||||
:param region_name: region name of subcloud to update
|
||||
:param subcloud_id: id of the subcloud to update
|
||||
:param subcloud_name: name of the subcloud to update
|
||||
:param subcloud_id: id of the subcloud to update
|
||||
:param endpoint_data: a dict containing the endpoint as key and its sync
|
||||
status as value
|
||||
"""
|
||||
@@ -455,56 +414,23 @@ class SubcloudStateManager(manager.Manager):
|
||||
# the difference that only the required endpoints will be update and that'll
|
||||
# happen at once.
|
||||
status_to_set = [f"{key} ({value})" for key, value in endpoint_data.items()]
|
||||
LOG.info(
|
||||
f"Updating endpoints on subcloud: {subcloud_name} "
|
||||
f"endpoints: {', '.join(status_to_set)}"
|
||||
)
|
||||
msg = f"Updating endpoints: {', '.join(status_to_set)}"
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
|
||||
# For each endpoint in endpoint_data, decide whether an alarm should be set
|
||||
# or not and create it in case it's necessary.
|
||||
faults_to_set = dict()
|
||||
entity_instance_id = f"subcloud={subcloud_name}"
|
||||
faults_to_set = []
|
||||
faults_to_clear = []
|
||||
|
||||
# Acquire all existing alarms with the specified alarm_id for a subcloud.
|
||||
faults = self.fm_api.get_faults_by_id_n_eid(
|
||||
ALARM_OUT_OF_SYNC, entity_instance_id
|
||||
)
|
||||
|
||||
# Create a dictionary with the endpoint as key and fault as value
|
||||
if faults:
|
||||
for fault in faults:
|
||||
# The entity_instance_id is made out of
|
||||
# subcloud={subcloud.name}.resource={endpoint}
|
||||
endpoint = fault.entity_instance_id.split("resource=")[1]
|
||||
|
||||
# The uuid reset is necessary to avoid warnings in postgres.log
|
||||
# related to adding an element with an existing uuid
|
||||
fault.uuid = None
|
||||
faults_to_set[endpoint] = fault
|
||||
|
||||
# Copy the original dictionary created and, for each endpoint to be updated,
|
||||
# verify if it either needs to set or clear an alarm
|
||||
endpoints_with_faults = copy.deepcopy(list(faults_to_set.keys()))
|
||||
for endpoint, sync_status in endpoint_data.items():
|
||||
has_fault = True if endpoint in endpoints_with_faults else False
|
||||
|
||||
if sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC and not has_fault:
|
||||
faults_to_set[endpoint] = self._create_fault(subcloud_name, endpoint)
|
||||
elif sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC and has_fault:
|
||||
del faults_to_set[endpoint]
|
||||
|
||||
# If the original dictionary created and the updated one are different, i.e.
|
||||
# if an endpoint is removed or added to it, clear all faults for the subcloud
|
||||
# and set the updated dictionary.
|
||||
if set(endpoints_with_faults) != set(list(faults_to_set.keys())):
|
||||
try:
|
||||
self.fm_api.clear_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
|
||||
self.fm_api.set_faults(faults_to_set.values())
|
||||
except Exception as e:
|
||||
LOG.exception(
|
||||
f"An error occurred when updating subcloud {subcloud_name} "
|
||||
f"alarms: {e}"
|
||||
if sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
|
||||
faults_to_set.append(
|
||||
self._create_fault_out_of_sync(subcloud_name, endpoint)
|
||||
)
|
||||
elif sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC:
|
||||
entity_instance_id = f"subcloud={subcloud_name}.resource={endpoint}"
|
||||
faults_to_clear.append((ALARM_OUT_OF_SYNC, entity_instance_id))
|
||||
self._raise_and_clear_subcloud_alarms_list(
|
||||
subcloud_name, faults_to_set, faults_to_clear
|
||||
)
|
||||
|
||||
try:
|
||||
db_api.subcloud_status_bulk_update_endpoints(
|
||||
@@ -513,12 +439,15 @@ class SubcloudStateManager(manager.Manager):
|
||||
endpoint_data,
|
||||
)
|
||||
except Exception as e:
|
||||
LOG.exception(
|
||||
f"An error occured when updating the subcloud {subcloud_name}'s"
|
||||
f"endpoint status: {e}"
|
||||
)
|
||||
msg = f"An error occured when updating the subcloud endpoint status: {e}"
|
||||
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
||||
|
||||
def _bulk_update_subcloud_endpoint_status(self, context, subcloud, endpoint_data):
|
||||
def _bulk_update_subcloud_endpoint_status(
|
||||
self,
|
||||
context: context.RequestContext,
|
||||
subcloud: models.Subcloud,
|
||||
endpoint_data: dict[str, str],
|
||||
) -> None:
|
||||
"""Update the sync status of a list of subcloud endpoints
|
||||
|
||||
:param context: current context object
|
||||
@@ -540,31 +469,31 @@ class SubcloudStateManager(manager.Manager):
|
||||
try:
|
||||
self._do_bulk_update_subcloud_endpoint_status(
|
||||
context,
|
||||
subcloud.region_name,
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
subcloud.id,
|
||||
endpoints_to_update,
|
||||
)
|
||||
except Exception as e:
|
||||
LOG.exception(e)
|
||||
raise e
|
||||
else:
|
||||
LOG.info(
|
||||
"Ignoring bulk_update_subcloud_endpoint_status for subcloud: "
|
||||
f"{subcloud.name} availability: {subcloud.availability_status} "
|
||||
f"management: {subcloud.management_state} endpoints: "
|
||||
f"{', '.join(endpoint_data.keys())}"
|
||||
msg = (
|
||||
f"No endpoints to update the status; "
|
||||
f"Availability: {subcloud.availability_status}; "
|
||||
f"Management: {subcloud.management_state};"
|
||||
f"Endpoints: {', '.join(endpoint_data.keys())}"
|
||||
)
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud.name)
|
||||
|
||||
def update_subcloud_endpoint_status(
|
||||
self,
|
||||
context,
|
||||
subcloud_region=None,
|
||||
endpoint_type=None,
|
||||
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
||||
alarmable=True,
|
||||
ignore_endpoints=None,
|
||||
):
|
||||
context: context.RequestContext,
|
||||
subcloud_region: str = None,
|
||||
endpoint_type: str = None,
|
||||
sync_status: str = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
||||
alarmable: bool = True,
|
||||
ignore_endpoints: list[str] = None,
|
||||
) -> None:
|
||||
"""Update subcloud endpoint status
|
||||
|
||||
:param context: request context object
|
||||
@@ -602,94 +531,102 @@ class SubcloudStateManager(manager.Manager):
|
||||
|
||||
def _update_subcloud_state(
|
||||
self,
|
||||
context,
|
||||
subcloud_name,
|
||||
subcloud_region,
|
||||
management_state,
|
||||
availability_status,
|
||||
):
|
||||
context: context.RequestContext,
|
||||
subcloud_name: str,
|
||||
subcloud_region: str,
|
||||
management_state: str,
|
||||
availability_status: str,
|
||||
) -> None:
|
||||
try:
|
||||
LOG.info(
|
||||
"Notifying dcorch, subcloud:%s management: %s, availability:%s"
|
||||
% (subcloud_name, management_state, availability_status)
|
||||
msg = (
|
||||
f"Notifying dcorch, management: {management_state}, "
|
||||
f"availability:{availability_status}"
|
||||
)
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
|
||||
self.dcorch_rpc_client.update_subcloud_states(
|
||||
context, subcloud_region, management_state, availability_status
|
||||
)
|
||||
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Problem informing dcorch of subcloud state change, subcloud: %s"
|
||||
% subcloud_name
|
||||
)
|
||||
msg = "Problem informing dcorch of subcloud state change"
|
||||
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
||||
|
||||
def _raise_and_clear_subcloud_alarms_list(
|
||||
self,
|
||||
subcloud_name: str,
|
||||
faults_to_raise: list[fm_api.Fault] = None,
|
||||
faults_to_clear: list[tuple[str, str]] = None,
|
||||
) -> None:
|
||||
"""Raise/clear a list of subcloud alarms
|
||||
|
||||
:param faults_to_raise: list of faults to raise
|
||||
:param faults_to_clear: list of faults to clear
|
||||
"""
|
||||
if faults_to_clear:
|
||||
try:
|
||||
self.fm_api.clear_faults_list(faults_to_clear)
|
||||
except Exception as e:
|
||||
msg = "Failed to clear alarms from list"
|
||||
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
||||
raise e
|
||||
|
||||
if faults_to_raise:
|
||||
try:
|
||||
self.fm_api.set_faults(faults_to_raise)
|
||||
except Exception as e:
|
||||
msg = "Failed to raise alarms from list"
|
||||
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
||||
raise e
|
||||
|
||||
def _raise_or_clear_subcloud_status_alarm(
|
||||
self, subcloud_name, availability_status, deploy_status=None
|
||||
):
|
||||
entity_instance_id = "subcloud=%s" % subcloud_name
|
||||
fault = self.fm_api.get_fault(
|
||||
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, entity_instance_id
|
||||
)
|
||||
self, subcloud_name: str, availability_status: str, deploy_status: str = None
|
||||
) -> None:
|
||||
entity_instance_id = f"subcloud={subcloud_name}"
|
||||
|
||||
if fault and (availability_status == dccommon_consts.AVAILABILITY_ONLINE):
|
||||
if availability_status == dccommon_consts.AVAILABILITY_ONLINE:
|
||||
try:
|
||||
self.fm_api.clear_fault(
|
||||
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, entity_instance_id
|
||||
)
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Failed to clear offline alarm for subcloud: %s", subcloud_name
|
||||
)
|
||||
except Exception as e:
|
||||
msg = "Failed to clear offline alarm"
|
||||
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
||||
raise e
|
||||
|
||||
# Raise the alarm if the subcloud became offline and it's not a
|
||||
# secondary subcloud
|
||||
elif not fault and (
|
||||
elif (
|
||||
availability_status == dccommon_consts.AVAILABILITY_OFFLINE
|
||||
and deploy_status != consts.DEPLOY_STATE_SECONDARY
|
||||
):
|
||||
try:
|
||||
fault = fm_api.Fault(
|
||||
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
|
||||
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
|
||||
reason_text=("%s is offline" % subcloud_name),
|
||||
alarm_type=fm_const.FM_ALARM_TYPE_0,
|
||||
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
|
||||
proposed_repair_action=(
|
||||
"Wait for subcloud to become online; if problem persists "
|
||||
"contact next level of support."
|
||||
),
|
||||
service_affecting=True,
|
||||
)
|
||||
|
||||
fault = self._create_fault_offline(subcloud_name)
|
||||
self.fm_api.set_fault(fault)
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Failed to raise offline alarm for subcloud: %s", subcloud_name
|
||||
)
|
||||
except Exception as e:
|
||||
msg = "Failed to raise offline alarm"
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
raise e
|
||||
|
||||
def update_subcloud_availability(
|
||||
self,
|
||||
context,
|
||||
subcloud_region,
|
||||
availability_status,
|
||||
update_state_only=False,
|
||||
audit_fail_count=None,
|
||||
subcloud=None,
|
||||
):
|
||||
context: context.RequestContext,
|
||||
subcloud_name: str,
|
||||
subcloud_region: str,
|
||||
availability_status: str,
|
||||
update_state_only: bool = False,
|
||||
audit_fail_count: int = None,
|
||||
subcloud: models.Subcloud = None,
|
||||
) -> None:
|
||||
if subcloud is None:
|
||||
try:
|
||||
subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region)
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Failed to get subcloud by region name %s" % subcloud_region
|
||||
)
|
||||
msg = f"Failed to get subcloud by region name: {subcloud_region}"
|
||||
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
||||
raise
|
||||
|
||||
if update_state_only:
|
||||
msg = "Received update_state_only request"
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
# Ensure that the status alarm is consistent with the
|
||||
# subcloud's availability. This is required to compensate
|
||||
# for rare alarm update failures, which may occur during
|
||||
@@ -716,12 +653,19 @@ class SubcloudStateManager(manager.Manager):
|
||||
except exceptions.SubcloudNotFound:
|
||||
# slim possibility subcloud could have been deleted since
|
||||
# we found it in db, ignore this benign error.
|
||||
LOG.info(
|
||||
"Ignoring SubcloudNotFound when attempting "
|
||||
"audit_fail_count update: %s" % subcloud.name
|
||||
msg = (
|
||||
"Ignoring SubcloudNotFound when attempting audit_fail_count update"
|
||||
)
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
return
|
||||
else:
|
||||
if availability_status == subcloud.availability_status:
|
||||
msg = (
|
||||
"Availability status hasn't changed from "
|
||||
f"{availability_status}, not updating"
|
||||
)
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
return
|
||||
self._raise_or_clear_subcloud_status_alarm(
|
||||
subcloud.name, availability_status
|
||||
)
|
||||
@@ -748,16 +692,15 @@ class SubcloudStateManager(manager.Manager):
|
||||
except exceptions.SubcloudNotFound:
|
||||
# slim possibility subcloud could have been deleted since
|
||||
# we found it in db, ignore this benign error.
|
||||
LOG.info(
|
||||
"Ignoring SubcloudNotFound when attempting state update: %s"
|
||||
% subcloud.name
|
||||
)
|
||||
msg = "Ignoring SubcloudNotFound when attempting state update"
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
return
|
||||
|
||||
if availability_status == dccommon_consts.AVAILABILITY_ONLINE:
|
||||
# Subcloud is going online
|
||||
# Tell cert-mon to audit endpoint certificate.
|
||||
LOG.info("Request for online audit for %s" % subcloud.name)
|
||||
msg = "Request for online audit"
|
||||
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
||||
dc_notification = rpc_client.DCManagerNotifications()
|
||||
dc_notification.subcloud_online(context, subcloud.region_name)
|
||||
# Trigger all the audits for the subcloud so it can update the
|
||||
@@ -772,48 +715,3 @@ class SubcloudStateManager(manager.Manager):
|
||||
updated_subcloud.management_state,
|
||||
availability_status,
|
||||
)
|
||||
|
||||
def update_subcloud_sync_endpoint_type(
|
||||
self, context, subcloud_region, endpoint_type_list, openstack_installed
|
||||
):
|
||||
operation = "add" if openstack_installed else "remove"
|
||||
func_switcher = {
|
||||
"add": (
|
||||
self.dcorch_rpc_client.add_subcloud_sync_endpoint_type,
|
||||
db_api.subcloud_status_create,
|
||||
),
|
||||
"remove": (
|
||||
self.dcorch_rpc_client.remove_subcloud_sync_endpoint_type,
|
||||
db_api.subcloud_status_delete,
|
||||
),
|
||||
}
|
||||
|
||||
try:
|
||||
subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region)
|
||||
except Exception:
|
||||
LOG.exception("Failed to get subcloud by region name: %s" % subcloud_region)
|
||||
raise
|
||||
|
||||
try:
|
||||
# Notify dcorch to add/remove sync endpoint type list
|
||||
func_switcher[operation][0](
|
||||
self.context, subcloud_region, endpoint_type_list
|
||||
)
|
||||
LOG.info(
|
||||
"Notifying dcorch, subcloud: %s new sync endpoint: %s"
|
||||
% (subcloud.name, endpoint_type_list)
|
||||
)
|
||||
|
||||
# Update subcloud status table by adding/removing openstack sync
|
||||
# endpoint types
|
||||
for endpoint_type in endpoint_type_list:
|
||||
func_switcher[operation][1](self.context, subcloud.id, endpoint_type)
|
||||
# Update openstack_installed of subcloud table
|
||||
db_api.subcloud_update(
|
||||
self.context, subcloud.id, openstack_installed=openstack_installed
|
||||
)
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Problem informing dcorch of subcloud sync endpoint "
|
||||
"type change, subcloud: %s" % subcloud.name
|
||||
)
|
||||
|
@@ -384,16 +384,6 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
values.update(kwargs)
|
||||
return db_api.subcloud_create(ctxt, **values)
|
||||
|
||||
def create_simplified_subcloud(self, subcloud):
|
||||
return {
|
||||
"id": subcloud.id,
|
||||
"name": subcloud.name,
|
||||
"availability_status": subcloud.availability_status,
|
||||
"management_state": subcloud.management_state,
|
||||
"deploy_status": subcloud.deploy_status,
|
||||
"region_name": subcloud.region_name,
|
||||
}
|
||||
|
||||
def test_init(self):
|
||||
am = subcloud_audit_worker_manager.SubcloudAuditWorkerManager()
|
||||
self.assertIsNotNone(am)
|
||||
@@ -465,7 +455,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
self._set_all_audits_in_sync()
|
||||
self.update_subcloud_availability_and_endpoint_status.assert_called_once_with(
|
||||
mock.ANY,
|
||||
self.create_simplified_subcloud(subcloud),
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
self.availability_data,
|
||||
self.endpoint_data,
|
||||
)
|
||||
@@ -535,7 +526,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0)
|
||||
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
|
||||
mock.ANY,
|
||||
self.create_simplified_subcloud(subcloud),
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
self.availability_data,
|
||||
self.endpoint_data,
|
||||
)
|
||||
@@ -600,7 +592,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0)
|
||||
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
|
||||
mock.ANY,
|
||||
self.create_simplified_subcloud(subcloud),
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
self.availability_data,
|
||||
self.endpoint_data,
|
||||
)
|
||||
@@ -693,7 +686,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, True, None)
|
||||
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
|
||||
mock.ANY,
|
||||
self.create_simplified_subcloud(subcloud),
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
self.availability_data,
|
||||
self.endpoint_data,
|
||||
)
|
||||
@@ -781,7 +775,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
self._set_all_audits_in_sync()
|
||||
self.update_subcloud_availability_and_endpoint_status.assert_called_once_with(
|
||||
mock.ANY,
|
||||
self.create_simplified_subcloud(subcloud),
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
self.availability_data,
|
||||
self.endpoint_data,
|
||||
)
|
||||
@@ -976,7 +971,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
self._update_availability(dccommon_consts.AVAILABILITY_OFFLINE, False, 2)
|
||||
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
|
||||
mock.ANY,
|
||||
self.create_simplified_subcloud(subcloud),
|
||||
subcloud.id,
|
||||
subcloud.name,
|
||||
self.availability_data,
|
||||
self.endpoint_data,
|
||||
)
|
||||
|
@@ -472,16 +472,6 @@ class BaseTestSubcloudManager(base.DCManagerTestCase):
|
||||
values.update(kwargs)
|
||||
return db_api.subcloud_create(ctxt, **values)
|
||||
|
||||
def create_simplified_subcloud(self, subcloud):
|
||||
return {
|
||||
"id": subcloud.id,
|
||||
"name": subcloud.name,
|
||||
"availability_status": subcloud.availability_status,
|
||||
"management_state": subcloud.management_state,
|
||||
"deploy_status": subcloud.deploy_status,
|
||||
"region_name": subcloud.region_name,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_subcloud_peer_group_static(ctxt, **kwargs):
|
||||
values = {
|
||||
@@ -2377,7 +2367,10 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
self.assertEqual(status.sync_status, dccommon_consts.SYNC_STATUS_UNKNOWN)
|
||||
|
||||
ssm.update_subcloud_availability(
|
||||
self.ctx, self.subcloud.region_name, dccommon_consts.AVAILABILITY_ONLINE
|
||||
self.ctx,
|
||||
self.subcloud.name,
|
||||
self.subcloud.region_name,
|
||||
dccommon_consts.AVAILABILITY_ONLINE,
|
||||
)
|
||||
|
||||
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, "subcloud1")
|
||||
@@ -2423,7 +2416,8 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
ssm = subcloud_state_manager.SubcloudStateManager()
|
||||
ssm.bulk_update_subcloud_availability_and_endpoint_status(
|
||||
self.ctx,
|
||||
self.create_simplified_subcloud(self.subcloud),
|
||||
self.subcloud.id,
|
||||
self.subcloud.name,
|
||||
availability_data,
|
||||
endpoint_data,
|
||||
)
|
||||
@@ -2457,7 +2451,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
|
||||
When the endpoint's status in the database is the same as the one it'll be
|
||||
updated to, ensure that, instead of validating, bulk_update_endpoint_status
|
||||
sets the same value in the database
|
||||
just skip it
|
||||
"""
|
||||
|
||||
db_api.subcloud_update(
|
||||
@@ -2475,23 +2469,25 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
ssm = subcloud_state_manager.SubcloudStateManager()
|
||||
ssm.bulk_update_subcloud_availability_and_endpoint_status(
|
||||
self.ctx,
|
||||
self.create_simplified_subcloud(self.subcloud),
|
||||
self.subcloud.id,
|
||||
self.subcloud.name,
|
||||
None,
|
||||
endpoint_data,
|
||||
)
|
||||
|
||||
self.assertEqual(mock_db.call_count, 1)
|
||||
|
||||
# Re-executing the method should result in the same amount of call counts
|
||||
# Re-executing the method should result in no extra calls
|
||||
# for the database query since there are no updates
|
||||
ssm.bulk_update_subcloud_availability_and_endpoint_status(
|
||||
self.ctx,
|
||||
self.create_simplified_subcloud(self.subcloud),
|
||||
self.subcloud.id,
|
||||
self.subcloud.name,
|
||||
None,
|
||||
endpoint_data,
|
||||
)
|
||||
|
||||
self.assertEqual(mock_db.call_count, 2)
|
||||
self.assertEqual(mock_db.call_count, 1)
|
||||
|
||||
@mock.patch.object(
|
||||
subcloud_state_manager.SubcloudStateManager,
|
||||
@@ -2513,6 +2509,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
with mock.patch.object(db_api, "subcloud_update") as subcloud_update_mock:
|
||||
ssm.update_subcloud_availability(
|
||||
self.ctx,
|
||||
self.subcloud.name,
|
||||
self.subcloud.region_name,
|
||||
availability_status=dccommon_consts.AVAILABILITY_ONLINE,
|
||||
update_state_only=True,
|
||||
@@ -2557,7 +2554,10 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
self.assertEqual(status.sync_status, dccommon_consts.SYNC_STATUS_UNKNOWN)
|
||||
|
||||
ssm.update_subcloud_availability(
|
||||
self.ctx, self.subcloud.region_name, dccommon_consts.AVAILABILITY_ONLINE
|
||||
self.ctx,
|
||||
self.subcloud.name,
|
||||
self.subcloud.region_name,
|
||||
dccommon_consts.AVAILABILITY_ONLINE,
|
||||
)
|
||||
|
||||
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, "subcloud1")
|
||||
@@ -2614,6 +2614,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
audit_fail_count = 1
|
||||
ssm.update_subcloud_availability(
|
||||
self.ctx,
|
||||
self.subcloud.name,
|
||||
self.subcloud.region_name,
|
||||
availability_status=None,
|
||||
audit_fail_count=audit_fail_count,
|
||||
@@ -2633,6 +2634,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
|
||||
audit_fail_count = audit_fail_count + 1
|
||||
ssm.update_subcloud_availability(
|
||||
self.ctx,
|
||||
self.subcloud.name,
|
||||
self.subcloud.region_name,
|
||||
dccommon_consts.AVAILABILITY_OFFLINE,
|
||||
audit_fail_count=audit_fail_count,
|
||||
|
Reference in New Issue
Block a user