Improve dcmanager-state scalability

This commit includes the following changes:

- Implement the new fm-api methods regarding raising/clearing alarms
in batches. The new keep_existing_alarms option was also implemented
to make sure we don't update alarms as we're not checking if it
exist before trying to raise them again.

- Moving from FaultAPIs to FaultAPIsV2, which raises exceptions if
there's an error in FM, preventing state to continue and not
clearing/raising alarms when FM is offline. This can happen during a
swact where FM process is stopped before state.

- Introduce a db call to get the subcloud object and current status
of endpoint instead of receiving a simplified subcloud through RPC.
The reason for doing this instead of a simplified subcloud is that
dcmanager-audit is faster to process than state, so until state
updates, audit will keep sending information causing duplicated
updates, slowing down the time it takes to update every subcloud.

- Convert all logs into a default format with the subcloud name at
the start, for better traceability. E.g: "Subcloud: subcloud1. <msg>".

- Removed unused function update_subcloud_sync_endpoint_type.

Test plan:
  - PASS: Deploy a subcloud and verify state communicates to cert-mon
          that it became online and then updates the dc_cert endpoint
          after receiving the response.
  - PASS: Manage the subcloud and verify all endpoint are updated and
          the final sync status is in-sync.
  - PASS: Force a subcloud to have an out-of-sync kube root-ca and
          kubernetes and verify state correctly updates the db and
          raise the alarms.
  - PASS: Turn off the subcloud and verify:
            - Subcloud availability was updated in db
            - All endpoints were updated in db
            - Dcorch was communicated
            - All endpoints alarms were cleared
            - The offline alarm was raised
  - PASS: Unmanage the subcloud and verify all endpoints, whith the
          exception of dc_cert, were updated to unknown.
  - PASS: Unmanage and stop the fm-mgs service and turn off the
          subcloud. Verify the subcloud is not updated to offline
          until fm comes back on.
  - PASS: Perform scale tests and verify that updating availability
          and endpoints is faster.

Story: 2011311
Task: 52283

Depends-on: https://review.opendev.org/c/starlingx/fault/+/952671

Change-Id: I8792e1cbf8eb0af0cc9dd1be25987fac2503ecee
Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
This commit is contained in:
Victor Romano
2025-06-16 09:39:59 -03:00
parent 3383593c7b
commit a3ddcf472d
9 changed files with 402 additions and 506 deletions

View File

@@ -459,14 +459,12 @@ class SubcloudAuditManager(manager.Manager):
audit_kube_rootca_update = True
break
LOG.info(
"Triggered subcloud audit: firmware=(%s) kube=(%s) kube-rootca=(%s) "
"software=(%s)"
% (
audit_firmware,
audit_kubernetes,
audit_kube_rootca_update,
audit_software,
)
"Triggered subcloud audit: "
f"firmware=({audit_firmware}) "
f"kube=({audit_kubernetes}) "
f"kube-rootca=({audit_kube_rootca_update}) "
f"software=({audit_software}) "
f"update_state_only=({update_subcloud_state})"
)
(
firmware_audit_data,

View File

@@ -570,20 +570,12 @@ class SubcloudAuditWorkerManager(manager.Manager):
)
if availability_data or (endpoint_data and any(endpoint_data.values())):
simplified_subcloud = {
"id": subcloud.id,
"name": subcloud.name,
"availability_status": subcloud.availability_status,
"management_state": subcloud.management_state,
"deploy_status": subcloud.deploy_status,
"region_name": subcloud.region_name,
}
try:
# If a value is not None, an update should be sent to the rpc client
bulk_update_subcloud_availability_and_endpoint_status(
self.context,
simplified_subcloud,
subcloud.id,
subcloud.name,
availability_data,
endpoint_data,
)

View File

@@ -197,9 +197,11 @@ def subcloud_get(context, subcloud_id):
return IMPL.Connection(context).subcloud_get(subcloud_id)
def subcloud_get_with_status(context, subcloud_id):
def subcloud_get_with_status(context, subcloud_id, endpoint_type=None):
"""Retrieve a subcloud and all endpoint sync statuses."""
return IMPL.Connection(context).subcloud_get_with_status(subcloud_id)
return IMPL.Connection(context).subcloud_get_with_status(
subcloud_id, endpoint_type=endpoint_type
)
def subcloud_get_by_name(context, name) -> models.Subcloud:

View File

@@ -449,8 +449,8 @@ class Connection(object):
return result
@require_context()
def subcloud_get_with_status(self, subcloud_id):
result = (
def subcloud_get_with_status(self, subcloud_id, endpoint_type=None):
query = (
model_query(self.context, models.Subcloud, models.SubcloudStatus)
.outerjoin(
models.SubcloudStatus,
@@ -459,9 +459,12 @@ class Connection(object):
)
.filter(models.Subcloud.id == subcloud_id)
.filter(models.Subcloud.deleted == 0)
.order_by(models.SubcloudStatus.endpoint_type)
.all()
)
if endpoint_type:
query = query.filter(
models.SubcloudStatus.endpoint_type == endpoint_type,
)
result = query.order_by(models.SubcloudStatus.endpoint_type).all()
if not result:
raise exception.SubcloudNotFound(subcloud_id=subcloud_id)

View File

@@ -70,14 +70,15 @@ class SubcloudStateClient(RPCClient):
)
def bulk_update_subcloud_availability_and_endpoint_status(
self, ctxt, simplified_subcloud, availability_data, endpoint_data
self, ctxt, subcloud_id, subcloud_name, availability_data, endpoint_data
):
# Note: This is an asynchronous operation.
return self.cast(
ctxt,
self.make_msg(
"bulk_update_subcloud_availability_and_endpoint_status",
simplified_subcloud=simplified_subcloud,
subcloud_id=subcloud_id,
subcloud_name=subcloud_name,
availability_data=availability_data,
endpoint_data=endpoint_data,
),

View File

@@ -25,6 +25,7 @@ import oslo_messaging
from oslo_service import service
from dccommon import consts as dccommon_consts
from dccommon import utils as cutils
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
from dcmanager.common import consts
from dcmanager.common import context
@@ -77,7 +78,7 @@ class DCManagerStateService(service.Service):
self.subcloud_state_manager = SubcloudStateManager()
def start(self):
LOG.info("Starting %s", self.__class__.__name__)
LOG.info(f"Starting {self.__class__.__name__}")
utils.set_open_file_limit(cfg.CONF.state_worker_rlimit_nofile)
self._init_managers()
target = oslo_messaging.Target(
@@ -99,10 +100,10 @@ class DCManagerStateService(service.Service):
self._rpc_server.wait()
LOG.info("Engine service stopped successfully")
except Exception as ex:
LOG.error("Failed to stop engine service: %s", str(ex))
LOG.error(f"Failed to stop engine service: {str(ex)}")
def stop(self):
LOG.info("Stopping %s", self.__class__.__name__)
LOG.info(f"Stopping {self.__class__.__name__}")
self._stop_rpc_server()
# Terminate the engine process
LOG.info("All threads were gone, terminating engine")
@@ -111,20 +112,21 @@ class DCManagerStateService(service.Service):
@request_context
def update_subcloud_endpoint_status(
self,
context,
subcloud_name=None,
subcloud_region=None,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
alarmable=True,
ignore_endpoints=None,
):
context: context.RequestContext,
subcloud_name: str = None,
subcloud_region: str = None,
endpoint_type: str = None,
sync_status: str = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
alarmable: bool = True,
ignore_endpoints: list[str] = None,
) -> None:
# Updates subcloud endpoint sync status
LOG.info(
"Handling update_subcloud_endpoint_status request for subcloud: "
f"({subcloud_name if subcloud_name is not None else subcloud_region}) "
name = subcloud_name if subcloud_name is not None else subcloud_region
msg = (
"Handling update_subcloud_endpoint_status request. "
f"endpoint: ({endpoint_type}) status: ({sync_status})"
)
cutils.log_subcloud_msg(LOG.info, msg, name)
self.subcloud_state_manager.update_subcloud_endpoint_status(
context,
@@ -164,17 +166,16 @@ class DCManagerStateService(service.Service):
@request_context
def update_subcloud_availability(
self,
context,
subcloud_name,
subcloud_region,
availability_status,
update_state_only=False,
audit_fail_count=None,
):
context: context.RequestContext,
subcloud_name: str,
subcloud_region: str,
availability_status: str,
update_state_only: bool = False,
audit_fail_count: int = None,
) -> None:
# Updates subcloud availability
LOG.info(
"Handling update_subcloud_availability request for: %s" % subcloud_name
)
msg = "Handling update_subcloud_availability request"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
self.subcloud_state_manager.update_subcloud_availability(
context,
subcloud_region,
@@ -184,14 +185,17 @@ class DCManagerStateService(service.Service):
)
def bulk_update_subcloud_availability_and_endpoint_status(
self, context, simplified_subcloud, availability_data, endpoint_data
):
LOG.debug(
"Handling bulk_update_subcloud_availability_and_endpoint_status request "
f"for subcloud: {simplified_subcloud['name']}"
)
self,
context: context.RequestContext,
subcloud_id: int,
subcloud_name: str,
availability_data: dict,
endpoint_data: dict[str, str],
) -> None:
msg = "Handling bulk_update_subcloud_availability_and_endpoint_status request"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
manager = self.subcloud_state_manager
manager.bulk_update_subcloud_availability_and_endpoint_status(
context, simplified_subcloud, availability_data, endpoint_data
context, subcloud_id, subcloud_name, availability_data, endpoint_data
)

View File

@@ -10,20 +10,19 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2017-2024 Wind River Systems, Inc.
# Copyright (c) 2017-2025 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
import copy
from fm_api import constants as fm_const
from fm_api import fm_api
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
from dccommon import utils as cutils
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
from dcmanager.common import consts
from dcmanager.common import context
@@ -31,7 +30,7 @@ from dcmanager.common import exceptions
from dcmanager.common import manager
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.db.sqlalchemy.models import Subcloud
from dcmanager.db.sqlalchemy import models
from dcmanager.rpc import client as rpc_client
from dcorch.rpc import client as dcorch_rpc_client
@@ -43,7 +42,7 @@ def sync_update_subcloud_endpoint_status(func):
"""Synchronized lock decorator for _update_subcloud_endpoint_status."""
def _get_lock_and_call(*args, **kwargs):
"""Get a single fair lock per subcloud based on subcloud region."""
"""Get a single fair lock per subcloud based on subcloud name/region."""
# subcloud region is the 3rd argument to
# _update_subcloud_endpoint_status()
@@ -67,18 +66,18 @@ class SubcloudStateManager(manager.Manager):
)
self.context = context.get_admin_context()
self.dcorch_rpc_client = dcorch_rpc_client.EngineWorkerClient()
self.fm_api = fm_api.FaultAPIs()
self.fm_api = fm_api.FaultAPIsV2()
self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient()
def _do_update_subcloud_endpoint_status(
self,
context,
subcloud_id,
endpoint_type,
sync_status,
alarmable,
ignore_endpoints=None,
):
context: context.RequestContext,
subcloud_id: int,
endpoint_type: str,
sync_status: str,
alarmable: bool,
ignore_endpoints: list[str],
) -> None:
"""Update online/managed subcloud endpoint status
:param context: request context object
@@ -90,211 +89,114 @@ class SubcloudStateManager(manager.Manager):
endpoint_type is None)
"""
if ignore_endpoints is None:
ignore_endpoints = []
endpoint_status_dict = {}
endpoint_to_update_list = []
faults_to_raise = []
faults_to_clear = []
# The subcloud object will always be the same, so we just keep the last one
for subcloud, endpoint_status in db_api.subcloud_get_with_status(
context,
subcloud_id,
endpoint_type=endpoint_type,
):
endpoint_status_dict[endpoint_status.endpoint_type] = endpoint_status
if endpoint_type:
status = endpoint_status_dict.get(endpoint_type)
if status and status.sync_status == sync_status:
msg = f"Sync status ({sync_status}) did not change - ignoring update"
cutils.log_subcloud_msg(LOG.debug, msg, subcloud.name)
return
elif not status:
msg = f"Subcloud: {subcloud.name}. Endpoint {endpoint_type} not found"
raise exceptions.BadRequest(
resource="subcloud",
msg=msg,
)
self._trigger_subcloud_audits_after_identity_sync(
context,
subcloud_id,
subcloud,
sync_status,
endpoint_type,
endpoint_status_dict,
)
for endpoint in endpoint_status_dict.values():
if not endpoint_type and endpoint.endpoint_type in ignore_endpoints:
continue
endpoint_to_update_list.append(endpoint.endpoint_type)
entity_instance_id = (
f"subcloud={subcloud.name}.resource={endpoint.endpoint_type}"
)
if sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
faults_to_clear.append((ALARM_OUT_OF_SYNC, entity_instance_id))
elif alarmable and (sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
fault = self._create_fault_out_of_sync(subcloud.name, endpoint_type)
faults_to_raise.append(fault)
subcloud_status_list = []
subcloud = None
original_identity_status = None
# retrieve the info from the db for this subcloud.
# subcloud_id should not be None
try:
for subcloud, subcloud_status in db_api.subcloud_get_with_status(
context, subcloud_id
):
if subcloud_status:
subcloud_status_list.append(
db_api.subcloud_endpoint_status_db_model_to_dict(
subcloud_status
)
)
if (
subcloud_status.endpoint_type
== dccommon_consts.ENDPOINT_TYPE_IDENTITY
):
original_identity_status = subcloud_status.sync_status
# We first want to raise/clear any alarms because in case of an
# unresponsive FM, like we have during a swact, the operations wont process
# again if the DB state is already the correct one, leading to a persistent
# alarm
self._raise_and_clear_subcloud_alarms_list(
subcloud.name, faults_to_raise, faults_to_clear
)
if endpoint_to_update_list:
db_api.subcloud_status_update_endpoints(
context, subcloud_id, endpoint_to_update_list, sync_status
)
except Exception as e:
LOG.exception(e)
msg = f"Failed to update subcloud endpoint status: {e}"
cutils.log_subcloud_msg(LOG.error, msg, subcloud.name)
raise e
if subcloud:
if endpoint_type:
# updating a single endpoint on a single subcloud
for subcloud_status in subcloud_status_list:
if subcloud_status["endpoint_type"] == endpoint_type:
if subcloud_status["sync_status"] == sync_status:
# No change in the sync_status
LOG.debug(
"Sync status (%s) for subcloud %s did not change "
"- ignore update" % (sync_status, subcloud.name)
)
return
# We found the endpoint
break
else:
# We did not find the endpoint
raise exceptions.BadRequest(
resource="subcloud",
msg="Endpoint %s not found for subcloud" % endpoint_type,
)
def _trigger_subcloud_audits_after_identity_sync(
self,
context: context.RequestContext,
subcloud_id: int,
subcloud: models.Subcloud,
sync_status: str,
endpoint_type: str,
endpoint_status_dict: dict[str, models.SubcloudStatus],
) -> None:
"""Trigger audits for a subcloud after the first identity sync is complete
LOG.info(
"Updating subcloud:%s endpoint:%s sync:%s"
% (subcloud.name, endpoint_type, sync_status)
)
db_api.subcloud_status_update(
context, subcloud_id, endpoint_type, sync_status
:param context: request context object
:param subcloud_id: id of the subcloud to update
:param subcloud: subcloud object
:param sync_status: sync status to set
:param endpoint_type: endpoint type to update
:param endpoint_status_dict: dict of endpoint types and their status
"""
is_sync_not_unknown = sync_status != dccommon_consts.SYNC_STATUS_UNKNOWN
identity_endpoint = endpoint_status_dict.get(
dccommon_consts.ENDPOINT_TYPE_IDENTITY
)
is_identity_unknown = (
identity_endpoint
and identity_endpoint.sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
)
if (
endpoint_type == dccommon_consts.ENDPOINT_TYPE_IDENTITY
and is_sync_not_unknown
and is_identity_unknown
):
if not subcloud.first_identity_sync_complete:
db_api.subcloud_update(
context, subcloud_id, first_identity_sync_complete=True
)
msg = "Request for audits after updating identity out of unknown"
cutils.log_subcloud_msg(LOG.debug, msg, subcloud.name)
self.audit_rpc_client.trigger_subcloud_audits(context, subcloud_id)
# Trigger subcloud audits for the subcloud after
# its identity endpoint turns to other status from unknown
is_sync_unknown = sync_status != dccommon_consts.SYNC_STATUS_UNKNOWN
is_identity_unknown = (
original_identity_status == dccommon_consts.SYNC_STATUS_UNKNOWN
)
if (
endpoint_type == dccommon_consts.ENDPOINT_TYPE_IDENTITY
and is_sync_unknown
and is_identity_unknown
):
if not subcloud.first_identity_sync_complete:
db_api.subcloud_update(
context, subcloud_id, first_identity_sync_complete=True
)
LOG.debug(
"Request for audits for %s after updating "
"identity out of unknown" % subcloud.name
)
self.audit_rpc_client.trigger_subcloud_audits(context, subcloud_id)
entity_instance_id = "subcloud=%s.resource=%s" % (
subcloud.name,
endpoint_type,
)
fault = self.fm_api.get_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
if (sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC) and fault:
try:
self.fm_api.clear_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
except Exception as e:
LOG.exception(e)
elif (
not fault
and alarmable
and (sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
):
entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD
try:
fault = fm_api.Fault(
alarm_id=ALARM_OUT_OF_SYNC,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=entity_type_id,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
reason_text=(
"%s %s sync_status is out-of-sync"
% (subcloud.name, endpoint_type)
),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
proposed_repair_action=(
"If problem persists contact next level of support"
),
service_affecting=False,
)
self.fm_api.set_fault(fault)
except Exception as e:
LOG.exception(e)
else:
# update all endpoints on this subcloud
LOG.info(
"Updating all endpoints on subcloud: %s sync: %s "
"ignore_endpoints: %s"
% (subcloud.name, sync_status, ignore_endpoints)
)
# TODO(yuxing): The following code can be further optimized when
# batch alarm clearance APIs are available, so we don't need to
# loop over all the endpoints of a given subcloud, e.g.
# if not ignore_endpoints:
# db_api.subcloud_status_update_endpoints_all(...)
# else:
# db_api.subcloud_status_update_endpoints(...)
endpoint_to_update_list = []
for entry in subcloud_status_list:
endpoint = entry[consts.ENDPOINT_TYPE]
if endpoint in ignore_endpoints:
# Do not update this endpoint
continue
endpoint_to_update_list.append(endpoint)
entity_instance_id = "subcloud=%s.resource=%s" % (
subcloud.name,
endpoint,
)
fault = self.fm_api.get_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
# TODO(yuxing): batch clear all the out-of-sync alarms of a
# given subcloud if fm_api support it. Be careful with the
# dc-cert endpoint when adding the above; the endpoint
# alarm must remain for offline subclouds.
if (
sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
) and fault:
try:
self.fm_api.clear_fault(
ALARM_OUT_OF_SYNC, entity_instance_id
)
except Exception as e:
LOG.exception(e)
elif (
not fault
and alarmable
and (sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
):
entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD
try:
fault = fm_api.Fault(
alarm_id=ALARM_OUT_OF_SYNC,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=entity_type_id,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
reason_text=(
"%s %s sync_status is out-of-sync"
% (subcloud.name, endpoint)
),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
proposed_repair_action=(
"If problem persists contact next level of support"
),
service_affecting=False,
)
self.fm_api.set_fault(fault)
except Exception as e:
LOG.exception(e)
if endpoint_to_update_list:
try:
db_api.subcloud_status_update_endpoints(
context, subcloud_id, endpoint_to_update_list, sync_status
)
except Exception as e:
LOG.exception(e)
else:
LOG.error("Subcloud not found:%s" % subcloud_id)
def _should_update_endpoint_status(self, subcloud, endpoint_type, sync_status):
def _should_update_endpoint_status(
self, subcloud: models.Subcloud, endpoint_type: str, sync_status: str
) -> bool:
"""Verifies if the subcloud's endpoint should have its sync status updated"""
# Rules for updating sync status:
@@ -332,13 +234,13 @@ class SubcloudStateManager(manager.Manager):
@sync_update_subcloud_endpoint_status
def _update_subcloud_endpoint_status(
self,
context,
subcloud_region,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
alarmable=True,
ignore_endpoints=None,
):
context: context.RequestContext,
subcloud_region: str,
endpoint_type: str,
sync_status: str,
alarmable: bool,
ignore_endpoints: list[str],
) -> None:
"""Update subcloud endpoint status
:param context: request context object
@@ -350,9 +252,6 @@ class SubcloudStateManager(manager.Manager):
endpoint_type is None)
"""
if ignore_endpoints is None:
ignore_endpoints = []
if not subcloud_region:
raise exceptions.BadRequest(
resource="subcloud", msg="Subcloud region not provided"
@@ -379,19 +278,18 @@ class SubcloudStateManager(manager.Manager):
LOG.exception(e)
raise e
else:
LOG.info(
"Ignoring subcloud sync_status update for subcloud:%s "
"availability:%s management:%s endpoint:%s sync:%s"
% (
subcloud.name,
subcloud.availability_status,
subcloud.management_state,
endpoint_type,
sync_status,
)
msg = (
f"Ignoring subcloud sync_status update. "
f"Availability: {subcloud.availability_status}; "
f"Management:{subcloud.management_state}; "
f"Endpoint:{endpoint_type}; "
f"sync_status:{sync_status}"
)
cutils.log_subcloud_msg(LOG.info, msg, subcloud.name)
def _create_fault(self, subcloud_name, endpoint):
def _create_fault_out_of_sync(
self, subcloud_name: str, endpoint: str
) -> fm_api.Fault:
"""Creates a fault for an endpoint out-of-sync
:param subcloud_name: subcloud's name
@@ -412,21 +310,79 @@ class SubcloudStateManager(manager.Manager):
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
proposed_repair_action="If problem persists contact next level of support",
service_affecting=False,
keep_existing_alarm=True,
)
def _create_fault_offline(self, subcloud_name: str) -> fm_api.Fault:
"""Creates a fault for an offline subcloud
:param subcloud_name: subcloud's name
:return: an FM fault object
:rtype: Fault
"""
entity_instance_id = f"subcloud={subcloud_name}"
return fm_api.Fault(
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
reason_text=("%s is offline" % subcloud_name),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
proposed_repair_action=(
"Wait for subcloud to become online; if problem persists "
"contact next level of support."
),
service_affecting=True,
keep_existing_alarm=True,
)
def bulk_update_subcloud_availability_and_endpoint_status(
self, context, simplified_subcloud, availability_data, endpoint_data
):
self,
context: context.RequestContext,
subcloud_id: int,
subcloud_name: str,
availability_data: dict,
endpoint_data: dict[str, str],
) -> None:
"""Bulk update subcloud availability and endpoint status
:param context: request context object
:param subcloud_id: id of the subcloud to update
:param subcloud_name: name of the subcloud to update
:param availability_data: a dict containing the availability status,
update_state_only and audit_fail_count
:param endpoint_data: a dict containing the endpoint as key and its sync
status as value
"""
# This bulk update is executed as part of the audit process in dcmanager and
# its related endpoints. This method is not used by dcorch and cert-mon.
# When the request is performed through RPC, the subcloud object is sent as
# a dict and needs to be redefined as a model
subcloud = Subcloud(**simplified_subcloud)
# The subcloud object will always be the same, so we just keep the last one
unchanged_endpoints = []
for subcloud, endpoint_status in db_api.subcloud_get_with_status(
context,
subcloud_id,
):
if (
endpoint_data.get(endpoint_status.endpoint_type)
== endpoint_status.sync_status
):
unchanged_endpoints.append(endpoint_status.endpoint_type)
del endpoint_data[endpoint_status.endpoint_type]
if unchanged_endpoints:
msg = (
"The following endpoints are already set to updated values, "
f"not updating: {unchanged_endpoints}"
)
cutils.log_subcloud_msg(LOG.debug, msg, subcloud_name)
if availability_data:
self.update_subcloud_availability(
context,
subcloud.name,
subcloud.region_name,
availability_data["availability_status"],
availability_data["update_state_only"],
@@ -438,14 +394,17 @@ class SubcloudStateManager(manager.Manager):
@sync_update_subcloud_endpoint_status
def _do_bulk_update_subcloud_endpoint_status(
self, context, region_name, subcloud_id, subcloud_name, endpoint_data
):
self,
context: context.RequestContext,
subcloud_name: str,
subcloud_id: int,
endpoint_data: dict[str, str],
) -> None:
"""Updates an online and managed subcloud's endpoints sync status
:param context: request context object
:param region_name: region name of subcloud to update
:param subcloud_id: id of the subcloud to update
:param subcloud_name: name of the subcloud to update
:param subcloud_id: id of the subcloud to update
:param endpoint_data: a dict containing the endpoint as key and its sync
status as value
"""
@@ -455,56 +414,23 @@ class SubcloudStateManager(manager.Manager):
# the difference that only the required endpoints will be update and that'll
# happen at once.
status_to_set = [f"{key} ({value})" for key, value in endpoint_data.items()]
LOG.info(
f"Updating endpoints on subcloud: {subcloud_name} "
f"endpoints: {', '.join(status_to_set)}"
)
msg = f"Updating endpoints: {', '.join(status_to_set)}"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
# For each endpoint in endpoint_data, decide whether an alarm should be set
# or not and create it in case it's necessary.
faults_to_set = dict()
entity_instance_id = f"subcloud={subcloud_name}"
faults_to_set = []
faults_to_clear = []
# Acquire all existing alarms with the specified alarm_id for a subcloud.
faults = self.fm_api.get_faults_by_id_n_eid(
ALARM_OUT_OF_SYNC, entity_instance_id
)
# Create a dictionary with the endpoint as key and fault as value
if faults:
for fault in faults:
# The entity_instance_id is made out of
# subcloud={subcloud.name}.resource={endpoint}
endpoint = fault.entity_instance_id.split("resource=")[1]
# The uuid reset is necessary to avoid warnings in postgres.log
# related to adding an element with an existing uuid
fault.uuid = None
faults_to_set[endpoint] = fault
# Copy the original dictionary created and, for each endpoint to be updated,
# verify if it either needs to set or clear an alarm
endpoints_with_faults = copy.deepcopy(list(faults_to_set.keys()))
for endpoint, sync_status in endpoint_data.items():
has_fault = True if endpoint in endpoints_with_faults else False
if sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC and not has_fault:
faults_to_set[endpoint] = self._create_fault(subcloud_name, endpoint)
elif sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC and has_fault:
del faults_to_set[endpoint]
# If the original dictionary created and the updated one are different, i.e.
# if an endpoint is removed or added to it, clear all faults for the subcloud
# and set the updated dictionary.
if set(endpoints_with_faults) != set(list(faults_to_set.keys())):
try:
self.fm_api.clear_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
self.fm_api.set_faults(faults_to_set.values())
except Exception as e:
LOG.exception(
f"An error occurred when updating subcloud {subcloud_name} "
f"alarms: {e}"
if sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
faults_to_set.append(
self._create_fault_out_of_sync(subcloud_name, endpoint)
)
elif sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC:
entity_instance_id = f"subcloud={subcloud_name}.resource={endpoint}"
faults_to_clear.append((ALARM_OUT_OF_SYNC, entity_instance_id))
self._raise_and_clear_subcloud_alarms_list(
subcloud_name, faults_to_set, faults_to_clear
)
try:
db_api.subcloud_status_bulk_update_endpoints(
@@ -513,12 +439,15 @@ class SubcloudStateManager(manager.Manager):
endpoint_data,
)
except Exception as e:
LOG.exception(
f"An error occured when updating the subcloud {subcloud_name}'s"
f"endpoint status: {e}"
)
msg = f"An error occured when updating the subcloud endpoint status: {e}"
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
def _bulk_update_subcloud_endpoint_status(self, context, subcloud, endpoint_data):
def _bulk_update_subcloud_endpoint_status(
self,
context: context.RequestContext,
subcloud: models.Subcloud,
endpoint_data: dict[str, str],
) -> None:
"""Update the sync status of a list of subcloud endpoints
:param context: current context object
@@ -540,31 +469,31 @@ class SubcloudStateManager(manager.Manager):
try:
self._do_bulk_update_subcloud_endpoint_status(
context,
subcloud.region_name,
subcloud.id,
subcloud.name,
subcloud.id,
endpoints_to_update,
)
except Exception as e:
LOG.exception(e)
raise e
else:
LOG.info(
"Ignoring bulk_update_subcloud_endpoint_status for subcloud: "
f"{subcloud.name} availability: {subcloud.availability_status} "
f"management: {subcloud.management_state} endpoints: "
f"{', '.join(endpoint_data.keys())}"
msg = (
f"No endpoints to update the status; "
f"Availability: {subcloud.availability_status}; "
f"Management: {subcloud.management_state};"
f"Endpoints: {', '.join(endpoint_data.keys())}"
)
cutils.log_subcloud_msg(LOG.info, msg, subcloud.name)
def update_subcloud_endpoint_status(
self,
context,
subcloud_region=None,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
alarmable=True,
ignore_endpoints=None,
):
context: context.RequestContext,
subcloud_region: str = None,
endpoint_type: str = None,
sync_status: str = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
alarmable: bool = True,
ignore_endpoints: list[str] = None,
) -> None:
"""Update subcloud endpoint status
:param context: request context object
@@ -602,94 +531,102 @@ class SubcloudStateManager(manager.Manager):
def _update_subcloud_state(
self,
context,
subcloud_name,
subcloud_region,
management_state,
availability_status,
):
context: context.RequestContext,
subcloud_name: str,
subcloud_region: str,
management_state: str,
availability_status: str,
) -> None:
try:
LOG.info(
"Notifying dcorch, subcloud:%s management: %s, availability:%s"
% (subcloud_name, management_state, availability_status)
msg = (
f"Notifying dcorch, management: {management_state}, "
f"availability:{availability_status}"
)
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
self.dcorch_rpc_client.update_subcloud_states(
context, subcloud_region, management_state, availability_status
)
except Exception:
LOG.exception(
"Problem informing dcorch of subcloud state change, subcloud: %s"
% subcloud_name
)
msg = "Problem informing dcorch of subcloud state change"
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
def _raise_and_clear_subcloud_alarms_list(
self,
subcloud_name: str,
faults_to_raise: list[fm_api.Fault] = None,
faults_to_clear: list[tuple[str, str]] = None,
) -> None:
"""Raise/clear a list of subcloud alarms
:param faults_to_raise: list of faults to raise
:param faults_to_clear: list of faults to clear
"""
if faults_to_clear:
try:
self.fm_api.clear_faults_list(faults_to_clear)
except Exception as e:
msg = "Failed to clear alarms from list"
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
raise e
if faults_to_raise:
try:
self.fm_api.set_faults(faults_to_raise)
except Exception as e:
msg = "Failed to raise alarms from list"
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
raise e
def _raise_or_clear_subcloud_status_alarm(
self, subcloud_name, availability_status, deploy_status=None
):
entity_instance_id = "subcloud=%s" % subcloud_name
fault = self.fm_api.get_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, entity_instance_id
)
self, subcloud_name: str, availability_status: str, deploy_status: str = None
) -> None:
entity_instance_id = f"subcloud={subcloud_name}"
if fault and (availability_status == dccommon_consts.AVAILABILITY_ONLINE):
if availability_status == dccommon_consts.AVAILABILITY_ONLINE:
try:
self.fm_api.clear_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, entity_instance_id
)
except Exception:
LOG.exception(
"Failed to clear offline alarm for subcloud: %s", subcloud_name
)
except Exception as e:
msg = "Failed to clear offline alarm"
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
raise e
# Raise the alarm if the subcloud became offline and it's not a
# secondary subcloud
elif not fault and (
elif (
availability_status == dccommon_consts.AVAILABILITY_OFFLINE
and deploy_status != consts.DEPLOY_STATE_SECONDARY
):
try:
fault = fm_api.Fault(
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
reason_text=("%s is offline" % subcloud_name),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
proposed_repair_action=(
"Wait for subcloud to become online; if problem persists "
"contact next level of support."
),
service_affecting=True,
)
fault = self._create_fault_offline(subcloud_name)
self.fm_api.set_fault(fault)
except Exception:
LOG.exception(
"Failed to raise offline alarm for subcloud: %s", subcloud_name
)
except Exception as e:
msg = "Failed to raise offline alarm"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
raise e
def update_subcloud_availability(
self,
context,
subcloud_region,
availability_status,
update_state_only=False,
audit_fail_count=None,
subcloud=None,
):
context: context.RequestContext,
subcloud_name: str,
subcloud_region: str,
availability_status: str,
update_state_only: bool = False,
audit_fail_count: int = None,
subcloud: models.Subcloud = None,
) -> None:
if subcloud is None:
try:
subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region)
except Exception:
LOG.exception(
"Failed to get subcloud by region name %s" % subcloud_region
)
msg = f"Failed to get subcloud by region name: {subcloud_region}"
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
raise
if update_state_only:
msg = "Received update_state_only request"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
# Ensure that the status alarm is consistent with the
# subcloud's availability. This is required to compensate
# for rare alarm update failures, which may occur during
@@ -716,12 +653,19 @@ class SubcloudStateManager(manager.Manager):
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info(
"Ignoring SubcloudNotFound when attempting "
"audit_fail_count update: %s" % subcloud.name
msg = (
"Ignoring SubcloudNotFound when attempting audit_fail_count update"
)
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
return
else:
if availability_status == subcloud.availability_status:
msg = (
"Availability status hasn't changed from "
f"{availability_status}, not updating"
)
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
return
self._raise_or_clear_subcloud_status_alarm(
subcloud.name, availability_status
)
@@ -748,16 +692,15 @@ class SubcloudStateManager(manager.Manager):
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info(
"Ignoring SubcloudNotFound when attempting state update: %s"
% subcloud.name
)
msg = "Ignoring SubcloudNotFound when attempting state update"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
return
if availability_status == dccommon_consts.AVAILABILITY_ONLINE:
# Subcloud is going online
# Tell cert-mon to audit endpoint certificate.
LOG.info("Request for online audit for %s" % subcloud.name)
msg = "Request for online audit"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
dc_notification = rpc_client.DCManagerNotifications()
dc_notification.subcloud_online(context, subcloud.region_name)
# Trigger all the audits for the subcloud so it can update the
@@ -772,48 +715,3 @@ class SubcloudStateManager(manager.Manager):
updated_subcloud.management_state,
availability_status,
)
def update_subcloud_sync_endpoint_type(
self, context, subcloud_region, endpoint_type_list, openstack_installed
):
operation = "add" if openstack_installed else "remove"
func_switcher = {
"add": (
self.dcorch_rpc_client.add_subcloud_sync_endpoint_type,
db_api.subcloud_status_create,
),
"remove": (
self.dcorch_rpc_client.remove_subcloud_sync_endpoint_type,
db_api.subcloud_status_delete,
),
}
try:
subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region)
except Exception:
LOG.exception("Failed to get subcloud by region name: %s" % subcloud_region)
raise
try:
# Notify dcorch to add/remove sync endpoint type list
func_switcher[operation][0](
self.context, subcloud_region, endpoint_type_list
)
LOG.info(
"Notifying dcorch, subcloud: %s new sync endpoint: %s"
% (subcloud.name, endpoint_type_list)
)
# Update subcloud status table by adding/removing openstack sync
# endpoint types
for endpoint_type in endpoint_type_list:
func_switcher[operation][1](self.context, subcloud.id, endpoint_type)
# Update openstack_installed of subcloud table
db_api.subcloud_update(
self.context, subcloud.id, openstack_installed=openstack_installed
)
except Exception:
LOG.exception(
"Problem informing dcorch of subcloud sync endpoint "
"type change, subcloud: %s" % subcloud.name
)

View File

@@ -384,16 +384,6 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
values.update(kwargs)
return db_api.subcloud_create(ctxt, **values)
def create_simplified_subcloud(self, subcloud):
return {
"id": subcloud.id,
"name": subcloud.name,
"availability_status": subcloud.availability_status,
"management_state": subcloud.management_state,
"deploy_status": subcloud.deploy_status,
"region_name": subcloud.region_name,
}
def test_init(self):
am = subcloud_audit_worker_manager.SubcloudAuditWorkerManager()
self.assertIsNotNone(am)
@@ -465,7 +455,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self._set_all_audits_in_sync()
self.update_subcloud_availability_and_endpoint_status.assert_called_once_with(
mock.ANY,
self.create_simplified_subcloud(subcloud),
subcloud.id,
subcloud.name,
self.availability_data,
self.endpoint_data,
)
@@ -535,7 +526,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0)
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
mock.ANY,
self.create_simplified_subcloud(subcloud),
subcloud.id,
subcloud.name,
self.availability_data,
self.endpoint_data,
)
@@ -600,7 +592,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0)
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
mock.ANY,
self.create_simplified_subcloud(subcloud),
subcloud.id,
subcloud.name,
self.availability_data,
self.endpoint_data,
)
@@ -693,7 +686,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, True, None)
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
mock.ANY,
self.create_simplified_subcloud(subcloud),
subcloud.id,
subcloud.name,
self.availability_data,
self.endpoint_data,
)
@@ -781,7 +775,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self._set_all_audits_in_sync()
self.update_subcloud_availability_and_endpoint_status.assert_called_once_with(
mock.ANY,
self.create_simplified_subcloud(subcloud),
subcloud.id,
subcloud.name,
self.availability_data,
self.endpoint_data,
)
@@ -976,7 +971,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self._update_availability(dccommon_consts.AVAILABILITY_OFFLINE, False, 2)
self.update_subcloud_availability_and_endpoint_status.assert_called_with(
mock.ANY,
self.create_simplified_subcloud(subcloud),
subcloud.id,
subcloud.name,
self.availability_data,
self.endpoint_data,
)

View File

@@ -472,16 +472,6 @@ class BaseTestSubcloudManager(base.DCManagerTestCase):
values.update(kwargs)
return db_api.subcloud_create(ctxt, **values)
def create_simplified_subcloud(self, subcloud):
return {
"id": subcloud.id,
"name": subcloud.name,
"availability_status": subcloud.availability_status,
"management_state": subcloud.management_state,
"deploy_status": subcloud.deploy_status,
"region_name": subcloud.region_name,
}
@staticmethod
def create_subcloud_peer_group_static(ctxt, **kwargs):
values = {
@@ -2377,7 +2367,10 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
self.assertEqual(status.sync_status, dccommon_consts.SYNC_STATUS_UNKNOWN)
ssm.update_subcloud_availability(
self.ctx, self.subcloud.region_name, dccommon_consts.AVAILABILITY_ONLINE
self.ctx,
self.subcloud.name,
self.subcloud.region_name,
dccommon_consts.AVAILABILITY_ONLINE,
)
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, "subcloud1")
@@ -2423,7 +2416,8 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
ssm = subcloud_state_manager.SubcloudStateManager()
ssm.bulk_update_subcloud_availability_and_endpoint_status(
self.ctx,
self.create_simplified_subcloud(self.subcloud),
self.subcloud.id,
self.subcloud.name,
availability_data,
endpoint_data,
)
@@ -2457,7 +2451,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
When the endpoint's status in the database is the same as the one it'll be
updated to, ensure that, instead of validating, bulk_update_endpoint_status
sets the same value in the database
just skip it
"""
db_api.subcloud_update(
@@ -2475,23 +2469,25 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
ssm = subcloud_state_manager.SubcloudStateManager()
ssm.bulk_update_subcloud_availability_and_endpoint_status(
self.ctx,
self.create_simplified_subcloud(self.subcloud),
self.subcloud.id,
self.subcloud.name,
None,
endpoint_data,
)
self.assertEqual(mock_db.call_count, 1)
# Re-executing the method should result in the same amount of call counts
# Re-executing the method should result in no extra calls
# for the database query since there are no updates
ssm.bulk_update_subcloud_availability_and_endpoint_status(
self.ctx,
self.create_simplified_subcloud(self.subcloud),
self.subcloud.id,
self.subcloud.name,
None,
endpoint_data,
)
self.assertEqual(mock_db.call_count, 2)
self.assertEqual(mock_db.call_count, 1)
@mock.patch.object(
subcloud_state_manager.SubcloudStateManager,
@@ -2513,6 +2509,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
with mock.patch.object(db_api, "subcloud_update") as subcloud_update_mock:
ssm.update_subcloud_availability(
self.ctx,
self.subcloud.name,
self.subcloud.region_name,
availability_status=dccommon_consts.AVAILABILITY_ONLINE,
update_state_only=True,
@@ -2557,7 +2554,10 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
self.assertEqual(status.sync_status, dccommon_consts.SYNC_STATUS_UNKNOWN)
ssm.update_subcloud_availability(
self.ctx, self.subcloud.region_name, dccommon_consts.AVAILABILITY_ONLINE
self.ctx,
self.subcloud.name,
self.subcloud.region_name,
dccommon_consts.AVAILABILITY_ONLINE,
)
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, "subcloud1")
@@ -2614,6 +2614,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
audit_fail_count = 1
ssm.update_subcloud_availability(
self.ctx,
self.subcloud.name,
self.subcloud.region_name,
availability_status=None,
audit_fail_count=audit_fail_count,
@@ -2633,6 +2634,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
audit_fail_count = audit_fail_count + 1
ssm.update_subcloud_availability(
self.ctx,
self.subcloud.name,
self.subcloud.region_name,
dccommon_consts.AVAILABILITY_OFFLINE,
audit_fail_count=audit_fail_count,