Files
distcloud/distributedcloud/dcmanager/state/service.py
Victor Romano a3ddcf472d Improve dcmanager-state scalability
This commit includes the following changes:

- Implement the new fm-api methods regarding raising/clearing alarms
in batches. The new keep_existing_alarms option was also implemented
to make sure we don't update alarms as we're not checking if it
exist before trying to raise them again.

- Moving from FaultAPIs to FaultAPIsV2, which raises exceptions if
there's an error in FM, preventing state to continue and not
clearing/raising alarms when FM is offline. This can happen during a
swact where FM process is stopped before state.

- Introduce a db call to get the subcloud object and current status
of endpoint instead of receiving a simplified subcloud through RPC.
The reason for doing this instead of a simplified subcloud is that
dcmanager-audit is faster to process than state, so until state
updates, audit will keep sending information causing duplicated
updates, slowing down the time it takes to update every subcloud.

- Convert all logs into a default format with the subcloud name at
the start, for better traceability. E.g: "Subcloud: subcloud1. <msg>".

- Removed unused function update_subcloud_sync_endpoint_type.

Test plan:
  - PASS: Deploy a subcloud and verify state communicates to cert-mon
          that it became online and then updates the dc_cert endpoint
          after receiving the response.
  - PASS: Manage the subcloud and verify all endpoint are updated and
          the final sync status is in-sync.
  - PASS: Force a subcloud to have an out-of-sync kube root-ca and
          kubernetes and verify state correctly updates the db and
          raise the alarms.
  - PASS: Turn off the subcloud and verify:
            - Subcloud availability was updated in db
            - All endpoints were updated in db
            - Dcorch was communicated
            - All endpoints alarms were cleared
            - The offline alarm was raised
  - PASS: Unmanage the subcloud and verify all endpoints, whith the
          exception of dc_cert, were updated to unknown.
  - PASS: Unmanage and stop the fm-mgs service and turn off the
          subcloud. Verify the subcloud is not updated to offline
          until fm comes back on.
  - PASS: Perform scale tests and verify that updating availability
          and endpoints is faster.

Story: 2011311
Task: 52283

Depends-on: https://review.opendev.org/c/starlingx/fault/+/952671

Change-Id: I8792e1cbf8eb0af0cc9dd1be25987fac2503ecee
Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
2025-06-16 09:39:59 -03:00

202 lines
7.3 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2017-2025 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
import functools
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging
from oslo_service import service
from dccommon import consts as dccommon_consts
from dccommon import utils as cutils
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common.i18n import _
from dcmanager.common import messaging as rpc_messaging
from dcmanager.common import utils
from dcmanager.state.subcloud_state_manager import SubcloudStateManager
LOG = logging.getLogger(__name__)
def request_context(func):
@functools.wraps(func)
def wrapped(self, ctx, *args, **kwargs):
if ctx is not None and not isinstance(ctx, context.RequestContext):
ctx = context.RequestContext.from_dict(ctx.to_dict())
try:
return func(self, ctx, *args, **kwargs)
except exceptions.DCManagerException:
raise oslo_messaging.rpc.dispatcher.ExpectedException()
return wrapped
class DCManagerStateService(service.Service):
"""Lifecycle manager for a running service.
- All the methods in here are called from the RPC client.
- If a RPC call does not have a corresponding method here, an exception
will be thrown.
- Arguments to these calls are added dynamically and will be treated as
keyword arguments by the RPC client.
"""
def __init__(self, host):
super(DCManagerStateService, self).__init__()
self.host = cfg.CONF.host
self.rpc_api_version = consts.RPC_API_VERSION
self.topic = consts.TOPIC_DC_MANAGER_STATE
# The following are initialized here, but assigned in start() which
# happens after the fork when spawning multiple worker processes
self.engine_id = None
self.target = None
self._rpc_server = None
self.subcloud_state_manager = None
self.audit_rpc_client = None
def _init_managers(self):
self.subcloud_state_manager = SubcloudStateManager()
def start(self):
LOG.info(f"Starting {self.__class__.__name__}")
utils.set_open_file_limit(cfg.CONF.state_worker_rlimit_nofile)
self._init_managers()
target = oslo_messaging.Target(
version=self.rpc_api_version, server=self.host, topic=self.topic
)
self.target = target
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
self._rpc_server.start()
# Used to notify dcmanager-audit
self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient()
super(DCManagerStateService, self).start()
def _stop_rpc_server(self):
# Stop RPC connection to prevent new requests
LOG.debug(_("Attempting to stop engine service..."))
try:
self._rpc_server.stop()
self._rpc_server.wait()
LOG.info("Engine service stopped successfully")
except Exception as ex:
LOG.error(f"Failed to stop engine service: {str(ex)}")
def stop(self):
LOG.info(f"Stopping {self.__class__.__name__}")
self._stop_rpc_server()
# Terminate the engine process
LOG.info("All threads were gone, terminating engine")
super(DCManagerStateService, self).stop()
@request_context
def update_subcloud_endpoint_status(
self,
context: context.RequestContext,
subcloud_name: str = None,
subcloud_region: str = None,
endpoint_type: str = None,
sync_status: str = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
alarmable: bool = True,
ignore_endpoints: list[str] = None,
) -> None:
# Updates subcloud endpoint sync status
name = subcloud_name if subcloud_name is not None else subcloud_region
msg = (
"Handling update_subcloud_endpoint_status request. "
f"endpoint: ({endpoint_type}) status: ({sync_status})"
)
cutils.log_subcloud_msg(LOG.info, msg, name)
self.subcloud_state_manager.update_subcloud_endpoint_status(
context,
subcloud_region,
endpoint_type,
sync_status,
alarmable,
ignore_endpoints,
)
# If the software sync status is being set to unknown, trigger the
# software audit so it can update the sync status ASAP.
if (
endpoint_type == dccommon_consts.AUDIT_TYPE_SOFTWARE
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
):
self.audit_rpc_client.trigger_software_audit(context)
# If the firmware sync status is being set to unknown, trigger the
# firmware audit so it can update the sync status ASAP.
if (
endpoint_type == dccommon_consts.ENDPOINT_TYPE_FIRMWARE
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
):
self.audit_rpc_client.trigger_firmware_audit(context)
# If the kubernetes sync status is being set to unknown, trigger the
# kubernetes audit so it can update the sync status ASAP.
if (
endpoint_type == dccommon_consts.ENDPOINT_TYPE_KUBERNETES
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
):
self.audit_rpc_client.trigger_kubernetes_audit(context)
return
@request_context
def update_subcloud_availability(
self,
context: context.RequestContext,
subcloud_name: str,
subcloud_region: str,
availability_status: str,
update_state_only: bool = False,
audit_fail_count: int = None,
) -> None:
# Updates subcloud availability
msg = "Handling update_subcloud_availability request"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
self.subcloud_state_manager.update_subcloud_availability(
context,
subcloud_region,
availability_status,
update_state_only,
audit_fail_count,
)
def bulk_update_subcloud_availability_and_endpoint_status(
self,
context: context.RequestContext,
subcloud_id: int,
subcloud_name: str,
availability_data: dict,
endpoint_data: dict[str, str],
) -> None:
msg = "Handling bulk_update_subcloud_availability_and_endpoint_status request"
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
manager = self.subcloud_state_manager
manager.bulk_update_subcloud_availability_and_endpoint_status(
context, subcloud_id, subcloud_name, availability_data, endpoint_data
)