
This commit includes the following changes: - Implement the new fm-api methods regarding raising/clearing alarms in batches. The new keep_existing_alarms option was also implemented to make sure we don't update alarms as we're not checking if it exist before trying to raise them again. - Moving from FaultAPIs to FaultAPIsV2, which raises exceptions if there's an error in FM, preventing state to continue and not clearing/raising alarms when FM is offline. This can happen during a swact where FM process is stopped before state. - Introduce a db call to get the subcloud object and current status of endpoint instead of receiving a simplified subcloud through RPC. The reason for doing this instead of a simplified subcloud is that dcmanager-audit is faster to process than state, so until state updates, audit will keep sending information causing duplicated updates, slowing down the time it takes to update every subcloud. - Convert all logs into a default format with the subcloud name at the start, for better traceability. E.g: "Subcloud: subcloud1. <msg>". - Removed unused function update_subcloud_sync_endpoint_type. Test plan: - PASS: Deploy a subcloud and verify state communicates to cert-mon that it became online and then updates the dc_cert endpoint after receiving the response. - PASS: Manage the subcloud and verify all endpoint are updated and the final sync status is in-sync. - PASS: Force a subcloud to have an out-of-sync kube root-ca and kubernetes and verify state correctly updates the db and raise the alarms. - PASS: Turn off the subcloud and verify: - Subcloud availability was updated in db - All endpoints were updated in db - Dcorch was communicated - All endpoints alarms were cleared - The offline alarm was raised - PASS: Unmanage the subcloud and verify all endpoints, whith the exception of dc_cert, were updated to unknown. - PASS: Unmanage and stop the fm-mgs service and turn off the subcloud. Verify the subcloud is not updated to offline until fm comes back on. - PASS: Perform scale tests and verify that updating availability and endpoints is faster. Story: 2011311 Task: 52283 Depends-on: https://review.opendev.org/c/starlingx/fault/+/952671 Change-Id: I8792e1cbf8eb0af0cc9dd1be25987fac2503ecee Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
202 lines
7.3 KiB
Python
202 lines
7.3 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# Copyright (c) 2017-2025 Wind River Systems, Inc.
|
|
#
|
|
# The right to copy, distribute, modify, or otherwise make use
|
|
# of this software may be licensed only pursuant to the terms
|
|
# of an applicable Wind River license agreement.
|
|
#
|
|
|
|
import functools
|
|
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
import oslo_messaging
|
|
from oslo_service import service
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon import utils as cutils
|
|
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common import exceptions
|
|
from dcmanager.common.i18n import _
|
|
from dcmanager.common import messaging as rpc_messaging
|
|
from dcmanager.common import utils
|
|
from dcmanager.state.subcloud_state_manager import SubcloudStateManager
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
def request_context(func):
|
|
@functools.wraps(func)
|
|
def wrapped(self, ctx, *args, **kwargs):
|
|
if ctx is not None and not isinstance(ctx, context.RequestContext):
|
|
ctx = context.RequestContext.from_dict(ctx.to_dict())
|
|
try:
|
|
return func(self, ctx, *args, **kwargs)
|
|
except exceptions.DCManagerException:
|
|
raise oslo_messaging.rpc.dispatcher.ExpectedException()
|
|
|
|
return wrapped
|
|
|
|
|
|
class DCManagerStateService(service.Service):
|
|
"""Lifecycle manager for a running service.
|
|
|
|
- All the methods in here are called from the RPC client.
|
|
- If a RPC call does not have a corresponding method here, an exception
|
|
will be thrown.
|
|
- Arguments to these calls are added dynamically and will be treated as
|
|
keyword arguments by the RPC client.
|
|
"""
|
|
|
|
def __init__(self, host):
|
|
super(DCManagerStateService, self).__init__()
|
|
self.host = cfg.CONF.host
|
|
self.rpc_api_version = consts.RPC_API_VERSION
|
|
self.topic = consts.TOPIC_DC_MANAGER_STATE
|
|
# The following are initialized here, but assigned in start() which
|
|
# happens after the fork when spawning multiple worker processes
|
|
self.engine_id = None
|
|
self.target = None
|
|
self._rpc_server = None
|
|
self.subcloud_state_manager = None
|
|
self.audit_rpc_client = None
|
|
|
|
def _init_managers(self):
|
|
self.subcloud_state_manager = SubcloudStateManager()
|
|
|
|
def start(self):
|
|
LOG.info(f"Starting {self.__class__.__name__}")
|
|
utils.set_open_file_limit(cfg.CONF.state_worker_rlimit_nofile)
|
|
self._init_managers()
|
|
target = oslo_messaging.Target(
|
|
version=self.rpc_api_version, server=self.host, topic=self.topic
|
|
)
|
|
self.target = target
|
|
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
|
|
self._rpc_server.start()
|
|
# Used to notify dcmanager-audit
|
|
self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient()
|
|
|
|
super(DCManagerStateService, self).start()
|
|
|
|
def _stop_rpc_server(self):
|
|
# Stop RPC connection to prevent new requests
|
|
LOG.debug(_("Attempting to stop engine service..."))
|
|
try:
|
|
self._rpc_server.stop()
|
|
self._rpc_server.wait()
|
|
LOG.info("Engine service stopped successfully")
|
|
except Exception as ex:
|
|
LOG.error(f"Failed to stop engine service: {str(ex)}")
|
|
|
|
def stop(self):
|
|
LOG.info(f"Stopping {self.__class__.__name__}")
|
|
self._stop_rpc_server()
|
|
# Terminate the engine process
|
|
LOG.info("All threads were gone, terminating engine")
|
|
super(DCManagerStateService, self).stop()
|
|
|
|
@request_context
|
|
def update_subcloud_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_name: str = None,
|
|
subcloud_region: str = None,
|
|
endpoint_type: str = None,
|
|
sync_status: str = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
|
alarmable: bool = True,
|
|
ignore_endpoints: list[str] = None,
|
|
) -> None:
|
|
# Updates subcloud endpoint sync status
|
|
name = subcloud_name if subcloud_name is not None else subcloud_region
|
|
msg = (
|
|
"Handling update_subcloud_endpoint_status request. "
|
|
f"endpoint: ({endpoint_type}) status: ({sync_status})"
|
|
)
|
|
cutils.log_subcloud_msg(LOG.info, msg, name)
|
|
|
|
self.subcloud_state_manager.update_subcloud_endpoint_status(
|
|
context,
|
|
subcloud_region,
|
|
endpoint_type,
|
|
sync_status,
|
|
alarmable,
|
|
ignore_endpoints,
|
|
)
|
|
|
|
# If the software sync status is being set to unknown, trigger the
|
|
# software audit so it can update the sync status ASAP.
|
|
if (
|
|
endpoint_type == dccommon_consts.AUDIT_TYPE_SOFTWARE
|
|
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
):
|
|
self.audit_rpc_client.trigger_software_audit(context)
|
|
|
|
# If the firmware sync status is being set to unknown, trigger the
|
|
# firmware audit so it can update the sync status ASAP.
|
|
if (
|
|
endpoint_type == dccommon_consts.ENDPOINT_TYPE_FIRMWARE
|
|
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
):
|
|
self.audit_rpc_client.trigger_firmware_audit(context)
|
|
|
|
# If the kubernetes sync status is being set to unknown, trigger the
|
|
# kubernetes audit so it can update the sync status ASAP.
|
|
if (
|
|
endpoint_type == dccommon_consts.ENDPOINT_TYPE_KUBERNETES
|
|
and sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
):
|
|
self.audit_rpc_client.trigger_kubernetes_audit(context)
|
|
|
|
return
|
|
|
|
@request_context
|
|
def update_subcloud_availability(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_name: str,
|
|
subcloud_region: str,
|
|
availability_status: str,
|
|
update_state_only: bool = False,
|
|
audit_fail_count: int = None,
|
|
) -> None:
|
|
# Updates subcloud availability
|
|
msg = "Handling update_subcloud_availability request"
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
self.subcloud_state_manager.update_subcloud_availability(
|
|
context,
|
|
subcloud_region,
|
|
availability_status,
|
|
update_state_only,
|
|
audit_fail_count,
|
|
)
|
|
|
|
def bulk_update_subcloud_availability_and_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_id: int,
|
|
subcloud_name: str,
|
|
availability_data: dict,
|
|
endpoint_data: dict[str, str],
|
|
) -> None:
|
|
msg = "Handling bulk_update_subcloud_availability_and_endpoint_status request"
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
|
|
manager = self.subcloud_state_manager
|
|
manager.bulk_update_subcloud_availability_and_endpoint_status(
|
|
context, subcloud_id, subcloud_name, availability_data, endpoint_data
|
|
)
|