
This commit includes the following changes: - Implement the new fm-api methods regarding raising/clearing alarms in batches. The new keep_existing_alarms option was also implemented to make sure we don't update alarms as we're not checking if it exist before trying to raise them again. - Moving from FaultAPIs to FaultAPIsV2, which raises exceptions if there's an error in FM, preventing state to continue and not clearing/raising alarms when FM is offline. This can happen during a swact where FM process is stopped before state. - Introduce a db call to get the subcloud object and current status of endpoint instead of receiving a simplified subcloud through RPC. The reason for doing this instead of a simplified subcloud is that dcmanager-audit is faster to process than state, so until state updates, audit will keep sending information causing duplicated updates, slowing down the time it takes to update every subcloud. - Convert all logs into a default format with the subcloud name at the start, for better traceability. E.g: "Subcloud: subcloud1. <msg>". - Removed unused function update_subcloud_sync_endpoint_type. Test plan: - PASS: Deploy a subcloud and verify state communicates to cert-mon that it became online and then updates the dc_cert endpoint after receiving the response. - PASS: Manage the subcloud and verify all endpoint are updated and the final sync status is in-sync. - PASS: Force a subcloud to have an out-of-sync kube root-ca and kubernetes and verify state correctly updates the db and raise the alarms. - PASS: Turn off the subcloud and verify: - Subcloud availability was updated in db - All endpoints were updated in db - Dcorch was communicated - All endpoints alarms were cleared - The offline alarm was raised - PASS: Unmanage the subcloud and verify all endpoints, whith the exception of dc_cert, were updated to unknown. - PASS: Unmanage and stop the fm-mgs service and turn off the subcloud. Verify the subcloud is not updated to offline until fm comes back on. - PASS: Perform scale tests and verify that updating availability and endpoints is faster. Story: 2011311 Task: 52283 Depends-on: https://review.opendev.org/c/starlingx/fault/+/952671 Change-Id: I8792e1cbf8eb0af0cc9dd1be25987fac2503ecee Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
718 lines
28 KiB
Python
718 lines
28 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# Copyright (c) 2017-2025 Wind River Systems, Inc.
|
|
#
|
|
# The right to copy, distribute, modify, or otherwise make use
|
|
# of this software may be licensed only pursuant to the terms
|
|
# of an applicable Wind River license agreement.
|
|
#
|
|
|
|
from fm_api import constants as fm_const
|
|
from fm_api import fm_api
|
|
from oslo_log import log as logging
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon import utils as cutils
|
|
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common import exceptions
|
|
from dcmanager.common import manager
|
|
from dcmanager.common import utils
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.db.sqlalchemy import models
|
|
from dcmanager.rpc import client as rpc_client
|
|
from dcorch.rpc import client as dcorch_rpc_client
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
ALARM_OUT_OF_SYNC = fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC
|
|
|
|
|
|
def sync_update_subcloud_endpoint_status(func):
|
|
"""Synchronized lock decorator for _update_subcloud_endpoint_status."""
|
|
|
|
def _get_lock_and_call(*args, **kwargs):
|
|
"""Get a single fair lock per subcloud based on subcloud name/region."""
|
|
|
|
# subcloud region is the 3rd argument to
|
|
# _update_subcloud_endpoint_status()
|
|
@utils.synchronized(args[2], external=True, fair=True)
|
|
def _call_func(*args, **kwargs):
|
|
return func(*args, **kwargs)
|
|
|
|
return _call_func(*args, **kwargs)
|
|
|
|
return _get_lock_and_call
|
|
|
|
|
|
class SubcloudStateManager(manager.Manager):
|
|
"""Manages tasks related to subclouds."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
LOG.debug("SubcloudStateManager initialization...")
|
|
|
|
super(SubcloudStateManager, self).__init__(
|
|
service_name="subcloud_manager", *args, **kwargs
|
|
)
|
|
self.context = context.get_admin_context()
|
|
self.dcorch_rpc_client = dcorch_rpc_client.EngineWorkerClient()
|
|
self.fm_api = fm_api.FaultAPIsV2()
|
|
self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient()
|
|
|
|
def _do_update_subcloud_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_id: int,
|
|
endpoint_type: str,
|
|
sync_status: str,
|
|
alarmable: bool,
|
|
ignore_endpoints: list[str],
|
|
) -> None:
|
|
"""Update online/managed subcloud endpoint status
|
|
|
|
:param context: request context object
|
|
:param subcloud_id: id of subcloud to update
|
|
:param endpoint_type: endpoint type to update
|
|
:param sync_status: sync status to set
|
|
:param alarmable: controls raising an alarm if applicable
|
|
:param ignore_endpoints: list of endpoints to ignore (only used if
|
|
endpoint_type is None)
|
|
"""
|
|
|
|
endpoint_status_dict = {}
|
|
endpoint_to_update_list = []
|
|
faults_to_raise = []
|
|
faults_to_clear = []
|
|
|
|
# The subcloud object will always be the same, so we just keep the last one
|
|
for subcloud, endpoint_status in db_api.subcloud_get_with_status(
|
|
context,
|
|
subcloud_id,
|
|
endpoint_type=endpoint_type,
|
|
):
|
|
endpoint_status_dict[endpoint_status.endpoint_type] = endpoint_status
|
|
|
|
if endpoint_type:
|
|
status = endpoint_status_dict.get(endpoint_type)
|
|
if status and status.sync_status == sync_status:
|
|
msg = f"Sync status ({sync_status}) did not change - ignoring update"
|
|
cutils.log_subcloud_msg(LOG.debug, msg, subcloud.name)
|
|
return
|
|
elif not status:
|
|
msg = f"Subcloud: {subcloud.name}. Endpoint {endpoint_type} not found"
|
|
raise exceptions.BadRequest(
|
|
resource="subcloud",
|
|
msg=msg,
|
|
)
|
|
|
|
self._trigger_subcloud_audits_after_identity_sync(
|
|
context,
|
|
subcloud_id,
|
|
subcloud,
|
|
sync_status,
|
|
endpoint_type,
|
|
endpoint_status_dict,
|
|
)
|
|
|
|
for endpoint in endpoint_status_dict.values():
|
|
if not endpoint_type and endpoint.endpoint_type in ignore_endpoints:
|
|
continue
|
|
endpoint_to_update_list.append(endpoint.endpoint_type)
|
|
|
|
entity_instance_id = (
|
|
f"subcloud={subcloud.name}.resource={endpoint.endpoint_type}"
|
|
)
|
|
if sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
|
|
faults_to_clear.append((ALARM_OUT_OF_SYNC, entity_instance_id))
|
|
elif alarmable and (sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
|
|
fault = self._create_fault_out_of_sync(subcloud.name, endpoint_type)
|
|
faults_to_raise.append(fault)
|
|
|
|
try:
|
|
# We first want to raise/clear any alarms because in case of an
|
|
# unresponsive FM, like we have during a swact, the operations wont process
|
|
# again if the DB state is already the correct one, leading to a persistent
|
|
# alarm
|
|
self._raise_and_clear_subcloud_alarms_list(
|
|
subcloud.name, faults_to_raise, faults_to_clear
|
|
)
|
|
if endpoint_to_update_list:
|
|
db_api.subcloud_status_update_endpoints(
|
|
context, subcloud_id, endpoint_to_update_list, sync_status
|
|
)
|
|
except Exception as e:
|
|
msg = f"Failed to update subcloud endpoint status: {e}"
|
|
cutils.log_subcloud_msg(LOG.error, msg, subcloud.name)
|
|
raise e
|
|
|
|
def _trigger_subcloud_audits_after_identity_sync(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_id: int,
|
|
subcloud: models.Subcloud,
|
|
sync_status: str,
|
|
endpoint_type: str,
|
|
endpoint_status_dict: dict[str, models.SubcloudStatus],
|
|
) -> None:
|
|
"""Trigger audits for a subcloud after the first identity sync is complete
|
|
|
|
:param context: request context object
|
|
:param subcloud_id: id of the subcloud to update
|
|
:param subcloud: subcloud object
|
|
:param sync_status: sync status to set
|
|
:param endpoint_type: endpoint type to update
|
|
:param endpoint_status_dict: dict of endpoint types and their status
|
|
"""
|
|
is_sync_not_unknown = sync_status != dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
identity_endpoint = endpoint_status_dict.get(
|
|
dccommon_consts.ENDPOINT_TYPE_IDENTITY
|
|
)
|
|
is_identity_unknown = (
|
|
identity_endpoint
|
|
and identity_endpoint.sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
)
|
|
if (
|
|
endpoint_type == dccommon_consts.ENDPOINT_TYPE_IDENTITY
|
|
and is_sync_not_unknown
|
|
and is_identity_unknown
|
|
):
|
|
if not subcloud.first_identity_sync_complete:
|
|
db_api.subcloud_update(
|
|
context, subcloud_id, first_identity_sync_complete=True
|
|
)
|
|
msg = "Request for audits after updating identity out of unknown"
|
|
cutils.log_subcloud_msg(LOG.debug, msg, subcloud.name)
|
|
self.audit_rpc_client.trigger_subcloud_audits(context, subcloud_id)
|
|
|
|
def _should_update_endpoint_status(
|
|
self, subcloud: models.Subcloud, endpoint_type: str, sync_status: str
|
|
) -> bool:
|
|
"""Verifies if the subcloud's endpoint should have its sync status updated"""
|
|
|
|
# Rules for updating sync status:
|
|
#
|
|
# For secondary subclouds, only update if the new sync_status is
|
|
# 'unknown'
|
|
#
|
|
# For others, always update if not in-sync.
|
|
#
|
|
# Otherwise, only update the sync status if managed and online
|
|
# (unless dc-cert).
|
|
#
|
|
# Most endpoints are audited only when the subcloud is managed and
|
|
# online. An exception is the dc-cert endpoint, which is audited
|
|
# whenever the subcloud is online (managed or unmanaged).
|
|
#
|
|
# This means if a subcloud is going offline or unmanaged, then
|
|
# the sync status update must be done first.
|
|
#
|
|
is_in_sync = sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC
|
|
is_online = subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE
|
|
is_managed = subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED
|
|
is_endpoint_type_dc_cert = (
|
|
endpoint_type == dccommon_consts.ENDPOINT_TYPE_DC_CERT
|
|
)
|
|
is_secondary = subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY
|
|
is_sync_unknown = sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
is_secondary_and_sync_unknown = is_secondary and is_sync_unknown
|
|
|
|
return (
|
|
(not is_in_sync or (is_online and (is_managed or is_endpoint_type_dc_cert)))
|
|
and not is_secondary
|
|
) or is_secondary_and_sync_unknown
|
|
|
|
@sync_update_subcloud_endpoint_status
|
|
def _update_subcloud_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_region: str,
|
|
endpoint_type: str,
|
|
sync_status: str,
|
|
alarmable: bool,
|
|
ignore_endpoints: list[str],
|
|
) -> None:
|
|
"""Update subcloud endpoint status
|
|
|
|
:param context: request context object
|
|
:param subcloud_region: name of subcloud region to update
|
|
:param endpoint_type: endpoint type to update
|
|
:param sync_status: sync status to set
|
|
:param alarmable: controls raising an alarm if applicable
|
|
:param ignore_endpoints: list of endpoints to ignore (only used if
|
|
endpoint_type is None)
|
|
"""
|
|
|
|
if not subcloud_region:
|
|
raise exceptions.BadRequest(
|
|
resource="subcloud", msg="Subcloud region not provided"
|
|
)
|
|
|
|
try:
|
|
subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region)
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
raise e
|
|
|
|
if self._should_update_endpoint_status(subcloud, endpoint_type, sync_status):
|
|
# update a single subcloud
|
|
try:
|
|
self._do_update_subcloud_endpoint_status(
|
|
context,
|
|
subcloud.id,
|
|
endpoint_type,
|
|
sync_status,
|
|
alarmable,
|
|
ignore_endpoints,
|
|
)
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
raise e
|
|
else:
|
|
msg = (
|
|
f"Ignoring subcloud sync_status update. "
|
|
f"Availability: {subcloud.availability_status}; "
|
|
f"Management:{subcloud.management_state}; "
|
|
f"Endpoint:{endpoint_type}; "
|
|
f"sync_status:{sync_status}"
|
|
)
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud.name)
|
|
|
|
def _create_fault_out_of_sync(
|
|
self, subcloud_name: str, endpoint: str
|
|
) -> fm_api.Fault:
|
|
"""Creates a fault for an endpoint out-of-sync
|
|
|
|
:param subcloud_name: subcloud's name
|
|
:param endpoint: the endpoint that is out-of-sync
|
|
|
|
:return: an FM fault object
|
|
:rtype: Fault
|
|
"""
|
|
|
|
return fm_api.Fault(
|
|
alarm_id=ALARM_OUT_OF_SYNC,
|
|
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
|
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
|
|
entity_instance_id=f"subcloud={subcloud_name}.resource={endpoint}",
|
|
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
|
|
reason_text=f"{subcloud_name} {endpoint} sync_status is out-of-sync",
|
|
alarm_type=fm_const.FM_ALARM_TYPE_0,
|
|
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
|
|
proposed_repair_action="If problem persists contact next level of support",
|
|
service_affecting=False,
|
|
keep_existing_alarm=True,
|
|
)
|
|
|
|
def _create_fault_offline(self, subcloud_name: str) -> fm_api.Fault:
|
|
"""Creates a fault for an offline subcloud
|
|
|
|
:param subcloud_name: subcloud's name
|
|
|
|
:return: an FM fault object
|
|
:rtype: Fault
|
|
"""
|
|
entity_instance_id = f"subcloud={subcloud_name}"
|
|
return fm_api.Fault(
|
|
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
|
|
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
|
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
|
|
entity_instance_id=entity_instance_id,
|
|
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
|
|
reason_text=("%s is offline" % subcloud_name),
|
|
alarm_type=fm_const.FM_ALARM_TYPE_0,
|
|
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
|
|
proposed_repair_action=(
|
|
"Wait for subcloud to become online; if problem persists "
|
|
"contact next level of support."
|
|
),
|
|
service_affecting=True,
|
|
keep_existing_alarm=True,
|
|
)
|
|
|
|
def bulk_update_subcloud_availability_and_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_id: int,
|
|
subcloud_name: str,
|
|
availability_data: dict,
|
|
endpoint_data: dict[str, str],
|
|
) -> None:
|
|
"""Bulk update subcloud availability and endpoint status
|
|
|
|
:param context: request context object
|
|
:param subcloud_id: id of the subcloud to update
|
|
:param subcloud_name: name of the subcloud to update
|
|
:param availability_data: a dict containing the availability status,
|
|
update_state_only and audit_fail_count
|
|
:param endpoint_data: a dict containing the endpoint as key and its sync
|
|
status as value
|
|
"""
|
|
# This bulk update is executed as part of the audit process in dcmanager and
|
|
# its related endpoints. This method is not used by dcorch and cert-mon.
|
|
|
|
# The subcloud object will always be the same, so we just keep the last one
|
|
unchanged_endpoints = []
|
|
for subcloud, endpoint_status in db_api.subcloud_get_with_status(
|
|
context,
|
|
subcloud_id,
|
|
):
|
|
if (
|
|
endpoint_data.get(endpoint_status.endpoint_type)
|
|
== endpoint_status.sync_status
|
|
):
|
|
unchanged_endpoints.append(endpoint_status.endpoint_type)
|
|
del endpoint_data[endpoint_status.endpoint_type]
|
|
if unchanged_endpoints:
|
|
msg = (
|
|
"The following endpoints are already set to updated values, "
|
|
f"not updating: {unchanged_endpoints}"
|
|
)
|
|
cutils.log_subcloud_msg(LOG.debug, msg, subcloud_name)
|
|
|
|
if availability_data:
|
|
self.update_subcloud_availability(
|
|
context,
|
|
subcloud.name,
|
|
subcloud.region_name,
|
|
availability_data["availability_status"],
|
|
availability_data["update_state_only"],
|
|
availability_data["audit_fail_count"],
|
|
subcloud,
|
|
)
|
|
if endpoint_data:
|
|
self._bulk_update_subcloud_endpoint_status(context, subcloud, endpoint_data)
|
|
|
|
@sync_update_subcloud_endpoint_status
|
|
def _do_bulk_update_subcloud_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_name: str,
|
|
subcloud_id: int,
|
|
endpoint_data: dict[str, str],
|
|
) -> None:
|
|
"""Updates an online and managed subcloud's endpoints sync status
|
|
|
|
:param context: request context object
|
|
:param subcloud_name: name of the subcloud to update
|
|
:param subcloud_id: id of the subcloud to update
|
|
:param endpoint_data: a dict containing the endpoint as key and its sync
|
|
status as value
|
|
"""
|
|
|
|
# This bulk update is executed as part of the audit process and, because of
|
|
# that, the logic is similar to _do_update_subcloud_endpoint_status but with
|
|
# the difference that only the required endpoints will be update and that'll
|
|
# happen at once.
|
|
status_to_set = [f"{key} ({value})" for key, value in endpoint_data.items()]
|
|
msg = f"Updating endpoints: {', '.join(status_to_set)}"
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
|
|
faults_to_set = []
|
|
faults_to_clear = []
|
|
|
|
for endpoint, sync_status in endpoint_data.items():
|
|
if sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
|
|
faults_to_set.append(
|
|
self._create_fault_out_of_sync(subcloud_name, endpoint)
|
|
)
|
|
elif sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC:
|
|
entity_instance_id = f"subcloud={subcloud_name}.resource={endpoint}"
|
|
faults_to_clear.append((ALARM_OUT_OF_SYNC, entity_instance_id))
|
|
self._raise_and_clear_subcloud_alarms_list(
|
|
subcloud_name, faults_to_set, faults_to_clear
|
|
)
|
|
|
|
try:
|
|
db_api.subcloud_status_bulk_update_endpoints(
|
|
context,
|
|
subcloud_id,
|
|
endpoint_data,
|
|
)
|
|
except Exception as e:
|
|
msg = f"An error occured when updating the subcloud endpoint status: {e}"
|
|
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
|
|
|
def _bulk_update_subcloud_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud: models.Subcloud,
|
|
endpoint_data: dict[str, str],
|
|
) -> None:
|
|
"""Update the sync status of a list of subcloud endpoints
|
|
|
|
:param context: current context object
|
|
:param subcloud: subcloud object
|
|
:param endpoint_data: a dict containing the endpoint as key and its sync
|
|
status as value
|
|
"""
|
|
|
|
endpoints_to_update = dict()
|
|
|
|
for endpoint_type, sync_status in endpoint_data.items():
|
|
if self._should_update_endpoint_status(
|
|
subcloud, endpoint_type, sync_status
|
|
):
|
|
endpoints_to_update.update({endpoint_type: sync_status})
|
|
|
|
# Update all the necessary endpoints for a single subcloud
|
|
if endpoints_to_update:
|
|
try:
|
|
self._do_bulk_update_subcloud_endpoint_status(
|
|
context,
|
|
subcloud.name,
|
|
subcloud.id,
|
|
endpoints_to_update,
|
|
)
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
raise e
|
|
else:
|
|
msg = (
|
|
f"No endpoints to update the status; "
|
|
f"Availability: {subcloud.availability_status}; "
|
|
f"Management: {subcloud.management_state};"
|
|
f"Endpoints: {', '.join(endpoint_data.keys())}"
|
|
)
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud.name)
|
|
|
|
def update_subcloud_endpoint_status(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_region: str = None,
|
|
endpoint_type: str = None,
|
|
sync_status: str = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
|
alarmable: bool = True,
|
|
ignore_endpoints: list[str] = None,
|
|
) -> None:
|
|
"""Update subcloud endpoint status
|
|
|
|
:param context: request context object
|
|
:param subcloud_region: region of subcloud to update
|
|
:param endpoint_type: endpoint type to update
|
|
:param sync_status: sync status to set
|
|
:param alarmable: controls raising an alarm if applicable
|
|
:param ignore_endpoints: list of endpoints to ignore (only used if
|
|
endpoint_type is None)
|
|
"""
|
|
|
|
if ignore_endpoints is None:
|
|
ignore_endpoints = []
|
|
|
|
if subcloud_region:
|
|
self._update_subcloud_endpoint_status(
|
|
context,
|
|
subcloud_region,
|
|
endpoint_type,
|
|
sync_status,
|
|
alarmable,
|
|
ignore_endpoints,
|
|
)
|
|
else:
|
|
# update all subclouds
|
|
for subcloud in db_api.subcloud_get_all(context):
|
|
self._update_subcloud_endpoint_status(
|
|
context,
|
|
subcloud.region_name,
|
|
endpoint_type,
|
|
sync_status,
|
|
alarmable,
|
|
ignore_endpoints,
|
|
)
|
|
|
|
def _update_subcloud_state(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_name: str,
|
|
subcloud_region: str,
|
|
management_state: str,
|
|
availability_status: str,
|
|
) -> None:
|
|
try:
|
|
msg = (
|
|
f"Notifying dcorch, management: {management_state}, "
|
|
f"availability:{availability_status}"
|
|
)
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
|
|
self.dcorch_rpc_client.update_subcloud_states(
|
|
context, subcloud_region, management_state, availability_status
|
|
)
|
|
|
|
except Exception:
|
|
msg = "Problem informing dcorch of subcloud state change"
|
|
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
|
|
|
def _raise_and_clear_subcloud_alarms_list(
|
|
self,
|
|
subcloud_name: str,
|
|
faults_to_raise: list[fm_api.Fault] = None,
|
|
faults_to_clear: list[tuple[str, str]] = None,
|
|
) -> None:
|
|
"""Raise/clear a list of subcloud alarms
|
|
|
|
:param faults_to_raise: list of faults to raise
|
|
:param faults_to_clear: list of faults to clear
|
|
"""
|
|
if faults_to_clear:
|
|
try:
|
|
self.fm_api.clear_faults_list(faults_to_clear)
|
|
except Exception as e:
|
|
msg = "Failed to clear alarms from list"
|
|
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
|
raise e
|
|
|
|
if faults_to_raise:
|
|
try:
|
|
self.fm_api.set_faults(faults_to_raise)
|
|
except Exception as e:
|
|
msg = "Failed to raise alarms from list"
|
|
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
|
raise e
|
|
|
|
def _raise_or_clear_subcloud_status_alarm(
|
|
self, subcloud_name: str, availability_status: str, deploy_status: str = None
|
|
) -> None:
|
|
entity_instance_id = f"subcloud={subcloud_name}"
|
|
|
|
if availability_status == dccommon_consts.AVAILABILITY_ONLINE:
|
|
try:
|
|
self.fm_api.clear_fault(
|
|
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, entity_instance_id
|
|
)
|
|
except Exception as e:
|
|
msg = "Failed to clear offline alarm"
|
|
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
|
raise e
|
|
|
|
elif (
|
|
availability_status == dccommon_consts.AVAILABILITY_OFFLINE
|
|
and deploy_status != consts.DEPLOY_STATE_SECONDARY
|
|
):
|
|
try:
|
|
fault = self._create_fault_offline(subcloud_name)
|
|
self.fm_api.set_fault(fault)
|
|
except Exception as e:
|
|
msg = "Failed to raise offline alarm"
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
raise e
|
|
|
|
def update_subcloud_availability(
|
|
self,
|
|
context: context.RequestContext,
|
|
subcloud_name: str,
|
|
subcloud_region: str,
|
|
availability_status: str,
|
|
update_state_only: bool = False,
|
|
audit_fail_count: int = None,
|
|
subcloud: models.Subcloud = None,
|
|
) -> None:
|
|
if subcloud is None:
|
|
try:
|
|
subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region)
|
|
except Exception:
|
|
msg = f"Failed to get subcloud by region name: {subcloud_region}"
|
|
cutils.log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
|
raise
|
|
|
|
if update_state_only:
|
|
msg = "Received update_state_only request"
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
# Ensure that the status alarm is consistent with the
|
|
# subcloud's availability. This is required to compensate
|
|
# for rare alarm update failures, which may occur during
|
|
# availability updates.
|
|
self._raise_or_clear_subcloud_status_alarm(
|
|
subcloud.name, availability_status
|
|
)
|
|
|
|
# Nothing has changed, but we want to send a state update for this
|
|
# subcloud as an audit. Get the most up-to-date data.
|
|
self._update_subcloud_state(
|
|
context,
|
|
subcloud.name,
|
|
subcloud.region_name,
|
|
subcloud.management_state,
|
|
availability_status,
|
|
)
|
|
elif availability_status is None:
|
|
# only update the audit fail count
|
|
try:
|
|
db_api.subcloud_update(
|
|
self.context, subcloud.id, audit_fail_count=audit_fail_count
|
|
)
|
|
except exceptions.SubcloudNotFound:
|
|
# slim possibility subcloud could have been deleted since
|
|
# we found it in db, ignore this benign error.
|
|
msg = (
|
|
"Ignoring SubcloudNotFound when attempting audit_fail_count update"
|
|
)
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
return
|
|
else:
|
|
if availability_status == subcloud.availability_status:
|
|
msg = (
|
|
"Availability status hasn't changed from "
|
|
f"{availability_status}, not updating"
|
|
)
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
return
|
|
self._raise_or_clear_subcloud_status_alarm(
|
|
subcloud.name, availability_status
|
|
)
|
|
|
|
if availability_status == dccommon_consts.AVAILABILITY_OFFLINE:
|
|
# Subcloud is going offline, set all endpoint statuses to
|
|
# unknown.
|
|
endpoint_data = dict()
|
|
|
|
for endpoint in dccommon_consts.AUDIT_TYPES_LIST:
|
|
endpoint_data[endpoint] = dccommon_consts.SYNC_STATUS_UNKNOWN
|
|
|
|
self._bulk_update_subcloud_endpoint_status(
|
|
context, subcloud, endpoint_data
|
|
)
|
|
|
|
try:
|
|
updated_subcloud = db_api.subcloud_update(
|
|
context,
|
|
subcloud.id,
|
|
availability_status=availability_status,
|
|
audit_fail_count=audit_fail_count,
|
|
)
|
|
except exceptions.SubcloudNotFound:
|
|
# slim possibility subcloud could have been deleted since
|
|
# we found it in db, ignore this benign error.
|
|
msg = "Ignoring SubcloudNotFound when attempting state update"
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
return
|
|
|
|
if availability_status == dccommon_consts.AVAILABILITY_ONLINE:
|
|
# Subcloud is going online
|
|
# Tell cert-mon to audit endpoint certificate.
|
|
msg = "Request for online audit"
|
|
cutils.log_subcloud_msg(LOG.info, msg, subcloud_name)
|
|
dc_notification = rpc_client.DCManagerNotifications()
|
|
dc_notification.subcloud_online(context, subcloud.region_name)
|
|
# Trigger all the audits for the subcloud so it can update the
|
|
# sync status ASAP.
|
|
self.audit_rpc_client.trigger_subcloud_audits(context, subcloud.id)
|
|
|
|
# Send dcorch a state update
|
|
self._update_subcloud_state(
|
|
context,
|
|
subcloud.name,
|
|
subcloud.region_name,
|
|
updated_subcloud.management_state,
|
|
availability_status,
|
|
)
|