
In order to allow for faster subcloud audits, introduce a new "audit-worker" process. By default there will be four worker processes to perform the actual subcloud audits. The main audit process will scan the DB for subclouds which need auditing based on the new audit start/end timestamps. It will then send out RPC messages to the "audit-worker" processes to request an audit for a list of subcloud IDs. When the "audit-worker" process receives the "audit_subclouds" RPC message, it loops over each of the specified subclouds. For each subcloud it updates the DB to indicate that the audit is starting for that subcloud, then basically does the exact same audit that is currently done in the audit process, then updates the DB to indicate that the audit has been completed for that subcloud. Story: 2007267 Task: 41336 Signed-off-by: Chris Friesen <chris.friesen@windriver.com> Change-Id: Ifb3dd363fd337d24f2c3f7aaa3549624fffaceca
355 lines
16 KiB
Python
355 lines
16 KiB
Python
# Copyright 2017 Ericsson AB.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# Copyright (c) 2017-2021 Wind River Systems, Inc.
|
|
#
|
|
# The right to copy, distribute, modify, or otherwise make use
|
|
# of this software may be licensed only pursuant to the terms
|
|
# of an applicable Wind River license agreement.
|
|
#
|
|
|
|
import os
|
|
|
|
from keystoneauth1 import exceptions as keystone_exceptions
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
|
|
|
|
from dcmanager.audit import alarm_aggregation
|
|
from dcmanager.audit import firmware_audit
|
|
from dcmanager.audit import patch_audit
|
|
from dcmanager.audit.subcloud_audit_manager import HELM_APP_OPENSTACK
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common import exceptions
|
|
from dcmanager.common.i18n import _
|
|
from dcmanager.common import manager
|
|
from dcmanager.common import scheduler
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.rpc import client as dcmanager_rpc_client
|
|
|
|
CONF = cfg.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
# We will update the state of each subcloud in the dcorch about once per hour.
|
|
# Calculate how many iterations that will be.
|
|
SUBCLOUD_STATE_UPDATE_ITERATIONS = \
|
|
dccommon_consts.SECONDS_IN_HOUR / CONF.scheduler.subcloud_audit_interval
|
|
|
|
|
|
class SubcloudAuditWorkerManager(manager.Manager):
|
|
"""Manages tasks related to audits."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
LOG.debug(_('SubcloudAuditWorkerManager initialization...'))
|
|
|
|
super(SubcloudAuditWorkerManager, self).__init__(
|
|
service_name="subcloud_audit_worker_manager")
|
|
self.context = context.get_admin_context()
|
|
self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient()
|
|
# Keeps track of greenthreads we create to do work.
|
|
self.thread_group_manager = scheduler.ThreadGroupManager(
|
|
thread_pool_size=100)
|
|
# Track workers created for each subcloud.
|
|
self.subcloud_workers = dict()
|
|
self.alarm_aggr = alarm_aggregation.AlarmAggregation(self.context)
|
|
self.patch_audit = patch_audit.PatchAudit(
|
|
self.context, self.dcmanager_rpc_client)
|
|
self.firmware_audit = firmware_audit.FirmwareAudit(
|
|
self.context, self.dcmanager_rpc_client)
|
|
self.pid = os.getpid()
|
|
|
|
def audit_subclouds(self, context, subcloud_ids, patch_audit_data,
|
|
firmware_audit_data, do_openstack_audit):
|
|
"""Run audits of the specified subcloud(s)"""
|
|
|
|
LOG.debug('PID: %s, subclouds to audit: %s, do_openstack_audit: %s' %
|
|
(self.pid, subcloud_ids, do_openstack_audit))
|
|
|
|
for subcloud_id in subcloud_ids:
|
|
# Retrieve the subcloud and subcloud audit info
|
|
try:
|
|
subcloud = db_api.subcloud_get(self.context, subcloud_id)
|
|
subcloud_audits = db_api.subcloud_audits_get_and_start_audit(
|
|
self.context, subcloud_id)
|
|
except exceptions.SubcloudNotFound:
|
|
# Possibility subcloud could have been deleted since the list of
|
|
# subclouds to audit was created.
|
|
LOG.info('Ignoring SubcloudNotFound when auditing subcloud %s' %
|
|
subcloud_id)
|
|
continue
|
|
|
|
LOG.debug("PID: %s, starting audit of subcloud: %s." %
|
|
(self.pid, subcloud.name))
|
|
|
|
# Include failure deploy status states in the auditable list
|
|
# so that the subcloud can be set as offline
|
|
if (subcloud.deploy_status not in
|
|
[consts.DEPLOY_STATE_DONE,
|
|
consts.DEPLOY_STATE_DEPLOYING,
|
|
consts.DEPLOY_STATE_DEPLOY_FAILED,
|
|
consts.DEPLOY_STATE_INSTALL_FAILED,
|
|
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
|
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
|
consts.DEPLOY_STATE_MIGRATED]):
|
|
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
|
|
(subcloud.name, subcloud.deploy_status))
|
|
# This DB API call will set the "audit_finished_at" timestamp
|
|
# so it won't get audited again for a while.
|
|
db_api.subcloud_audits_end_audit(self.context, subcloud_id)
|
|
continue
|
|
|
|
# Check the per-subcloud audit flags
|
|
do_patch_audit = subcloud_audits.patch_audit_requested
|
|
do_load_audit = subcloud_audits.load_audit_requested
|
|
do_firmware_audit = subcloud_audits.firmware_audit_requested
|
|
update_subcloud_state = subcloud_audits.state_update_requested
|
|
|
|
# Create a new greenthread for each subcloud to allow the audits
|
|
# to be done in parallel. If there are not enough greenthreads
|
|
# in the pool, this will block until one becomes available.
|
|
self.subcloud_workers[subcloud.name] = \
|
|
self.thread_group_manager.start(self._do_audit_subcloud,
|
|
subcloud,
|
|
update_subcloud_state,
|
|
do_openstack_audit,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
do_patch_audit,
|
|
do_load_audit,
|
|
do_firmware_audit)
|
|
|
|
def _update_subcloud_availability(self, subcloud_name,
|
|
availability_status=None,
|
|
update_state_only=False,
|
|
audit_fail_count=None):
|
|
try:
|
|
self.dcmanager_rpc_client.update_subcloud_availability(
|
|
self.context, subcloud_name, availability_status,
|
|
update_state_only, audit_fail_count)
|
|
LOG.info('Notifying dcmanager, subcloud:%s, availability:%s' %
|
|
(subcloud_name,
|
|
availability_status))
|
|
except Exception:
|
|
LOG.exception('Problem informing dcmanager of subcloud '
|
|
'availability state change, subcloud: %s'
|
|
% subcloud_name)
|
|
|
|
@staticmethod
|
|
def _get_subcloud_availability_status(subcloud_name, sysinv_client):
|
|
"""For each subcloud, if at least one service is active in each
|
|
|
|
service of servicegroup-list then declare the subcloud online.
|
|
"""
|
|
avail_to_set = consts.AVAILABILITY_OFFLINE
|
|
svc_groups = None
|
|
|
|
# get a list of service groups in the subcloud
|
|
try:
|
|
svc_groups = sysinv_client.get_service_groups()
|
|
except Exception as e:
|
|
LOG.warn('Cannot retrieve service groups for '
|
|
'subcloud: %s, %s' % (subcloud_name, e))
|
|
|
|
if svc_groups:
|
|
active_sgs = []
|
|
inactive_sgs = []
|
|
|
|
# Build 2 lists, 1 of active service groups,
|
|
# one with non-active.
|
|
for sg in svc_groups:
|
|
if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE:
|
|
inactive_sgs.append(sg.service_group_name)
|
|
else:
|
|
active_sgs.append(sg.service_group_name)
|
|
|
|
# Create a list of service groups that are only present
|
|
# in non-active list
|
|
inactive_only = [sg for sg in inactive_sgs if
|
|
sg not in active_sgs]
|
|
|
|
# An empty inactive only list and a non-empty active list
|
|
# means we're good to go.
|
|
if not inactive_only and active_sgs:
|
|
avail_to_set = \
|
|
consts.AVAILABILITY_ONLINE
|
|
else:
|
|
LOG.info("Subcloud:%s has non-active "
|
|
"service groups: %s" %
|
|
(subcloud_name, inactive_only))
|
|
return avail_to_set
|
|
|
|
def _audit_subcloud_openstack_app(self, subcloud_name, sysinv_client,
|
|
openstack_installed):
|
|
openstack_installed_current = False
|
|
# get a list of installed apps in the subcloud
|
|
try:
|
|
apps = sysinv_client.get_applications()
|
|
except Exception:
|
|
LOG.exception('Cannot retrieve installed apps for subcloud:%s'
|
|
% subcloud_name)
|
|
return
|
|
|
|
for app in apps:
|
|
if app.name == HELM_APP_OPENSTACK and app.active:
|
|
# audit find openstack app is installed and active in
|
|
# the subcloud
|
|
openstack_installed_current = True
|
|
break
|
|
|
|
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
|
|
if openstack_installed_current and not openstack_installed:
|
|
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
|
|
self.context,
|
|
subcloud_name,
|
|
endpoint_type_list,
|
|
openstack_installed_current)
|
|
elif not openstack_installed_current and openstack_installed:
|
|
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
|
|
self.context,
|
|
subcloud_name,
|
|
endpoint_type_list,
|
|
openstack_installed_current)
|
|
|
|
def _do_audit_subcloud(self, subcloud, update_subcloud_state,
|
|
do_audit_openstack, patch_audit_data,
|
|
firmware_audit_data, do_patch_audit,
|
|
do_load_audit, do_firmware_audit):
|
|
# Do the actual subcloud audit.
|
|
try:
|
|
self._audit_subcloud(subcloud, update_subcloud_state,
|
|
do_audit_openstack, patch_audit_data,
|
|
firmware_audit_data, do_patch_audit,
|
|
do_load_audit, do_firmware_audit)
|
|
except Exception:
|
|
LOG.exception("Got exception auditing subcloud: %s" % subcloud.name)
|
|
|
|
# Update the audit completion timestamp so it doesn't get
|
|
# audited again for a while.
|
|
db_api.subcloud_audits_end_audit(self.context, subcloud.id)
|
|
# Remove the worker for this subcloud
|
|
self.subcloud_workers.pop(subcloud.name, None)
|
|
LOG.debug("PID: %s, done auditing subcloud: %s." %
|
|
(self.pid, subcloud.name))
|
|
|
|
def _audit_subcloud(self, subcloud, update_subcloud_state,
|
|
do_audit_openstack, patch_audit_data, firmware_audit_data,
|
|
do_patch_audit, do_load_audit, do_firmware_audit):
|
|
"""Audit a single subcloud."""
|
|
|
|
avail_status_current = subcloud.availability_status
|
|
audit_fail_count = subcloud.audit_fail_count
|
|
subcloud_name = subcloud.name
|
|
|
|
# Set defaults to None and disabled so we will still set disabled
|
|
# status if we encounter an error.
|
|
|
|
sysinv_client = None
|
|
fm_client = None
|
|
avail_to_set = consts.AVAILABILITY_OFFLINE
|
|
|
|
try:
|
|
os_client = OpenStackDriver(region_name=subcloud_name,
|
|
thread_name='subcloud-audit')
|
|
sysinv_client = os_client.sysinv_client
|
|
fm_client = os_client.fm_client
|
|
except (keystone_exceptions.EndpointNotFound,
|
|
keystone_exceptions.ConnectFailure,
|
|
keystone_exceptions.ConnectTimeout,
|
|
IndexError):
|
|
if avail_status_current == consts.AVAILABILITY_OFFLINE:
|
|
LOG.info("Identity or Platform endpoint for %s not "
|
|
"found, ignoring for offline "
|
|
"subcloud." % subcloud_name)
|
|
return
|
|
else:
|
|
# The subcloud will be marked as offline below.
|
|
LOG.error("Identity or Platform endpoint for online "
|
|
"subcloud: %s not found." % subcloud_name)
|
|
|
|
except Exception:
|
|
LOG.exception("Failed to get OS Client for subcloud: %s"
|
|
% subcloud_name)
|
|
|
|
# Check availability of the subcloud
|
|
if sysinv_client:
|
|
avail_to_set = self._get_subcloud_availability_status(
|
|
subcloud_name, sysinv_client)
|
|
|
|
if avail_to_set == consts.AVAILABILITY_OFFLINE:
|
|
if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
|
|
audit_fail_count = audit_fail_count + 1
|
|
if (avail_status_current == consts.AVAILABILITY_ONLINE) and \
|
|
(audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM):
|
|
# Do not set offline until we have failed audit
|
|
# the requisite number of times
|
|
avail_to_set = consts.AVAILABILITY_ONLINE
|
|
else:
|
|
# In the case of a one off blip, we may need to set the
|
|
# fail count back to 0
|
|
audit_fail_count = 0
|
|
|
|
if avail_to_set != avail_status_current:
|
|
|
|
if avail_to_set == consts.AVAILABILITY_ONLINE:
|
|
audit_fail_count = 0
|
|
|
|
LOG.info('Setting new availability status: %s '
|
|
'on subcloud: %s' %
|
|
(avail_to_set, subcloud_name))
|
|
self._update_subcloud_availability(
|
|
subcloud_name,
|
|
availability_status=avail_to_set,
|
|
audit_fail_count=audit_fail_count)
|
|
|
|
elif audit_fail_count != subcloud.audit_fail_count:
|
|
self._update_subcloud_availability(
|
|
subcloud_name,
|
|
availability_status=None,
|
|
audit_fail_count=audit_fail_count)
|
|
|
|
elif update_subcloud_state:
|
|
# Nothing has changed, but we want to send a state update for this
|
|
# subcloud as an audit.
|
|
LOG.debug('Updating subcloud state unconditionally for subcloud %s'
|
|
% subcloud_name)
|
|
self._update_subcloud_availability(
|
|
subcloud_name,
|
|
availability_status=avail_status_current,
|
|
update_state_only=True)
|
|
|
|
# If subcloud is managed and online, audit additional resources
|
|
if (subcloud.management_state == consts.MANAGEMENT_MANAGED and
|
|
avail_to_set == consts.AVAILABILITY_ONLINE):
|
|
# Get alarm summary and store in db,
|
|
if fm_client:
|
|
self.alarm_aggr.update_alarm_summary(subcloud_name, fm_client)
|
|
|
|
# If we have patch audit data, audit the subcloud
|
|
if do_patch_audit and patch_audit_data:
|
|
self.patch_audit.subcloud_patch_audit(subcloud_name,
|
|
patch_audit_data,
|
|
do_load_audit)
|
|
# Perform firmware audit
|
|
if do_firmware_audit:
|
|
self.firmware_audit.subcloud_firmware_audit(subcloud_name,
|
|
firmware_audit_data)
|
|
# Audit openstack application in the subcloud
|
|
if do_audit_openstack and sysinv_client:
|
|
self._audit_subcloud_openstack_app(
|
|
subcloud_name, sysinv_client, subcloud.openstack_installed)
|