Files
distcloud/distributedcloud/dcmanager/audit/subcloud_audit_manager.py
Hugo Brito c2c7ab93ef Update endpoint caches post network reconfig
After switching the management network to the admin network
we need to update the services endpoints cache of all
dcmanager, dcorch and cert-mon workers to the new admin value.

The fanout parameter is being added to the cast calls of the
RPC clients (dcorch and audit), this parameter makes the method
cast to all servers listening on a topic rather than just one
of them.

Test Plan:
PASS: dcmanager subcloud update (using admin network parameters)
1. Endpoints for the subcloud updated with admin ip value
2. subcloud availability = online
PASS: Verify that the subcloud is online shortly after succesful
completion of subcloud_update playbook
PASS: Verify that the service endpoints are updated in all workers'
endpoint caches for the subcloud
PASS: Manage the subcloud and verify that both dcmanager and dcorch
audits are working as expected
PASS: Perform a Identity sync:
1. openstack --os-region-name SystemController user create <new_user>
--domain <domain> --project <project> --password <password>
2. Log in into subcloud and verify the new user: openstack user list
PASS: Verify that the master token is refreshed successfully after an hour

Story: 2010319
Task: 47556

Depends-On: https://review.opendev.org/c/starlingx/config/+/877323

Signed-off-by: Hugo Brito <hugo.brito@windriver.com>
Change-Id: I149c864382b7c63d424f736bdb4eaac2a787b709
2023-03-14 11:44:15 -03:00

468 lines
20 KiB
Python

# Copyright 2017 Ericsson AB.
# Copyright (c) 2017-2023 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import datetime
import eventlet
import os
import time
from tsconfig.tsconfig import CONFIG_PATH
from oslo_config import cfg
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack import sysinv_v1
from dcmanager.audit import firmware_audit
from dcmanager.audit import kube_rootca_update_audit
from dcmanager.audit import kubernetes_audit
from dcmanager.audit import patch_audit
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
from dcmanager.audit import utils as audit_utils
from dcmanager.common import context
from dcmanager.common.i18n import _
from dcmanager.common import manager
from dcmanager.db import api as db_api
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# We will update the state of each subcloud in the dcorch about once per hour.
# Calculate how many iterations that will be.
SUBCLOUD_STATE_UPDATE_ITERATIONS = \
dccommon_consts.SECONDS_IN_HOUR // CONF.scheduler.subcloud_audit_interval
# Patch audit normally happens every CONF.scheduler.patch_audit_interval
# seconds, but can be forced to happen on the next audit interval by calling
# trigger_patch_audit.
# Name of starlingx openstack helm application
HELM_APP_OPENSTACK = 'openstack'
# Every 4 audits triggers a kubernetes audit
KUBERNETES_AUDIT_RATE = 4
# Every 4 audits triggers a kube rootca update audit
KUBE_ROOTCA_UPDATE_AUDIT_RATE = 4
class SubcloudAuditManager(manager.Manager):
"""Manages tasks related to audits."""
# Used to force patch audit on the next interval
force_patch_audit = False
# Used to force firmware audit on the next interval
force_firmware_audit = False
# Used to force kube rootca update audit on the next interval
force_kube_rootca_update_audit = False
# Used to force kubernetes audit on the next interval
force_kubernetes_audit = False
def __init__(self, *args, **kwargs):
LOG.debug(_('SubcloudAuditManager initialization...'))
super(SubcloudAuditManager, self).__init__(
service_name="subcloud_audit_manager")
self.context = context.get_admin_context()
self.audit_worker_rpc_client = dcmanager_audit_rpc_client.ManagerAuditWorkerClient()
# Number of audits since last subcloud state update
self.audit_count = SUBCLOUD_STATE_UPDATE_ITERATIONS - 2
# Number of patch audits
self.patch_audit_count = 0
self.patch_audit = patch_audit.PatchAudit(
self.context, None)
# trigger a patch audit on startup
self.patch_audit_time = 0
self.firmware_audit = firmware_audit.FirmwareAudit(
self.context, None)
self.kubernetes_audit = kubernetes_audit.KubernetesAudit(
self.context, None)
self.kube_rootca_update_audit = \
kube_rootca_update_audit.KubeRootcaUpdateAudit(self.context, None)
def _add_missing_endpoints(self):
# Update this flag file based on the most recent new endpoint
file_path = os.path.join(CONFIG_PATH,
'.kube_rootca_update_endpoint_added')
# If file exists on the controller, all the endpoints have been
# added to DB since last time an endpoint was added
if not os.path.isfile(file_path):
# Ensures all endpoints exist for all subclouds
# If the endpoint doesn't exist, an entry will be made
# in endpoint_status table
for subcloud in db_api.subcloud_get_all(self.context):
subcloud_statuses = \
db_api.subcloud_status_get_all(self.context,
subcloud.id)
# Use set difference to find missing endpoints
endpoint_type_set = set(dccommon_consts.ENDPOINT_TYPES_LIST)
subcloud_set = set()
for subcloud_status in subcloud_statuses:
subcloud_set.add(subcloud_status.endpoint_type)
missing_endpoints = list(endpoint_type_set - subcloud_set)
for endpoint in missing_endpoints:
db_api.subcloud_status_create(self.context,
subcloud.id,
endpoint)
# Add a flag on a replicated filesystem to avoid re-running
# the DB checks for missing subcloud endpoints
open(file_path, 'w').close()
@classmethod
def trigger_firmware_audit(cls, context):
"""Trigger firmware audit at next interval.
This can be called from outside the dcmanager audit
"""
cls.force_firmware_audit = True
@classmethod
def reset_force_firmware_audit(cls):
cls.force_firmware_audit = False
@classmethod
def trigger_kubernetes_audit(cls, context):
"""Trigger kubernetes audit at next interval.
This can be called from outside the dcmanager audit
"""
cls.force_kubernetes_audit = True
@classmethod
def reset_force_kubernetes_audit(cls):
cls.force_kubernetes_audit = False
@classmethod
def trigger_kube_rootca_update_audit(cls, context):
"""Trigger kubernetes rootca update audit at next interval.
This can be called from outside the dcmanager audit
"""
cls.force_kube_rootca_update_audit = True
@classmethod
def reset_force_kube_rootca_update_audit(cls):
cls.force_kube_rootca_update_audit = False
@classmethod
def trigger_patch_audit(cls, context):
"""Trigger patch audit at next interval.
This can be called from outside the dcmanager audit
"""
cls.force_patch_audit = True
@classmethod
def trigger_load_audit(cls, context):
"""Trigger load audit of all subclouds at next audit."""
audit_utils.request_subcloud_audits(context, audit_load=True)
@classmethod
def reset_force_patch_audit(cls):
cls.force_patch_audit = False
def trigger_subcloud_audits(self, context, subcloud_id, exclude_endpoints):
"""Trigger all subcloud audits for one subcloud."""
values = {
'patch_audit_requested': True,
'firmware_audit_requested': True,
'load_audit_requested': True,
'kubernetes_audit_requested': True,
'kube_rootca_update_audit_requested': True,
}
# For the endpoints excluded in the audit, set it to False in db
# to disable the audit explicitly.
if exclude_endpoints:
for exclude_endpoint in exclude_endpoints:
exclude_request = dccommon_consts.ENDPOINT_AUDIT_REQUESTS.get(
exclude_endpoint)
if exclude_request:
values.update({exclude_request: False})
db_api.subcloud_audits_update(context, subcloud_id, values)
def trigger_subcloud_patch_load_audits(self, context, subcloud_id):
"""Trigger subcloud patch and load audits for one subcloud."""
values = {
"patch_audit_requested": True,
"load_audit_requested": True,
}
db_api.subcloud_audits_update(context, subcloud_id, values)
def trigger_subcloud_endpoints_update(self, context, subcloud_name, endpoints):
"""Trigger update endpoints of services for a subcloud region."""
self.audit_worker_rpc_client.update_subcloud_endpoints(
context, subcloud_name, endpoints)
def periodic_subcloud_audit(self):
"""Audit availability of subclouds."""
# Verify subclouds have all the endpoints in DB
self._add_missing_endpoints()
# For any subclouds that were in the middle of being audited
# when dcmanager-audit was shut down, fix up the timestamps so we'll
# audit them and request all sub-audits.
# (This is for swact and process restart.)
db_api.subcloud_audits_fix_expired_audits(
self.context, datetime.datetime.utcnow(), trigger_audits=True)
# Blanket catch all exceptions in the audit so that the audit
# does not die.
while True:
try:
eventlet.greenthread.sleep(
CONF.scheduler.subcloud_audit_interval)
self._periodic_subcloud_audit_loop()
except eventlet.greenlet.GreenletExit:
# We have been told to exit
return
except Exception:
LOG.exception("Error in periodic subcloud audit loop")
def _get_audits_needed(self):
"""Returns which (if any) extra audits are needed."""
audit_patch = False
audit_load = False
audit_firmware = False
audit_kubernetes = False
audit_kube_rootca_updates = False
current_time = time.time()
# Determine whether to trigger a patch audit of each subcloud
if (SubcloudAuditManager.force_patch_audit or
(current_time - self.patch_audit_time >=
CONF.scheduler.patch_audit_interval)):
LOG.info("Trigger patch audit")
audit_patch = True
self.patch_audit_time = current_time
self.patch_audit_count += 1
# Check subcloud software version every other patch audit cycle
if (self.patch_audit_count % 2 != 0 or
SubcloudAuditManager.force_patch_audit):
LOG.info("Trigger load audit")
audit_load = True
if (self.patch_audit_count % 4 == 1):
LOG.info("Trigger firmware audit")
audit_firmware = True
# Reset force_firmware_audit only when firmware audit has been fired
SubcloudAuditManager.reset_force_firmware_audit()
if (self.patch_audit_count % KUBERNETES_AUDIT_RATE == 1):
LOG.info("Trigger kubernetes audit")
audit_kubernetes = True
# Reset force_kubernetes_audit only when kubernetes audit has been fired
SubcloudAuditManager.reset_force_kubernetes_audit()
if (self.patch_audit_count % KUBE_ROOTCA_UPDATE_AUDIT_RATE == 1):
LOG.info("Trigger kube rootca update audit")
audit_kube_rootca_updates = True
# Reset force_kube_rootca_update_audit only if audit is fired
SubcloudAuditManager.reset_force_kube_rootca_update_audit()
# the force_patch_audit flag is also used to evaluate audit_load
# so reset it here, even if it is not set
SubcloudAuditManager.reset_force_patch_audit()
# Trigger a firmware audit as it is changed through proxy
if SubcloudAuditManager.force_firmware_audit:
LOG.info("Trigger firmware audit")
audit_firmware = True
SubcloudAuditManager.reset_force_firmware_audit()
# Trigger a kubernetes audit as it is changed through proxy
if SubcloudAuditManager.force_kubernetes_audit:
LOG.info("Trigger kubernetes audit")
audit_kubernetes = True
SubcloudAuditManager.reset_force_kubernetes_audit()
# Trigger a kube rootca update audit as it is changed through proxy
if SubcloudAuditManager.force_kube_rootca_update_audit:
LOG.info("Trigger kube rootca update audit")
audit_kube_rootca_updates = True
SubcloudAuditManager.reset_force_kube_rootca_update_audit()
return (audit_patch, audit_load, audit_firmware,
audit_kubernetes, audit_kube_rootca_updates)
def _get_audit_data(self,
audit_patch,
audit_firmware,
audit_kubernetes,
audit_kube_rootca_updates):
"""Return the patch / firmware / kubernetes audit data as needed."""
patch_audit_data = None
firmware_audit_data = None
kubernetes_audit_data = None
kube_rootca_update_audit_data = None
if audit_patch:
# Query RegionOne patches and software version
patch_audit_data = self.patch_audit.get_regionone_audit_data()
if audit_firmware:
# Query RegionOne firmware
firmware_audit_data = self.firmware_audit.get_regionone_audit_data()
if audit_kubernetes:
# Query RegionOne kubernetes version info
kubernetes_audit_data = self.kubernetes_audit.get_regionone_audit_data()
if audit_kube_rootca_updates:
# Query RegionOne kube rootca update info
kube_rootca_update_audit_data = \
self.kube_rootca_update_audit.get_regionone_audit_data()
return (patch_audit_data, firmware_audit_data,
kubernetes_audit_data, kube_rootca_update_audit_data)
def _periodic_subcloud_audit_loop(self):
"""Audit availability of subclouds loop."""
# We will be running in our own green thread here.
LOG.debug('Triggered subcloud audit.')
self.audit_count += 1
# Determine whether to trigger a state update to each subcloud.
if self.audit_count >= SUBCLOUD_STATE_UPDATE_ITERATIONS:
update_subcloud_state = True
self.audit_count = 0
else:
update_subcloud_state = False
# Determine whether we want to trigger specialty audits.
(audit_patch, audit_load, audit_firmware,
audit_kubernetes,
audit_kube_rootca_update) = self._get_audits_needed()
# Set desired audit flags for all subclouds.
audit_utils.request_subcloud_audits(
self.context, update_subcloud_state, audit_patch, audit_load,
audit_firmware, audit_kubernetes, audit_kube_rootca_update)
do_openstack_audit = False
# The feature of syncing openstack resources to the subclouds was not
# completed, therefore, auditing the openstack application is disabled
# Determine whether OpenStack is installed in central cloud
# os_client = OpenStackDriver(region_name=consts.DEFAULT_REGION_NAME,
# thread_name='dcmanager-audit')
# sysinv_client = os_client.sysinv_client
# This could be optimized in the future by attempting to get just the
# one application. However, sysinv currently treats this as a failure
# if the application is not installed and generates warning logs, so it
# would require changes to handle this gracefully.
# apps = sysinv_client.get_applications()
# for app in apps:
# if app.name.endswith(HELM_APP_OPENSTACK):
# do_openstack_audit = True
# break
current_time = datetime.datetime.utcnow()
last_audit_threshold = current_time - datetime.timedelta(
seconds=CONF.scheduler.subcloud_audit_interval)
# The sysinv and patching subcloud REST API timeouts are 600 sec,
# and we need to be greater than that, so lets go with that plus
# an extra audit interval.
last_audit_fixup_threshold = current_time - datetime.timedelta(
seconds=(sysinv_v1.SYSINV_CLIENT_REST_DEFAULT_TIMEOUT +
CONF.scheduler.subcloud_audit_interval))
# Fix up any stale audit timestamps for subclouds that started an
# audit but never finished it.
start = datetime.datetime.utcnow()
num_fixed = db_api.subcloud_audits_fix_expired_audits(
self.context, last_audit_fixup_threshold)
end = datetime.datetime.utcnow()
if num_fixed > 0:
LOG.info('Fixed up subcloud audit timestamp for %s subclouds.' % num_fixed)
LOG.info('Fixup took %s seconds' % (end - start))
subcloud_ids = []
subcloud_audits = db_api.subcloud_audits_get_all_need_audit(
self.context, last_audit_threshold)
# Now check whether any of these subclouds need patch audit or firmware
# audit data and grab it if needed.
if not audit_patch:
for audit in subcloud_audits:
# Currently the load audit is done as part of the patch audit.
# It might make sense to split it out.
if audit.patch_audit_requested or audit.load_audit_requested:
audit_patch = True
LOG.debug("DB says patch audit needed")
break
if not audit_firmware:
for audit in subcloud_audits:
if audit.firmware_audit_requested:
LOG.debug("DB says firmware audit needed")
audit_firmware = True
break
if not audit_kubernetes:
for audit in subcloud_audits:
if audit.kubernetes_audit_requested:
LOG.debug("DB says kubernetes audit needed")
audit_kubernetes = True
break
if not audit_kube_rootca_update:
for audit in subcloud_audits:
if audit.kube_rootca_update_audit_requested:
LOG.debug("DB says kube-rootca-update audit needed")
audit_kube_rootca_update = True
break
LOG.info("Triggered subcloud audit: patch=(%s) firmware=(%s) "
"kube=(%s) kube-rootca=(%s)"
% (audit_patch, audit_firmware,
audit_kubernetes, audit_kube_rootca_update))
(patch_audit_data, firmware_audit_data,
kubernetes_audit_data, kube_rootca_update_audit_data) = \
self._get_audit_data(audit_patch,
audit_firmware,
audit_kubernetes,
audit_kube_rootca_update)
LOG.debug("patch_audit_data: %s, "
"firmware_audit_data: %s, "
"kubernetes_audit_data: %s, "
"kube_rootca_update_audit_data: : %s, "
% (patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
kube_rootca_update_audit_data))
# We want a chunksize of at least 1 so add the number of workers.
chunksize = (len(subcloud_audits) + CONF.audit_worker_workers) // CONF.audit_worker_workers
for audit in subcloud_audits:
subcloud_ids.append(audit.subcloud_id)
if len(subcloud_ids) == chunksize:
# We've gathered a batch of subclouds, send it for processing.
self.audit_worker_rpc_client.audit_subclouds(
self.context,
subcloud_ids,
patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
do_openstack_audit,
kube_rootca_update_audit_data)
LOG.debug('Sent subcloud audit request message for subclouds: %s' % subcloud_ids)
subcloud_ids = []
if len(subcloud_ids) > 0:
# We've got a partial batch...send it off for processing.
self.audit_worker_rpc_client.audit_subclouds(
self.context,
subcloud_ids,
patch_audit_data,
firmware_audit_data,
kubernetes_audit_data,
do_openstack_audit,
kube_rootca_update_audit_data)
LOG.debug('Sent final subcloud audit request message for subclouds: %s' % subcloud_ids)
else:
LOG.debug('Done sending audit request messages.')