
This commit introduces DC Kubernetes upgrade orchestration. - Added a new kubernetes endpoint for DC audit The audit compares that the active-target-version in the subcloud for kubernetes matches the system controller. - Wiring the kubernetes strategy to the kube orch thread - Add protections to orchestrator shutdown for when 'start' encounters a problem. - Added 'kubernetes' as a valid strategy type for: create / abort / delete / show - Update the endpoint flag file name to trigger updating the new endpoint - Added a kubernetes orch thread similar to sw upgrade orch thread with these states: - updating kube patches - creating vim patch strategy - applying vim patch strategy - deleting vim patch strategy - creating vim kube upgrade strategy - applying vim kube upgrade strategy Patch vim orchestration steps will be skipped if there are no patches that need to be applied in the subcloud. Change-Id: Ic508ed1a504dba177c100a9caf783e87bc10193a Story: 2008137 Task: 41177 Depends-On: https://review.opendev.org/c/starlingx/nfv/+/767421 Signed-off-by: albailey <al.bailey@windriver.com>
388 lines
17 KiB
Python
388 lines
17 KiB
Python
# Copyright 2017 Ericsson AB.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# Copyright (c) 2017-2021 Wind River Systems, Inc.
|
|
#
|
|
# The right to copy, distribute, modify, or otherwise make use
|
|
# of this software may be licensed only pursuant to the terms
|
|
# of an applicable Wind River license agreement.
|
|
#
|
|
|
|
import os
|
|
|
|
from keystoneauth1 import exceptions as keystone_exceptions
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
|
|
|
|
from dcmanager.audit import alarm_aggregation
|
|
from dcmanager.audit import firmware_audit
|
|
from dcmanager.audit import kubernetes_audit
|
|
from dcmanager.audit import patch_audit
|
|
from dcmanager.audit.subcloud_audit_manager import HELM_APP_OPENSTACK
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common import exceptions
|
|
from dcmanager.common.i18n import _
|
|
from dcmanager.common import manager
|
|
from dcmanager.common import scheduler
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.rpc import client as dcmanager_rpc_client
|
|
|
|
CONF = cfg.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
# We will update the state of each subcloud in the dcorch about once per hour.
|
|
# Calculate how many iterations that will be.
|
|
SUBCLOUD_STATE_UPDATE_ITERATIONS = \
|
|
dccommon_consts.SECONDS_IN_HOUR / CONF.scheduler.subcloud_audit_interval
|
|
|
|
|
|
class SubcloudAuditWorkerManager(manager.Manager):
|
|
"""Manages tasks related to audits."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
LOG.debug(_('SubcloudAuditWorkerManager initialization...'))
|
|
|
|
super(SubcloudAuditWorkerManager, self).__init__(
|
|
service_name="subcloud_audit_worker_manager")
|
|
self.context = context.get_admin_context()
|
|
self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient()
|
|
# Keeps track of greenthreads we create to do work.
|
|
self.thread_group_manager = scheduler.ThreadGroupManager(
|
|
thread_pool_size=100)
|
|
# Track workers created for each subcloud.
|
|
self.subcloud_workers = dict()
|
|
self.alarm_aggr = alarm_aggregation.AlarmAggregation(self.context)
|
|
self.patch_audit = patch_audit.PatchAudit(
|
|
self.context, self.dcmanager_rpc_client)
|
|
self.firmware_audit = firmware_audit.FirmwareAudit(
|
|
self.context, self.dcmanager_rpc_client)
|
|
self.kubernetes_audit = kubernetes_audit.KubernetesAudit(
|
|
self.context, self.dcmanager_rpc_client)
|
|
self.pid = os.getpid()
|
|
|
|
def audit_subclouds(self, context, subcloud_ids, patch_audit_data,
|
|
firmware_audit_data, kubernetes_audit_data,
|
|
do_openstack_audit):
|
|
"""Run audits of the specified subcloud(s)"""
|
|
|
|
LOG.debug('PID: %s, subclouds to audit: %s, do_openstack_audit: %s' %
|
|
(self.pid, subcloud_ids, do_openstack_audit))
|
|
|
|
for subcloud_id in subcloud_ids:
|
|
# Retrieve the subcloud and subcloud audit info
|
|
try:
|
|
subcloud = db_api.subcloud_get(self.context, subcloud_id)
|
|
subcloud_audits = db_api.subcloud_audits_get_and_start_audit(
|
|
self.context, subcloud_id)
|
|
except exceptions.SubcloudNotFound:
|
|
# Possibility subcloud could have been deleted since the list of
|
|
# subclouds to audit was created.
|
|
LOG.info('Ignoring SubcloudNotFound when auditing subcloud %s' %
|
|
subcloud_id)
|
|
continue
|
|
|
|
LOG.debug("PID: %s, starting audit of subcloud: %s." %
|
|
(self.pid, subcloud.name))
|
|
|
|
# Include failure deploy status states in the auditable list
|
|
# so that the subcloud can be set as offline
|
|
if (subcloud.deploy_status not in
|
|
[consts.DEPLOY_STATE_DONE,
|
|
consts.DEPLOY_STATE_DEPLOYING,
|
|
consts.DEPLOY_STATE_DEPLOY_FAILED,
|
|
consts.DEPLOY_STATE_INSTALL_FAILED,
|
|
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
|
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
|
consts.DEPLOY_STATE_MIGRATED]):
|
|
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
|
|
(subcloud.name, subcloud.deploy_status))
|
|
# This DB API call will set the "audit_finished_at" timestamp
|
|
# so it won't get audited again for a while.
|
|
audits_done = []
|
|
db_api.subcloud_audits_end_audit(self.context,
|
|
subcloud_id, audits_done)
|
|
continue
|
|
|
|
# Check the per-subcloud audit flags
|
|
do_patch_audit = subcloud_audits.patch_audit_requested
|
|
do_load_audit = subcloud_audits.load_audit_requested
|
|
do_firmware_audit = subcloud_audits.firmware_audit_requested
|
|
do_kubernetes_audit = subcloud_audits.kubernetes_audit_requested
|
|
update_subcloud_state = subcloud_audits.state_update_requested
|
|
|
|
# Create a new greenthread for each subcloud to allow the audits
|
|
# to be done in parallel. If there are not enough greenthreads
|
|
# in the pool, this will block until one becomes available.
|
|
self.subcloud_workers[subcloud.name] = \
|
|
self.thread_group_manager.start(self._do_audit_subcloud,
|
|
subcloud,
|
|
update_subcloud_state,
|
|
do_openstack_audit,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
do_patch_audit,
|
|
do_load_audit,
|
|
do_firmware_audit,
|
|
do_kubernetes_audit)
|
|
|
|
def _update_subcloud_availability(self, subcloud_name,
|
|
availability_status=None,
|
|
update_state_only=False,
|
|
audit_fail_count=None):
|
|
try:
|
|
self.dcmanager_rpc_client.update_subcloud_availability(
|
|
self.context, subcloud_name, availability_status,
|
|
update_state_only, audit_fail_count)
|
|
LOG.info('Notifying dcmanager, subcloud:%s, availability:%s' %
|
|
(subcloud_name,
|
|
availability_status))
|
|
except Exception:
|
|
LOG.exception('Problem informing dcmanager of subcloud '
|
|
'availability state change, subcloud: %s'
|
|
% subcloud_name)
|
|
|
|
@staticmethod
|
|
def _get_subcloud_availability_status(subcloud_name, sysinv_client):
|
|
"""For each subcloud, if at least one service is active in each
|
|
|
|
service of servicegroup-list then declare the subcloud online.
|
|
"""
|
|
avail_to_set = consts.AVAILABILITY_OFFLINE
|
|
svc_groups = None
|
|
|
|
# get a list of service groups in the subcloud
|
|
try:
|
|
svc_groups = sysinv_client.get_service_groups()
|
|
except Exception as e:
|
|
LOG.warn('Cannot retrieve service groups for '
|
|
'subcloud: %s, %s' % (subcloud_name, e))
|
|
|
|
if svc_groups:
|
|
active_sgs = []
|
|
inactive_sgs = []
|
|
|
|
# Build 2 lists, 1 of active service groups,
|
|
# one with non-active.
|
|
for sg in svc_groups:
|
|
if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE:
|
|
inactive_sgs.append(sg.service_group_name)
|
|
else:
|
|
active_sgs.append(sg.service_group_name)
|
|
|
|
# Create a list of service groups that are only present
|
|
# in non-active list
|
|
inactive_only = [sg for sg in inactive_sgs if
|
|
sg not in active_sgs]
|
|
|
|
# An empty inactive only list and a non-empty active list
|
|
# means we're good to go.
|
|
if not inactive_only and active_sgs:
|
|
avail_to_set = \
|
|
consts.AVAILABILITY_ONLINE
|
|
else:
|
|
LOG.info("Subcloud:%s has non-active "
|
|
"service groups: %s" %
|
|
(subcloud_name, inactive_only))
|
|
return avail_to_set
|
|
|
|
def _audit_subcloud_openstack_app(self, subcloud_name, sysinv_client,
|
|
openstack_installed):
|
|
openstack_installed_current = False
|
|
# get a list of installed apps in the subcloud
|
|
try:
|
|
apps = sysinv_client.get_applications()
|
|
except Exception:
|
|
LOG.exception('Cannot retrieve installed apps for subcloud:%s'
|
|
% subcloud_name)
|
|
return
|
|
|
|
for app in apps:
|
|
if app.name == HELM_APP_OPENSTACK and app.active:
|
|
# audit find openstack app is installed and active in
|
|
# the subcloud
|
|
openstack_installed_current = True
|
|
break
|
|
|
|
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
|
|
if openstack_installed_current and not openstack_installed:
|
|
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
|
|
self.context,
|
|
subcloud_name,
|
|
endpoint_type_list,
|
|
openstack_installed_current)
|
|
elif not openstack_installed_current and openstack_installed:
|
|
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
|
|
self.context,
|
|
subcloud_name,
|
|
endpoint_type_list,
|
|
openstack_installed_current)
|
|
|
|
def _do_audit_subcloud(self,
|
|
subcloud,
|
|
update_subcloud_state,
|
|
do_audit_openstack,
|
|
patch_audit_data,
|
|
firmware_audit_data,
|
|
kubernetes_audit_data,
|
|
do_patch_audit,
|
|
do_load_audit,
|
|
do_firmware_audit,
|
|
do_kubernetes_audit):
|
|
audits_done = []
|
|
# Do the actual subcloud audit.
|
|
try:
|
|
audits_done = self._audit_subcloud(
|
|
subcloud, update_subcloud_state, do_audit_openstack,
|
|
patch_audit_data, firmware_audit_data, kubernetes_audit_data,
|
|
do_patch_audit,
|
|
do_load_audit, do_firmware_audit, do_kubernetes_audit)
|
|
except Exception:
|
|
LOG.exception("Got exception auditing subcloud: %s" % subcloud.name)
|
|
|
|
# Update the audit completion timestamp so it doesn't get
|
|
# audited again for a while.
|
|
db_api.subcloud_audits_end_audit(self.context,
|
|
subcloud.id, audits_done)
|
|
# Remove the worker for this subcloud
|
|
self.subcloud_workers.pop(subcloud.name, None)
|
|
LOG.debug("PID: %s, done auditing subcloud: %s." %
|
|
(self.pid, subcloud.name))
|
|
|
|
def _audit_subcloud(self, subcloud, update_subcloud_state,
|
|
do_audit_openstack, patch_audit_data,
|
|
firmware_audit_data, kubernetes_audit_data,
|
|
do_patch_audit, do_load_audit, do_firmware_audit,
|
|
do_kubernetes_audit):
|
|
"""Audit a single subcloud."""
|
|
|
|
avail_status_current = subcloud.availability_status
|
|
audit_fail_count = subcloud.audit_fail_count
|
|
subcloud_name = subcloud.name
|
|
audits_done = []
|
|
|
|
# Set defaults to None and disabled so we will still set disabled
|
|
# status if we encounter an error.
|
|
|
|
sysinv_client = None
|
|
fm_client = None
|
|
avail_to_set = consts.AVAILABILITY_OFFLINE
|
|
|
|
try:
|
|
os_client = OpenStackDriver(region_name=subcloud_name,
|
|
thread_name='subcloud-audit')
|
|
sysinv_client = os_client.sysinv_client
|
|
fm_client = os_client.fm_client
|
|
except (keystone_exceptions.EndpointNotFound,
|
|
keystone_exceptions.ConnectFailure,
|
|
keystone_exceptions.ConnectTimeout,
|
|
IndexError):
|
|
if avail_status_current == consts.AVAILABILITY_OFFLINE:
|
|
LOG.info("Identity or Platform endpoint for %s not "
|
|
"found, ignoring for offline "
|
|
"subcloud." % subcloud_name)
|
|
return audits_done
|
|
else:
|
|
# The subcloud will be marked as offline below.
|
|
LOG.error("Identity or Platform endpoint for online "
|
|
"subcloud: %s not found." % subcloud_name)
|
|
|
|
except Exception:
|
|
LOG.exception("Failed to get OS Client for subcloud: %s"
|
|
% subcloud_name)
|
|
|
|
# Check availability of the subcloud
|
|
if sysinv_client:
|
|
avail_to_set = self._get_subcloud_availability_status(
|
|
subcloud_name, sysinv_client)
|
|
|
|
if avail_to_set == consts.AVAILABILITY_OFFLINE:
|
|
if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
|
|
audit_fail_count = audit_fail_count + 1
|
|
if (avail_status_current == consts.AVAILABILITY_ONLINE) and \
|
|
(audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM):
|
|
# Do not set offline until we have failed audit
|
|
# the requisite number of times
|
|
avail_to_set = consts.AVAILABILITY_ONLINE
|
|
else:
|
|
# In the case of a one off blip, we may need to set the
|
|
# fail count back to 0
|
|
audit_fail_count = 0
|
|
|
|
if avail_to_set != avail_status_current:
|
|
|
|
if avail_to_set == consts.AVAILABILITY_ONLINE:
|
|
audit_fail_count = 0
|
|
|
|
LOG.info('Setting new availability status: %s '
|
|
'on subcloud: %s' %
|
|
(avail_to_set, subcloud_name))
|
|
self._update_subcloud_availability(
|
|
subcloud_name,
|
|
availability_status=avail_to_set,
|
|
audit_fail_count=audit_fail_count)
|
|
|
|
elif audit_fail_count != subcloud.audit_fail_count:
|
|
self._update_subcloud_availability(
|
|
subcloud_name,
|
|
availability_status=None,
|
|
audit_fail_count=audit_fail_count)
|
|
|
|
elif update_subcloud_state:
|
|
# Nothing has changed, but we want to send a state update for this
|
|
# subcloud as an audit.
|
|
LOG.debug('Updating subcloud state unconditionally for subcloud %s'
|
|
% subcloud_name)
|
|
self._update_subcloud_availability(
|
|
subcloud_name,
|
|
availability_status=avail_status_current,
|
|
update_state_only=True)
|
|
|
|
# If subcloud is managed and online, audit additional resources
|
|
if (subcloud.management_state == consts.MANAGEMENT_MANAGED and
|
|
avail_to_set == consts.AVAILABILITY_ONLINE):
|
|
# Get alarm summary and store in db,
|
|
if fm_client:
|
|
self.alarm_aggr.update_alarm_summary(subcloud_name, fm_client)
|
|
|
|
# If we have patch audit data, audit the subcloud
|
|
if do_patch_audit and patch_audit_data:
|
|
self.patch_audit.subcloud_patch_audit(subcloud_name,
|
|
patch_audit_data,
|
|
do_load_audit)
|
|
audits_done.append('patch')
|
|
if do_load_audit:
|
|
audits_done.append('load')
|
|
# Perform firmware audit
|
|
if do_firmware_audit:
|
|
self.firmware_audit.subcloud_firmware_audit(subcloud_name,
|
|
firmware_audit_data)
|
|
audits_done.append('firmware')
|
|
# Perform kubernetes audit
|
|
if do_kubernetes_audit:
|
|
self.kubernetes_audit.subcloud_kubernetes_audit(
|
|
subcloud_name,
|
|
kubernetes_audit_data)
|
|
audits_done.append('kubernetes')
|
|
# Audit openstack application in the subcloud
|
|
if do_audit_openstack and sysinv_client:
|
|
self._audit_subcloud_openstack_app(
|
|
subcloud_name, sysinv_client, subcloud.openstack_installed)
|
|
return audits_done
|