Move subcloud audit to separate process

Remove subcloud audit from dcmanager-manager process.
Create dcmanager-audit process & associated files.
Add new RPC calls for dcmanager-audit to notify dcmanager
subcloud availability and sync endpoint type changes.
Update dcmanager to handle availability and sync endpoint
type updates from dcmanager-audit.
Subcloud audit interval will be reduced to 20 seconds.
Create/update unit tests, to verify the implementation
changes.

Story: 2007267
Task: 39637

Change-Id: Iff408166753f22ce3616d34e267ca1155ac43042
Signed-off-by: Tao Liu <tao.liu@windriver.com>
This commit is contained in:
Tao Liu
2020-05-05 08:59:59 -05:00
parent d46516c46d
commit 5c8377047b
25 changed files with 1398 additions and 563 deletions

View File

@@ -28,6 +28,7 @@ Source9: dcmanager.conf
Source10: dcorch.conf
Source11: dcdbsync.conf
Source12: clean-dcorch
Source13: dcmanager-audit.service
BuildArch: noarch
@@ -130,6 +131,7 @@ install -d -m 755 %{buildroot}%{_sysconfdir}/dcmanager/
# install systemd unit files
install -p -D -m 644 %{SOURCE1} %{buildroot}%{_unitdir}/dcmanager-api.service
install -p -D -m 644 %{SOURCE2} %{buildroot}%{_unitdir}/dcmanager-manager.service
install -p -D -m 644 %{SOURCE13} %{buildroot}%{_unitdir}/dcmanager-audit.service
install -p -D -m 644 %{SOURCE9} %{buildroot}%{_tmpfilesdir}
# install default config files
cd %{_builddir}/%{pypi_name}-%{version} && oslo-config-generator --config-file ./dcmanager/config-generator.conf --output-file %{_builddir}/%{pypi_name}-%{version}%{_sysconfdir}/dcmanager/dcmanager.conf.sample
@@ -185,6 +187,8 @@ install -m 755 -D -p %{SOURCE12} %{buildroot}/%{_bindir}/clean-dcorch
%exclude %{python2_sitelib}/dcmanager/tests
%{_bindir}/dcmanager-api
%{_unitdir}/dcmanager-api.service
%{_bindir}/dcmanager-audit
%{_unitdir}/dcmanager-audit.service
%{_bindir}/dcmanager-manager
%{_unitdir}/dcmanager-manager.service
%{_bindir}/dcmanager-manage

View File

@@ -0,0 +1,12 @@
[Unit]
Description=DC Manager Audit Service
After=syslog-ng.service network-online.target dcmanager-manager.service
[Service]
Type=simple
User=root
ExecStart=/usr/bin/dcmanager-audit --config-file /etc/dcmanager/dcmanager.conf
Restart=on-failure
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,93 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
import six
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging
from oslo_service import service
from dcmanager.audit.subcloud_audit_manager import SubcloudAuditManager
from dcmanager.common import consts
from dcmanager.common.i18n import _
from dcmanager.common import messaging as rpc_messaging
from dcmanager.common import scheduler
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
class DCManagerAuditService(service.Service):
"""Lifecycle manager for a running audit service."""
def __init__(self):
super(DCManagerAuditService, self).__init__()
self.host = cfg.CONF.host
# To be used by the sw update manager to trigger the patch audit
self.rpc_api_version = consts.RPC_API_VERSION
self.topic = consts.TOPIC_DC_MANAGER_AUDIT
# The following are initialized here, but assigned in start() which
# happens after the fork when spawning multiple worker processes
self.TG = None
self.target = None
self._rpc_server = None
self.subcloud_audit_manager = None
def start(self):
self.init_tgm()
self.init_audit_managers()
target = oslo_messaging.Target(version=self.rpc_api_version,
server=self.host,
topic=self.topic)
self.target = target
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
self._rpc_server.start()
super(DCManagerAuditService, self).start()
def init_tgm(self):
self.TG = scheduler.ThreadGroupManager()
def init_audit_managers(self):
self.subcloud_audit_manager = SubcloudAuditManager()
# Audit availability of all subclouds.
# Note this will run in a separate green thread
self.TG.start(self.subcloud_audit_manager.periodic_subcloud_audit)
def _stop_rpc_server(self):
# Stop RPC connection to prevent new requests
LOG.debug(_("Attempting to stop engine service..."))
try:
self._rpc_server.stop()
self._rpc_server.wait()
LOG.info('Engine service stopped successfully')
except Exception as ex:
LOG.error('Failed to stop engine service: %s',
six.text_type(ex))
def stop(self):
self._stop_rpc_server()
self.TG.stop()
# Terminate the engine process
LOG.info("All threads were gone, terminating engine")
super(DCManagerAuditService, self).stop()

View File

@@ -0,0 +1,339 @@
# Copyright 2017 Ericsson AB.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2017-2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
import eventlet
from keystoneauth1 import exceptions as keystone_exceptions
from oslo_config import cfg
from oslo_log import log as logging
from sysinv.common import constants as sysinv_constants
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dcmanager.audit import alarm_aggregation
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common.i18n import _
from dcmanager.common import manager
from dcmanager.common import scheduler
from dcmanager.db import api as db_api
from dcmanager.rpc import client as dcmanager_rpc_client
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# We will update the state of each subcloud in the dcorch about once per hour.
# Calculate how many iterations that will be.
SUBCLOUD_STATE_UPDATE_ITERATIONS = \
dccommon_consts.SECONDS_IN_HOUR / CONF.scheduler.subcloud_audit_interval
class SubcloudAuditManager(manager.Manager):
"""Manages tasks related to audits."""
def __init__(self, *args, **kwargs):
LOG.debug(_('SubcloudAuditManager initialization...'))
super(SubcloudAuditManager, self).__init__(
service_name="subcloud_audit_manager")
self.context = context.get_admin_context()
self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient()
# Keeps track of greenthreads we create to do work.
self.thread_group_manager = scheduler.ThreadGroupManager(
thread_pool_size=100)
# Track workers created for each subcloud.
self.subcloud_workers = dict()
# Number of audits since last subcloud state update
self.audit_count = 0
self.alarm_aggr = alarm_aggregation.AlarmAggregation(self.context)
def periodic_subcloud_audit(self):
"""Audit availability of subclouds."""
# Blanket catch all exceptions in the audit so that the audit
# does not die.
while True:
try:
eventlet.greenthread.sleep(
CONF.scheduler.subcloud_audit_interval)
self._periodic_subcloud_audit_loop()
except eventlet.greenlet.GreenletExit:
# We have been told to exit
return
except Exception:
LOG.exception("Error in periodic subcloud audit loop")
def _periodic_subcloud_audit_loop(self):
"""Audit availability of subclouds loop."""
# We will be running in our own green thread here.
LOG.info('Triggered subcloud audit.')
self.audit_count += 1
# Determine whether to trigger a state update to each subcloud
if self.audit_count >= SUBCLOUD_STATE_UPDATE_ITERATIONS:
update_subcloud_state = True
else:
update_subcloud_state = False
openstack_installed = False
# The feature of syncing openstack resources to the subclouds was not
# completed, therefore, auditing the openstack application is disabled
# Determine whether OpenStack is installed in central cloud
# os_client = OpenStackDriver(region_name=consts.DEFAULT_REGION_NAME,
# thread_name='dcmanager-audit')
# sysinv_client = os_client.sysinv_client
# This could be optimized in the future by attempting to get just the
# one application. However, sysinv currently treats this as a failure
# if the application is not installed and generates warning logs, so it
# would require changes to handle this gracefully.
# apps = sysinv_client.get_applications()
# for app in apps:
# if app.name == sysinv_constants.HELM_APP_OPENSTACK and app.active:
# openstack_installed = True
# break
for subcloud in db_api.subcloud_get_all(self.context):
if (subcloud.deploy_status not in
[consts.DEPLOY_STATE_DONE,
consts.DEPLOY_STATE_DEPLOYING,
consts.DEPLOY_STATE_DEPLOY_FAILED]):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud.name, subcloud.deploy_status))
continue
# Create a new greenthread for each subcloud to allow the audits
# to be done in parallel. If there are not enough greenthreads
# in the pool, this will block until one becomes available.
self.subcloud_workers[subcloud.name] = \
self.thread_group_manager.start(self._audit_subcloud,
subcloud.name,
update_subcloud_state,
openstack_installed)
# Wait for all greenthreads to complete
LOG.info('Waiting for subcloud audits to complete.')
for thread in self.subcloud_workers.values():
thread.wait()
# Clear the list of workers before next audit
self.subcloud_workers = dict()
LOG.info('All subcloud audits have completed.')
def _update_subcloud_availability(self, subcloud_name,
availability_status=None,
update_state_only=False,
audit_fail_count=None):
try:
self.dcmanager_rpc_client.update_subcloud_availability(
self.context, subcloud_name, availability_status,
update_state_only, audit_fail_count)
LOG.info('Notifying dcmanager, subcloud:%s, availability:%s' %
(subcloud_name,
availability_status))
except Exception:
LOG.exception('Problem informing dcmanager of subcloud '
'availability state change, subcloud: %s'
% subcloud_name)
@staticmethod
def _get_subcloud_availability_status(subcloud_name, sysinv_client):
"""For each subcloud, if at least one service is active in each
service of servicegroup-list then declare the subcloud online.
"""
avail_to_set = consts.AVAILABILITY_OFFLINE
svc_groups = None
# get a list of service groups in the subcloud
try:
svc_groups = sysinv_client.get_service_groups()
except Exception as e:
LOG.warn('Cannot retrieve service groups for '
'subcloud: %s, %s' % (subcloud_name, e))
if svc_groups:
active_sgs = []
inactive_sgs = []
# Build 2 lists, 1 of active service groups,
# one with non-active.
for sg in svc_groups:
if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE:
inactive_sgs.append(sg.service_group_name)
else:
active_sgs.append(sg.service_group_name)
# Create a list of service groups that are only present
# in non-active list
inactive_only = [sg for sg in inactive_sgs if
sg not in active_sgs]
# An empty inactive only list and a non-empty active list
# means we're good to go.
if not inactive_only and active_sgs:
avail_to_set = \
consts.AVAILABILITY_ONLINE
else:
LOG.info("Subcloud:%s has non-active "
"service groups: %s" %
(subcloud_name, inactive_only))
return avail_to_set
def _audit_subcloud_openstack_app(self, subcloud_name, sysinv_client,
openstack_installed):
openstack_installed_current = False
# get a list of installed apps in the subcloud
try:
apps = sysinv_client.get_applications()
except Exception:
LOG.exception('Cannot retrieve installed apps for subcloud:%s'
% subcloud_name)
return
for app in apps:
if app.name == sysinv_constants.HELM_APP_OPENSTACK \
and app.active:
# audit find openstack app is installed and active in
# the subcloud
openstack_installed_current = True
break
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
if openstack_installed_current and not openstack_installed:
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
self.context,
subcloud_name,
endpoint_type_list,
openstack_installed_current)
elif not openstack_installed_current and openstack_installed:
self.dcmanager_rpc_client.update_subcloud_sync_endpoint_type(
self.context,
subcloud_name,
endpoint_type_list,
openstack_installed_current)
def _audit_subcloud(self, subcloud_name, update_subcloud_state,
audit_openstack):
"""Audit a single subcloud."""
# Retrieve the subcloud
try:
subcloud = db_api.subcloud_get_by_name(self.context, subcloud_name)
except exceptions.SubcloudNotFound:
# Possibility subcloud could have been deleted since the list of
# subclouds to audit was created.
LOG.info('Ignoring SubcloudNotFound when auditing subcloud %s' %
subcloud_name)
return
avail_status_current = subcloud.availability_status
audit_fail_count = subcloud.audit_fail_count
# Set defaults to None and disabled so we will still set disabled
# status if we encounter an error.
sysinv_client = None
fm_client = None
avail_to_set = consts.AVAILABILITY_OFFLINE
try:
os_client = OpenStackDriver(region_name=subcloud_name,
thread_name='subcloud-audit')
sysinv_client = os_client.sysinv_client
fm_client = os_client.fm_client
except (keystone_exceptions.EndpointNotFound,
keystone_exceptions.ConnectFailure,
keystone_exceptions.ConnectTimeout,
IndexError):
if avail_status_current == consts.AVAILABILITY_OFFLINE:
LOG.info("Identity or Platform endpoint for %s not "
"found, ignoring for offline "
"subcloud." % subcloud_name)
return
else:
# The subcloud will be marked as offline below.
LOG.error("Identity or Platform endpoint for online "
"subcloud: %s not found." % subcloud_name)
except Exception:
LOG.exception("Failed to get OS Client for subcloud: %s"
% subcloud_name)
# Check availability of the subcloud
if sysinv_client:
avail_to_set = self._get_subcloud_availability_status(
subcloud_name, sysinv_client)
if avail_to_set == consts.AVAILABILITY_OFFLINE:
if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
audit_fail_count = audit_fail_count + 1
if (avail_status_current == consts.AVAILABILITY_ONLINE) and \
(audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM):
# Do not set offline until we have failed audit
# the requisite number of times
avail_to_set = consts.AVAILABILITY_ONLINE
else:
# In the case of a one off blip, we may need to set the
# fail count back to 0
audit_fail_count = 0
if avail_to_set != avail_status_current:
if avail_to_set == consts.AVAILABILITY_ONLINE:
audit_fail_count = 0
LOG.info('Setting new availability status: %s '
'on subcloud: %s' %
(avail_to_set, subcloud_name))
self._update_subcloud_availability(
subcloud_name,
availability_status=avail_to_set,
audit_fail_count=audit_fail_count)
elif audit_fail_count != subcloud.audit_fail_count:
self._update_subcloud_availability(
subcloud_name,
availability_status=None,
audit_fail_count=audit_fail_count)
elif update_subcloud_state:
# Nothing has changed, but we want to send a state update for this
# subcloud as an audit.
self._update_subcloud_availability(
subcloud_name,
availability_status=avail_status_current,
update_state_only=True)
self.audit_count = 0
if avail_to_set == consts.AVAILABILITY_ONLINE:
# If subcloud is online, get alarm summary and store in db,
if fm_client:
self.alarm_aggr.update_alarm_summary(subcloud_name, fm_client)
# Audit openstack application in the subcloud
if audit_openstack and sysinv_client:
self._audit_subcloud_openstack_app(
subcloud_name, sysinv_client, subcloud.openstack_installed)

View File

@@ -0,0 +1,63 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
"""
DC Manager Audit Service.
"""
import eventlet
eventlet.monkey_patch()
from oslo_config import cfg
from oslo_i18n import _lazy
from oslo_log import log as logging
from oslo_service import service
from dcmanager.common import config
from dcmanager.common import messaging
_lazy.enable_lazy()
config.register_options()
config.register_keystone_options()
LOG = logging.getLogger('dcmanager.audit')
CONF = cfg.CONF
def main():
logging.register_options(CONF)
CONF(project='dcmanager', prog='dcmanager-audit')
logging.setup(cfg.CONF, 'dcmanager-audit')
logging.set_defaults()
messaging.setup()
from dcmanager.audit import service as audit
srv = audit.DCManagerAuditService()
launcher = service.launch(cfg.CONF,
srv, workers=CONF.audit_workers)
LOG.info("Configuration:")
cfg.CONF.log_opt_values(LOG, logging.INFO)
launcher.wait()
if __name__ == '__main__':
main()

View File

@@ -11,7 +11,7 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2017 Wind River Systems, Inc.
# Copyright (c) 2017-2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
@@ -112,7 +112,7 @@ scheduler_opts = [
default=True,
help='boolean value for enable/disable periodic tasks'),
cfg.IntOpt('subcloud_audit_interval',
default=180,
default=20,
help='periodic time interval for subcloud audit'),
cfg.IntOpt('patch_audit_interval',
default=10,
@@ -122,6 +122,8 @@ scheduler_opts = [
common_opts = [
cfg.IntOpt('workers', default=1,
help='number of workers'),
cfg.IntOpt('audit_workers', default=1,
help='number of audit workers'),
cfg.StrOpt('host',
default='localhost',
help='hostname of the machine')

View File

@@ -23,6 +23,8 @@ RPC_API_VERSION = "1.0"
TOPIC_DC_MANAGER = "dcmanager"
TOPIC_DC_MANAGER_AUDIT = "dcmanager-audit"
PATCH_VAULT_DIR = "/opt/dc-vault/patches"
# Well known region names

View File

@@ -10,7 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2017 Wind River Systems, Inc.
# Copyright (c) 2017-2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms

View File

@@ -36,10 +36,10 @@ from dccommon.drivers.openstack import vim
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common import scheduler
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.manager.patch_audit_manager import PatchAuditManager
from dcmanager.manager import scheduler
LOG = logging.getLogger(__name__)

View File

@@ -34,9 +34,8 @@ from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common.i18n import _
from dcmanager.common import messaging as rpc_messaging
from dcmanager.common import scheduler
from dcmanager.manager.patch_audit_manager import PatchAuditManager
from dcmanager.manager import scheduler
from dcmanager.manager.subcloud_audit_manager import SubcloudAuditManager
from dcmanager.manager.subcloud_manager import SubcloudManager
from dcmanager.manager.sw_update_manager import SwUpdateManager
@@ -81,7 +80,6 @@ class DCManagerService(service.Service):
self.target = None
self._rpc_server = None
self.subcloud_manager = None
self.subcloud_audit_manager = None
self.sw_update_manager = None
self.patch_audit_manager = None
@@ -89,8 +87,6 @@ class DCManagerService(service.Service):
self.TG = scheduler.ThreadGroupManager()
def init_audit_managers(self):
self.subcloud_audit_manager = SubcloudAuditManager(
subcloud_manager=self.subcloud_manager)
self.patch_audit_manager = PatchAuditManager(
subcloud_manager=self.subcloud_manager)
@@ -116,18 +112,9 @@ class DCManagerService(service.Service):
super(DCManagerService, self).start()
if self.periodic_enable:
LOG.info("Adding periodic tasks for the manager to perform")
self.TG.add_timer(cfg.CONF.scheduler.subcloud_audit_interval,
self.subcloud_audit, initial_delay=10)
self.TG.add_timer(cfg.CONF.scheduler.patch_audit_interval,
self.patch_audit, initial_delay=60)
def subcloud_audit(self):
# Audit availability of all subclouds.
# Note this will run in a separate green thread
LOG.debug("Subcloud audit job started at: %s",
time.strftime("%c"))
self.subcloud_audit_manager.periodic_subcloud_audit()
def patch_audit(self):
# Audit patch status of all subclouds.
# Note this will run in a separate green thread
@@ -189,6 +176,32 @@ class DCManagerService(service.Service):
return
@request_context
def update_subcloud_availability(self, context,
subcloud_name,
availability_status,
update_state_only=False,
audit_fail_count=None):
# Updates subcloud availability
LOG.info("Handling update_subcloud_availability request for: %s" %
subcloud_name)
self.subcloud_manager.update_subcloud_availability(
context,
subcloud_name,
availability_status,
update_state_only,
audit_fail_count)
@request_context
def update_subcloud_sync_endpoint_type(self, context, subcloud_name,
endpoint_type_list,
openstack_installed):
# Updates subcloud sync endpoint type
LOG.info("Handling update_subcloud_sync_endpoint_type request for: %s"
% subcloud_name)
self.subcloud_manager.update_subcloud_sync_endpoint_type(
context, subcloud_name, endpoint_type_list, openstack_installed)
@request_context
def create_sw_update_strategy(self, context, payload):
# Creates a software update strategy

View File

@@ -1,422 +0,0 @@
# Copyright 2017 Ericsson AB.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Copyright (c) 2017-2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
from keystoneauth1 import exceptions as keystone_exceptions
from oslo_config import cfg
from oslo_log import log as logging
from fm_api import constants as fm_const
from fm_api import fm_api
from sysinv.common import constants as sysinv_constants
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dcorch.rpc import client as dcorch_rpc_client
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common.i18n import _
from dcmanager.common import manager
from dcmanager.db import api as db_api
from dcmanager.manager import alarm_aggregation
from dcmanager.manager import scheduler
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# We will update the state of each subcloud in the dcorch about once per hour.
# Calculate how many iterations that will be.
SUBCLOUD_STATE_UPDATE_ITERATIONS = \
dccommon_consts.SECONDS_IN_HOUR / CONF.scheduler.subcloud_audit_interval
class SubcloudAuditManager(manager.Manager):
"""Manages tasks related to audits."""
def __init__(self, *args, **kwargs):
LOG.debug(_('SubcloudAuditManager initialization...'))
super(SubcloudAuditManager, self).__init__(
service_name="subcloud_audit_manager")
self.context = context.get_admin_context()
self.dcorch_rpc_client = dcorch_rpc_client.EngineClient()
self.fm_api = fm_api.FaultAPIs()
self.subcloud_manager = kwargs['subcloud_manager']
# Keeps track of greenthreads we create to do work.
self.thread_group_manager = scheduler.ThreadGroupManager(
thread_pool_size=100)
# Track workers created for each subcloud.
self.subcloud_workers = dict()
# Number of audits since last subcloud state update
self.audit_count = 0
self.alarm_aggr = alarm_aggregation.AlarmAggregation(self.context)
def periodic_subcloud_audit(self):
"""Audit availability of subclouds."""
# Blanket catch all exceptions in the audit so that the audit
# does not die.
try:
self._periodic_subcloud_audit_loop()
except Exception as e:
LOG.exception(e)
def _periodic_subcloud_audit_loop(self):
"""Audit availability of subclouds loop."""
# We will be running in our own green thread here.
LOG.info('Triggered subcloud audit.')
self.audit_count += 1
# Determine whether to trigger a state update to each subcloud
if self.audit_count >= SUBCLOUD_STATE_UPDATE_ITERATIONS:
update_subcloud_state = True
else:
update_subcloud_state = False
# Determine whether OpenStack is installed in central cloud
os_client = OpenStackDriver(region_name=consts.DEFAULT_REGION_NAME,
thread_name='dcmanager')
sysinv_client = os_client.sysinv_client
# This could be optimized in the future by attempting to get just the
# one application. However, sysinv currently treats this as a failure
# if the application is not installed and generates warning logs, so it
# would require changes to handle this gracefully.
apps = sysinv_client.get_applications()
openstack_installed = False
for app in apps:
if app.name == sysinv_constants.HELM_APP_OPENSTACK and app.active:
openstack_installed = True
break
for subcloud in db_api.subcloud_get_all(self.context):
if (subcloud.deploy_status not in
[consts.DEPLOY_STATE_DONE,
consts.DEPLOY_STATE_DEPLOYING,
consts.DEPLOY_STATE_DEPLOY_FAILED]):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud.name, subcloud.deploy_status))
continue
# Create a new greenthread for each subcloud to allow the audits
# to be done in parallel. If there are not enough greenthreads
# in the pool, this will block until one becomes available.
self.subcloud_workers[subcloud.name] = \
self.thread_group_manager.start(self._audit_subcloud,
subcloud.name,
update_subcloud_state,
openstack_installed)
# Wait for all greenthreads to complete
LOG.info('Waiting for subcloud audits to complete.')
for thread in self.subcloud_workers.values():
thread.wait()
# Clear the list of workers before next audit
self.subcloud_workers = dict()
LOG.info('All subcloud audits have completed.')
def _audit_subcloud(self, subcloud_name, update_subcloud_state,
audit_openstack):
"""Audit a single subcloud."""
# Retrieve the subcloud
try:
subcloud = db_api.subcloud_get_by_name(self.context, subcloud_name)
except exceptions.SubcloudNotFound:
# Possibility subcloud could have been deleted since the list of
# subclouds to audit was created.
LOG.info('Ignoring SubcloudNotFound when auditing subcloud %s' %
subcloud_name)
return
# For each subcloud, if at least one service is active in
# each service of servicegroup-list then declare the subcloud online.
subcloud_id = subcloud.id
avail_status_current = subcloud.availability_status
audit_fail_count = subcloud.audit_fail_count
# Set defaults to None and disabled so we will still set disabled
# status if we encounter an error.
sysinv_client = None
fm_client = None
svc_groups = None
avail_to_set = consts.AVAILABILITY_OFFLINE
try:
os_client = OpenStackDriver(region_name=subcloud_name,
thread_name='dcmanager')
sysinv_client = os_client.sysinv_client
fm_client = os_client.fm_client
except (keystone_exceptions.EndpointNotFound,
keystone_exceptions.ConnectFailure,
keystone_exceptions.ConnectTimeout,
IndexError):
if avail_status_current == consts.AVAILABILITY_OFFLINE:
LOG.info("Identity or Platform endpoint for %s not "
"found, ignoring for offline "
"subcloud." % subcloud_name)
return
else:
# The subcloud will be marked as offline below.
LOG.error("Identity or Platform endpoint for online "
"subcloud: %s not found." % subcloud_name)
except Exception as e:
LOG.exception(e)
if sysinv_client:
# get a list of service groups in the subcloud
try:
svc_groups = sysinv_client.get_service_groups()
except Exception as e:
svc_groups = None
LOG.warn('Cannot retrieve service groups for '
'subcloud: %s, %s' % (subcloud_name, e))
if svc_groups:
active_sgs = []
inactive_sgs = []
# Build 2 lists, 1 of active service groups,
# one with non-active.
for sg in svc_groups:
if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE:
inactive_sgs.append(sg.service_group_name)
else:
active_sgs.append(sg.service_group_name)
# Create a list of service groups that are only present
# in non-active list
inactive_only = [sg for sg in inactive_sgs if
sg not in active_sgs]
# An empty inactive only list and a non-empty active list
# means we're good to go.
if not inactive_only and active_sgs:
avail_to_set = \
consts.AVAILABILITY_ONLINE
else:
LOG.info("Subcloud:%s has non-active "
"service groups: %s" %
(subcloud_name, inactive_only))
if avail_to_set == consts.AVAILABILITY_OFFLINE:
if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
audit_fail_count = audit_fail_count + 1
if (avail_status_current == consts.AVAILABILITY_ONLINE) and \
(audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM):
# Do not set offline until we have failed audit
# the requisite number of times
avail_to_set = consts.AVAILABILITY_ONLINE
else:
# In the case of a one off blip, we may need to set the
# fail count back to 0
audit_fail_count = 0
if avail_to_set != avail_status_current:
if avail_to_set == consts.AVAILABILITY_ONLINE:
audit_fail_count = 0
LOG.info('Setting new availability status: %s '
'on subcloud: %s' %
(avail_to_set, subcloud_name))
entity_instance_id = "subcloud=%s" % subcloud_name
fault = self.fm_api.get_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
entity_instance_id)
if fault and (avail_to_set == consts.AVAILABILITY_ONLINE):
try:
self.fm_api.clear_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
entity_instance_id)
except Exception as e:
LOG.exception(e)
elif not fault and \
(avail_to_set == consts.AVAILABILITY_OFFLINE):
try:
fault = fm_api.Fault(
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
reason_text=('%s is offline' % subcloud_name),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
proposed_repair_action="Wait for subcloud to "
"become online; if "
"problem persists contact "
"next level of support.",
service_affecting=True)
self.fm_api.set_fault(fault)
except Exception as e:
LOG.exception(e)
try:
updated_subcloud = db_api.subcloud_update(
self.context,
subcloud_id,
management_state=None,
availability_status=avail_to_set,
software_version=None,
description=None, location=None,
audit_fail_count=audit_fail_count)
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info('Ignoring SubcloudNotFound when attempting state'
' update: %s' % subcloud_name)
return
try:
self.dcorch_rpc_client.\
update_subcloud_states(self.context,
subcloud_name,
updated_subcloud.management_state,
avail_to_set)
LOG.info('Notifying dcorch, subcloud:%s management: %s, '
'availability:%s' %
(subcloud_name,
updated_subcloud.management_state,
avail_to_set))
except Exception as e:
LOG.exception(e)
LOG.warn('Problem informing dcorch of subcloud '
'state change, subcloud: %s' % subcloud_name)
if avail_to_set == consts.AVAILABILITY_OFFLINE:
# Subcloud is going offline, set all endpoint statuses to
# unknown.
try:
self.subcloud_manager.update_subcloud_endpoint_status(
self.context,
subcloud_name=subcloud_name,
endpoint_type=None,
sync_status=consts.SYNC_STATUS_UNKNOWN)
except exceptions.SubcloudNotFound:
LOG.info('Ignoring SubcloudNotFound when attempting '
'sync_status update: %s' % subcloud_name)
return
elif audit_fail_count != subcloud.audit_fail_count:
try:
db_api.subcloud_update(self.context, subcloud_id,
management_state=None,
availability_status=None,
software_version=None,
description=None, location=None,
audit_fail_count=audit_fail_count)
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info('Ignoring SubcloudNotFound when attempting '
'audit_fail_count update: %s' % subcloud_name)
return
elif update_subcloud_state:
# Nothing has changed, but we want to send a state update for this
# subcloud as an audit. Get the most up-to-date data.
subcloud = db_api.subcloud_get_by_name(self.context, subcloud_name)
self.dcorch_rpc_client. \
update_subcloud_states(self.context,
subcloud_name,
subcloud.management_state,
subcloud.availability_status)
self.audit_count = 0
# If subcloud is online, get alarm summary and store in db.
subcloud = db_api.subcloud_get_by_name(self.context, subcloud_name)
if (subcloud.availability_status == consts.AVAILABILITY_ONLINE) and \
fm_client:
self.alarm_aggr.update_alarm_summary(subcloud_name, fm_client)
# Audit openstack application in the subcloud
if audit_openstack and sysinv_client:
# get a list of installed apps in the subcloud
try:
apps = sysinv_client.get_applications()
except Exception as e:
LOG.warn('Cannot retrieve installed apps for '
'subcloud:%s, %s' % (subcloud_name, e))
return
openstack_installed = subcloud.openstack_installed
openstack_installed_current = False
for app in apps:
if app.name == sysinv_constants.HELM_APP_OPENSTACK\
and app.active:
# audit find openstack app is installed and active in
# the subcloud
openstack_installed_current = True
break
dcm_update_func = None
dco_update_func = None
if openstack_installed_current and not openstack_installed:
dcm_update_func = db_api.subcloud_status_create
# TODO(andy.ning): This RPC will block for the duration of the
# initial sync. It needs to be made non-blocking.
dco_update_func = self.dcorch_rpc_client.\
add_subcloud_sync_endpoint_type
elif not openstack_installed_current and openstack_installed:
dcm_update_func = db_api.subcloud_status_delete
dco_update_func = self.dcorch_rpc_client.\
remove_subcloud_sync_endpoint_type
if dcm_update_func and dco_update_func:
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
try:
# Notify dcorch to add/remove sync endpoint type list
dco_update_func(self.context, subcloud_name,
endpoint_type_list)
LOG.info('Notifying dcorch, subcloud: %s new sync'
' endpoint: %s' % (subcloud_name,
endpoint_type_list))
# Update subcloud status table by adding/removing
# openstack sync endpoint types.
for endpoint_type in endpoint_type_list:
dcm_update_func(self.context, subcloud_id,
endpoint_type)
# Update openstack_installed of subcloud table
db_api.subcloud_update(
self.context, subcloud_id,
openstack_installed=openstack_installed_current)
except exceptions.SubcloudNotFound:
LOG.info('Ignoring SubcloudNotFound when attempting'
' openstack_installed update: %s'
% subcloud_name)
except Exception as e:
LOG.exception(e)
LOG.warn('Problem informing dcorch of subcloud '
'sync endpoint type change, subcloud: %s'
% subcloud_name)

View File

@@ -1041,3 +1041,153 @@ class SubcloudManager(manager.Manager):
self._update_subcloud_endpoint_status(
context, subcloud.name, endpoint_type, sync_status,
alarmable)
def _update_subcloud_state(self, context, subcloud_name,
management_state, availability_status):
try:
self.dcorch_rpc_client.update_subcloud_states(
context, subcloud_name, management_state, availability_status)
LOG.info('Notifying dcorch, subcloud:%s management: %s, '
'availability:%s' %
(subcloud_name,
management_state,
availability_status))
except Exception:
LOG.exception('Problem informing dcorch of subcloud state change,'
'subcloud: %s' % subcloud_name)
def _raise_or_clear_subcloud_status_alarm(self, subcloud_name,
availability_status):
entity_instance_id = "subcloud=%s" % subcloud_name
fault = self.fm_api.get_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
entity_instance_id)
if fault and (availability_status == consts.AVAILABILITY_ONLINE):
try:
self.fm_api.clear_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
entity_instance_id)
except Exception:
LOG.exception("Failed to clear offline alarm for subcloud: %s",
subcloud_name)
elif not fault and \
(availability_status == consts.AVAILABILITY_OFFLINE):
try:
fault = fm_api.Fault(
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
reason_text=('%s is offline' % subcloud_name),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
proposed_repair_action="Wait for subcloud to "
"become online; if "
"problem persists contact "
"next level of support.",
service_affecting=True)
self.fm_api.set_fault(fault)
except Exception:
LOG.exception("Failed to raise offline alarm for subcloud: %s",
subcloud_name)
def update_subcloud_availability(self, context, subcloud_name,
availability_status,
update_state_only=False,
audit_fail_count=None):
try:
subcloud = db_api.subcloud_get_by_name(context, subcloud_name)
except Exception:
LOG.exception("Failed to get subcloud by name: %s" % subcloud_name)
if update_state_only:
# Nothing has changed, but we want to send a state update for this
# subcloud as an audit. Get the most up-to-date data.
self._update_subcloud_state(context, subcloud_name,
subcloud.management_state,
availability_status)
elif availability_status is None:
# only update the audit fail count
try:
db_api.subcloud_update(self.context, subcloud.id,
audit_fail_count=audit_fail_count)
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info('Ignoring SubcloudNotFound when attempting '
'audit_fail_count update: %s' % subcloud_name)
return
else:
self._raise_or_clear_subcloud_status_alarm(subcloud_name,
availability_status)
if availability_status == consts.AVAILABILITY_OFFLINE:
# Subcloud is going offline, set all endpoint statuses to
# unknown.
self._update_subcloud_endpoint_status(
context, subcloud_name, endpoint_type=None,
sync_status=consts.SYNC_STATUS_UNKNOWN)
try:
updated_subcloud = db_api.subcloud_update(
context,
subcloud.id,
availability_status=availability_status,
audit_fail_count=audit_fail_count)
except exceptions.SubcloudNotFound:
# slim possibility subcloud could have been deleted since
# we found it in db, ignore this benign error.
LOG.info('Ignoring SubcloudNotFound when attempting state'
' update: %s' % subcloud_name)
return
# Send dcorch a state update
self._update_subcloud_state(context, subcloud_name,
updated_subcloud.management_state,
availability_status)
def update_subcloud_sync_endpoint_type(self, context,
subcloud_name,
endpoint_type_list,
openstack_installed):
operation = 'add' if openstack_installed else 'remove'
func_switcher = {
'add': (
self.dcorch_rpc_client.add_subcloud_sync_endpoint_type,
db_api.subcloud_status_create
),
'remove': (
self.dcorch_rpc_client.remove_subcloud_sync_endpoint_type,
db_api.subcloud_status_delete
)
}
try:
subcloud = db_api.subcloud_get_by_name(context, subcloud_name)
except Exception:
LOG.exception("Failed to get subcloud by name: %s" % subcloud_name)
try:
# Notify dcorch to add/remove sync endpoint type list
func_switcher[operation][0](self.context, subcloud_name,
endpoint_type_list)
LOG.info('Notifying dcorch, subcloud: %s new sync endpoint: %s' %
(subcloud_name, endpoint_type_list))
# Update subcloud status table by adding/removing openstack sync
# endpoint types
for endpoint_type in endpoint_type_list:
func_switcher[operation][1](self.context, subcloud.id,
endpoint_type)
# Update openstack_installed of subcloud table
db_api.subcloud_update(self.context, subcloud.id,
openstack_installed=openstack_installed)
except Exception:
LOG.exception('Problem informing dcorch of subcloud sync endpoint'
' type change, subcloud: %s' % subcloud_name)

View File

@@ -30,9 +30,9 @@ from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common import scheduler
from dcmanager.db import api as db_api
from dcmanager.manager.patch_audit_manager import PatchAuditManager
from dcmanager.manager import scheduler
LOG = logging.getLogger(__name__)

View File

@@ -89,6 +89,31 @@ class ManagerClient(object):
endpoint_type=endpoint_type,
sync_status=sync_status))
def update_subcloud_availability(self, ctxt,
subcloud_name,
availability_status,
update_state_only=False,
audit_fail_count=None):
return self.call(
ctxt,
self.make_msg('update_subcloud_availability',
subcloud_name=subcloud_name,
availability_status=availability_status,
update_state_only=update_state_only,
audit_fail_count=audit_fail_count))
def update_subcloud_sync_endpoint_type(self, ctxt, subcloud_id,
subcloud_name,
endpoint_type_list,
openstack_installed):
return self.cast(
ctxt,
self.make_msg('update_subcloud_sync_endpoint_type',
subcloud_id=subcloud_id,
subcloud_name=subcloud_name,
endpoint_type_list=endpoint_type_list,
openstack_installed=openstack_installed))
def create_sw_update_strategy(self, ctxt, payload):
return self.call(ctxt, self.make_msg('create_sw_update_strategy',
payload=payload))

View File

@@ -20,8 +20,8 @@
import mock
from dccommon.drivers.openstack import sdk_platform as sdk
from dcmanager.audit import alarm_aggregation
from dcmanager.common import exceptions
from dcmanager.manager import alarm_aggregation
from dcmanager.tests import base
from dcmanager.tests import utils

View File

@@ -0,0 +1,45 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
from dcmanager.audit import service
from dcmanager.common import scheduler
from dcmanager.tests import base
from dcmanager.tests import utils
from oslo_config import cfg
CONF = cfg.CONF
class TestDCManagerAuditService(base.DCManagerTestCase):
def setUp(self):
super(TestDCManagerAuditService, self).setUp()
self.tenant_id = 'fake_admin'
self.thm = scheduler.ThreadGroupManager()
self.context = utils.dummy_context(user='test_user',
tenant=self.tenant_id)
self.service_obj = service.DCManagerAuditService()
def test_init(self):
self.assertEqual(self.service_obj.host, 'localhost')
self.assertEqual(self.service_obj.topic, 'dcmanager-audit')
def test_init_tgm(self):
self.service_obj.init_tgm()
self.assertIsNotNone(self.service_obj.TG)

View File

@@ -24,19 +24,19 @@ import sys
sys.modules['fm_core'] = mock.Mock()
from dccommon import consts as dccommon_consts
from dcmanager.audit import subcloud_audit_manager
from dcmanager.common import consts
from dcmanager.db.sqlalchemy import api as db_api
from dcmanager.manager import subcloud_audit_manager
from dcmanager.manager import subcloud_manager
# from dcmanager.manager import subcloud_manager
from dcmanager.tests import base
class FakeDCOrchAPI(object):
class FakeDCManagerAPI(object):
def __init__(self):
self.update_subcloud_states = mock.MagicMock()
self.add_subcloud_sync_endpoint_type = mock.MagicMock()
self.update_subcloud_availability = mock.MagicMock()
self.update_subcloud_sync_endpoint_type = mock.MagicMock()
class FakeAlarmAggregation(object):
@@ -201,11 +201,11 @@ class TestAuditManager(base.DCManagerTestCase):
def setUp(self):
super(TestAuditManager, self).setUp()
# Mock the DCOrch API
self.fake_dcorch_api = FakeDCOrchAPI()
p = mock.patch('dcorch.rpc.client.EngineClient')
self.mock_dcorch_api = p.start()
self.mock_dcorch_api.return_value = self.fake_dcorch_api
# Mock the DCManager API
self.fake_dcmanager_api = FakeDCManagerAPI()
p = mock.patch('dcmanager.rpc.client.ManagerClient')
self.mock_dcmanager_api = p.start()
self.mock_dcmanager_api.return_value = self.fake_dcmanager_api
self.addCleanup(p.stop)
# Mock the OpenStackDriver
@@ -250,63 +250,46 @@ class TestAuditManager(base.DCManagerTestCase):
return db_api.subcloud_create(ctxt, **values)
def test_init(self):
sm = subcloud_manager.SubcloudManager()
am = subcloud_audit_manager.SubcloudAuditManager(subcloud_manager=sm)
am = subcloud_audit_manager.SubcloudAuditManager()
self.assertIsNotNone(am)
self.assertEqual('subcloud_audit_manager', am.service_name)
self.assertEqual('localhost', am.host)
self.assertEqual(self.ctx, am.context)
def test_periodic_subcloud_audit(self):
mock_sm = mock.Mock()
am = subcloud_audit_manager.SubcloudAuditManager(
subcloud_manager=mock_sm)
am.periodic_subcloud_audit()
am = subcloud_audit_manager.SubcloudAuditManager()
am._periodic_subcloud_audit_loop()
def test_audit_subcloud_online(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
mock_sm = mock.Mock()
am = subcloud_audit_manager.SubcloudAuditManager(
subcloud_manager=mock_sm)
# No stx-openstack application
self.fake_openstack_client.sysinv_client.get_application_results = []
am = subcloud_audit_manager.SubcloudAuditManager()
# Audit the subcloud
am._audit_subcloud(subcloud.name, update_subcloud_state=False,
audit_openstack=False)
# Verify the subcloud was set to online
self.fake_dcorch_api.update_subcloud_states.assert_called_with(
mock.ANY, subcloud.name, consts.MANAGEMENT_UNMANAGED,
consts.AVAILABILITY_ONLINE)
self.fake_dcmanager_api.update_subcloud_availability.assert_called_with(
mock.ANY, subcloud.name, consts.AVAILABILITY_ONLINE,
False, 0)
# Verify the openstack endpoints were not added
self.fake_dcorch_api.add_subcloud_sync_endpoint_type.\
# Verify the openstack endpoints were not updated
self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\
assert_not_called()
# Verify the subcloud openstack_installed was not updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.openstack_installed, False)
# Verify alarm update is called
self.fake_alarm_aggr.update_alarm_summary.assert_called_with(
'subcloud1', self.fake_openstack_client.fm_client)
subcloud.name, self.fake_openstack_client.fm_client)
def test_audit_subcloud_online_no_change(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
mock_sm = mock.Mock()
am = subcloud_audit_manager.SubcloudAuditManager(
subcloud_manager=mock_sm)
# No stx-openstack application
self.fake_openstack_client.sysinv_client.get_application_results = []
am = subcloud_audit_manager.SubcloudAuditManager()
# Set the subcloud to online
db_api.subcloud_update(
@@ -318,15 +301,12 @@ class TestAuditManager(base.DCManagerTestCase):
audit_openstack=False)
# Verify the subcloud state was not updated
self.fake_dcorch_api.update_subcloud_states.assert_not_called()
# Verify the openstack endpoints were not added
self.fake_dcorch_api.add_subcloud_sync_endpoint_type.\
self.fake_dcmanager_api.update_subcloud_availability.\
assert_not_called()
# Verify the subcloud openstack_installed was not updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.openstack_installed, False)
# Verify the openstack endpoints were not added
self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\
assert_not_called()
# Verify alarm update is called
self.fake_alarm_aggr.update_alarm_summary.assert_called_with(
@@ -337,12 +317,7 @@ class TestAuditManager(base.DCManagerTestCase):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
mock_sm = mock.Mock()
am = subcloud_audit_manager.SubcloudAuditManager(
subcloud_manager=mock_sm)
# No stx-openstack application
self.fake_openstack_client.sysinv_client.get_application_results = []
am = subcloud_audit_manager.SubcloudAuditManager()
# Set the subcloud to online
db_api.subcloud_update(
@@ -354,30 +329,24 @@ class TestAuditManager(base.DCManagerTestCase):
audit_openstack=False)
# Verify the subcloud state was updated even though no change
self.fake_dcorch_api.update_subcloud_states.assert_called_with(
mock.ANY, 'subcloud1', consts.MANAGEMENT_UNMANAGED,
consts.AVAILABILITY_ONLINE)
self.fake_dcmanager_api.update_subcloud_availability.assert_called_with(
mock.ANY, subcloud.name, consts.AVAILABILITY_ONLINE,
True, None)
# Verify the openstack endpoints were not added
self.fake_dcorch_api.add_subcloud_sync_endpoint_type.\
# Verify the openstack endpoints were not updated
self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\
assert_not_called()
# Verify the subcloud openstack_installed was not updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.openstack_installed, False)
# Verify alarm update is called
self.fake_alarm_aggr.update_alarm_summary.assert_called_with(
'subcloud1', self.fake_openstack_client.fm_client)
def test_audit_subcloud_offline(self):
def test_audit_subcloud_go_offline(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
mock_sm = mock.Mock()
am = subcloud_audit_manager.SubcloudAuditManager(
subcloud_manager=mock_sm)
am = subcloud_audit_manager.SubcloudAuditManager()
# Set the subcloud to online
db_api.subcloud_update(
@@ -394,54 +363,152 @@ class TestAuditManager(base.DCManagerTestCase):
am._audit_subcloud(subcloud.name, update_subcloud_state=False,
audit_openstack=False)
# Verify the subcloud was not set to offline
self.fake_dcorch_api.update_subcloud_states.assert_not_called()
# Verify the audit fail count was updated
audit_fail_count = 1
self.fake_dcmanager_api.update_subcloud_availability.\
assert_called_with(mock.ANY, subcloud.name,
None, False, audit_fail_count)
# Verify the audit_fail_count was updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.audit_fail_count, 1)
db_api.subcloud_update(self.ctx, subcloud.id,
audit_fail_count=audit_fail_count)
# Audit the subcloud again
am._audit_subcloud(subcloud.name, update_subcloud_state=False,
audit_openstack=False)
audit_fail_count = audit_fail_count + 1
# Verify the subcloud was set to offline
self.fake_dcorch_api.update_subcloud_states.assert_called_with(
mock.ANY, 'subcloud1', consts.MANAGEMENT_UNMANAGED,
consts.AVAILABILITY_OFFLINE)
self.fake_dcmanager_api.update_subcloud_availability.\
assert_called_with(mock.ANY, subcloud.name,
consts.AVAILABILITY_OFFLINE, False,
audit_fail_count)
# Verify the sublcoud availability was updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.availability_status,
consts.AVAILABILITY_OFFLINE)
# Verify alarm update is called
# Verify alarm update is called only once
self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with(
'subcloud1', self.fake_openstack_client.fm_client)
def test_audit_subcloud_online_with_openstack(self):
subcloud.name, self.fake_openstack_client.fm_client)
def test_audit_subcloud_offline_no_change(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
mock_sm = mock.Mock()
am = subcloud_audit_manager.SubcloudAuditManager(
subcloud_manager=mock_sm)
am = subcloud_audit_manager.SubcloudAuditManager()
db_api.subcloud_update(self.ctx, subcloud.id,
audit_fail_count=consts.AVAIL_FAIL_COUNT_MAX)
# Mark a service group as inactive
self.fake_openstack_client.sysinv_client.get_service_groups_result = \
copy.deepcopy(FAKE_SERVICE_GROUPS)
self.fake_openstack_client.sysinv_client. \
get_service_groups_result[3].state = 'inactive'
# Audit the subcloud
am._audit_subcloud(subcloud.name, update_subcloud_state=False,
audit_openstack=True)
# Verify the subcloud was set to online
self.fake_dcorch_api.update_subcloud_states.assert_called_with(
mock.ANY, 'subcloud1', consts.MANAGEMENT_UNMANAGED,
consts.AVAILABILITY_ONLINE)
# Verify the subcloud state was not updated
self.fake_dcmanager_api.update_subcloud_availability.\
assert_not_called()
# Verify the openstack endpoints were not updated
self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\
assert_not_called()
# Verify alarm update is not called
self.fake_alarm_aggr.update_alarm_summary.assert_not_called()
def test_audit_subcloud_online_with_openstack_installed(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
am = subcloud_audit_manager.SubcloudAuditManager()
# Set the subcloud to online
db_api.subcloud_update(
self.ctx, subcloud.id,
availability_status=consts.AVAILABILITY_ONLINE)
# Audit the subcloud
am._audit_subcloud(subcloud.name, update_subcloud_state=False,
audit_openstack=True)
# Verify the subcloud state was not updated
self.fake_dcmanager_api.update_subcloud_availability.\
assert_not_called()
# Verify the openstack endpoints were added
self.fake_dcorch_api.add_subcloud_sync_endpoint_type.\
assert_called_with(mock.ANY, 'subcloud1',
dccommon_consts.ENDPOINT_TYPES_LIST_OS)
# self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\
# assert_called_with(mock.ANY, 'subcloud1',
# dccommon_consts.ENDPOINT_TYPES_LIST_OS,
# True)
# Verify the subcloud openstack_installed was updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.openstack_installed, True)
# Verify alarm update is called
self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with(
'subcloud1', self.fake_openstack_client.fm_client)
def test_audit_subcloud_online_with_openstack_removed(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
am = subcloud_audit_manager.SubcloudAuditManager()
# Set the subcloud to online and openstack installed
db_api.subcloud_update(
self.ctx, subcloud.id,
availability_status=consts.AVAILABILITY_ONLINE,
openstack_installed=True)
# Remove stx-openstack application
FAKE_APPLICATIONS.pop(1)
# Audit the subcloud
am._audit_subcloud(subcloud.name, update_subcloud_state=False,
audit_openstack=True)
# Verify the subcloud state was not updated
self.fake_dcmanager_api.update_subcloud_availability.\
assert_not_called()
# Verify the openstack endpoints were removed
self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\
assert_called_with(mock.ANY, 'subcloud1',
dccommon_consts.ENDPOINT_TYPES_LIST_OS, False)
# Verify alarm update is called
self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with(
'subcloud1', self.fake_openstack_client.fm_client)
def test_audit_subcloud_online_with_openstack_inactive(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
am = subcloud_audit_manager.SubcloudAuditManager()
# Set the subcloud to online and openstack installed
db_api.subcloud_update(
self.ctx, subcloud.id,
availability_status=consts.AVAILABILITY_ONLINE,
openstack_installed=True)
# stx-openstack application is not active
FAKE_APPLICATIONS[1].active = False
# Audit the subcloud
am._audit_subcloud(subcloud.name, update_subcloud_state=False,
audit_openstack=True)
# Verify the subcloud state was not updated
self.fake_dcmanager_api.update_subcloud_availability.\
assert_not_called()
# Verify the openstack endpoints were removed
self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\
assert_called_with(mock.ANY, 'subcloud1',
dccommon_consts.ENDPOINT_TYPES_LIST_OS, False)
# Verify alarm update is called
self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with(
'subcloud1', self.fake_openstack_client.fm_client)

View File

@@ -22,7 +22,7 @@ import mock
import sys
sys.modules['fm_core'] = mock.Mock()
from dcmanager.manager import scheduler
from dcmanager.common import scheduler
from dcmanager.manager import service
from dcmanager.tests import base
from dcmanager.tests import utils
@@ -56,10 +56,8 @@ class TestDCManagerService(base.DCManagerTestCase):
self.service_obj.init_tgm()
self.assertIsNotNone(self.service_obj.TG)
@mock.patch.object(service, 'SubcloudAuditManager')
def test_init_audit_managers(self, mock_audit_manager):
def test_init_audit_managers(self):
self.service_obj.init_audit_managers()
self.assertIsNotNone(self.service_obj.subcloud_audit_manager)
self.assertIsNotNone(self.service_obj.patch_audit_manager)
@mock.patch.object(service, 'SwUpdateManager')
@@ -72,29 +70,16 @@ class TestDCManagerService(base.DCManagerTestCase):
@mock.patch.object(service, 'SwUpdateManager')
@mock.patch.object(service, 'SubcloudManager')
@mock.patch.object(service, 'SubcloudAuditManager')
@mock.patch.object(service, 'rpc_messaging')
def test_start(self, mock_rpc, mock_audit_manager, mock_subcloud_manager,
def test_start(self, mock_rpc, mock_subcloud_manager,
mock_sw_update_manager):
self.service_obj.start()
mock_rpc.get_rpc_server.assert_called_once_with(
self.service_obj.target, self.service_obj)
mock_rpc.get_rpc_server().start.assert_called_once_with()
@mock.patch.object(service, 'SubcloudAuditManager')
@mock.patch.object(service, 'PatchAuditManager')
def test_periodic_audit_subclouds(self, mock_patch_audit_manager,
mock_subcloud_audit_manager):
self.service_obj.init_tgm()
self.service_obj.init_audit_managers()
self.service_obj.subcloud_audit()
mock_subcloud_audit_manager().periodic_subcloud_audit.\
assert_called_once_with()
@mock.patch.object(service, 'SubcloudAuditManager')
@mock.patch.object(service, 'PatchAuditManager')
def test_periodic_audit_patches(self, mock_patch_audit_manager,
mock_subcloud_audit_manager):
def test_periodic_audit_patches(self, mock_patch_audit_manager):
self.service_obj.init_tgm()
self.service_obj.init_audit_managers()
self.service_obj.patch_audit()
@@ -137,20 +122,18 @@ class TestDCManagerService(base.DCManagerTestCase):
@mock.patch.object(service, 'SwUpdateManager')
@mock.patch.object(service, 'SubcloudManager')
@mock.patch.object(service, 'SubcloudAuditManager')
@mock.patch.object(service, 'rpc_messaging')
def test_stop_rpc_server(self, mock_rpc, mock_audit_manager,
mock_subcloud_manager, mock_sw_update_manager):
def test_stop_rpc_server(self, mock_rpc, mock_subcloud_manager,
mock_sw_update_manager):
self.service_obj.start()
self.service_obj._stop_rpc_server()
mock_rpc.get_rpc_server().stop.assert_called_once_with()
@mock.patch.object(service, 'SwUpdateManager')
@mock.patch.object(service, 'SubcloudManager')
@mock.patch.object(service, 'SubcloudAuditManager')
@mock.patch.object(service, 'rpc_messaging')
def test_stop(self, mock_rpc, mock_audit_manager,
mock_subcloud_manager, mock_sw_update_manager):
def test_stop(self, mock_rpc, mock_subcloud_manager,
mock_sw_update_manager):
self.service_obj.start()
self.service_obj.stop()
mock_rpc.get_rpc_server().stop.assert_called_once_with()

View File

@@ -27,7 +27,9 @@ sys.modules['fm_core'] = mock.Mock()
import threading
from dccommon import consts as dccommon_consts
from dcmanager.common import consts
from dcmanager.common import exceptions
from dcmanager.db.sqlalchemy import api as db_api
from dcmanager.manager import subcloud_manager
from dcmanager.tests import base
@@ -39,6 +41,7 @@ class FakeDCOrchAPI(object):
def __init__(self):
self.update_subcloud_states = mock.MagicMock()
self.add_subcloud_sync_endpoint_type = mock.MagicMock()
self.remove_subcloud_sync_endpoint_type = mock.MagicMock()
self.del_subcloud = mock.MagicMock()
self.add_subcloud = mock.MagicMock()
@@ -443,3 +446,136 @@ class TestSubcloudManager(base.DCManagerTestCase):
self.assertIsNotNone(updated_subcloud_status)
self.assertEqual(updated_subcloud_status.sync_status,
consts.SYNC_STATUS_OUT_OF_SYNC)
def test_update_subcloud_availability_go_online(self):
# create a subcloud
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
self.assertEqual(subcloud.availability_status,
consts.AVAILABILITY_OFFLINE)
sm = subcloud_manager.SubcloudManager()
sm.update_subcloud_availability(self.ctx, subcloud.name,
consts.AVAILABILITY_ONLINE)
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
# Verify the subcloud was set to online
self.assertEqual(updated_subcloud.availability_status,
consts.AVAILABILITY_ONLINE)
# Verify notifying dcorch
self.fake_dcorch_api.update_subcloud_states.assert_called_once_with(
self.ctx, subcloud.name, updated_subcloud.management_state,
consts.AVAILABILITY_ONLINE)
def test_update_subcloud_availability_go_offline(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
# Set the subcloud to online/managed
db_api.subcloud_update(self.ctx, subcloud.id,
management_state=consts.MANAGEMENT_MANAGED,
availability_status=consts.AVAILABILITY_ONLINE)
sm = subcloud_manager.SubcloudManager()
# create sync statuses for endpoints and set them to in-sync
for endpoint in [dcorch_consts.ENDPOINT_TYPE_PLATFORM,
dcorch_consts.ENDPOINT_TYPE_IDENTITY,
dcorch_consts.ENDPOINT_TYPE_PATCHING,
dcorch_consts.ENDPOINT_TYPE_FM,
dcorch_consts.ENDPOINT_TYPE_NFV]:
db_api.subcloud_status_create(
self.ctx, subcloud.id, endpoint)
sm.update_subcloud_endpoint_status(
self.ctx, subcloud_name=subcloud.name,
endpoint_type=endpoint,
sync_status=consts.SYNC_STATUS_IN_SYNC)
# Audit fails once
audit_fail_count = 1
sm.update_subcloud_availability(self.ctx, subcloud.name,
availability_status=None,
audit_fail_count=audit_fail_count)
# Verify the subclcoud availability was not updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.availability_status,
consts.AVAILABILITY_ONLINE)
# Verify dcorch was not notified
self.fake_dcorch_api.update_subcloud_states.assert_not_called()
# Verify the audit_fail_count was updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.audit_fail_count, audit_fail_count)
# Audit fails again
audit_fail_count = audit_fail_count + 1
sm.update_subcloud_availability(self.ctx, subcloud.name,
consts.AVAILABILITY_OFFLINE,
audit_fail_count=audit_fail_count)
# Verify the subclcoud availability was updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, 'subcloud1')
self.assertEqual(updated_subcloud.availability_status,
consts.AVAILABILITY_OFFLINE)
# Verify notifying dcorch
self.fake_dcorch_api.update_subcloud_states.assert_called_once_with(
self.ctx, subcloud.name, updated_subcloud.management_state,
consts.AVAILABILITY_OFFLINE)
# Verify all endpoint statuses set to unknown
for subcloud, subcloud_status in db_api. \
subcloud_get_with_status(self.ctx, subcloud.id):
self.assertIsNotNone(subcloud_status)
self.assertEqual(subcloud_status.sync_status,
consts.SYNC_STATUS_UNKNOWN)
def test_update_subcloud_sync_endpoint_type(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
sm = subcloud_manager.SubcloudManager()
endpoint_type_list = dccommon_consts.ENDPOINT_TYPES_LIST_OS
# Test openstack app installed
openstack_installed = True
sm.update_subcloud_sync_endpoint_type(self.ctx, subcloud.name,
endpoint_type_list,
openstack_installed)
# Verify notifying dcorch to add subcloud sync endpoint type
self.fake_dcorch_api.add_subcloud_sync_endpoint_type.\
assert_called_once_with(self.ctx, subcloud.name,
endpoint_type_list)
# Verify the subcloud status created for os endpoints
for endpoint in endpoint_type_list:
subcloud_status = db_api.subcloud_status_get(
self.ctx, subcloud.id, endpoint)
self.assertIsNotNone(subcloud_status)
self.assertEqual(subcloud_status.sync_status,
consts.SYNC_STATUS_UNKNOWN)
# Verify the subcloud openstack_installed was updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, subcloud.name)
self.assertEqual(updated_subcloud.openstack_installed, True)
# Test openstack app removed
openstack_installed = False
sm.update_subcloud_sync_endpoint_type(self.ctx, subcloud.name,
endpoint_type_list,
openstack_installed)
# Verify notifying dcorch to remove subcloud sync endpoint type
self.fake_dcorch_api.remove_subcloud_sync_endpoint_type.\
assert_called_once_with(self.ctx, subcloud.name,
endpoint_type_list)
# Verify the subcloud status is deleted for os endpoints
for endpoint in endpoint_type_list:
self.assertRaises(exceptions.SubcloudStatusNotFound,
db_api.subcloud_status_get, self.ctx,
subcloud.id, endpoint)
# Verify the subcloud openstack_installed was updated
updated_subcloud = db_api.subcloud_get_by_name(self.ctx, subcloud.name)
self.assertEqual(updated_subcloud.openstack_installed, False)

View File

@@ -100,7 +100,7 @@ class EngineClient(object):
def add_subcloud_sync_endpoint_type(self, ctxt, subcloud_name,
endpoint_type_list):
return self.call(
return self.cast(
ctxt,
self.make_msg('add_subcloud_sync_endpoint_type',
subcloud_name=subcloud_name,
@@ -108,7 +108,7 @@ class EngineClient(object):
def remove_subcloud_sync_endpoint_type(self, ctxt, subcloud_name,
endpoint_type_list):
return self.call(
return self.cast(
ctxt,
self.make_msg('remove_subcloud_sync_endpoint_type',
subcloud_name=subcloud_name,

View File

@@ -0,0 +1,322 @@
#!/bin/sh
# OpenStack DC Manager Audit Service (dcmanager-audit)
#
# Description:
# Manages an OpenStack DC Manager Audit Service (dcmanager-audit)
# process as an HA resource
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# See usage() function below for more details ...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_config
# OCF_RESKEY_user
# OCF_RESKEY_pid
# OCF_RESKEY_additional_parameters
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# Fill in some defaults if no values are specified
OCF_RESKEY_binary_default="/usr/bin/dcmanager-audit"
OCF_RESKEY_config_default="/etc/dcmanager/dcmanager.conf"
OCF_RESKEY_user_default="root"
OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
$0 manages an OpenStack DC Manager Audit service (dcmanager-audit) process as an HA resource
The 'start' operation starts the dcmanager-audit service.
The 'stop' operation stops the dcmanager-audit service.
The 'validate-all' operation reports whether the parameters are valid
The 'meta-data' operation reports this RA's meta-data information
The 'status' operation reports whether the dcmanager-audit service is running
The 'monitor' operation reports whether the dcmanager-audit service seems to be working
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="dcmanager-audit">
<version>1.0</version>
<longdesc lang="en">
Resource agent for the DC Manager service (dcmanager-audit)
</longdesc>
<shortdesc lang="en">Manages the OpenStack DC Manager Audit Service (dcmanager-audit)</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the DC Manager Audit Service binary (dcmanager-audit)
</longdesc>
<shortdesc lang="en">DC Manager Audit Service binary (dcmanager-audit)</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Location of the DC Manager Audit Service (dcmanager-audit) configuration file
</longdesc>
<shortdesc lang="en">DC Manager Audit Service (dcmanager-audit registry) config file</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running DC Manager Audit Service (dcmanager-audit)
</longdesc>
<shortdesc lang="en">DC Manager Audit Service (dcmanager-audit) user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="pid" unique="0" required="0">
<longdesc lang="en">
The pid file to use for this DC Manager Audit Service (dcmanager-audit) instance
</longdesc>
<shortdesc lang="en">DC Manager Audit Service (dcmanager-audit) pid file</shortdesc>
<content type="string" default="${OCF_RESKEY_pid_default}" />
</parameter>
<parameter name="additional_parameters" unique="0" required="0">
<longdesc lang="en">
Additional parameters to pass on to the dcmanager-audit
</longdesc>
<shortdesc lang="en">Additional parameters for dcmanager-audit</shortdesc>
<content type="string" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="status" timeout="20" />
<action name="monitor" timeout="10" interval="5" />
<action name="validate-all" timeout="5" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}
#######################################################################
# Functions invoked by resource manager actions
dcmanager_audit_validate() {
local rc
check_binary $OCF_RESKEY_binary
check_binary curl
check_binary tr
check_binary grep
check_binary cut
check_binary head
# A config file on shared storage that is not available
# during probes is OK.
if [ ! -f $OCF_RESKEY_config ]; then
if ! ocf_is_probe; then
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
return $OCF_ERR_INSTALLED
fi
ocf_log_warn "Config $OCF_RESKEY_config not available during a probe"
fi
getent passwd $OCF_RESKEY_user >/dev/null 2>&1
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "User $OCF_RESKEY_user doesn't exist"
return $OCF_ERR_INSTALLED
fi
true
}
dcmanager_audit_status() {
local pid
local rc
if [ ! -f $OCF_RESKEY_pid ]; then
ocf_log info "DC Manager Audit Service (dcmanager-audit) is not running"
return $OCF_NOT_RUNNING
else
pid=`cat $OCF_RESKEY_pid`
fi
ocf_run -warn kill -s 0 $pid
rc=$?
if [ $rc -eq 0 ]; then
return $OCF_SUCCESS
else
ocf_log info "Old PID file found, but DC Manager Audit Service (dcmanager-audit) is not running"
rm -f $OCF_RESKEY_pid
return $OCF_NOT_RUNNING
fi
}
dcmanager_audit_monitor() {
local rc
dcmanager_audit_status
rc=$?
# If status returned anything but success, return that immediately
if [ $rc -ne $OCF_SUCCESS ]; then
return $rc
fi
ocf_log debug "DC Manager Audit Service (dcmanager-audit) monitor succeeded"
return $OCF_SUCCESS
}
dcmanager_audit_start() {
local rc
dcmanager_audit_status
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
ocf_log info "DC Manager Audit Service (dcmanager-audit) already running"
return $OCF_SUCCESS
fi
# Change the working dir to /, to be sure it's accesible
cd /
# run the actual dcmanager-audit daemon. Don't use ocf_run as we're sending the tool's output
# straight to /dev/null anyway and using ocf_run would break stdout-redirection here.
su ${OCF_RESKEY_user} -s /bin/sh -c "${OCF_RESKEY_binary} --config-file=$OCF_RESKEY_config \
$OCF_RESKEY_additional_parameters"' >> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid
# Spin waiting for the server to come up.
# Let the CRM/LRM time us out if required
while true; do
dcmanager_audit_monitor
rc=$?
[ $rc -eq $OCF_SUCCESS ] && break
if [ $rc -ne $OCF_NOT_RUNNING ]; then
ocf_log err "DC Manager Audit Service (dcmanager-audit) start failed"
exit $OCF_ERR_GENERIC
fi
sleep 1
done
ocf_log info "DC Manager Audit Service (dcmanager-audit) started"
return $OCF_SUCCESS
}
dcmanager_audit_confirm_stop() {
local my_bin
local my_processes
my_binary=`which ${OCF_RESKEY_binary}`
my_processes=`pgrep -l -f "^(python|/usr/bin/python|/usr/bin/python2) ${my_binary}([^\w-]|$)"`
if [ -n "${my_processes}" ]
then
ocf_log info "About to SIGKILL the following: ${my_processes}"
pkill -KILL -f "^(python|/usr/bin/python|/usr/bin/python2) ${my_binary}([^\w-]|$)"
fi
}
dcmanager_audit_stop() {
local rc
local pid
dcmanager_audit_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
ocf_log info "DC Manager Audit Service (dcmanager-audit) already stopped"
dcmanager_audit_confirm_stop
return $OCF_SUCCESS
fi
# Try SIGTERM
pid=`cat $OCF_RESKEY_pid`
ocf_run kill -s TERM $pid
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "DC Manager Audit Service (dcmanager-audit) couldn't be stopped"
dcmanager_audit_confirm_stop
exit $OCF_ERR_GENERIC
fi
# stop waiting
shutdown_timeout=15
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5))
fi
count=0
while [ $count -lt $shutdown_timeout ]; do
dcmanager_audit_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
break
fi
count=`expr $count + 1`
sleep 1
ocf_log debug "DC Manager Audit Service (dcmanager-audit) still hasn't stopped yet. Waiting ..."
done
dcmanager_audit_status
rc=$?
if [ $rc -ne $OCF_NOT_RUNNING ]; then
# SIGTERM didn't help either, try SIGKILL
ocf_log info "DC Manager Audit Service (dcmanager-audit) failed to stop after ${shutdown_timeout}s \
using SIGTERM. Trying SIGKILL ..."
ocf_run kill -s KILL $pid
fi
dcmanager_audit_confirm_stop
ocf_log info "DC Manager Audit Service (dcmanager-audit) stopped"
rm -f $OCF_RESKEY_pid
return $OCF_SUCCESS
}
#######################################################################
case "$1" in
meta-data) meta_data
exit $OCF_SUCCESS;;
usage|help) usage
exit $OCF_SUCCESS;;
esac
# Anything except meta-data and help must pass validation
dcmanager_audit_validate || exit $?
# What kind of method was invoked?
case "$1" in
start) dcmanager_audit_start;;
stop) dcmanager_audit_stop;;
status) dcmanager_audit_status;;
monitor) dcmanager_audit_monitor;;
validate-all) ;;
*) usage
exit $OCF_ERR_UNIMPLEMENTED;;
esac

View File

@@ -29,6 +29,7 @@ packages =
[entry_points]
console_scripts =
dcmanager-api = dcmanager.cmd.api:main
dcmanager-audit = dcmanager.cmd.audit:main
dcmanager-manager = dcmanager.cmd.manager:main
dcmanager-manage = dcmanager.cmd.manage:main
dcorch-api = dcorch.cmd.api:main