From 2366a4b86e7f0127275dba9e94c4a90dd11578ce Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Tue, 30 May 2023 11:18:32 -0700 Subject: [PATCH] Adds service steps A huge list of initial work for service steps * Adds service_step verb * Adds service_step db/object/API field on the node object for the status. * Increments the API version to 1.87 for both changes. * Increments the RPC API version to 1.57. * Adds initial testing to facilitate ensurance that supplied steps are passed through and executed upon. Does not: * Have tests for starting the agent ramdisk, although this is relatively boiler plate. * Have a collection of pre-decorated steps available for immediate consumption. Change-Id: I5b9dd928f24dff7877a4ab8dc7b743058cace994 --- .../contributor/webapi-version-history.rst | 13 + ironic/api/controllers/v1/node.py | 73 +- ironic/api/controllers/v1/utils.py | 8 +- ironic/api/controllers/v1/versions.py | 6 +- ironic/common/faults.py | 6 +- ironic/common/neutron.py | 9 + ironic/common/release_mappings.py | 6 +- ironic/common/states.py | 74 +- ironic/conductor/manager.py | 78 +- ironic/conductor/rpcapi.py | 27 +- ironic/conductor/servicing.py | 377 ++++++ ironic/conductor/steps.py | 90 +- ironic/conductor/utils.py | 85 +- ironic/conf/conductor.py | 7 + ironic/conf/neutron.py | 19 +- .../aa2384fee727_add_service_steps.py | 35 + ironic/db/sqlalchemy/models.py | 1 + ironic/drivers/base.py | 177 +++ ironic/drivers/modules/agent_base.py | 149 ++- ironic/drivers/modules/deploy_utils.py | 80 ++ ironic/drivers/modules/fake.py | 5 + ironic/drivers/modules/network/flat.py | 25 + ironic/drivers/modules/network/neutron.py | 30 + ironic/objects/node.py | 8 +- .../unit/api/controllers/v1/test_node.py | 78 +- ironic/tests/unit/common/test_neutron.py | 25 + ironic/tests/unit/conductor/test_manager.py | 47 + ironic/tests/unit/conductor/test_rpcapi.py | 8 + ironic/tests/unit/conductor/test_servicing.py | 1184 +++++++++++++++++ ironic/tests/unit/conductor/test_steps.py | 88 +- ironic/tests/unit/conductor/test_utils.py | 127 ++ ironic/tests/unit/db/utils.py | 3 +- .../unit/drivers/modules/test_agent_base.py | 205 ++- ironic/tests/unit/objects/test_objects.py | 2 +- .../add-service-steps-deb45c9a0e77a647.yaml | 19 + 35 files changed, 3109 insertions(+), 65 deletions(-) create mode 100644 ironic/conductor/servicing.py create mode 100644 ironic/db/sqlalchemy/alembic/versions/aa2384fee727_add_service_steps.py create mode 100644 ironic/tests/unit/conductor/test_servicing.py create mode 100644 releasenotes/notes/add-service-steps-deb45c9a0e77a647.yaml diff --git a/doc/source/contributor/webapi-version-history.rst b/doc/source/contributor/webapi-version-history.rst index a08b674c2c..efd34d77ee 100644 --- a/doc/source/contributor/webapi-version-history.rst +++ b/doc/source/contributor/webapi-version-history.rst @@ -2,6 +2,19 @@ REST API Version History ======================== +1.87 (Bobcat) +------------- + +Adds the ``service`` provision state verb to allow modifications +via the "steps" interface to occur with a baremetal node. With this +functionality comes a ``service_step`` field on the ``/v1/nodes`` +based resources, which indicates the current step. + +1.86 (Bobcat) +------------- + +Adds a ``firmware_interface`` field to the ``/v1/nodes`` resources. + 1.85 (Bobcat) ------------- diff --git a/ironic/api/controllers/v1/node.py b/ironic/api/controllers/v1/node.py index ea6fcdca1b..a477f5f157 100644 --- a/ironic/api/controllers/v1/node.py +++ b/ironic/api/controllers/v1/node.py @@ -56,9 +56,12 @@ from ironic import objects CONF = ironic.conf.CONF LOG = log.getLogger(__name__) -_CLEAN_STEPS_SCHEMA = { + + +# TODO(TheJulia): We *really* need to just have *one* schema. +_STEPS_SCHEMA = { "$schema": "http://json-schema.org/schema#", - "title": "Clean steps schema", + "title": "Steps schema", "type": "array", # list of clean steps "items": { @@ -124,7 +127,8 @@ PROVISION_ACTION_STATES = (ir_states.VERBS['manage'], ir_states.VERBS['provide'], ir_states.VERBS['abort'], ir_states.VERBS['adopt'], - ir_states.VERBS['unhold']) + ir_states.VERBS['unhold'], + ir_states.VERBS['service']) _NODES_CONTROLLER_RESERVED_WORDS = None @@ -950,7 +954,8 @@ class NodeStatesController(rest.RestController): def _do_provision_action(self, rpc_node, target, configdrive=None, clean_steps=None, deploy_steps=None, - rescue_password=None, disable_ramdisk=None): + rescue_password=None, disable_ramdisk=None, + service_steps=None): topic = api.request.rpcapi.get_topic_for(rpc_node) # Note that there is a race condition. The node state(s) could change # by the time the RPC call is made and the TaskManager manager gets a @@ -993,6 +998,17 @@ class NodeStatesController(rest.RestController): api.request.rpcapi.do_node_clean( api.request.context, rpc_node.uuid, clean_steps, disable_ramdisk, topic=topic) + elif target == ir_states.VERBS['service']: + if not service_steps: + msg = (_('"service_steps" is required when setting ' + 'target provision state to ' + '%s') % ir_states.VERBS['service']) + raise exception.ClientSideError( + msg, status_code=http_client.BAD_REQUEST) + _check_service_steps(service_steps) + api.request.rpcapi.do_node_service( + api.request.context, rpc_node.uuid, service_steps, + disable_ramdisk, topic=topic) elif target in PROVISION_ACTION_STATES: api.request.rpcapi.do_provisioning_action( api.request.context, rpc_node.uuid, target, topic) @@ -1008,10 +1024,12 @@ class NodeStatesController(rest.RestController): clean_steps=args.types(type(None), list), deploy_steps=args.types(type(None), list), rescue_password=args.string, - disable_ramdisk=args.boolean) + disable_ramdisk=args.boolean, + service_steps=args.types(type(None), list)) def provision(self, node_ident, target, configdrive=None, clean_steps=None, deploy_steps=None, - rescue_password=None, disable_ramdisk=None): + rescue_password=None, disable_ramdisk=None, + service_steps=None): """Asynchronous trigger the provisioning of the node. This will set the target provision state of the node, and a @@ -1069,11 +1087,31 @@ class NodeStatesController(rest.RestController): inside the rescue environment. This is required (and only valid), when target is "rescue". :param disable_ramdisk: Whether to skip booting ramdisk for cleaning. + :param service_steps: A list of service steps that will be performed on + the node. A service step is a dictionary with required keys + 'interface', 'step', 'priority' and 'args'. If specified, the value + for 'args' is a keyword variable argument dictionary that is passed + to the service step method.:: + + { 'interface': , + 'step': , + 'args': {: , ..., : } + 'priority': } + + For example (this isn't a real example, this service step doesn't + exist):: + + { 'interface': 'deploy', + 'step': 'upgrade_firmware', + 'args': {'force': True}, + 'priority': 90 } + :raises: NodeLocked (HTTP 409) if the node is currently locked. :raises: ClientSideError (HTTP 409) if the node is already being provisioned. :raises: InvalidParameterValue (HTTP 400), if validation of - clean_steps, deploy_steps or power driver interface fails. + clean_steps, deploy_steps, service_steps or power driver + interface fails. :raises: InvalidStateRequested (HTTP 400) if the requested transition is not possible from the current state. :raises: NodeInMaintenance (HTTP 400), if operation cannot be @@ -1140,9 +1178,13 @@ class NodeStatesController(rest.RestController): if not api_utils.allow_unhold_verb(): raise exception.NotAcceptable() + if target == ir_states.VERBS['service']: + if not api_utils.allow_service_verb(): + raise exception.NotAcceptable() + self._do_provision_action(rpc_node, target, configdrive, clean_steps, deploy_steps, rescue_password, - disable_ramdisk) + disable_ramdisk, service_steps) # Set the HTTP Location Header url_args = '/'.join([node_ident, 'states']) @@ -1156,7 +1198,7 @@ def _check_clean_steps(clean_steps): clean_steps parameter of :func:`NodeStatesController.provision`. :raises: InvalidParameterValue if validation of steps fails. """ - _check_steps(clean_steps, 'clean', _CLEAN_STEPS_SCHEMA) + _check_steps(clean_steps, 'clean', _STEPS_SCHEMA) def _check_deploy_steps(deploy_steps): @@ -1169,6 +1211,16 @@ def _check_deploy_steps(deploy_steps): _check_steps(deploy_steps, 'deploy', _DEPLOY_STEPS_SCHEMA) +def _check_service_steps(service_steps): + """Ensure all necessary keys are present and correct in steps for service + + :param service_steps: a list of steps. For more details, see the + service_steps parameter of :func:`NodeStatesController.provision`. + :raises: InvalidParameterValue if validation of steps fails. + """ + _check_steps(service_steps, 'service', _STEPS_SCHEMA) + + def _check_steps(steps, step_type, schema): """Ensure all necessary keys are present and correct in steps. @@ -1429,6 +1481,7 @@ def _get_fields_for_node_query(fields=None): 'retired', 'retired_reason', 'secure_boot', + 'service_step', 'shard', 'storage_interface', 'target_power_state', @@ -2105,7 +2158,7 @@ class NodesController(rest.RestController): 'instance_info', 'driver_internal_info', 'clean_step', 'deploy_step', 'raid_config', 'target_raid_config', - 'traits', 'network_data'] + 'traits', 'network_data', 'service_step'] _subcontroller_map = { 'ports': port.PortsController, diff --git a/ironic/api/controllers/v1/utils.py b/ironic/api/controllers/v1/utils.py index 269cfa3d26..10c631df81 100644 --- a/ironic/api/controllers/v1/utils.py +++ b/ironic/api/controllers/v1/utils.py @@ -808,7 +808,8 @@ VERSIONED_FIELDS = { 'secure_boot': versions.MINOR_75_NODE_BOOT_MODE, 'shard': versions.MINOR_82_NODE_SHARD, 'parent_node': versions.MINOR_83_PARENT_CHILD_NODES, - 'firmware_interface': versions.MINOR_86_FIRMWARE_INTERFACE + 'firmware_interface': versions.MINOR_86_FIRMWARE_INTERFACE, + 'service_step': versions.MINOR_87_SERVICE } for field in V31_FIELDS: @@ -1957,6 +1958,11 @@ def allow_unhold_verb(): return api.request.version.minor >= versions.MINOR_85_UNHOLD_VERB +def allow_service_verb(): + """Check if the service verb may be passed to the API.""" + return api.request.version.minor >= versions.MINOR_87_SERVICE + + def check_allow_deploy_steps(target, deploy_steps): """Check if deploy steps are allowed""" diff --git a/ironic/api/controllers/v1/versions.py b/ironic/api/controllers/v1/versions.py index d0c81efa5b..bcdcd2e421 100644 --- a/ironic/api/controllers/v1/versions.py +++ b/ironic/api/controllers/v1/versions.py @@ -123,6 +123,8 @@ BASE_VERSION = 1 # v1.83: Add child node modeling # v1.84: Add ramdisk callback to continue inspection. # v1.85: Add unhold verb +# v1.86: Add firmware interface +# v1.87: Add service verb MINOR_0_JUNO = 0 MINOR_1_INITIAL_VERSION = 1 MINOR_2_AVAILABLE_STATE = 2 @@ -210,6 +212,8 @@ MINOR_83_PARENT_CHILD_NODES = 83 MINOR_84_CONTINUE_INSPECTION = 84 MINOR_85_UNHOLD_VERB = 85 MINOR_86_FIRMWARE_INTERFACE = 86 +MINOR_87_SERVICE = 87 + # When adding another version, update: # - MINOR_MAX_VERSION @@ -217,7 +221,7 @@ MINOR_86_FIRMWARE_INTERFACE = 86 # explanation of what changed in the new version # - common/release_mappings.py, RELEASE_MAPPING['master']['api'] -MINOR_MAX_VERSION = MINOR_86_FIRMWARE_INTERFACE +MINOR_MAX_VERSION = MINOR_87_SERVICE # String representations of the minor and maximum versions _MIN_VERSION_STRING = '{}.{}'.format(BASE_VERSION, MINOR_1_INITIAL_VERSION) diff --git a/ironic/common/faults.py b/ironic/common/faults.py index b794950779..4d068f2ee9 100644 --- a/ironic/common/faults.py +++ b/ironic/common/faults.py @@ -24,4 +24,8 @@ RESCUE_ABORT_FAILURE = 'rescue abort failure' """ Node is moved to maintenance due to failure of cleaning up during rescue abort. """ -VALID_FAULTS = (POWER_FAILURE, CLEAN_FAILURE, RESCUE_ABORT_FAILURE) +SERVICE_FAILURE = 'service failure' +""" Node is moved to maintenance due to failure of a service operation. """ + +VALID_FAULTS = (POWER_FAILURE, CLEAN_FAILURE, RESCUE_ABORT_FAILURE, + SERVICE_FAILURE) diff --git a/ironic/common/neutron.py b/ironic/common/neutron.py index df5b5bd7a8..7a5d92dd85 100644 --- a/ironic/common/neutron.py +++ b/ironic/common/neutron.py @@ -989,3 +989,12 @@ class NeutronNetworkInterfaceMixin(object): # Fall back to non-managed in-band inspection raise exception.UnsupportedDriverExtension( driver=task.node.driver, extension='inspection') + + def get_servicing_network_uuid(self, task): + servicing_network = ( + task.node.driver_info.get('servicing_network') + or CONF.neutron.servicing_network + ) + return validate_network( + servicing_network, _('servicing network'), + context=task.context) diff --git a/ironic/common/release_mappings.py b/ironic/common/release_mappings.py index d76bd16f75..4fdc705385 100644 --- a/ironic/common/release_mappings.py +++ b/ironic/common/release_mappings.py @@ -574,12 +574,12 @@ RELEASE_MAPPING = { } }, 'master': { - 'api': '1.86', - 'rpc': '1.56', + 'api': '1.87', + 'rpc': '1.57', 'objects': { 'Allocation': ['1.1'], 'BIOSSetting': ['1.1'], - 'Node': ['1.39', '1.38', '1.37'], + 'Node': ['1.40', '1.39', '1.38', '1.37'], 'NodeHistory': ['1.0'], 'NodeInventory': ['1.0'], 'Conductor': ['1.3'], diff --git a/ironic/common/states.py b/ironic/common/states.py index b3f913da44..868c0d143e 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -53,6 +53,7 @@ VERBS = { 'rescue': 'rescue', 'unrescue': 'unrescue', 'unhold': 'unhold', + 'service': 'service', } """ Mapping of state-changing events that are PUT to the REST API @@ -235,11 +236,27 @@ UNRESCUEFAIL = 'unrescue failed' UNRESCUING = 'unrescuing' """ Node is being restored from rescue mode (to active state). """ +SERVICE = 'service' +""" Node is being requested to be modified through a service step. """ + +SERVICING = 'servicing' +""" Node is actively being changed by a service step. """ + +SERVICEWAIT = 'service wait' +""" Node is waiting for an operation to complete. """ + +SERVICEFAIL = 'service failed' +""" Node has failed in a service step execution. """ + +SERVICEHOLD = 'service hold' +""" Node is being held for direct intervention from a service step. """ + + # NOTE(kaifeng): INSPECTING is allowed to keep backwards compatibility, # starting from API 1.39 node update is disallowed in this state. UPDATE_ALLOWED_STATES = (DEPLOYFAIL, INSPECTING, INSPECTFAIL, INSPECTWAIT, CLEANFAIL, ERROR, VERIFYING, ADOPTFAIL, RESCUEFAIL, - UNRESCUEFAIL) + UNRESCUEFAIL, SERVICE, SERVICEHOLD, SERVICEFAIL) """Transitional states in which we allow updating a node.""" DELETE_ALLOWED_STATES = (MANAGEABLE, ENROLL, ADOPTFAIL) @@ -250,7 +267,7 @@ STABLE_STATES = (ENROLL, MANAGEABLE, AVAILABLE, ACTIVE, ERROR, RESCUE) UNSTABLE_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, VERIFYING, DELETING, INSPECTING, INSPECTWAIT, ADOPTING, RESCUING, - RESCUEWAIT, UNRESCUING) + RESCUEWAIT, UNRESCUING, SERVICING, SERVICEWAIT) """States that can be changed without external request.""" STUCK_STATES_TREATED_AS_FAIL = (DEPLOYING, CLEANING, VERIFYING, INSPECTING, @@ -272,12 +289,15 @@ _FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE, DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, INSPECTING, INSPECTWAIT, - RESCUING, RESCUEWAIT) + RESCUING, RESCUEWAIT, + SERVICING, SERVICEWAIT, + SERVICEHOLD) FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES) """States where API lookups are permitted with fast track enabled.""" FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL, - RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL)) + RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL, + SERVICEFAIL)) ############## @@ -594,3 +614,49 @@ machine.add_transition(ADOPTFAIL, ADOPTING, 'adopt') # A node that failed adoption can be moved back to manageable machine.add_transition(ADOPTFAIL, MANAGEABLE, 'manage') + +# Add service* states +machine.add_state(SERVICING, target=ACTIVE, **watchers) +machine.add_state(SERVICEWAIT, target=ACTIVE, **watchers) +machine.add_state(SERVICEFAIL, target=ACTIVE, **watchers) +machine.add_state(SERVICEHOLD, target=ACTIVE, **watchers) + +# A node in service an be returned to active +machine.add_transition(SERVICING, ACTIVE, 'done') + +# A node in active can be serviced +machine.add_transition(ACTIVE, SERVICING, 'service') + +# A node in servicing can be failed +machine.add_transition(SERVICING, SERVICEFAIL, 'fail') + +# A node in service can enter a wait state +machine.add_transition(SERVICING, SERVICEWAIT, 'wait') + +# A node in service can be held +machine.add_transition(SERVICING, SERVICEHOLD, 'hold') +machine.add_transition(SERVICEWAIT, SERVICEHOLD, 'hold') + +# A held node in service can get more service steps to start over +machine.add_transition(SERVICEHOLD, SERVICING, 'service') + +# A held node in service can be removed from service +machine.add_transition(SERVICEHOLD, SERVICEWAIT, 'unhold') + +# A node in service wait can resume +machine.add_transition(SERVICEWAIT, SERVICING, 'resume') + +# A node in service wait can failed +machine.add_transition(SERVICEWAIT, SERVICEFAIL, 'fail') + +# A node in service wait can be aborted +machine.add_transition(SERVICEWAIT, SERVICEFAIL, 'abort') + +# A node in service hold can be aborted +machine.add_transition(SERVICEHOLD, SERVICEFAIL, 'abort') + +# A node in service fail can re-enter service +machine.add_transition(SERVICEFAIL, SERVICING, 'service') + +# A node in service fail can be rescued +machine.add_transition(SERVICEFAIL, RESCUING, 'rescue') diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index fc53595908..0807193c4c 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -66,6 +66,7 @@ from ironic.conductor import deployments from ironic.conductor import inspection from ironic.conductor import notification_utils as notify_utils from ironic.conductor import periodics +from ironic.conductor import servicing from ironic.conductor import steps as conductor_steps from ironic.conductor import task_manager from ironic.conductor import utils @@ -93,7 +94,7 @@ class ConductorManager(base_manager.BaseConductorManager): # NOTE(rloo): This must be in sync with rpcapi.ConductorAPI's. # NOTE(pas-ha): This also must be in sync with # ironic.common.release_mappings.RELEASE_MAPPING['master'] - RPC_API_VERSION = '1.56' + RPC_API_VERSION = '1.57' target = messaging.Target(version=RPC_API_VERSION) @@ -3710,6 +3711,81 @@ class ConductorManager(base_manager.BaseConductorManager): task, inventory, plugin_data), err_handler=utils.provisioning_error_handler) + @METRICS.timer('ConductorManager.do_node_service') + @messaging.expected_exceptions(exception.InvalidParameterValue, + exception.InvalidStateRequested, + exception.NodeInMaintenance, + exception.NodeLocked, + exception.NoFreeConductorWorker, + exception.ConcurrentActionLimit) + def do_node_service(self, context, node_id, service_steps, + disable_ramdisk=False): + """RPC method to initiate node service. + + :param context: an admin context. + :param node_id: the ID or UUID of a node. + :param service_steps: an ordered list of steps that will be + performed on the node. A step is a dictionary with required + keys 'interface' and 'step', and optional key 'args'. If + specified, the 'args' arguments are passed to the clean step + method.:: + + { 'interface': , + 'step': , + 'args': {: , ..., : } } + + For example (this isn't a real example, this service step + doesn't exist):: + + { 'interface': deploy', + 'step': 'upgrade_firmware', + 'args': {'force': True} } + :param disable_ramdisk: Optional. Whether to disable the ramdisk boot. + :raises: InvalidParameterValue if power validation fails. + :raises: InvalidStateRequested if the node is not in manageable state. + :raises: NodeLocked if node is locked by another conductor. + :raises: NoFreeConductorWorker when there is no free worker to start + async task. + :raises: ConcurrentActionLimit If this action would exceed the + configured limits of the deployment. + """ + self._concurrent_action_limit(action='service') + with task_manager.acquire(context, node_id, shared=False, + purpose='node service') as task: + node = task.node + if node.maintenance: + raise exception.NodeInMaintenance(op=_('service'), + node=node.uuid) + # NOTE(TheJulia): service.do_node_service() will also make similar + # calls to validate power & network, but we are doing it again + # here so that the user gets immediate feedback of any issues. + # This behaviour (of validating) is consistent with other methods + # like self.do_node_deploy(). + try: + task.driver.power.validate(task) + task.driver.network.validate(task) + except exception.InvalidParameterValue as e: + msg = (_('Validation of node %(node)s for servicing ' + 'failed: %(msg)s') % + {'node': node.uuid, 'msg': e}) + raise exception.InvalidParameterValue(msg) + try: + task.process_event( + 'service', + callback=self._spawn_worker, + call_args=(servicing.do_node_service, task, service_steps, + disable_ramdisk), + err_handler=utils.provisioning_error_handler, + target_state=states.ACTIVE) + except exception.InvalidState: + raise exception.InvalidStateRequested( + action='service', node=node.uuid, + state=node.provision_state) + + +# NOTE(TheJulia): This is the end of the class definition for the +# conductor manager. Methods for RPC and stuffs should go above this +# point in the File. Everything below is a helper or periodic. @METRICS.timer('get_vendor_passthru_metadata') def get_vendor_passthru_metadata(route_dict): diff --git a/ironic/conductor/rpcapi.py b/ironic/conductor/rpcapi.py index 5a8582f888..f268a17aee 100644 --- a/ironic/conductor/rpcapi.py +++ b/ironic/conductor/rpcapi.py @@ -153,12 +153,13 @@ class ConductorAPI(object): heartbeat | 1.55 - Added change_node_boot_mode | 1.56 - Added continue_inspection + | 1.57 - Added do_node_service """ # NOTE(rloo): This must be in sync with manager.ConductorManager's. # NOTE(pas-ha): This also must be in sync with # ironic.common.release_mappings.RELEASE_MAPPING['master'] - RPC_API_VERSION = '1.56' + RPC_API_VERSION = '1.57' def __init__(self, topic=None): super(ConductorAPI, self).__init__() @@ -1409,3 +1410,27 @@ class ConductorAPI(object): cctxt = self._prepare_call(topic=topic, version='1.56') return cctxt.call(context, 'continue_inspection', node_id=node_id, inventory=inventory, plugin_data=plugin_data) + + def do_node_service(self, context, node_id, service_steps, + disable_ramdisk=None, topic=None): + """Signal to conductor service to perform manual cleaning on a node. + + :param context: request context. + :param node_id: node ID or UUID. + :param service_steps: a list of service step dictionaries. + :param disable_ramdisk: Whether to skip booting ramdisk for service. + :param topic: RPC topic. Defaults to self.topic. + :raises: InvalidParameterValue if validation of power driver interface + failed. + :raises: InvalidStateRequested if cleaning can not be performed. + :raises: NodeInMaintenance if node is in maintenance mode. + :raises: NodeLocked if node is locked by another conductor. + :raises: NoFreeConductorWorker when there is no free worker to start + async task. + """ + cctxt = self._prepare_call(topic=topic, version='1.57') + return cctxt.call( + context, 'do_node_service', + node_id=node_id, + service_steps=service_steps, + disable_ramdisk=disable_ramdisk) diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py new file mode 100644 index 0000000000..4a14de19d8 --- /dev/null +++ b/ironic/conductor/servicing.py @@ -0,0 +1,377 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Functionality related to servicing.""" + +from oslo_log import log + +from ironic.common import exception +from ironic.common.i18n import _ +from ironic.common import states +from ironic.conductor import steps as conductor_steps +from ironic.conductor import task_manager +from ironic.conductor import utils +from ironic.conf import CONF +from ironic.drivers import utils as driver_utils +from ironic import objects + +LOG = log.getLogger(__name__) + + +@task_manager.require_exclusive_lock +def do_node_service(task, service_steps=None, disable_ramdisk=False): + """Internal RPC method to perform servicing of a node. + + :param task: a TaskManager instance with an exclusive lock on its node + :param service_steps: The list of service steps to perform. If none, step + validation will fail. + :param disable_ramdisk: Whether to skip booting ramdisk for servicing. + """ + node = task.node + try: + # NOTE(ghe): Valid power and network values are needed to perform + # a service operation. + task.driver.power.validate(task) + if not disable_ramdisk: + task.driver.network.validate(task) + except (exception.InvalidParameterValue, exception.NetworkError) as e: + msg = (_('Validation of node %(node)s for service failed: %(msg)s') % + {'node': node.uuid, 'msg': e}) + return utils.servicing_error_handler(task, msg) + + utils.wipe_service_internal_info(task) + node.set_driver_internal_info('service_steps', service_steps) + node.set_driver_internal_info('service_disable_ramdisk', + disable_ramdisk) + task.node.save() + + # Allow the deploy driver to set up the ramdisk again (necessary for IPA) + try: + if not disable_ramdisk: + prepare_result = task.driver.deploy.prepare_service(task) + else: + LOG.info('Skipping preparing for service in-band service since ' + 'out-of-band only service has been requested for node ' + '%s', node.uuid) + prepare_result = None + except Exception as e: + msg = (_('Failed to prepare node %(node)s for service: %(e)s') + % {'node': node.uuid, 'e': e}) + return utils.servicing_error_handler(task, msg, traceback=True) + + if prepare_result == states.SERVICEWAIT: + # Prepare is asynchronous, the deploy driver will need to + # set node.driver_internal_info['service_steps'] and + # node.service_step and then make an RPC call to + # continue_node_service to start service operations. + task.process_event('wait') + return + try: + conductor_steps.set_node_service_steps( + task, disable_ramdisk=disable_ramdisk) + except Exception as e: + # Catch all exceptions and follow the error handling + # path so things are cleaned up properly. + msg = (_('Cannot service node %(node)s: %(msg)s') + % {'node': node.uuid, 'msg': e}) + return utils.servicing_error_handler(task, msg) + + steps = node.driver_internal_info.get('service_steps', []) + if not steps: + _tear_down_node_service(task, disable_ramdisk=disable_ramdisk) + step_index = 0 if steps else None + do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk) + + +@utils.fail_on_error(utils.servicing_error_handler, + _("Unexpected error when processing next service step"), + traceback=True) +@task_manager.require_exclusive_lock +def do_next_service_step(task, step_index, disable_ramdisk=None): + """Do service, starting from the specified service step. + + :param task: a TaskManager instance with an exclusive lock + :param step_index: The first service step in the list to execute. This + is the index (from 0) into the list of service steps in the node's + driver_internal_info['service_steps']. Is None if there are no steps + to execute. + :param disable_ramdisk: Whether to skip booting ramdisk for service. + """ + node = task.node + # For manual cleaning, the target provision state is MANAGEABLE, + # whereas for automated cleaning, it is AVAILABLE. + if step_index is None: + steps = [] + else: + assert node.driver_internal_info.get('service_steps') is not None, \ + f"BUG: No steps for {node.uuid}, step index is {step_index}" + steps = node.driver_internal_info['service_steps'][step_index:] + + if disable_ramdisk is None: + disable_ramdisk = node.driver_internal_info.get( + 'service_disable_ramdisk', False) + + LOG.info('Executing service on node %(node)s, remaining steps: ' + '%(steps)s', {'node': node.uuid, 'steps': steps}) + # Execute each step until we hit an async step or run out of steps + for ind, step in enumerate(steps): + # Save which step we're about to start so we can restart + # if necessary + node.service_step = step + node.set_driver_internal_info('service_step_index', step_index + ind) + node.save() + eocn = step.get('execute_on_child_nodes', False) + result = None + try: + if not eocn: + LOG.info('Executing %(step)s on node %(node)s', + {'step': step, 'node': node.uuid}) + use_step_handler = conductor_steps.use_reserved_step_handler( + task, step) + if use_step_handler: + if use_step_handler == conductor_steps.EXIT_STEPS: + # Exit the step, i.e. hold step + return + # if use_step_handler == conductor_steps.USED_HANDLER + # Then we have completed the needful in the handler, + # but since there is no other value to check now, + # we know we just need to skip execute_deploy_step + else: + interface = getattr(task.driver, step.get('interface')) + result = interface.execute_service_step(task, step) + else: + LOG.info('Executing %(step)s on child nodes for node ' + '%(node)s.', + {'step': step, 'node': node.uuid}) + result = execute_step_on_child_nodes(task, step) + + except Exception as e: + if isinstance(e, exception.AgentConnectionFailed): + if task.node.driver_internal_info.get('service_reboot'): + LOG.info('Agent is not yet running on node %(node)s ' + 'after service reboot, waiting for agent to ' + 'come up to run next service step %(step)s.', + {'node': node.uuid, 'step': step}) + node.set_driver_internal_info('skip_current_service_step', + False) + task.process_event('wait') + return + if isinstance(e, exception.AgentInProgress): + LOG.info('Conductor attempted to process service step for ' + 'node %(node)s. Agent indicated it is presently ' + 'executing a command. Error: %(error)s', + {'node': task.node.uuid, + 'error': e}) + node.set_driver_internal_info( + 'skip_current_service_step', False) + task.process_event('wait') + return + + msg = (_('Node %(node)s failed step %(step)s: ' + '%(exc)s') % + {'node': node.uuid, 'exc': e, + 'step': node.service_step}) + if not disable_ramdisk: + driver_utils.collect_ramdisk_logs(task.node, label='service') + utils.servicing_error_handler(task, msg, traceback=True) + return + + # Check if the step is done or not. The step should return + # states.SERVICEWAIT if the step is still being executed, or + # None if the step is done. + if result == states.SERVICEWAIT: + # Kill this worker, the async step will make an RPC call to + # continue_node_service to continue service + LOG.info('Service step %(step)s on node %(node)s being ' + 'executed asynchronously, waiting for driver.', + {'node': node.uuid, 'step': step}) + task.process_event('wait') + return + elif result is not None: + msg = (_('While executing step %(step)s on node ' + '%(node)s, step returned invalid value: %(val)s') + % {'step': step, 'node': node.uuid, 'val': result}) + return utils.servicing_error_handler(task, msg) + LOG.info('Node %(node)s finished service step %(step)s', + {'node': node.uuid, 'step': step}) + utils.wipe_service_internal_info(task) + if CONF.agent.deploy_logs_collect == 'always' and not disable_ramdisk: + driver_utils.collect_ramdisk_logs(task.node, label='service') + _tear_down_node_service(task, disable_ramdisk) + + +def _tear_down_node_service(task, disable_ramdisk): + """Clean up a node from service. + + :param task: A Taskmanager object. + :returns: None + """ + task.node.service_step = None + utils.wipe_service_internal_info(task) + task.node.save() + if not disable_ramdisk: + try: + task.driver.deploy.tear_down_service(task) + except Exception as e: + msg = (_('Failed to tear down from service for node %(node)s, ' + 'reason: %(err)s') + % {'node': task.node.uuid, 'err': e}) + return utils.servicing_error_handler(task, msg, + traceback=True, + tear_down_service=False) + LOG.info('Node %s service complete.', task.node.uuid) + task.process_event('done') + + +def execute_step_on_child_nodes(task, step): + """Execute a requested step against a child node. + + :param task: The TaskManager object for the parent node. + :param step: The requested step to be executed. + :returns: None on Success, the resulting error message if a + failure has occured. + """ + # NOTE(TheJulia): We could just use nodeinfo list calls against + # dbapi. + # NOTE(TheJulia): We validate the data in advance in the API + # with the original request context. + eocn = step.get('execute_on_child_nodes') + child_nodes = step.get('limit_child_node_execution', []) + filters = {'parent_node': task.node.uuid} + if eocn and len(child_nodes) >= 1: + filters['uuid_in'] = child_nodes + child_nodes = objects.Node.list( + task.context, + filters=filters, + fields=['uuid'] + ) + for child_node in child_nodes: + result = None + LOG.info('Executing step %(step)s on child node %(node)s for parent ' + 'node %(parent_node)s', + {'step': step, + 'node': child_node.uuid, + 'parent_node': task.node.uuid}) + with task_manager.acquire(task.context, + child_node.uuid, + purpose='execute step') as child_task: + interface = getattr(child_task.driver, step.get('interface')) + LOG.info('Executing %(step)s on node %(node)s', + {'step': step, 'node': child_task.node.uuid}) + if not conductor_steps.use_reserved_step_handler(child_task, step): + result = interface.execute_service_step(child_task, step) + if result is not None: + if (result == states.SERVICEWAIT + and CONF.conductor.permit_child_node_step_async_result): + # Operator has chosen to permit this due to some reason + # NOTE(TheJulia): This is where we would likely wire agent + # error handling if we ever implicitly allowed child node + # deploys to take place with the agent from a parent node + # being deployed. + continue + msg = (_('While executing step %(step)s on child node ' + '%(node)s, step returned invalid value: %(val)s') + % {'step': step, 'node': child_task.node.uuid, + 'val': result}) + LOG.error(msg) + # Only None or states.SERVICEWAIT are possible paths forward + # in the parent step execution code, so returning the message + # means it will be logged. + return msg + + +def get_last_error(node): + last_error = _('By request, the service operation was aborted') + if node.service_step: + last_error += ( + _(' during or after the completion of step "%s"') + % conductor_steps.step_id(node.service_step) + ) + return last_error + + +@task_manager.require_exclusive_lock +def do_node_service_abort(task): + """Internal method to abort an ongoing operation. + + :param task: a TaskManager instance with an exclusive lock + """ + node = task.node + try: + task.driver.deploy.tear_down_service(task) + except Exception as e: + log_msg = (_('Failed to tear down service for node %(node)s ' + 'after aborting the operation. Error: %(err)s') % + {'node': node.uuid, 'err': e}) + error_msg = _('Failed to tear down service after aborting ' + 'the operation') + utils.servicing_error_handler(task, log_msg, + errmsg=error_msg, + traceback=True, + tear_down_service=False, + set_fail_state=False) + return + + last_error = get_last_error(node) + info_message = _('Clean operation aborted for node %s') % node.uuid + if node.service_step: + info_message += ( + _(' during or after the completion of step "%s"') + % node.service_step + ) + + node.last_error = last_error + node.service_step = None + utils.wipe_service_internal_info(task) + node.save() + LOG.info(info_message) + + +@utils.fail_on_error(utils.servicing_error_handler, + _("Unexpected error when processing next service step"), + traceback=True) +@task_manager.require_exclusive_lock +def continue_node_service(task): + """Continue servicing after finishing an async service step. + + This function calculates which step has to run next and passes control + into do_next_service_step. + + :param task: a TaskManager instance with an exclusive lock + """ + node = task.node + + next_step_index = utils.update_next_step_index(task, 'service') + + # If this isn't the final service step in the service operation + # and it is flagged to abort after the service step that just + # finished, we abort the operation. + if node.service_step.get('abort_after'): + step_name = node.service_step['step'] + if next_step_index is not None: + LOG.debug('The service operation for node %(node)s was ' + 'marked to be aborted after step "%(step)s ' + 'completed. Aborting now that it has completed.', + {'node': task.node.uuid, 'step': step_name}) + + task.process_event('fail') + do_node_service_abort(task) + return + + LOG.debug('The service operation for node %(node)s was ' + 'marked to be aborted after step "%(step)s" ' + 'completed. However, since there are no more ' + 'service steps after this, the abort is not going ' + 'to be done.', {'node': node.uuid, + 'step': step_name}) + + do_next_service_step(task, next_step_index) diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py index bf8043bb19..1c2895d813 100644 --- a/ironic/conductor/steps.py +++ b/ironic/conductor/steps.py @@ -54,6 +54,8 @@ DEPLOYING_INTERFACE_PRIORITY = { 'raid': 1, } +SERVICING_INTERFACE_PRIORITY = DEPLOYING_INTERFACE_PRIORITY.copy() + VERIFYING_INTERFACE_PRIORITY = { # When two verify steps have the same priority, their order is determined # by which interface is implementing the verify step. The verifying step of @@ -127,6 +129,15 @@ def _sorted_steps(steps, sort_step_key): return sorted(steps, key=sort_step_key, reverse=True) +def _service_step_key(step): + """Sort by priority, then interface priority in event of tie. + + :param step: deploy step dict to get priority for. + """ + return (step.get('priority'), + SERVICING_INTERFACE_PRIORITY[step.get('interface')]) + + def is_equivalent(step1, step2): """Compare steps, ignoring their priority.""" return (step1.get('interface') == step2.get('interface') @@ -240,6 +251,26 @@ def _get_deployment_steps(task, enabled=False, sort=True): enabled=enabled, sort_step_key=sort_key) +def _get_service_steps(task, enabled=False, sort=True): + """Get service steps for task.node. + + :param task: A TaskManager object + :param enabled: If True, returns only enabled (priority > 0) steps. If + False, returns all clean steps. + :param sort: If True, the steps are sorted from highest priority to lowest + priority. For steps having the same priority, they are sorted from + highest interface priority to lowest. + :raises: NodeServicingFailure if there was a problem getting the + clean steps. + :returns: A list of clean step dictionaries + """ + sort_key = _service_step_key if sort else None + service_steps = _get_steps(task, SERVICING_INTERFACE_PRIORITY, + 'get_service_steps', enabled=enabled, + sort_step_key=sort_key) + return service_steps + + def _get_verify_steps(task, enabled=False, sort=True): """Get verify steps for task.node. @@ -455,6 +486,34 @@ def set_node_deployment_steps(task, reset_current=True, skip_missing=False): node.save() +def set_node_service_steps(task, disable_ramdisk=False): + """Set up the node with clean step information for cleaning. + + For automated cleaning, get the clean steps from the driver. + For manual cleaning, the user's clean steps are known but need to be + validated against the driver's clean steps. + + :param disable_ramdisk: If `True`, only steps with requires_ramdisk=False + are accepted. + :raises: InvalidParameterValue if there is a problem with the user's + clean steps. + :raises: NodeCleaningFailure if there was a problem getting the + clean steps. + """ + node = task.node + steps = _validate_user_service_steps( + task, node.driver_internal_info.get('service_steps', []), + disable_ramdisk=disable_ramdisk) + LOG.debug('List of the steps for service of node %(node)s: ' + '%(steps)s', {'node': node.uuid, + 'steps': steps}) + + node.service_step = {} + node.set_driver_internal_info('service_steps', steps) + node.set_driver_internal_info('service_step_index', None) + node.save() + + def step_id(step): """Return the 'ID' of a deploy step. @@ -705,7 +764,6 @@ def _validate_user_steps(task, user_steps, driver_steps, step_type, err = error_prefix or '' err += '; '.join(errors) raise exception.InvalidParameterValue(err=err) - return result @@ -769,6 +827,36 @@ def _validate_user_deploy_steps(task, user_steps, error_prefix=None, skip_missing=skip_missing) +def _validate_user_service_steps(task, user_steps, disable_ramdisk=False): + """Validate the user-specified service steps. + + :param task: A TaskManager object + :param user_steps: a list of clean steps. A clean step is a dictionary + with required keys 'interface' and 'step', and optional key 'args':: + + { 'interface': , + 'step': , + 'args': {: , ..., : } } + + For example:: + + { 'interface': 'deploy', + 'step': 'upgrade_firmware', + 'args': {'force': True} } + :param disable_ramdisk: If `True`, only steps with requires_ramdisk=False + are accepted. + :raises: InvalidParameterValue if validation of clean steps fails. + :raises: NodeCleaningFailure if there was a problem getting the + clean steps from the driver. + :return: validated clean steps update with information from the driver + """ + # We call with enabled = False below so we pickup auto-disabled + # steps, since service steps are not automagic like cleaning can be. + driver_steps = _get_service_steps(task, enabled=False, sort=False) + return _validate_user_steps(task, user_steps, driver_steps, 'service', + disable_ramdisk=disable_ramdisk) + + def _get_validated_user_deploy_steps(task, deploy_steps=None, skip_missing=False): """Validate the deploy steps for a node. diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 0f7d20fdb0..1255c8691f 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -574,6 +574,20 @@ def wipe_cleaning_internal_info(task): node.del_driver_internal_info('steps_validated') +def wipe_service_internal_info(task): + """Remove temporary servicing fields from driver_internal_info.""" + wipe_token_and_url(task) + node = task.node + node.set_driver_internal_info('service_steps', None) + node.del_driver_internal_info('agent_cached_service_steps') + node.del_driver_internal_info('service_step_index') + node.del_driver_internal_info('service_reboot') + node.del_driver_internal_info('service_polling') + node.del_driver_internal_info('service_disable_ramdisk') + node.del_driver_internal_info('skip_current_service_step') + node.del_driver_internal_info('steps_validated') + + def deploying_error_handler(task, logmsg, errmsg=None, traceback=False, clean_up=True): """Put a failed node in DEPLOYFAIL. @@ -1209,7 +1223,7 @@ def _get_node_next_steps(task, step_type, skip_current_step=True): :returns: index of the next step; None if there are none to execute. """ - valid_types = set(['clean', 'deploy']) + valid_types = set(['clean', 'deploy', 'service']) if step_type not in valid_types: # NOTE(rloo): No need to i18n this, since this would be a # developer error; it isn't user-facing. @@ -1745,3 +1759,72 @@ def get_token_project_from_request(ctx): except AttributeError: LOG.warning('Attempted to identify requestor project ID value, ' 'however we were unable to do so. Possible older API?') + + +def servicing_error_handler(task, logmsg, errmsg=None, traceback=False, + tear_down_service=True, set_fail_state=True, + set_maintenance=None): + """Put a failed node in SERVICEFAIL and maintenance (if needed). + + :param task: a TaskManager instance. + :param logmsg: Message to be logged. + :param errmsg: Message for the user. Optional, if not provided `logmsg` is + used. + :param traceback: Whether to log a traceback. Defaults to False. + :param tear_down_service: Whether to clean up the PXE and DHCP files after + servie. Default to True. + :param set_fail_state: Whether to set node to failed state. Default to + True. + :param set_maintenance: Whether to set maintenance mode. If None, + maintenance mode will be set if and only if a clean step is being + executed on a node. + """ + if set_maintenance is None: + set_maintenance = bool(task.node.service_step) + + errmsg = errmsg or logmsg + LOG.error(logmsg, exc_info=traceback) + node = task.node + if set_maintenance: + node.fault = faults.SERVICE_FAILURE + node.maintenance = True + + if tear_down_service: + try: + task.driver.deploy.tear_down_service(task) + except Exception as e: + msg2 = ('Failed to tear down servicing on node %(uuid)s, ' + 'reason: %(err)s' % {'err': e, 'uuid': node.uuid}) + LOG.exception(msg2) + errmsg = _('%s. Also failed to tear down servicing.') % errmsg + + if node.provision_state in ( + states.SERVICING, + states.SERVICEWAIT, + states.SERVICEFAIL): + # Clear clean step, msg should already include current step + node.service_step = {} + # Clear any leftover metadata about cleaning + node.del_driver_internal_info('service_step_index') + node.del_driver_internal_info('servicing_reboot') + node.del_driver_internal_info('servicing_polling') + node.del_driver_internal_info('skip_current_service_step') + # We don't need to keep the old agent URL, or token + # as it should change upon the next cleaning attempt. + wipe_token_and_url(task) + # For manual cleaning, the target provision state is MANAGEABLE, whereas + # for automated cleaning, it is AVAILABLE. + node_history_record(node, event=errmsg, event_type=states.SERVICING, + error=True) + # NOTE(dtantsur): avoid overwriting existing maintenance_reason + if not node.maintenance_reason and set_maintenance: + node.maintenance_reason = errmsg + + if CONF.conductor.poweroff_in_servicefail: + # NOTE(NobodyCam): Power off node in service fail + node_power_action(task, states.POWER_OFF) + + node.save() + + if set_fail_state and node.provision_state != states.SERVICEFAIL: + task.process_event('fail') diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 47e98551b0..dd75fe4132 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -356,6 +356,13 @@ opts = [ 'when using Cleaning to perform ' 'hardware-transformative actions such as ' 'firmware upgrade.')), + cfg.BoolOpt('poweroff_in_servicefail', + default=False, + help=_('If True power off nodes in the ``service failed`` ' + 'state. Default False. Option may be unsafe ' + 'when using service to perform ' + 'hardware-transformative actions such as ' + 'firmware upgrade.')), cfg.BoolOpt('permit_child_node_step_async_result', default=False, mutable=True, diff --git a/ironic/conf/neutron.py b/ironic/conf/neutron.py index 0acfab5aa6..7bf529e9ee 100644 --- a/ironic/conf/neutron.py +++ b/ironic/conf/neutron.py @@ -128,7 +128,24 @@ opts = [ 'different CLID/IAID. Due to non-identical identifiers ' 'multiple addresses must be reserved for the host to ' 'ensure each step of the boot process can successfully ' - 'lease addresses.')) + 'lease addresses.')), + cfg.StrOpt('servicing_network', + mutable=True, + help=_('Neutron network UUID or name for booting the ramdisk ' + 'for service mode. Required for "neutron" ' + 'network interface, if service mode will be used. It ' + 'is not used for the "flat" or "noop" network ' + 'interfaces. If a name is provided, it must be unique ' + 'among all networks or service will fail.')), + cfg.ListOpt('servicing_network_security_groups', + default=[], + mutable=True, + help=_('List of Neutron Security Group UUIDs to be applied ' + 'during the node service process. Optional for the ' + '"neutron" network interface and not used for the ' + '"flat" or "noop" network interfaces. If not ' + 'specified, the default security group is used.')), + ] diff --git a/ironic/db/sqlalchemy/alembic/versions/aa2384fee727_add_service_steps.py b/ironic/db/sqlalchemy/alembic/versions/aa2384fee727_add_service_steps.py new file mode 100644 index 0000000000..54597b538d --- /dev/null +++ b/ironic/db/sqlalchemy/alembic/versions/aa2384fee727_add_service_steps.py @@ -0,0 +1,35 @@ +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Add service_steps + +Revision ID: aa2384fee727 +Revises: d163df1bab88 +Create Date: 2023-05-25 11:50:05.285602 + +""" + + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'aa2384fee727' +down_revision = 'd163df1bab88' + + +def upgrade(): + op.add_column('nodes', sa.Column('service_step', sa.Text(), + nullable=True)) diff --git a/ironic/db/sqlalchemy/models.py b/ironic/db/sqlalchemy/models.py index 3333164258..5660d7b1dd 100644 --- a/ironic/db/sqlalchemy/models.py +++ b/ironic/db/sqlalchemy/models.py @@ -217,6 +217,7 @@ class NodeBase(Base): secure_boot = Column(Boolean, nullable=True) shard = Column(String(255), nullable=True) parent_node = Column(String(36), nullable=True) + service_step = Column(db_types.JsonEncodedDict) class Node(NodeBase): diff --git a/ironic/drivers/base.py b/ironic/drivers/base.py index 1e0cd49719..f41ff74c5c 100644 --- a/ironic/drivers/base.py +++ b/ironic/drivers/base.py @@ -247,6 +247,7 @@ class BaseInterface(object, metaclass=abc.ABCMeta): instance.clean_steps = [] instance.deploy_steps = [] instance.verify_steps = [] + instance.service_steps = [] for n, method in inspect.getmembers(instance, inspect.ismethod): if getattr(method, '_is_clean_step', False): # Create a CleanStep to represent this method @@ -271,6 +272,15 @@ class BaseInterface(object, metaclass=abc.ABCMeta): 'priority': method._verify_step_priority, 'interface': instance.interface_type} instance.verify_steps.append(step) + if getattr(method, '_is_service_step', False): + step = {'step': method.__name__, + 'priority': method._service_step_priority, + 'abortable': method._service_step_abortable, + 'argsinfo': method._service_step_argsinfo, + 'interface': instance.interface_type, + 'requires_ramdisk': + method._service_step_requires_ramdisk} + instance.service_steps.append(step) if instance.clean_steps: LOG.debug('Found clean steps %(steps)s for interface ' @@ -287,6 +297,11 @@ class BaseInterface(object, metaclass=abc.ABCMeta): '%(interface)s', {'steps': instance.deploy_steps, 'interface': instance.interface_type}) + if instance.service_steps: + LOG.debug('Found service steps %(steps)s for interface ' + '%(interface)s', + {'steps': instance.service_steps, + 'interface': instance.interface_type}) return instance @@ -411,6 +426,35 @@ class BaseInterface(object, metaclass=abc.ABCMeta): """ return self._execute_step(task, step) + def get_service_steps(self, task): + """Get a list of service steps for the interface. + + This function will return all service steps (both enabled and disabled) + for the interface, in an unordered list. + + :param task: A TaskManager object, useful for interfaces overriding + this function + :raises NodeServiceFailure: if there is a problem getting the steps + from the driver. For example, when a node (using an agent driver) + has just been enrolled and the agent isn't alive yet to be queried + for the available clean steps. + :returns: A list of clean step dictionaries + """ + return self.service_steps + + def execute_service_step(self, task, step): + """Execute the service step on task.node. + + A verify step must take a single positional argument: a TaskManager + object. It does not take keyword variable arguments. + + :param task: A TaskManager object + :param step: The deploy step dictionary representing the step to + execute + :returns: None if this method has completed synchronously + """ + return self._execute_step(task, step) + class DeployInterface(BaseInterface): """Interface for deploy-related actions.""" @@ -545,6 +589,38 @@ class DeployInterface(BaseInterface): 'the driver %(driver)s does not support heartbeating', {'node': task.node.uuid, 'driver': task.node.driver}) + def tear_down_service(self, task): + """Tear down after servicing is completed. + + Given that servicing is complete, do all cleanup and tear + down necessary to allow the node to be returned to an active + state. + + :param task: A TaskManager instance containing the node to act on. + """ + pass + + def prepare_service(self, task): + """Prepare the node for servicing tasks. + + For example, nodes that use the Ironic Python Agent will need to + boot the ramdisk in order to do in-band service tasks. + + If the function is asynchronous, the driver will need to handle + settings node.driver_internal_info['service_steps'] and + node.service_step, as they would be set in + ironic.conductor.manager._do_node_service, but cannot be set when + this is asynchronous. After, the interface should make an RPC call + to continue_node_servicing to start cleaning. + + :param task: A TaskManager instance containing the node to act on. + :returns: If this function is going to be asynchronous, should return + `states.SERVICEWAIT`. Otherwise, should return `None`. + The interface will need to call _get_cleaning_steps and then RPC + to continue_node_service. + """ + pass + class BootInterface(BaseInterface): """Interface for boot-related actions.""" @@ -1710,6 +1786,28 @@ class NetworkInterface(BaseInterface): """ return task.node.network_data or {} + def add_servicing_network(self, task): + """Add the servicing network to the node. + + :param task: A TaskManager instance. + :returns: a dictionary in the form {port.uuid: neutron_port['id']} + :raises: NetworkError + :raises: InvalidParameterValue, if the network interface configuration + is invalid. + """ + return {} + + def remove_servicing_network(self, task): + """Removes the servicing network from a node. + + :param task: A TaskManager instance. + :raises: NetworkError + :raises: InvalidParameterValue, if the network interface configuration + is invalid. + :raises: MissingParameterValue, if some parameters are missing. + """ + pass + class StorageInterface(BaseInterface, metaclass=abc.ABCMeta): """Base class for storage interfaces.""" @@ -2023,3 +2121,82 @@ def verify_step(priority): return func return decorator + + +def service_step(priority=None, abortable=False, argsinfo=None, + requires_ramdisk=True): + """Decorator for service steps. + + Service steps may be used in performing service upon a node. + + For service, the steps will be executed in a similar fashion + to cleaning, but the steps and order of execution must be + explicitly specified by the user when invoking the servicing API. + + Decorated service steps must take as the only a single positional + argument, a TaskManager object, in addition to a keyword arguments + variable (as described in argsinfo). + + Service steps can be either synchronous or asynchronous. If the step is + synchronous, it should return `None` when finished, and the conductor + will continue on to the next step. While the clean step is executing, the + node will be in `states.SERVICING` provision state. If the step is + asynchronous, the step should return `states.SERVICEWAIT` to the + conductor before it starts the asynchronous work. When the step is + complete, the step should make an RPC call to `continue_node_service` to + move to the next step in servicing. The node will be in + `states.SERVICEWAIT` provision state during the asynchronous work. + + Examples:: + + class MyInterface(base.BaseInterface): + @base.service_step() + def example_service(self, task): + # do some service actions + + @base.service_step(priority=0, abortable=True, argsinfo= + {'size': {'description': 'size of widget (MB)', + 'required': True}}) + def advanced_service(self, task, **kwargs): + # do some advanced magical service + + :param priority: an integer priority, defaults to None which maps to 0. + Priorities are not considered, by default but exists + should this functionality be adopted later on to align + with the steps framework. + :param abortable: Boolean value. Whether the clean step is abortable + or not; defaults to False. + :param argsinfo: a dictionary of keyword arguments where key is the name of + the argument and value is a dictionary as follows:: + + 'description': . Required. This should include + possible values. + 'required': Boolean. Optional; default is False. True if this + argument is required. If so, it must be specified in + the service request; false if it is optional. + :param requires_ramdisk: Whether this step requires the ramdisk + to be running. Should be set to False for purely out-of-band steps. + :raises InvalidParameterValue: if any of the arguments are invalid + """ + def decorator(func): + func._is_service_step = True + if isinstance(priority, int): + func._service_step_priority = priority + else: + # Service steps are only invoked by operators in a model + # like manual cleaning, so there is no need to explicitly + # require it on the decorator. + func._service_step_priority = 0 + + if isinstance(abortable, bool): + func._service_step_abortable = abortable + else: + raise exception.InvalidParameterValue( + _('"abortable" must be a Boolean value instead of "%s"') + % abortable) + + _validate_argsinfo(argsinfo) + func._service_step_argsinfo = argsinfo + func._service_step_requires_ramdisk = requires_ramdisk + return func + return decorator diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index 94f316e380..d5a66ce39a 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -32,6 +32,7 @@ from ironic.common import states from ironic.common import utils from ironic.conductor import cleaning from ironic.conductor import deployments +from ironic.conductor import servicing from ironic.conductor import steps as conductor_steps from ironic.conductor import task_manager from ironic.conductor import utils as manager_utils @@ -85,21 +86,23 @@ VENDOR_PROPERTIES = { __HEARTBEAT_RECORD_ONLY = (states.ENROLL, states.MANAGEABLE, states.AVAILABLE, states.CLEANING, states.DEPLOYING, states.RESCUING, - states.DEPLOYHOLD, states.CLEANHOLD) + states.DEPLOYHOLD, states.CLEANHOLD, + states.SERVICING, states.SERVICEHOLD) _HEARTBEAT_RECORD_ONLY = frozenset(__HEARTBEAT_RECORD_ONLY) _HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT, # These are allowed but don't cause any actions since # they're also in HEARTBEAT_RECORD_ONLY. states.DEPLOYING, states.CLEANING, states.RESCUING, - states.DEPLOYHOLD, states.CLEANHOLD) + states.DEPLOYHOLD, states.CLEANHOLD, states.SERVICING, + states.SERVICEWAIT, states.SERVICEHOLD) HEARTBEAT_ALLOWED = frozenset(_HEARTBEAT_ALLOWED) _FASTTRACK_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT, states.ENROLL, states.MANAGEABLE, states.AVAILABLE, states.DEPLOYING, states.CLEANHOLD, - states.DEPLOYHOLD) + states.DEPLOYHOLD, states.SERVICEHOLD) FASTTRACK_HEARTBEAT_ALLOWED = frozenset(_FASTTRACK_HEARTBEAT_ALLOWED) @@ -164,11 +167,12 @@ def _get_post_step_hook(node, step_type): """Get post clean/deploy step hook for the currently executing step. :param node: a node object - :param step_type: 'clean' or 'deploy' + :param step_type: 'clean' or 'deploy' or 'service' :returns: a method if there is a post clean step hook for this clean step; None otherwise """ - step_obj = node.clean_step if step_type == 'clean' else node.deploy_step + + step_obj = getattr(node, "%s_step" % step_type) interface = step_obj.get('interface') step = step_obj.get('step') try: @@ -178,17 +182,16 @@ def _get_post_step_hook(node, step_type): def _post_step_reboot(task, step_type): - """Reboots a node out of band after a clean/deploy step that requires it. + """Reboots a node out of band after a step that requires it. If an agent step has 'reboot_requested': True, reboots the node when the step is completed. Will put the node in CLEANFAIL/DEPLOYFAIL if the node cannot be rebooted. :param task: a TaskManager instance - :param step_type: 'clean' or 'deploy' + :param step_type: 'clean' or 'deploy' or 'service' """ - current_step = (task.node.clean_step if step_type == 'clean' - else task.node.deploy_step) + current_step = getattr(task.node, '%s_step' % step_type) try: # NOTE(fellypefca): ensure that the baremetal node boots back into # the ramdisk after reboot. @@ -205,16 +208,22 @@ def _post_step_reboot(task, step_type): if step_type == 'clean': manager_utils.cleaning_error_handler(task, msg, traceback=traceback) - else: + elif step_type == 'deploy': manager_utils.deploying_error_handler(task, msg, traceback=traceback) + elif step_type == 'service': + manager_utils.servicing_error_handler(task, msg, + traceback=traceback) return # Signify that we've rebooted if step_type == 'clean': task.node.set_driver_internal_info('cleaning_reboot', True) - else: + elif step_type == 'deploy': task.node.set_driver_internal_info('deployment_reboot', True) + elif step_type == 'service': + task.node.set_driver_internal_info('servicing_reboot', True) + if not task.node.driver_internal_info.get( 'agent_secret_token_pregenerated', False): # Wipes out the existing recorded token because the machine will @@ -261,8 +270,7 @@ def _get_completed_command(task, commands, step_type): last_result = last_command.get('command_result') or {} last_step = last_result.get('%s_step' % step_type) - current_step = (task.node.clean_step if step_type == 'clean' - else task.node.deploy_step) + current_step = getattr(task.node, '%s_step' % step_type) if last_command['command_status'] == 'RUNNING': LOG.debug('%(type)s step still running for node %(node)s: %(step)s', {'step': last_step, 'node': task.node.uuid, @@ -410,7 +418,10 @@ def _continue_steps(task, step_type): cleaning.continue_node_clean(task) else: task.process_event('resume') - deployments.continue_node_deploy(task) + if step_type == 'deploy': + deployments.continue_node_deploy(task) + else: + servicing.continue_node_service(task) class HeartbeatMixin(object): @@ -439,11 +450,18 @@ class HeartbeatMixin(object): """ return self.refresh_steps(task, 'clean') - def process_next_step(self, task, step_type): - """Start the next clean/deploy step if the previous one is complete. + def refresh_service_steps(self, task): + """Refresh the node's cached service steps :param task: a TaskManager instance - :param step_type: "clean" or "deploy" + """ + return self.refresh_steps(task, 'service') + + def process_next_step(self, task, step_type): + """Start the next step if the previous one is complete. + + :param task: a TaskManager instance + :param step_type: "clean", "deploy", "service" """ def continue_cleaning(self, task): @@ -453,6 +471,13 @@ class HeartbeatMixin(object): """ return self.process_next_step(task, 'clean') + def continue_servicing(self, task): + """Start the next cleaning step if the previous one is complete. + + :param task: a TaskManager instance + """ + return self.process_next_step(task, 'service') + def heartbeat_allowed(self, node): if utils.fast_track_enabled(node): return node.provision_state in FASTTRACK_HEARTBEAT_ALLOWED @@ -480,6 +505,12 @@ class HeartbeatMixin(object): 'maintenance mode', node.uuid) last_error = _('Rescue aborted as node is in maintenance mode') manager_utils.rescuing_error_handler(task, last_error) + elif (node.provision_state in (states.SERVICING, states.SERVICEWAIT) + and not CONF.conductor.allow_provisioning_in_maintenance): + LOG.error('Aborting service for node %s, as it is in ' + 'maintenance mode', node.uuid) + last_error = _('Service aborted as node is in maintenance mode') + manager_utils.servicing_error_handler(task, last_error) else: LOG.warning('Heartbeat from node %(node)s in ' 'maintenance mode; not taking any action.', @@ -559,6 +590,37 @@ class HeartbeatMixin(object): states.RESCUEWAIT): manager_utils.rescuing_error_handler(task, last_error) + def _heartbeat_service_wait(self, task): + node = task.node + msg = _('Failed checking if service is done') + try: + node.touch_provisioning() + if not node.service_step: + LOG.debug('Node %s just booted to start %s service', + node.uuid) + msg = _('Node failed to start the first service step') + task.process_event('resume') + # First, cache the service steps + self.refresh_service_steps(task) + # Then set/verify node servicesteps and start service + conductor_steps.set_node_service_steps(task) + servicing.continue_node_service(task) + else: + msg = _('Node failed to check service progress') + # Check if the driver is polling for completion of a step, + # via the 'cleaning_polling' flag. + polling = node.driver_internal_info.get( + 'service_polling', False) + if not polling: + self.continue_servicing(task) + except Exception as e: + last_error = _('%(msg)s: %(exc)s') % {'msg': msg, 'exc': e} + log_msg = ('Asynchronous exception for node %(node)s: %(err)s' % + {'node': task.node.uuid, 'err': last_error}) + if node.provision_state in (states.SERVICING, states.SERVICEWAIT): + manager_utils.servicing_error_handler(task, log_msg, + errmsg=last_error) + @METRICS.timer('HeartbeatMixin.heartbeat') def heartbeat(self, task, callback_url, agent_version, agent_verify_ca=None, agent_status=None, @@ -616,13 +678,14 @@ class HeartbeatMixin(object): if node.maintenance: return self._heartbeat_in_maintenance(task) - if node.provision_state == states.DEPLOYWAIT: self._heartbeat_deploy_wait(task) elif node.provision_state == states.CLEANWAIT: self._heartbeat_clean_wait(task) elif node.provision_state == states.RESCUEWAIT: self._heartbeat_rescue_wait(task) + elif node.provision_state == states.SERVICEWAIT: + self._heartbeat_service_wait(task) def _finalize_rescue(self, task): """Call ramdisk to prepare rescue mode and verify result. @@ -744,6 +807,35 @@ class AgentBaseMixin(object): deploy_utils.tear_down_inband_cleaning( task, manage_boot=self.should_manage_boot(task)) + @METRICS.timer('AgentBaseMixin.prepare_cleaning') + def prepare_service(self, task): + """Boot into the agent to prepare for cleaning. + + :param task: a TaskManager object containing the node + :raises: NodeCleaningFailure, NetworkError if the previous cleaning + ports cannot be removed or if new cleaning ports cannot be created. + :raises: InvalidParameterValue if cleaning network UUID config option + has an invalid value. + :returns: states.CLEANWAIT to signify an asynchronous prepare + """ + result = deploy_utils.prepare_inband_service( + task, manage_boot=self.should_manage_boot(task)) + if result is None: + # Fast-track, ensure the steps are available. + self.refresh_steps(task, 'service') + return result + + @METRICS.timer('AgentBaseMixin.tear_down_service') + def tear_down_service(self, task): + """Clean up the PXE and DHCP files after cleaning. + + :param task: a TaskManager object containing the node + :raises: NodeServiceFailure, NetworkError if the cleaning ports cannot + be removed + """ + deploy_utils.tear_down_inband_service( + task, manage_boot=self.should_manage_boot(task)) + @METRICS.timer('AgentBaseMixin.get_clean_steps') def get_clean_steps(self, task): """Get the list of clean steps from the agent. @@ -785,7 +877,6 @@ class AgentBaseMixin(object): 'Previously cached steps: %(steps)s', {'node': node.uuid, 'type': step_type, 'steps': previous_steps}) - client = agent_client.get_client(task) call = getattr(client, 'get_%s_steps' % step_type) try: @@ -936,32 +1027,34 @@ class AgentBaseMixin(object): set to True, this method will coordinate the reboot once the step is completed. """ - assert step_type in ('clean', 'deploy') + assert step_type in ('clean', 'deploy', 'service') node = task.node client = agent_client.get_client(task) agent_commands = client.get_commands_status(task.node) - if _freshly_booted(agent_commands, step_type): - field = ('cleaning_reboot' if step_type == 'clean' - else 'deployment_reboot') + if step_type == 'clean': + field = 'cleaning_reboot' + elif step_type == 'service': + field = 'servicing_reboot' + else: + # TODO(TheJulia): One day we should standardize the field + # names here, but we also need to balance human ability + # to understand what is going on so *shrug*. + field = 'deployment_reboot' utils.pop_node_nested_field(node, 'driver_internal_info', field) node.save() return _continue_steps(task, step_type) - - current_step = (node.clean_step if step_type == 'clean' - else node.deploy_step) + current_step = getattr(node, '%s_step' % step_type) command = _get_completed_command(task, agent_commands, step_type) LOG.debug('%(type)s command status for node %(node)s on step %(step)s:' ' %(command)s', {'node': node.uuid, 'step': current_step, 'command': command, 'type': step_type}) - if not command: # Agent command in progress return - if command.get('command_status') == 'FAILED': msg = (_('%(type)s step %(step)s failed on node %(node)s. ' '%(err)s') % diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index fd83f9f087..82086efcdf 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -771,6 +771,86 @@ def tear_down_inband_cleaning(task, manage_boot=True): task, power_state_to_restore) +def prepare_inband_service(self, task): + """Boot a service ramdisk on the node. + + :param task: a TaskManager instance. + :raises: NetworkError if the tenant ports cannot be removed. + :raises: InvalidParameterValue when the wrong power state is specified + or the wrong driver info is specified for power management. + :raises: other exceptions by the node's power driver if something + wrong occurred during the power action. + :raises: any boot interface's prepare_ramdisk exceptions. + :returns: Returns states.SERVICEWAIT + """ + manager_utils.node_power_action(task, states.POWER_OFF) + # NOTE(TheJulia): Revealing that the power is off at any time can + # cause external power sync to decide that the node must be off. + # This may result in a post-rescued instance being turned off + # unexpectedly after rescue has started. + # TODO(TheJulia): Once we have power/state callbacks to nova, + # the reset of the power_state can be removed. + task.node.power_state = states.POWER_ON + task.node.save() + + task.driver.boot.clean_up_instance(task) + with manager_utils.power_state_for_network_configuration(task): + task.driver.network.unconfigure_tenant_networks(task) + task.driver.network.add_service_network(task) + if CONF.agent.manage_agent_boot: + # prepare_ramdisk will set the boot device + prepare_agent_boot(task) + manager_utils.node_power_action(task, states.POWER_ON) + + return states.SERVICEWAIT + + +def tear_down_inband_service(task, manage_boot=True): + """Tears down the environment setup for in-band service. + + This method does the following: + 1. Powers off the bare metal node (unless the node is fast + tracked or there was a service failure). + 2. If 'manage_boot' parameter is set to true, it also calls + the 'clean_up_ramdisk' method of boot interface to clean + up the environment that was set for booting agent ramdisk. + 3. Deletes the cleaning ports which were setup as part + of cleaning. + + :param task: a TaskManager object containing the node + :param manage_boot: If this is set to True, this method calls the + 'clean_up_ramdisk' method of boot interface to boot the agent + ramdisk. If False, it skips this step. + :raises: NetworkError, NodeServiceFailure if the cleaning ports cannot be + removed. + """ + node = task.node + service_failure = (node.fault == faults.SERVICE_FAILURE) + + if not service_failure: + manager_utils.node_power_action(task, states.POWER_OFF) + + if manage_boot: + task.driver.boot.clean_up_ramdisk(task) + + power_state_to_restore = manager_utils.power_on_node_if_needed(task) + task.driver.network.remove_service_network(task) + + if not service_failure: + manager_utils.restore_power_state_if_needed( + task, power_state_to_restore) + + with manager_utils.power_state_for_network_configuration(task): + task.driver.network.remove_service_network(task) + task.driver.network.configure_tenant_networks(task) + task.driver.boot.prepare_instance(task) + manager_utils.restore_power_state_if_needed( + task, power_state_to_restore) + + # Change the task instead of return the state. + task.process_event('done') + + def get_image_instance_info(node): """Gets the image information from the node. diff --git a/ironic/drivers/modules/fake.py b/ironic/drivers/modules/fake.py index c2d92a7f16..625ffbb172 100644 --- a/ironic/drivers/modules/fake.py +++ b/ironic/drivers/modules/fake.py @@ -224,6 +224,7 @@ class FakeVendorB(base.VendorInterface): sleep(CONF.fake.vendor_delay) return True if bar == 'woof' else False + @base.service_step(requires_ramdisk=False) @base.clean_step(priority=1) @base.passthru(['POST'], description=_("Test pass-through to wait.")) @@ -234,6 +235,10 @@ class FakeVendorB(base.VendorInterface): # NOTE(TheJulia): Step methods invoked via an API *cannot* # have return values + @base.service_step() + def trigger_servicewait(self, task, **kwargs): + return states.SERVICEWAIT + class FakeConsole(base.ConsoleInterface): """Example implementation of a simple console interface.""" diff --git a/ironic/drivers/modules/network/flat.py b/ironic/drivers/modules/network/flat.py index a069878176..af19e284e5 100644 --- a/ironic/drivers/modules/network/flat.py +++ b/ironic/drivers/modules/network/flat.py @@ -217,3 +217,28 @@ class FlatNetwork(common.NeutronVIFPortIDMixin, """ return self._remove_service_network( task, self.get_inspection_network_uuid(task), 'inspection') + + def add_servicing_network(self, task): + """Add the rescuing network to a node. + + Flat network does not use the servicing network. + Bind the port again since unconfigure_tenant_network() unbound it. + + :param task: A TaskManager instance. + :returns: a dictionary in the form {port.uuid: neutron_port['id']} + :raises: NetworkError, InvalidParameterValue + """ + LOG.info('Bind ports for servicing node %s', task.node.uuid) + self._bind_flat_ports(task) + + def remove_servicing_network(self, task): + """Remove the servicing network from a node. + + Flat network does not use the servicing network. + Unbind the port again since add_rescuing_network() bound it. + + :param task: A TaskManager instance. + :raises: NetworkError + """ + LOG.info('Unbind ports for servicing node %s', task.node.uuid) + self._unbind_flat_ports(task) diff --git a/ironic/drivers/modules/network/neutron.py b/ironic/drivers/modules/network/neutron.py index 2693b603e2..2503ea84b6 100644 --- a/ironic/drivers/modules/network/neutron.py +++ b/ironic/drivers/modules/network/neutron.py @@ -265,3 +265,33 @@ class NeutronNetwork(common.NeutronVIFPortIDMixin, """ return self._remove_network( task, self.get_inspection_network_uuid(task), 'inspection') + + def validate_servicing(self, task): + """Validates the network interface for servicing operation. + + :param task: a TaskManager instance. + :raises: InvalidParameterValue, if the network interface configuration + is invalid. + :raises: MissingParameterValue, if some parameters are missing. + """ + self.get_servicing_network_uuid(task) + + def add_servicing_network(self, task): + """Create neutron ports for each port to boot the servicing ramdisk. + + :param task: a TaskManager instance. + :returns: a dictionary in the form {port.uuid: neutron_port['id']} + """ + return self._add_network( + task, self.get_servicing_network_uuid(task), + CONF.neutron.servicing_network_security_groups, + 'servicing') + + def remove_servicing_network(self, task): + """Deletes neutron port created for booting the servicing ramdisk. + + :param task: a TaskManager instance. + :raises: NetworkError + """ + return self._remove_network( + task, self.get_servicing_network_uuid(task), 'servicing') diff --git a/ironic/objects/node.py b/ironic/objects/node.py index f86b9f78a5..b00cc6b3c9 100644 --- a/ironic/objects/node.py +++ b/ironic/objects/node.py @@ -81,7 +81,8 @@ class Node(base.IronicObject, object_base.VersionedObjectDictCompat): # Version 1.37: Add shard field # Version 1.38: Add parent_node field # Version 1.39: Add firmware_interface field - VERSION = '1.39' + # Version 1.40: Add service_step field + VERSION = '1.40' dbapi = db_api.get_instance() @@ -107,6 +108,11 @@ class Node(base.IronicObject, object_base.VersionedObjectDictCompat): # or has not yet started. 'deploy_step': object_fields.FlexibleDictField(nullable=True), + # A service step dictionary, indicating the current step + # being executed, or None, indicating deployment is not in progress + # or has not yet started. + 'service_step': object_fields.FlexibleDictField(nullable=True), + 'raid_config': object_fields.FlexibleDictField(nullable=True), 'target_raid_config': object_fields.FlexibleDictField(nullable=True), diff --git a/ironic/tests/unit/api/controllers/v1/test_node.py b/ironic/tests/unit/api/controllers/v1/test_node.py index ce2c069300..e08ba6f5fa 100644 --- a/ironic/tests/unit/api/controllers/v1/test_node.py +++ b/ironic/tests/unit/api/controllers/v1/test_node.py @@ -145,6 +145,7 @@ class TestListNodes(test_api_base.BaseApiTest): self.assertNotIn('retired_reason', data['nodes'][0]) self.assertNotIn('lessee', data['nodes'][0]) self.assertNotIn('network_data', data['nodes'][0]) + self.assertNotIn('service_steps', data['nodes'][0]) @mock.patch.object(policy, 'check', autospec=True) @mock.patch.object(policy, 'check_policy', autospec=True) @@ -223,6 +224,7 @@ class TestListNodes(test_api_base.BaseApiTest): self.assertIn('lessee', data) self.assertNotIn('allocation_id', data) self.assertIn('allocation_uuid', data) + self.assertIn('service_step', data) def test_get_one_configdrive_dict(self): fake_instance_info = { @@ -6489,6 +6491,54 @@ ORHMKeXMO8fcK0By7CiMKwHSXCoEQgfQhWwpMdSsO8LgHCjh87DQc= """ self.assertEqual(http_client.NOT_ACCEPTABLE, ret.status_code) mock_dpa.assert_not_called() + @mock.patch.object(rpcapi.ConductorAPI, 'do_provisioning_action', + autospec=True) + def test_unhold_servicehold(self, mock_dpa): + self.node.provision_state = states.SERVICEHOLD + self.node.save() + + ret = self.put_json('/nodes/%s/states/provision' % self.node.uuid, + {'target': states.VERBS['unhold']}, + headers={api_base.Version.string: "1.86"}) + self.assertEqual(http_client.ACCEPTED, ret.status_code) + self.assertEqual(b'', ret.body) + mock_dpa.assert_called_once_with(mock.ANY, mock.ANY, self.node.uuid, + states.VERBS['unhold'], + 'test-topic') + + @mock.patch.object(rpcapi.ConductorAPI, 'do_node_service', + autospec=True) + def test_service(self, mock_dns): + self.node.provision_state = states.SERVICEHOLD + self.node.save() + + ret = self.put_json('/nodes/%s/states/provision' % self.node.uuid, + {'target': states.VERBS['service'], + 'service_steps': [{ + 'interface': 'deploy', + 'step': 'meow'}]}, + headers={api_base.Version.string: "1.87"}) + self.assertEqual(http_client.ACCEPTED, ret.status_code) + self.assertEqual(b'', ret.body) + mock_dns.assert_called_once_with( + mock.ANY, mock.ANY, self.node.uuid, + [{'interface': 'deploy', 'step': 'meow'}], + None, topic='test-topic') + + @mock.patch.object(rpcapi.ConductorAPI, 'do_node_service', + autospec=True) + def test_service_args_required(self, mock_dns): + self.node.provision_state = states.SERVICEHOLD + self.node.save() + + ret = self.put_json('/nodes/%s/states/provision' % self.node.uuid, + {'target': states.VERBS['service']}, + headers={api_base.Version.string: "1.87"}, + expect_errors=True) + self.assertEqual(http_client.BAD_REQUEST, ret.status_code) + self.assertIn('error_message', ret.json) + mock_dns.assert_not_called() + def test_set_console_mode_enabled(self): with mock.patch.object(rpcapi.ConductorAPI, 'set_console_mode', @@ -6975,11 +7025,28 @@ class TestCheckCleanSteps(db_base.DbTestCase): step1 = {"step": "upgrade_firmware", "interface": "deploy", "args": {"arg1": "value1", "arg2": "value2"}} + # NOTE(TheJulia): _check_service_steps and _check_deploy_steps + # both route back to _check_steps which is what backs _check + # clean steps. It is needful duplication for cases, but it doesn't + # make a ton of sense to copy/paste everything over and over unless + # there is a specific case. In any case, do the needful here. api_node._check_clean_steps([step1]) - step2 = {"step": "configure raid", "interface": "raid"} + step2 = {"step": "configure raid", "interface": "raid", + "args": {}} api_node._check_clean_steps([step1, step2]) + api_node._check_service_steps([step1]) + api_node._check_service_steps([step1, step2]) + # Schema differences exist, cleaning doesn't have a schema for + # priority when validated. + + step1['priority'] = 10 + step2['priority'] = 12 + api_node._check_deploy_steps([step1]) + + api_node._check_deploy_steps([step1, step2]) + @mock.patch.object(api_utils, 'check_node_policy_and_retrieve', autospec=True) def test__check_clean_steps_child_node(self, mock_policy): @@ -7029,6 +7096,15 @@ class TestCheckCleanSteps(db_base.DbTestCase): mock.call('baremetal:node:set_provision_state', child_node_2.uuid)]) + @mock.patch.object(api_node, '_check_steps', autospec=True) + def test_check__check_steps_wrappers(self, check_mock): + api_node._check_clean_steps({}) + self.assertEqual(1, check_mock.call_count) + api_node._check_deploy_steps({}) + self.assertEqual(2, check_mock.call_count) + api_node._check_service_steps({}) + self.assertEqual(3, check_mock.call_count) + class TestAttachDetachVif(test_api_base.BaseApiTest): diff --git a/ironic/tests/unit/common/test_neutron.py b/ironic/tests/unit/common/test_neutron.py index 7dc67ab329..c3595a130b 100644 --- a/ironic/tests/unit/common/test_neutron.py +++ b/ironic/tests/unit/common/test_neutron.py @@ -1422,3 +1422,28 @@ class TestGetPhysnetsByPortUUID(base.TestCase): self.client, port_uuid) mock_gp.assert_called_once_with(self.client, port_uuid) mock_gn.assert_called_once_with(self.client, network_uuid) + + +class TestNeutronNetworkInterfaceMixin(db_base.DbTestCase): + + def setUp(self): + super(TestNeutronNetworkInterfaceMixin, self).setUp() + self.node = object_utils.create_test_node(self.context) + + def test_get_network_names_and_uuids(self): + """A test to validate confiured overrides work.""" + for name in ['cleaning', 'provisioning', 'rescuing', 'inspection', + 'servicing']: + method_name = 'get_{}_network_uuid'.format(name) + method_to_call = getattr(neutron.NeutronNetworkInterfaceMixin, + method_name) + network_uuid = uuidutils.generate_uuid() + self.node.driver_info = {'%s_network' % name: network_uuid} + self.node.save() + with mock.patch.object(neutron, 'validate_network', + autospec=True) as mock_validate: + with task_manager.acquire(self.context, + self.node.uuid) as task: + method_to_call(self, task) + mock_validate.assert_called_once_with( + network_uuid, mock.ANY, context=task.context) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 582c87d80c..2e26e5ffea 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -49,6 +49,7 @@ from ironic.conductor import deployments from ironic.conductor import inspection from ironic.conductor import manager from ironic.conductor import notification_utils +from ironic.conductor import servicing from ironic.conductor import steps as conductor_steps from ironic.conductor import task_manager from ironic.conductor import utils as conductor_utils @@ -8601,3 +8602,49 @@ class ContinueInspectionTestCase(mgr_utils.ServiceSetUpMixin, self.assertEqual(exception.NotFound, exc.exc_info[0]) node.refresh() self.assertEqual(states.AVAILABLE, node.provision_state) + + +@mgr_utils.mock_record_keepalive +class DoNodeServiceTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): + def setUp(self): + super(DoNodeServiceTestCase, self).setUp() + + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', + autospec=True) + def test_do_node_service_maintenance(self, mock_validate): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.ACTIVE, + target_provision_state=states.NOSTATE, + maintenance=True, maintenance_reason='reason') + self._start_service() + exc = self.assertRaises(messaging.rpc.ExpectedException, + self.service.do_node_service, + self.context, node.uuid, {'foo': 'bar'}) + # Compare true exception hidden by @messaging.expected_exceptions + self.assertEqual(exception.NodeInMaintenance, exc.exc_info[0]) + self.assertFalse(mock_validate.called) + + @mock.patch.object(task_manager.TaskManager, 'process_event', + autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', + autospec=True) + def test_do_node_service(self, mock_pv, mock_nv, mock_event): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.ACTIVE, + target_provision_state=states.NOSTATE) + self._start_service() + self.service.do_node_service(self.context, + node.uuid, {'foo': 'bar'}) + self.assertTrue(mock_pv.called) + self.assertTrue(mock_nv.called) + mock_event.assert_called_once_with( + mock.ANY, + 'service', + callback=mock.ANY, + call_args=(servicing.do_node_service, mock.ANY, + {'foo': 'bar'}, False), + err_handler=mock.ANY, target_state='active') diff --git a/ironic/tests/unit/conductor/test_rpcapi.py b/ironic/tests/unit/conductor/test_rpcapi.py index 56f67b2c2e..5791920b47 100644 --- a/ironic/tests/unit/conductor/test_rpcapi.py +++ b/ironic/tests/unit/conductor/test_rpcapi.py @@ -722,6 +722,14 @@ class RPCAPITestCase(db_base.DbTestCase): allocation='fake-allocation', version='1.48') + def test_do_node_service(self): + self._test_rpcapi('do_node_service', + 'call', + node_id='fake-node', + service_steps={'foo': 'bar'}, + disable_ramdisk=False, + version='1.57') + @mock.patch.object(rpc, 'GLOBAL_MANAGER', spec_set=conductor_manager.ConductorManager) def test_local_call(self, mock_manager): diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py new file mode 100644 index 0000000000..2ec10cfb6d --- /dev/null +++ b/ironic/tests/unit/conductor/test_servicing.py @@ -0,0 +1,1184 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Tests for service bits.""" + +from unittest import mock + +from oslo_config import cfg +from oslo_utils import uuidutils + +from ironic.common import exception +from ironic.common import faults +from ironic.common import states +from ironic.conductor import servicing +from ironic.conductor import steps as conductor_steps +from ironic.conductor import task_manager +from ironic.conductor import utils as conductor_utils +from ironic.drivers.modules import fake +from ironic.drivers.modules.network import flat as n_flat +from ironic.tests.unit.db import base as db_base +from ironic.tests.unit.objects import utils as obj_utils + +CONF = cfg.CONF + +# NOTE(TheJulia): This file is based upon test_cleaning.py with logic +# for automated cleaning out and switched over for the service steps +# framework. It *largely* exists to ensure we have similar consistency +# between the frameworks, similar was done for deploy steps in the past. + + +class DoNodeServiceTestCase(db_base.DbTestCase): + def setUp(self): + super(DoNodeServiceTestCase, self).setUp() + self.power_update = { + 'step': 'update_firmware', 'priority': 10, 'interface': 'power'} + self.deploy_update = { + 'step': 'update_firmware', 'priority': 10, 'interface': 'deploy'} + self.deploy_magic = { + 'step': 'magic_firmware', 'priority': 10, 'interface': 'deploy'} + self.next_service_step_index = 1 + self.deploy_raid = { + 'step': 'build_raid', 'priority': 0, 'interface': 'deploy'} + self.service_steps = [self.deploy_update, + self.power_update, + self.deploy_magic] + + def __do_node_service_validate_fail(self, mock_validate, + service_steps=None): + tgt_prov_state = states.ACTIVE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state) + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_node_service(task, service_steps=service_steps) + node.refresh() + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) + mock_validate.assert_called_once_with(mock.ANY, mock.ANY) + + def __do_node_service_validate_fail_invalid(self, mock_validate, + service_steps=None): + # InvalidParameterValue should cause node to go to SERVICEFAIL + mock_validate.side_effect = exception.InvalidParameterValue('error') + self.__do_node_service_validate_fail(mock_validate, + service_steps=service_steps) + + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', + autospec=True) + def test__do_node_service_automated_power_validate_fail(self, + mock_validate): + self.__do_node_service_validate_fail_invalid(mock_validate) + + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', + autospec=True) + def test__do_node_service_manual_power_validate_fail(self, mock_validate): + self.__do_node_service_validate_fail_invalid(mock_validate, + service_steps=[]) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + def test__do_node_service_automated_network_validate_fail(self, + mock_validate): + self.__do_node_service_validate_fail_invalid(mock_validate) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + def test__do_node_service_manual_network_validate_fail(self, + mock_validate): + self.__do_node_service_validate_fail_invalid(mock_validate, + service_steps=[]) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + def test__do_node_service_network_error_fail(self, mock_validate): + # NetworkError should cause node to go to CLEANFAIL + mock_validate.side_effect = exception.NetworkError() + self.__do_node_service_validate_fail(mock_validate) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', + autospec=True) + def test__do_node_service_prepare_service_fail(self, mock_prep, + mock_validate, + service_steps=None): + # Exception from task.driver.deploy.prepare_cleaning should cause node + # to go to SERVICEFAIL + mock_prep.side_effect = exception.InvalidParameterValue('error') + tgt_prov_state = states.ACTIVE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state) + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_node_service(task, service_steps=service_steps) + node.refresh() + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + mock_prep.assert_called_once_with(mock.ANY, task) + mock_validate.assert_called_once_with(mock.ANY, task) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', + autospec=True) + def test__do_node_service_prepare_service_wait(self, mock_prep, + mock_validate): + service_steps = [ + {'step': 'trigger_servicewait', 'priority': 10, + 'interface': 'vendor'} + ] + + mock_prep.return_value = states.SERVICEWAIT + tgt_prov_state = states.ACTIVE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + vendor_interface='fake') + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_node_service(task, service_steps=service_steps) + node.refresh() + self.assertEqual(states.SERVICEWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + mock_prep.assert_called_once_with(mock.ANY, mock.ANY) + mock_validate.assert_called_once_with(mock.ANY, mock.ANY) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', + autospec=True) + def test__do_node_service_prepare_service_active(self, mock_prep, + mock_validate): + service_steps = [ + {'step': 'log_passthrough', 'priority': 10, 'interface': 'vendor'} + ] + + mock_prep.return_value = states.SERVICEWAIT + tgt_prov_state = states.ACTIVE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + vendor_interface='fake') + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_node_service(task, service_steps=service_steps, + disable_ramdisk=True) + # Validate we went back to active, and did not trigger a ramdisk. + node.refresh() + self.assertEqual(states.ACTIVE, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + mock_prep.assert_not_called() + mock_validate.assert_not_called() + + @mock.patch.object(n_flat.FlatNetwork, 'validate', autospec=True) + @mock.patch.object(conductor_steps, 'set_node_service_steps', + autospec=True) + def __do_node_service_steps_fail(self, mock_steps, mock_validate, + service_steps=None, invalid_exc=True): + if invalid_exc: + mock_steps.side_effect = exception.InvalidParameterValue('invalid') + else: + mock_steps.side_effect = exception.NodeCleaningFailure('failure') + tgt_prov_state = states.ACTIVE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state) + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_node_service(task, service_steps=service_steps) + mock_validate.assert_called_once_with(mock.ANY, task) + node.refresh() + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False) + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) + + @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', + autospec=True) + @mock.patch.object(n_flat.FlatNetwork, 'validate', autospec=True) + @mock.patch.object(conductor_steps, 'set_node_service_steps', + autospec=True) + def test_do_node_service_steps_fail_poweroff(self, mock_steps, + mock_validate, + mock_power, + service_steps=None, + invalid_exc=True): + if invalid_exc: + mock_steps.side_effect = exception.InvalidParameterValue('invalid') + else: + mock_steps.side_effect = exception.NodeCleaningFailure('failure') + tgt_prov_state = states.ACTIVE + self.config(poweroff_in_cleanfail=True, group='conductor') + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + provision_state=states.SERVICING, + power_state=states.POWER_ON, + target_provision_state=tgt_prov_state) + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_node_service(task, service_steps=service_steps) + mock_validate.assert_called_once_with(mock.ANY, task) + node.refresh() + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + mock_steps.assert_called_once_with(mock.ANY, disable_ramdisk=False) + self.assertFalse(mock_power.called) + + def test__do_node_service_steps_fail(self): + for invalid in (True, False): + self.__do_node_service_steps_fail(service_steps=[self.deploy_raid], + invalid_exc=invalid) + + @mock.patch.object(conductor_steps, 'set_node_service_steps', + autospec=True) + @mock.patch.object(servicing, 'do_next_service_step', autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', + autospec=True) + def __do_node_service(self, mock_power_valid, mock_network_valid, + mock_next_step, mock_steps, service_steps=None, + disable_ramdisk=False): + tgt_prov_state = states.ACTIVE + + if not service_steps: + service_steps = self.service_steps + + def set_steps(task, disable_ramdisk=None): + dii = task.node.driver_internal_info + dii['service_steps'] = service_steps + task.node.driver_internal_info = dii + task.node.save() + + mock_steps.side_effect = set_steps + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + power_state=states.POWER_OFF, + driver_internal_info={'agent_secret_token': 'old'}) + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_node_service(task, service_steps=service_steps, + disable_ramdisk=disable_ramdisk) + + node.refresh() + + mock_power_valid.assert_called_once_with(mock.ANY, task) + if disable_ramdisk: + mock_network_valid.assert_not_called() + else: + mock_network_valid.assert_called_once_with(mock.ANY, task) + + mock_next_step.assert_called_once_with( + task, 0, disable_ramdisk=disable_ramdisk) + mock_steps.assert_called_once_with( + task, disable_ramdisk=disable_ramdisk) + if service_steps: + self.assertEqual(service_steps, + node.driver_internal_info['service_steps']) + self.assertFalse(node.maintenance) + self.assertNotIn('agent_secret_token', node.driver_internal_info) + + # Check that state didn't change + self.assertEqual(states.SERVICING, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + + def test__do_node_service(self): + self.__do_node_service() + + def test__do_node_service_disable_ramdisk(self): + self.__do_node_service(service_steps=[self.deploy_raid], + disable_ramdisk=True) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def _do_next_service_step_first_step_async(self, return_state, + mock_execute, + service_steps=None): + # Execute the first async clean step on a node + driver_internal_info = {'service_step_index': None} + tgt_prov_state = states.ACTIVE + if service_steps: + driver_internal_info['service_steps'] = service_steps + else: + driver_internal_info['service_steps'] = self.service_steps + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info=driver_internal_info, + clean_step={}) + mock_execute.return_value = return_state + expected_first_step = node.driver_internal_info['service_steps'][0] + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + + node.refresh() + self.assertEqual(states.SERVICEWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(expected_first_step, node.service_step) + self.assertEqual(0, node.driver_internal_info['service_step_index']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, expected_first_step) + + def test_do_next_service_step_automated_first_step_async(self): + self._do_next_service_step_first_step_async(states.SERVICEWAIT) + + def test_do_next_service_step_manual_first_step_async(self): + self._do_next_service_step_first_step_async( + states.SERVICEWAIT, service_steps=[self.deploy_raid]) + + @mock.patch('ironic.drivers.modules.fake.FakePower.execute_service_step', + autospec=True) + def _do_next_clean_step_continue_from_last_cleaning(self, return_state, + mock_execute, + manual=False): + # Resume an in-progress servicing after the first async step + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': 0}, + service_step=self.service_steps[0]) + mock_execute.return_value = return_state + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, self.next_service_step_index) + + node.refresh() + + self.assertEqual(states.SERVICEWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.service_steps[1], node.service_step) + self.assertEqual(1, node.driver_internal_info['service_step_index']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.service_steps[1]) + + def test_do_next_clean_step_continue_from_last_cleaning(self): + self._do_next_clean_step_continue_from_last_cleaning( + states.SERVICEWAIT) + + def test_do_next_clean_step_manual_continue_from_last_cleaning(self): + self._do_next_clean_step_continue_from_last_cleaning( + states.SERVICEWAIT, manual=True) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def _do_next_service_step_last_step_noop(self, mock_execute, + manual=False): + # Resume where last_step is the last cleaning step, should be noop + tgt_prov_state = states.ACTIVE + info = {'service_steps': self.service_steps, + 'service_step_index': len(self.service_steps) - 1, + 'agent_url': 'test-url', + 'agent_secret_token': 'token'} + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info=info, + service_step=self.service_steps[-1]) + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, None) + + node.refresh() + + # Cleaning should be complete without calling additional steps + self.assertEqual(tgt_prov_state, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertEqual({}, node.clean_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertIsNone(node.driver_internal_info['service_steps']) + self.assertFalse(mock_execute.called) + self.assertNotIn('agent_url', node.driver_internal_info) + self.assertNotIn('agent_secret_token', + node.driver_internal_info) + + def test__do_next_service_step_automated_last_step_noop(self): + self._do_next_service_step_last_step_noop() + + def test__do_next_service_step_manual_last_step_noop(self): + self._do_next_service_step_last_step_noop(manual=True) + + @mock.patch('ironic.drivers.utils.collect_ramdisk_logs', autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.tear_down_service', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.execute_service_step', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def _do_next_service_step_all(self, mock_deploy_execute, + mock_power_execute, mock_tear_down, + mock_collect_logs, + disable_ramdisk=False): + # Run all steps from start to finish (all synchronous) + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None}, + clean_step={}) + + def fake_deploy(conductor_obj, task, step): + driver_internal_info = task.node.driver_internal_info + driver_internal_info['goober'] = 'test' + task.node.driver_internal_info = driver_internal_info + task.node.save() + + mock_deploy_execute.side_effect = fake_deploy + mock_power_execute.return_value = None + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step( + task, 0, disable_ramdisk=disable_ramdisk) + + mock_power_execute.assert_called_once_with(task.driver.power, task, + self.service_steps[1]) + mock_deploy_execute.assert_has_calls( + [mock.call(task.driver.deploy, task, self.service_steps[0]), + mock.call(task.driver.deploy, task, self.service_steps[2])]) + if disable_ramdisk: + mock_tear_down.assert_not_called() + else: + mock_tear_down.assert_called_once_with( + task.driver.deploy, task) + + node.refresh() + + # Servicing should be complete + self.assertEqual(tgt_prov_state, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertEqual({}, node.service_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertEqual('test', node.driver_internal_info['goober']) + self.assertIsNone(node.driver_internal_info['service_steps']) + self.assertFalse(mock_collect_logs.called) + + def test_do_next_clean_step_all(self): + self._do_next_service_step_all() + + def test_do_next_clean_step_all_disable_ramdisk(self): + self._do_next_service_step_all(disable_ramdisk=True) + + @mock.patch('ironic.drivers.utils.collect_ramdisk_logs', autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.execute_service_step', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def test_do_next_clean_step_collect_logs(self, mock_deploy_execute, + mock_power_execute, + mock_collect_logs): + CONF.set_override('deploy_logs_collect', 'always', group='agent') + # Run all steps from start to finish (all synchronous) + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None}, + clean_step={}) + + def fake_deploy(conductor_obj, task, step): + driver_internal_info = task.node.driver_internal_info + driver_internal_info['goober'] = 'test' + task.node.driver_internal_info = driver_internal_info + task.node.save() + + mock_deploy_execute.side_effect = fake_deploy + mock_power_execute.return_value = None + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + + node.refresh() + + # Cleaning should be complete + self.assertEqual(tgt_prov_state, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertEqual({}, node.clean_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertEqual('test', node.driver_internal_info['goober']) + self.assertIsNone(node.driver_internal_info['service_steps']) + mock_power_execute.assert_called_once_with(mock.ANY, mock.ANY, + self.service_steps[1]) + mock_deploy_execute.assert_has_calls( + [mock.call(mock.ANY, mock.ANY, self.service_steps[0]), + mock.call(mock.ANY, mock.ANY, self.service_steps[2])]) + mock_collect_logs.assert_called_once_with(mock.ANY, label='service') + + @mock.patch('ironic.drivers.utils.collect_ramdisk_logs', autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def _do_next_service_step_execute_fail(self, tear_mock, mock_execute, + mock_collect_logs): + # When a clean step fails, go to CLEANFAIL + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None}, + clean_step={}) + mock_execute.side_effect = Exception() + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + tear_mock.assert_called_once_with(task.driver.deploy, task) + + node.refresh() + + # Make sure we go to SERVICEFAIL, clear service_steps + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual({}, node.service_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertIsNotNone(node.last_error) + self.assertTrue(node.maintenance) + self.assertEqual(faults.SERVICE_FAILURE, node.fault) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.service_steps[0]) + mock_collect_logs.assert_called_once_with(mock.ANY, label='service') + + def test__do_next_clean_step_automated_execute_fail(self): + self._do_next_service_step_execute_fail() + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def test_do_next_service_step_oob_reboot(self, mock_execute): + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None, + 'service_reboot': True}, + service_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + + node.refresh() + + # Make sure we go to SERVICEWAIT + self.assertEqual(states.SERVICEWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.service_steps[0], node.service_step) + self.assertEqual(0, node.driver_internal_info['service_step_index']) + self.assertFalse( + node.driver_internal_info['skip_current_service_step']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.service_steps[0]) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def test_do_next_service_step_agent_busy(self, mock_execute): + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None, + 'service_reboot': True}, + service_step={}) + mock_execute.side_effect = exception.AgentInProgress( + reason='still meowing') + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + + node.refresh() + # Make sure we go to SERVICEWAIT + self.assertEqual(states.SERVICEWAIT, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual(self.service_steps[0], node.service_step) + self.assertEqual(0, node.driver_internal_info['service_step_index']) + self.assertFalse( + node.driver_internal_info['skip_current_service_step']) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.service_steps[0]) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def test_do_next_service_step_oob_reboot_last_step(self, mock_execute): + # Resume where last_step is the last service step + tgt_prov_state = states.ACTIVE + info = {'service_steps': self.service_steps, + 'service_reboot': True, + 'service_step_index': len(self.service_steps) - 1} + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info=info, + service_step=self.service_steps[-1]) + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, None) + + node.refresh() + + # Servicing should be complete without calling additional steps + self.assertEqual(tgt_prov_state, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertEqual({}, node.service_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertNotIn('service_reboot', node.driver_internal_info) + self.assertIsNone(node.driver_internal_info['service_steps']) + self.assertFalse(mock_execute.called) + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def test_do_next_service_step_oob_reboot_fail(self, tear_mock, + mock_execute): + # When a service step fails with no reboot requested go to SERVICEFAIL + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None}, + service_step={}) + mock_execute.side_effect = exception.AgentConnectionFailed( + reason='failed') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + tear_mock.assert_called_once_with(task.driver.deploy, task) + + node.refresh() + + # Make sure we go to SERVICEFAIL, clear service_steps + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual({}, node.service_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertNotIn('skip_current_service_step', + node.driver_internal_info) + self.assertIsNotNone(node.last_error) + self.assertTrue(node.maintenance) + mock_execute.assert_called_once_with( + mock.ANY, mock.ANY, self.service_steps[0]) + + @mock.patch.object(conductor_utils, 'LOG', autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.execute_service_step', + autospec=True) + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def _do_next_service_step_fail_in_tear_down_service( + self, tear_mock, power_exec_mock, deploy_exec_mock, log_mock): + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None}, + service_step={}) + + deploy_exec_mock.return_value = None + power_exec_mock.return_value = None + tear_mock.side_effect = Exception('boom') + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + + node.refresh() + + # Make sure we go to SERVICEFAIL, clear service_steps + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual({}, node.service_step) + self.assertNotIn('clean_step_index', node.driver_internal_info) + self.assertIsNotNone(node.last_error) + self.assertEqual(1, tear_mock.call_count) + self.assertFalse(node.maintenance) # no step is running + deploy_exec_calls = [ + mock.call(mock.ANY, mock.ANY, self.service_steps[0]), + mock.call(mock.ANY, mock.ANY, self.service_steps[2]), + ] + self.assertEqual(deploy_exec_calls, deploy_exec_mock.call_args_list) + + power_exec_calls = [ + mock.call(mock.ANY, mock.ANY, self.service_steps[1]), + ] + self.assertEqual(power_exec_calls, power_exec_mock.call_args_list) + log_mock.error.assert_called_once_with( + 'Failed to tear down from service for node {}, reason: boom' + .format(node.uuid), exc_info=True) + + def test__do_next_service_step_automated_fail_in_tear_down_service(self): + self._do_next_service_step_fail_in_tear_down_service() + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def _do_next_service_step_no_steps(self, mock_execute, manual=False): + for info in ({'service_steps': None, 'service_step_index': None, + 'agent_url': 'test-url', 'agent_secret_token': 'magic'}, + {'service_steps': None, 'agent_url': 'test-url', + 'agent_secret_token': 'it_is_a_kind_of_magic'}): + # Resume where there are no steps, should be a noop + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info=info, + service_step={}) + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, None) + + node.refresh() + + # Cleaning should be complete without calling additional steps + self.assertEqual(tgt_prov_state, node.provision_state) + self.assertEqual(states.NOSTATE, node.target_provision_state) + self.assertEqual({}, node.clean_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertFalse(mock_execute.called) + self.assertNotIn('agent_url', node.driver_internal_info) + self.assertNotIn('agent_secret_token', + node.driver_internal_info) + mock_execute.reset_mock() + + def test__do_next_service_step_automated_no_steps(self): + self._do_next_service_step_no_steps() + + @mock.patch('ironic.drivers.modules.fake.FakePower.execute_service_step', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def test__do_next_service_step_bad_step_return_value( + self, deploy_exec_mock, power_exec_mock, manual=False): + # When a service step fails, go to CLEANFAIL + tgt_prov_state = states.ACTIVE + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=tgt_prov_state, + last_error=None, + driver_internal_info={'service_steps': self.service_steps, + 'service_step_index': None}, + service_step={}) + deploy_exec_mock.return_value = "foo" + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + + node.refresh() + + # Make sure we go to SERVICEFAIL, clear service_steps + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertEqual({}, node.service_step) + self.assertNotIn('service_step_index', node.driver_internal_info) + self.assertIsNotNone(node.last_error) + self.assertTrue(node.maintenance) # the 1st clean step was running + deploy_exec_mock.assert_called_once_with(mock.ANY, mock.ANY, + self.service_steps[0]) + # Make sure we don't execute any other step and return + self.assertFalse(power_exec_mock.called) + + def _test_do_next_service_step_handles_hold(self, start_state): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=start_state, + driver_internal_info={ + 'service_steps': [ + { + 'step': 'hold', + 'priority': 10, + 'interface': 'power' + } + ], + 'service_step_index': None}, + service_step=None) + + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + servicing.do_next_service_step(task, 0) + node.refresh() + self.assertEqual(states.SERVICEHOLD, node.provision_state) + + def test_do_next_service_step_handles_hold_from_active(self): + # Start is from the conductor + self._test_do_next_service_step_handles_hold(states.SERVICING) + + def test_do_next_service_step_handles_hold_from_wait(self): + # Start is the continuation from a heartbeat. + self._test_do_next_service_step_handles_hold(states.SERVICEWAIT) + + @mock.patch.object(servicing, 'do_next_service_step', autospec=True) + def _continue_node_service(self, mock_next_step, skip=True): + # test that skipping current step mechanism works + driver_info = {'service_steps': self.service_steps, + 'service_step_index': 0, + 'servicing_polling': 'value'} + if not skip: + driver_info['skip_current_service_step'] = skip + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=states.ACTIVE, + driver_internal_info=driver_info, + service_step=self.service_steps[0]) + with task_manager.acquire(self.context, node.uuid) as task: + servicing.continue_node_service(task) + expected_step_index = 1 if skip else 0 + self.assertNotIn( + 'skip_current_service_step', task.node.driver_internal_info) + self.assertNotIn( + 'cleaning_polling', task.node.driver_internal_info) + mock_next_step.assert_called_once_with(task, expected_step_index) + + def test_continue_node_service(self): + self._continue_node_service(skip=True) + + def test_continue_node_service_no_skip_step(self): + self._continue_node_service(skip=False) + + +class DoNodeServiceAbortTestCase(db_base.DbTestCase): + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def _test_do_node_service_abort(self, service_step, + tear_mock=None): + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEWAIT, + target_provision_state=states.AVAILABLE, + service_step=service_step, + driver_internal_info={ + 'agent_url': 'some url', + 'agent_secret_token': 'token', + 'service_step_index': 2, + 'service_reboot': True, + 'service_polling': True, + 'skip_current_service_step': True}) + + with task_manager.acquire(self.context, node.uuid) as task: + servicing.do_node_service_abort(task) + self.assertIsNotNone(task.node.last_error) + tear_mock.assert_called_once_with(task.driver.deploy, task) + task.node.refresh() + if service_step: + self.assertIn(service_step['step'], task.node.last_error) + # assert node's clean_step and metadata was cleaned up + self.assertEqual({}, task.node.service_step) + self.assertNotIn('service_step_index', + task.node.driver_internal_info) + self.assertNotIn('service_reboot', + task.node.driver_internal_info) + self.assertNotIn('service_polling', + task.node.driver_internal_info) + self.assertNotIn('skip_current_service_step', + task.node.driver_internal_info) + self.assertNotIn('agent_url', + task.node.driver_internal_info) + self.assertNotIn('agent_secret_token', + task.node.driver_internal_info) + + def test_do_node_service_abort_early(self): + self._test_do_node_service_abort(None) + + def test_do_node_service_abort_with_step(self): + self._test_do_node_service_abort({'step': 'foo', 'interface': 'deploy', + 'abortable': True}) + + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def test__do_node_service_abort_tear_down_fail(self, tear_mock): + tear_mock.side_effect = Exception('Surprise') + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEFAIL, + target_provision_state=states.ACTIVE, + service_step={'step': 'foo', 'abortable': True}) + + with task_manager.acquire(self.context, node.uuid) as task: + servicing.do_node_service_abort(task) + tear_mock.assert_called_once_with(task.driver.deploy, task) + self.assertIsNotNone(task.node.last_error) + self.assertIsNotNone(task.node.maintenance_reason) + self.assertTrue(task.node.maintenance) + self.assertEqual('service failure', task.node.fault) + + @mock.patch.object(fake.FakeDeploy, 'tear_down_service', autospec=True) + def test__do_node_cleanhold_abort_tear_down_fail(self, tear_mock): + tear_mock.side_effect = Exception('Surprise') + + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICEHOLD, + target_provision_state=states.ACTIVE, + service_step={'step': 'hold', 'abortable': True}) + + with task_manager.acquire(self.context, node.uuid) as task: + servicing.do_node_service_abort(task) + tear_mock.assert_called_once_with(task.driver.deploy, task) + self.assertIsNotNone(task.node.last_error) + self.assertIsNotNone(task.node.maintenance_reason) + self.assertTrue(task.node.maintenance) + self.assertEqual('service failure', task.node.fault) + + +class DoNodeCleanTestChildNodes(db_base.DbTestCase): + def setUp(self): + super(DoNodeCleanTestChildNodes, self).setUp() + self.power_off_parent = { + 'step': 'power_off', 'priority': 4, 'interface': 'power'} + self.power_on_children = { + 'step': 'power_on', 'priority': 5, 'interface': 'power', + 'execute_on_child_nodes': True} + self.update_firmware_on_children = { + 'step': 'update_firmware', 'priority': 10, + 'interface': 'management', 'execute_on_child_nodes': True} + self.reboot_children = { + 'step': 'reboot', 'priority': 5, 'interface': 'power', + 'execute_on_child_nodes': True} + self.power_on_parent = { + 'step': 'power_on', 'priority': 15, 'interface': 'power'} + self.service_steps = [ + self.power_off_parent, + self.power_on_children, + self.update_firmware_on_children, + self.reboot_children, + self.power_on_parent] + self.node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.SERVICING, + target_provision_state=states.ACTIVE, + last_error=None, + power_state=states.POWER_ON, + driver_internal_info={'agent_secret_token': 'old', + 'service_steps': self.service_steps}) + + @mock.patch('ironic.drivers.modules.fake.FakePower.reboot', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', + autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.execute_service_step', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeManagement.' + 'execute_service_step', autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def test_do_next_clean_step_with_children( + self, mock_deploy, mock_mgmt, mock_power, mock_pv, mock_nv, + mock_sps, mock_reboot): + child_node1 = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + last_error=None, + power_state=states.POWER_OFF, + parent_node=self.node.uuid) + child_node2 = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + last_error=None, + power_state=states.POWER_OFF, + parent_node=self.node.uuid) + + mock_deploy.return_value = None + mock_mgmt.return_value = None + mock_power.return_value = None + child1_updated_at = str(child_node1.updated_at) + child2_updated_at = str(child_node2.updated_at) + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + + servicing.do_next_service_step(task, 0, + disable_ramdisk=True) + self.node.refresh() + child_node1.refresh() + child_node2.refresh() + + # Confirm the objects *did* recieve locks. + self.assertNotEqual(child1_updated_at, child_node1.updated_at) + self.assertNotEqual(child2_updated_at, child_node2.updated_at) + + # Confirm the child nodes have no errors + self.assertFalse(child_node1.maintenance) + self.assertFalse(child_node2.maintenance) + self.assertIsNone(child_node1.last_error) + self.assertIsNone(child_node2.last_error) + self.assertIsNone(self.node.last_error) + + # Confirm the call counts expected + self.assertEqual(0, mock_deploy.call_count) + self.assertEqual(2, mock_mgmt.call_count) + self.assertEqual(0, mock_power.call_count) + self.assertEqual(0, mock_nv.call_count) + self.assertEqual(0, mock_pv.call_count) + self.assertEqual(4, mock_sps.call_count) + self.assertEqual(2, mock_reboot.call_count) + mock_sps.assert_has_calls([ + mock.call(mock.ANY, mock.ANY, 'power off', timeout=None), + mock.call(mock.ANY, mock.ANY, 'power on', timeout=None), + mock.call(mock.ANY, mock.ANY, 'power on', timeout=None)]) + + @mock.patch('ironic.drivers.modules.fake.FakePower.set_power_state', + autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.execute_service_step', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeManagement.' + 'execute_service_step', autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_service_step', + autospec=True) + def test_do_next_clean_step_with_children_by_uuid( + self, mock_deploy, mock_mgmt, mock_power, mock_pv, mock_nv, + mock_sps): + child_node1 = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + last_error=None, + parent_node=self.node.uuid) + child_node2 = obj_utils.create_test_node( + self.context, + uuid=uuidutils.generate_uuid(), + driver='fake-hardware', + last_error=None, + parent_node=self.node.uuid) + power_on_children = { + 'step': 'power_on', 'priority': 5, 'interface': 'power', + 'execute_on_child_nodes': True, + 'limit_child_node_execution': [child_node1.uuid]} + update_firmware_on_children = { + 'step': 'update_firmware', 'priority': 10, + 'interface': 'management', + 'execute_on_child_nodes': True, + 'limit_child_node_execution': [child_node1.uuid]} + power_on_parent = { + 'step': 'not_power', 'priority': 15, 'interface': 'power'} + service_steps = [power_on_children, update_firmware_on_children, + power_on_parent] + dii = self.node.driver_internal_info + dii['service_steps'] = service_steps + self.node.driver_internal_info = dii + self.node.save() + + mock_deploy.return_value = None + mock_mgmt.return_value = None + mock_power.return_value = None + child1_updated_at = str(child_node1.updated_at) + + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + + servicing.do_next_service_step(task, 0, + disable_ramdisk=True) + self.node.refresh() + child_node1.refresh() + child_node2.refresh() + + # Confirm the objects *did* recieve locks. + self.assertNotEqual(child1_updated_at, child_node1.updated_at) + self.assertIsNone(child_node2.updated_at) + + # Confirm the child nodes have no errors + self.assertFalse(child_node1.maintenance) + self.assertFalse(child_node2.maintenance) + self.assertIsNone(child_node1.last_error) + self.assertIsNone(child_node2.last_error) + self.assertIsNone(self.node.last_error) + + # Confirm the call counts expected + self.assertEqual(0, mock_deploy.call_count) + self.assertEqual(1, mock_mgmt.call_count) + self.assertEqual(1, mock_power.call_count) + self.assertEqual(0, mock_nv.call_count) + self.assertEqual(0, mock_pv.call_count) + mock_sps.assert_has_calls([ + mock.call(mock.ANY, mock.ANY, 'power on', timeout=None)]) diff --git a/ironic/tests/unit/conductor/test_steps.py b/ironic/tests/unit/conductor/test_steps.py index 058a508f5b..09d267af0e 100644 --- a/ironic/tests/unit/conductor/test_steps.py +++ b/ironic/tests/unit/conductor/test_steps.py @@ -1059,8 +1059,8 @@ class GetValidatedStepsFromTemplatesTestCase(db_base.DbTestCase): mock_templates): mock_templates.return_value = [self.template] mock_validate.side_effect = exception.InstanceDeployFailure('foo') - with task_manager.acquire( - self.context, self.node.uuid, shared=False) as task: + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: self.assertRaises( exception.InstanceDeployFailure, conductor_steps._get_validated_steps_from_templates, task) @@ -1399,3 +1399,87 @@ class ReservedStepHandlerByNameTestCase(db_base.DbTestCase): def test_reserved_step_wait_time(self): self._test_reserved_step({'step': 'wait', 'args': {'seconds': 1}}) + + +class NodeServiceStepsTestCase(db_base.DbTestCase): + def setUp(self): + super(NodeServiceStepsTestCase, self).setUp() + + self.deploy_start = { + 'step': 'deploy_start', 'priority': 50, 'interface': 'deploy'} + self.power_one = { + 'step': 'power_one', 'priority': 40, 'interface': 'power'} + self.deploy_middle = { + 'step': 'deploy_middle', 'priority': 40, 'interface': 'deploy'} + self.deploy_end = { + 'step': 'deploy_end', 'priority': 20, 'interface': 'deploy'} + self.power_disable = { + 'step': 'power_disable', 'priority': 0, 'interface': 'power'} + self.deploy_core = { + 'step': 'deploy', 'priority': 100, 'interface': 'deploy'} + # enabled steps + self.service_steps = [self.deploy_start, self.power_one, + self.deploy_middle, self.deploy_end] + # Deploy step with argsinfo. + self.deploy_raid = { + 'step': 'build_raid', 'priority': 0, 'interface': 'deploy', + 'argsinfo': {'arg1': {'description': 'desc1', 'required': True}, + 'arg2': {'description': 'desc2'}}} + self.node = obj_utils.create_test_node( + self.context, driver='fake-hardware') + + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.get_service_steps', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakePower.get_service_steps', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeManagement.get_service_steps', + autospec=True) + def test__get_service_steps(self, mock_mgt_steps, mock_power_steps, + mock_deploy_steps): + # Test getting deploy steps, with one driver returning None, two + # conflicting priorities, and asserting they are ordered properly. + + mock_power_steps.return_value = [self.power_disable, self.power_one] + mock_deploy_steps.return_value = [ + self.deploy_start, self.deploy_middle, self.deploy_end] + # These next steps are actually present on the FakeVendorB interface, + # and instead of just mock everything, we're actually exercising the + # rest of the way down including the decorator to get here. + fake_log_passthrough = { + 'abortable': False, 'argsinfo': None, 'interface': 'vendor', + 'priority': 0, 'requires_ramdisk': False, + 'step': 'log_passthrough' + } + fake_trigger_servicewait = { + 'abortable': False, 'argsinfo': None, 'interface': 'vendor', + 'priority': 0, 'requires_ramdisk': True, + 'step': 'trigger_servicewait' + } + + expected = self.service_steps + [fake_log_passthrough, + fake_trigger_servicewait, + self.power_disable] + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + steps = conductor_steps._get_service_steps(task, enabled=False) + + self.assertEqual(expected, steps) + mock_mgt_steps.assert_called_once_with(mock.ANY, task) + mock_power_steps.assert_called_once_with(mock.ANY, task) + mock_deploy_steps.assert_called_once_with(mock.ANY, task) + + @mock.patch.object(conductor_steps, '_validate_user_service_steps', + autospec=True) + def test_set_node_service_steps(self, mock_steps): + mock_steps.return_value = self.service_steps + + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + conductor_steps.set_node_service_steps(task) + self.node.refresh() + self.assertEqual(self.service_steps, + self.node.driver_internal_info['service_steps']) + self.assertEqual({}, self.node.service_step) + self.assertIsNone( + self.node.driver_internal_info['service_step_index']) + mock_steps.assert_called_once_with(task, [], disable_ramdisk=False) diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 27c4bfa860..e99784bd11 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -1490,6 +1490,90 @@ class ErrorHandlersTestCase(db_base.DbTestCase): log_mock.assert_has_calls(log_calls) self.node.save.assert_called_once_with() + @mock.patch.object(conductor_utils.LOG, 'error', autospec=True) + def _test_servicing_error_handler(self, mock_log_error, + prov_state=states.SERVICING): + self.node.provision_state = prov_state + target = 'baz' + self.node.target_provision_state = target + self.node.service_step = {'key': 'val'} + self.node.set_driver_internal_info('service_reboot', True) + self.node.set_driver_internal_info('service_polling', True) + self.node.set_driver_internal_info('skip_current_service_step', True) + self.node.set_driver_internal_info('service_step_index', 0) + self.node.set_driver_internal_info('agent_url', 'url') + self.node.set_driver_internal_info('agent_secret_token', 'foo') + self.node.set_driver_internal_info('agent_secret_token_pregenerated', + False) + + msg = 'error bar' + last_error = "last error" + conductor_utils.servicing_error_handler(self.task, msg, + errmsg=last_error) + self.node.save.assert_called_once_with() + self.assertEqual({}, self.node.service_step) + self.assertNotIn('service_step_index', self.node.driver_internal_info) + self.assertNotIn('service_reboot', self.node.driver_internal_info) + self.assertNotIn('service_polling', self.node.driver_internal_info) + self.assertNotIn('skip_current_service_step', + self.node.driver_internal_info) + self.assertNotIn('agent_secret_token', self.node.driver_internal_info) + self.assertNotIn('agent_secret_token_pregenerated', + self.node.driver_internal_info) + self.assertEqual(last_error, self.node.last_error) + self.assertTrue(self.node.maintenance) + self.assertEqual(last_error, self.node.maintenance_reason) + self.assertEqual('service failure', self.node.fault) + driver = self.task.driver.deploy + driver.tear_down_service.assert_called_once_with(self.task) + if prov_state == states.SERVICEFAIL: + self.assertFalse(self.task.process_event.called) + else: + self.task.process_event.assert_called_once_with('fail') + self.assertNotIn('agent_url', self.node.driver_internal_info) + mock_log_error.assert_called_once_with(msg, exc_info=False) + + def test_servicing_error_handler(self): + self._test_servicing_error_handler() + + def test_servicing_error_handler_servicewait(self): + self._test_servicing_error_handler(prov_state=states.SERVICEWAIT) + + def test_servicing_error_handler_servicefail(self): + self._test_servicing_error_handler(prov_state=states.SERVICEFAIL) + + def test_servicing_error_handler_no_teardown(self): + target = states.MANAGEABLE + self.node.target_provision_state = target + conductor_utils.servicing_error_handler(self.task, 'foo', + tear_down_service=False) + self.assertFalse(self.task.driver.deploy.tear_down_service.called) + self.task.process_event.assert_called_once_with('fail') + + def test_servicing_error_handler_no_fail(self): + conductor_utils.servicing_error_handler(self.task, 'foo', + set_fail_state=False) + driver = self.task.driver.deploy + driver.tear_down_service.assert_called_once_with(self.task) + self.assertFalse(self.task.process_event.called) + + @mock.patch.object(conductor_utils, 'LOG', autospec=True) + def test_servicing_error_handler_tear_down_error(self, log_mock): + def _side_effect(task): + # simulate overwriting last error by another operation (e.g. power) + task.node.last_error = None + raise Exception('bar') + + driver = self.task.driver.deploy + msg = 'foo' + driver.tear_down_service.side_effect = _side_effect + conductor_utils.servicing_error_handler(self.task, msg) + log_mock.error.assert_called_once_with(msg, exc_info=False) + self.assertTrue(log_mock.exception.called) + self.assertIn(msg, self.node.last_error) + self.assertIn(msg, self.node.maintenance_reason) + self.assertEqual('service failure', self.node.fault) + class ValidatePortPhysnetTestCase(db_base.DbTestCase): @@ -2738,3 +2822,46 @@ class GetTokenProjectFromRequestTestCase(db_base.DbTestCase): self.context.auth_token_info = self.auth_token_info res = conductor_utils.get_token_project_from_request(self.context) self.assertEqual('user-project', res) + + +class ServiceUtilsTestCase(db_base.DbTestCase): + + def setUp(self): + super(ServiceUtilsTestCase, self).setUp() + self.node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + driver_internal_info={ + 'agent_last_heartbeat': str(timeutils.utcnow().isoformat()), + 'agent_url': 'a_url'}) + self.config(fast_track=True, group='deploy') + + @mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) + def test_wipe_service_internal_info(self, mock_power): + mock_power.return_value = False + self.node.driver_internal_info = { + 'service_steps': {'foo': 'bar'}, + 'agent_cached_service_steps': {'more_foo': None}, + 'service_reboot': False, + 'service_polling': 1, + 'service_disable_ramdisk': False, + 'skip_current_service_step': False, + 'steps_validated': 'meow' + 'agent_secret_token'} + self.node.save() + not_in_list = ['agent_cached_service_steps', + 'serivce_reboot', + 'service_polling', + 'service_disable_ramdisk', + 'skip_current_service_step', + 'steps_validated', + 'agent_secret_token'] + with task_manager.acquire(self.context, self.node.id, + shared=True) as task: + conductor_utils.wipe_service_internal_info(task) + task.node.save() + self.assertIsNone( + task.node.driver_internal_info['service_steps'] + ) + for field in not_in_list: + self.assertNotIn(field, task.node.driver_internal_info) diff --git a/ironic/tests/unit/db/utils.py b/ironic/tests/unit/db/utils.py index a1cbad9306..8e72608cb8 100644 --- a/ironic/tests/unit/db/utils.py +++ b/ironic/tests/unit/db/utils.py @@ -238,7 +238,8 @@ def get_test_node(**kw): 'boot_mode': kw.get('boot_mode', None), 'secure_boot': kw.get('secure_boot', None), 'shard': kw.get('shard', None), - 'parent_node': kw.get('parent_node', None) + 'parent_node': kw.get('parent_node', None), + 'service_step': kw.get('service_step'), } for iface in drivers_base.ALL_INTERFACES: diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index b9e09f324e..c589d520be 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -25,6 +25,7 @@ from ironic.common import exception from ironic.common import image_service from ironic.common import states from ironic.conductor import cleaning +from ironic.conductor import servicing from ironic.conductor import steps as conductor_steps from ironic.conductor import task_manager from ironic.conductor import utils as manager_utils @@ -159,7 +160,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): autospec=True) def test_heartbeat_in_maintenance(self, next_step_mock): # NOTE(pas-ha) checking only for states that are not noop - for state in (states.DEPLOYWAIT, states.CLEANWAIT): + for state in (states.DEPLOYWAIT, states.CLEANWAIT, + states.SERVICEWAIT): next_step_mock.reset_mock() self.node.provision_state = state self.node.maintenance = True @@ -186,7 +188,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): group='conductor') for state, expected in [(states.DEPLOYWAIT, states.DEPLOYFAIL), (states.CLEANWAIT, states.CLEANFAIL), - (states.RESCUEWAIT, states.RESCUEFAIL)]: + (states.RESCUEWAIT, states.RESCUEFAIL), + (states.SERVICEWAIT, states.SERVICEFAIL)]: next_step_mock.reset_mock() self.node.provision_state = state self.node.maintenance = True @@ -211,7 +214,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): autospec=True) def test_heartbeat_with_reservation(self, next_step_mock): # NOTE(pas-ha) checking only for states that are not noop - for state in (states.DEPLOYWAIT, states.CLEANWAIT): + for state in (states.DEPLOYWAIT, states.CLEANWAIT, + states.SERVICEWAIT): next_step_mock.reset_mock() self.node.provision_state = state self.node.reservation = 'localhost' @@ -232,7 +236,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): def test_heartbeat_noops_in_wrong_state(self, next_step_mock, log_mock): allowed = {states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT, states.DEPLOYING, states.CLEANING, states.RESCUING, - states.DEPLOYHOLD, states.CLEANHOLD} + states.DEPLOYHOLD, states.CLEANHOLD, states.SERVICEHOLD, + states.SERVICING, states.SERVICEWAIT} for state in set(states.machine.states) - allowed: for m in (next_step_mock, log_mock): m.reset_mock() @@ -253,7 +258,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): def test_heartbeat_noops_in_wrong_state2(self, next_step_mock): CONF.set_override('allow_provisioning_in_maintenance', False, group='conductor') - allowed = {states.DEPLOYWAIT, states.CLEANWAIT} + allowed = {states.DEPLOYWAIT, states.CLEANWAIT, + states.SERVICEWAIT} for state in set(states.machine.states) - allowed: next_step_mock.reset_mock() with task_manager.acquire(self.context, self.node.uuid, @@ -466,7 +472,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): @mock.patch.object(agent_base.LOG, 'error', autospec=True) def test_heartbeat_records_when_appropriate(self, log_mock): for provision_state in (states.CLEANING, states.DEPLOYING, - states.CLEANHOLD, states.DEPLOYHOLD): + states.CLEANHOLD, states.DEPLOYHOLD, + states.SERVICEHOLD, states.SERVICING): self.node.driver_internal_info = {} self.node.provision_state = provision_state self.node.save() @@ -521,6 +528,68 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): task.node.driver_internal_info['agent_last_heartbeat']) self.assertEqual(provision_state, task.node.provision_state) + @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'refresh_steps', autospec=True) + @mock.patch.object(conductor_steps, 'set_node_service_steps', + autospec=True) + @mock.patch.object(servicing, 'continue_node_service', autospec=True) + def test_heartbeat_resume_service(self, mock_service, mock_set_steps, + mock_refresh, mock_touch): + self.node.clean_step = {} + self.node.provision_state = states.SERVICEWAIT + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0') + + mock_touch.assert_called_once_with(mock.ANY) + mock_refresh.assert_called_once_with(mock.ANY, task, 'service') + mock_service.assert_called_once_with(task) + mock_set_steps.assert_called_once_with(task) + + @mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'continue_servicing', autospec=True) + def test_heartbeat_continue_servicing(self, mock_continue, mock_touch): + self.node.service_step = { + 'priority': 10, + 'interface': 'deploy', + 'step': 'foo', + 'reboot_requested': False + } + self.node.provision_state = states.SERVICEWAIT + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0') + + mock_touch.assert_called_once_with(mock.ANY) + mock_continue.assert_called_once_with(mock.ANY, task) + + @mock.patch.object(manager_utils, 'servicing_error_handler', autospec=True) + @mock.patch.object(agent_base.HeartbeatMixin, + 'continue_servicing', autospec=True) + def test_heartbeat_continue_servicing_fails(self, mock_continue, + mock_handler): + self.node.service_step = { + 'priority': 10, + 'interface': 'deploy', + 'step': 'foo', + 'reboot_requested': False + } + + mock_continue.side_effect = Exception() + + self.node.provision_state = states.SERVICEWAIT + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0') + + mock_continue.assert_called_once_with(mock.ANY, task) + mock_handler.assert_called_once_with(task, mock.ANY, mock.ANY) + class AgentRescueTests(AgentDeployMixinBaseTest): @@ -1604,6 +1673,25 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest): self.assertNotIn('deployment_reboot', task.node.driver_internal_info) + @mock.patch.object(deploy_utils, 'build_agent_options', autospec=True) + @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', spec_set=True, + autospec=True) + @mock.patch.object(manager_utils, 'servicing_error_handler', autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + def test__post_step_reboot_fail_servicing(self, mock_reboot, mock_handler, + mock_prepare, mock_build_opt): + mock_reboot.side_effect = RuntimeError("broken") + self.node.provision_state = states.SERVICEWAIT + self.node.save() + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + agent_base._post_step_reboot(task, 'service') + mock_reboot.assert_called_once_with(task, states.REBOOT) + mock_handler.assert_called_once_with(task, mock.ANY, + traceback=True) + self.assertNotIn('servicing_reboot', + task.node.driver_internal_info) + def _test_clean_step_hook(self): """Helper method for unit tests related to clean step hooks.""" some_function_mock = mock.MagicMock() @@ -1998,6 +2086,111 @@ class ContinueCleaningTest(AgentDeployMixinBaseTest): error_mock.assert_called_once_with(task, mock.ANY, traceback=False) +class ContinueServiceTest(AgentDeployMixinBaseTest): + + def setUp(self): + super().setUp() + self.node.provision_state = states.SERVICEWAIT + self.node.target_provision_state = states.ACTIVE + self.node.save() + + @mock.patch.object(servicing, 'continue_node_service', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_continue_servicing(self, status_mock, service_mock): + # Test a successful execute clean step on the agent + self.node.service_step = { + 'priority': 10, + 'interface': 'deploy', + 'step': 'erase_devices', + 'reboot_requested': False + } + self.node.save() + status_mock.return_value = [{ + 'command_status': 'SUCCEEDED', + 'command_name': 'execute_service_step', + 'command_result': { + 'service_step': self.node.service_step + } + }] + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.deploy.continue_servicing(task) + service_mock.assert_called_once_with(task) + self.assertEqual(states.SERVICING, task.node.provision_state) + self.assertEqual(states.ACTIVE, + task.node.target_provision_state) + + @mock.patch.object(deploy_utils, 'build_agent_options', autospec=True) + @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', spec_set=True, + autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_continue_servicing_reboot( + self, status_mock, reboot_mock, mock_prepare, mock_build_opt): + # Test a successful execute clean step on the agent, with reboot + self.node.service_step = { + 'priority': 42, + 'interface': 'deploy', + 'step': 'reboot_me_afterwards', + 'reboot_requested': True + } + self.node.save() + status_mock.return_value = [{ + 'command_status': 'SUCCEEDED', + 'command_name': 'execute_service_step', + 'command_result': { + 'service_step': self.node.service_step + } + }] + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.deploy.continue_servicing(task) + reboot_mock.assert_called_once_with(task, states.REBOOT) + + @mock.patch.object(servicing, 'continue_node_service', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_continue_servicing_after_reboot(self, status_mock, service_mock): + # Test a successful execute clean step on the agent, with reboot + self.node.service_step = { + 'priority': 42, + 'interface': 'deploy', + 'step': 'reboot_me_afterwards', + 'reboot_requested': True + } + driver_internal_info = self.node.driver_internal_info + driver_internal_info['servicing_reboot'] = True + self.node.driver_internal_info = driver_internal_info + self.node.save() + # Represents a freshly booted agent with no commands + status_mock.return_value = [] + + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.deploy.continue_servicing(task) + service_mock.assert_called_once_with(task) + self.assertEqual(states.SERVICING, task.node.provision_state) + self.assertNotIn('servicing_reboot', + task.node.driver_internal_info) + + @mock.patch.object(servicing, 'continue_node_service', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_continue_servicing_running(self, status_mock, service_mock): + # Test that no action is taken while a clean step is executing + status_mock.return_value = [{ + 'command_status': 'RUNNING', + 'command_name': 'execute_service_step', + 'command_result': None + }] + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.deploy.continue_servicing(task) + self.assertFalse(service_mock.called) + + class TestRefreshCleanSteps(AgentDeployMixinBaseTest): def setUp(self): diff --git a/ironic/tests/unit/objects/test_objects.py b/ironic/tests/unit/objects/test_objects.py index ab9f64693c..3546f3721a 100644 --- a/ironic/tests/unit/objects/test_objects.py +++ b/ironic/tests/unit/objects/test_objects.py @@ -676,7 +676,7 @@ class TestObject(_LocalTest, _TestObject): # version bump. It is an MD5 hash of the object fields and remotable methods. # The fingerprint values should only be changed if there is a version bump. expected_object_fingerprints = { - 'Node': '1.39-ee3f5ff28b79f9fabf84a50e34a71684', + 'Node': '1.40-2182d4660bb5d5e4cc5670c37012ef71', 'MyObj': '1.5-9459d30d6954bffc7a9afd347a807ca6', 'Chassis': '1.3-d656e039fd8ae9f34efc232ab3980905', 'Port': '1.11-97bf15b61224f26c65e90f007d78bfd2', diff --git a/releasenotes/notes/add-service-steps-deb45c9a0e77a647.yaml b/releasenotes/notes/add-service-steps-deb45c9a0e77a647.yaml new file mode 100644 index 0000000000..74e2bb7981 --- /dev/null +++ b/releasenotes/notes/add-service-steps-deb45c9a0e77a647.yaml @@ -0,0 +1,19 @@ +--- +features: + - | + Adds a new Ironic capability called ``service_steps`` which allows a + deployed ``ACTIVE`` node to be modified utilizing a new API provision + state verb of ``service`` which can include a list of ``service_steps`` + to be performed. This work is inspired by ``clean_steps`` and + ``deploy_steps`` and similar to those efforts, this functionality will + continue to evolve as new features, functionality, and capabilities + are added. + - Adds a new driver method decorator ``base.service_step`` which operates + exactly like the existing ``base.clean_step`` and ``base.deploy_step`` + decorators. Driver methods which are decorated *can* be invoked utilizing + the service steps. +issues: + - | + The ``service_steps`` functionality does not understand how to poll and + communicate with the ``ironic-python-agent``. This is anticipated to be + addressed in a future release.