Adds service steps

A huge list of initial work for service steps

* Adds service_step verb
* Adds service_step db/object/API field on the node object for the
  status.
* Increments the API version to 1.87 for both changes.
* Increments the RPC API version to 1.57.
* Adds initial testing to facilitate ensurance that supplied steps
  are passed through and executed upon.

Does not:

* Have tests for starting the agent ramdisk, although this is
  relatively boiler plate.
* Have a collection of pre-decorated steps available for immediate
  consumption.

Change-Id: I5b9dd928f24dff7877a4ab8dc7b743058cace994
This commit is contained in:
Julia Kreger 2023-05-30 11:18:32 -07:00
parent 8e2aab8291
commit 2366a4b86e
35 changed files with 3109 additions and 65 deletions

View File

@ -2,6 +2,19 @@
REST API Version History REST API Version History
======================== ========================
1.87 (Bobcat)
-------------
Adds the ``service`` provision state verb to allow modifications
via the "steps" interface to occur with a baremetal node. With this
functionality comes a ``service_step`` field on the ``/v1/nodes``
based resources, which indicates the current step.
1.86 (Bobcat)
-------------
Adds a ``firmware_interface`` field to the ``/v1/nodes`` resources.
1.85 (Bobcat) 1.85 (Bobcat)
------------- -------------

View File

@ -56,9 +56,12 @@ from ironic import objects
CONF = ironic.conf.CONF CONF = ironic.conf.CONF
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
_CLEAN_STEPS_SCHEMA = {
# TODO(TheJulia): We *really* need to just have *one* schema.
_STEPS_SCHEMA = {
"$schema": "http://json-schema.org/schema#", "$schema": "http://json-schema.org/schema#",
"title": "Clean steps schema", "title": "Steps schema",
"type": "array", "type": "array",
# list of clean steps # list of clean steps
"items": { "items": {
@ -124,7 +127,8 @@ PROVISION_ACTION_STATES = (ir_states.VERBS['manage'],
ir_states.VERBS['provide'], ir_states.VERBS['provide'],
ir_states.VERBS['abort'], ir_states.VERBS['abort'],
ir_states.VERBS['adopt'], ir_states.VERBS['adopt'],
ir_states.VERBS['unhold']) ir_states.VERBS['unhold'],
ir_states.VERBS['service'])
_NODES_CONTROLLER_RESERVED_WORDS = None _NODES_CONTROLLER_RESERVED_WORDS = None
@ -950,7 +954,8 @@ class NodeStatesController(rest.RestController):
def _do_provision_action(self, rpc_node, target, configdrive=None, def _do_provision_action(self, rpc_node, target, configdrive=None,
clean_steps=None, deploy_steps=None, clean_steps=None, deploy_steps=None,
rescue_password=None, disable_ramdisk=None): rescue_password=None, disable_ramdisk=None,
service_steps=None):
topic = api.request.rpcapi.get_topic_for(rpc_node) topic = api.request.rpcapi.get_topic_for(rpc_node)
# Note that there is a race condition. The node state(s) could change # Note that there is a race condition. The node state(s) could change
# by the time the RPC call is made and the TaskManager manager gets a # by the time the RPC call is made and the TaskManager manager gets a
@ -993,6 +998,17 @@ class NodeStatesController(rest.RestController):
api.request.rpcapi.do_node_clean( api.request.rpcapi.do_node_clean(
api.request.context, rpc_node.uuid, clean_steps, api.request.context, rpc_node.uuid, clean_steps,
disable_ramdisk, topic=topic) disable_ramdisk, topic=topic)
elif target == ir_states.VERBS['service']:
if not service_steps:
msg = (_('"service_steps" is required when setting '
'target provision state to '
'%s') % ir_states.VERBS['service'])
raise exception.ClientSideError(
msg, status_code=http_client.BAD_REQUEST)
_check_service_steps(service_steps)
api.request.rpcapi.do_node_service(
api.request.context, rpc_node.uuid, service_steps,
disable_ramdisk, topic=topic)
elif target in PROVISION_ACTION_STATES: elif target in PROVISION_ACTION_STATES:
api.request.rpcapi.do_provisioning_action( api.request.rpcapi.do_provisioning_action(
api.request.context, rpc_node.uuid, target, topic) api.request.context, rpc_node.uuid, target, topic)
@ -1008,10 +1024,12 @@ class NodeStatesController(rest.RestController):
clean_steps=args.types(type(None), list), clean_steps=args.types(type(None), list),
deploy_steps=args.types(type(None), list), deploy_steps=args.types(type(None), list),
rescue_password=args.string, rescue_password=args.string,
disable_ramdisk=args.boolean) disable_ramdisk=args.boolean,
service_steps=args.types(type(None), list))
def provision(self, node_ident, target, configdrive=None, def provision(self, node_ident, target, configdrive=None,
clean_steps=None, deploy_steps=None, clean_steps=None, deploy_steps=None,
rescue_password=None, disable_ramdisk=None): rescue_password=None, disable_ramdisk=None,
service_steps=None):
"""Asynchronous trigger the provisioning of the node. """Asynchronous trigger the provisioning of the node.
This will set the target provision state of the node, and a This will set the target provision state of the node, and a
@ -1069,11 +1087,31 @@ class NodeStatesController(rest.RestController):
inside the rescue environment. This is required (and only valid), inside the rescue environment. This is required (and only valid),
when target is "rescue". when target is "rescue".
:param disable_ramdisk: Whether to skip booting ramdisk for cleaning. :param disable_ramdisk: Whether to skip booting ramdisk for cleaning.
:param service_steps: A list of service steps that will be performed on
the node. A service step is a dictionary with required keys
'interface', 'step', 'priority' and 'args'. If specified, the value
for 'args' is a keyword variable argument dictionary that is passed
to the service step method.::
{ 'interface': <driver_interface>,
'step': <name_of_service_step>,
'args': {<arg1>: <value1>, ..., <argn>: <valuen>}
'priority': <integer>}
For example (this isn't a real example, this service step doesn't
exist)::
{ 'interface': 'deploy',
'step': 'upgrade_firmware',
'args': {'force': True},
'priority': 90 }
:raises: NodeLocked (HTTP 409) if the node is currently locked. :raises: NodeLocked (HTTP 409) if the node is currently locked.
:raises: ClientSideError (HTTP 409) if the node is already being :raises: ClientSideError (HTTP 409) if the node is already being
provisioned. provisioned.
:raises: InvalidParameterValue (HTTP 400), if validation of :raises: InvalidParameterValue (HTTP 400), if validation of
clean_steps, deploy_steps or power driver interface fails. clean_steps, deploy_steps, service_steps or power driver
interface fails.
:raises: InvalidStateRequested (HTTP 400) if the requested transition :raises: InvalidStateRequested (HTTP 400) if the requested transition
is not possible from the current state. is not possible from the current state.
:raises: NodeInMaintenance (HTTP 400), if operation cannot be :raises: NodeInMaintenance (HTTP 400), if operation cannot be
@ -1140,9 +1178,13 @@ class NodeStatesController(rest.RestController):
if not api_utils.allow_unhold_verb(): if not api_utils.allow_unhold_verb():
raise exception.NotAcceptable() raise exception.NotAcceptable()
if target == ir_states.VERBS['service']:
if not api_utils.allow_service_verb():
raise exception.NotAcceptable()
self._do_provision_action(rpc_node, target, configdrive, clean_steps, self._do_provision_action(rpc_node, target, configdrive, clean_steps,
deploy_steps, rescue_password, deploy_steps, rescue_password,
disable_ramdisk) disable_ramdisk, service_steps)
# Set the HTTP Location Header # Set the HTTP Location Header
url_args = '/'.join([node_ident, 'states']) url_args = '/'.join([node_ident, 'states'])
@ -1156,7 +1198,7 @@ def _check_clean_steps(clean_steps):
clean_steps parameter of :func:`NodeStatesController.provision`. clean_steps parameter of :func:`NodeStatesController.provision`.
:raises: InvalidParameterValue if validation of steps fails. :raises: InvalidParameterValue if validation of steps fails.
""" """
_check_steps(clean_steps, 'clean', _CLEAN_STEPS_SCHEMA) _check_steps(clean_steps, 'clean', _STEPS_SCHEMA)
def _check_deploy_steps(deploy_steps): def _check_deploy_steps(deploy_steps):
@ -1169,6 +1211,16 @@ def _check_deploy_steps(deploy_steps):
_check_steps(deploy_steps, 'deploy', _DEPLOY_STEPS_SCHEMA) _check_steps(deploy_steps, 'deploy', _DEPLOY_STEPS_SCHEMA)
def _check_service_steps(service_steps):
"""Ensure all necessary keys are present and correct in steps for service
:param service_steps: a list of steps. For more details, see the
service_steps parameter of :func:`NodeStatesController.provision`.
:raises: InvalidParameterValue if validation of steps fails.
"""
_check_steps(service_steps, 'service', _STEPS_SCHEMA)
def _check_steps(steps, step_type, schema): def _check_steps(steps, step_type, schema):
"""Ensure all necessary keys are present and correct in steps. """Ensure all necessary keys are present and correct in steps.
@ -1429,6 +1481,7 @@ def _get_fields_for_node_query(fields=None):
'retired', 'retired',
'retired_reason', 'retired_reason',
'secure_boot', 'secure_boot',
'service_step',
'shard', 'shard',
'storage_interface', 'storage_interface',
'target_power_state', 'target_power_state',
@ -2105,7 +2158,7 @@ class NodesController(rest.RestController):
'instance_info', 'driver_internal_info', 'instance_info', 'driver_internal_info',
'clean_step', 'deploy_step', 'clean_step', 'deploy_step',
'raid_config', 'target_raid_config', 'raid_config', 'target_raid_config',
'traits', 'network_data'] 'traits', 'network_data', 'service_step']
_subcontroller_map = { _subcontroller_map = {
'ports': port.PortsController, 'ports': port.PortsController,

View File

@ -808,7 +808,8 @@ VERSIONED_FIELDS = {
'secure_boot': versions.MINOR_75_NODE_BOOT_MODE, 'secure_boot': versions.MINOR_75_NODE_BOOT_MODE,
'shard': versions.MINOR_82_NODE_SHARD, 'shard': versions.MINOR_82_NODE_SHARD,
'parent_node': versions.MINOR_83_PARENT_CHILD_NODES, 'parent_node': versions.MINOR_83_PARENT_CHILD_NODES,
'firmware_interface': versions.MINOR_86_FIRMWARE_INTERFACE 'firmware_interface': versions.MINOR_86_FIRMWARE_INTERFACE,
'service_step': versions.MINOR_87_SERVICE
} }
for field in V31_FIELDS: for field in V31_FIELDS:
@ -1957,6 +1958,11 @@ def allow_unhold_verb():
return api.request.version.minor >= versions.MINOR_85_UNHOLD_VERB return api.request.version.minor >= versions.MINOR_85_UNHOLD_VERB
def allow_service_verb():
"""Check if the service verb may be passed to the API."""
return api.request.version.minor >= versions.MINOR_87_SERVICE
def check_allow_deploy_steps(target, deploy_steps): def check_allow_deploy_steps(target, deploy_steps):
"""Check if deploy steps are allowed""" """Check if deploy steps are allowed"""

View File

@ -123,6 +123,8 @@ BASE_VERSION = 1
# v1.83: Add child node modeling # v1.83: Add child node modeling
# v1.84: Add ramdisk callback to continue inspection. # v1.84: Add ramdisk callback to continue inspection.
# v1.85: Add unhold verb # v1.85: Add unhold verb
# v1.86: Add firmware interface
# v1.87: Add service verb
MINOR_0_JUNO = 0 MINOR_0_JUNO = 0
MINOR_1_INITIAL_VERSION = 1 MINOR_1_INITIAL_VERSION = 1
MINOR_2_AVAILABLE_STATE = 2 MINOR_2_AVAILABLE_STATE = 2
@ -210,6 +212,8 @@ MINOR_83_PARENT_CHILD_NODES = 83
MINOR_84_CONTINUE_INSPECTION = 84 MINOR_84_CONTINUE_INSPECTION = 84
MINOR_85_UNHOLD_VERB = 85 MINOR_85_UNHOLD_VERB = 85
MINOR_86_FIRMWARE_INTERFACE = 86 MINOR_86_FIRMWARE_INTERFACE = 86
MINOR_87_SERVICE = 87
# When adding another version, update: # When adding another version, update:
# - MINOR_MAX_VERSION # - MINOR_MAX_VERSION
@ -217,7 +221,7 @@ MINOR_86_FIRMWARE_INTERFACE = 86
# explanation of what changed in the new version # explanation of what changed in the new version
# - common/release_mappings.py, RELEASE_MAPPING['master']['api'] # - common/release_mappings.py, RELEASE_MAPPING['master']['api']
MINOR_MAX_VERSION = MINOR_86_FIRMWARE_INTERFACE MINOR_MAX_VERSION = MINOR_87_SERVICE
# String representations of the minor and maximum versions # String representations of the minor and maximum versions
_MIN_VERSION_STRING = '{}.{}'.format(BASE_VERSION, MINOR_1_INITIAL_VERSION) _MIN_VERSION_STRING = '{}.{}'.format(BASE_VERSION, MINOR_1_INITIAL_VERSION)

View File

@ -24,4 +24,8 @@ RESCUE_ABORT_FAILURE = 'rescue abort failure'
""" Node is moved to maintenance due to failure of cleaning up during """ Node is moved to maintenance due to failure of cleaning up during
rescue abort. """ rescue abort. """
VALID_FAULTS = (POWER_FAILURE, CLEAN_FAILURE, RESCUE_ABORT_FAILURE) SERVICE_FAILURE = 'service failure'
""" Node is moved to maintenance due to failure of a service operation. """
VALID_FAULTS = (POWER_FAILURE, CLEAN_FAILURE, RESCUE_ABORT_FAILURE,
SERVICE_FAILURE)

View File

@ -989,3 +989,12 @@ class NeutronNetworkInterfaceMixin(object):
# Fall back to non-managed in-band inspection # Fall back to non-managed in-band inspection
raise exception.UnsupportedDriverExtension( raise exception.UnsupportedDriverExtension(
driver=task.node.driver, extension='inspection') driver=task.node.driver, extension='inspection')
def get_servicing_network_uuid(self, task):
servicing_network = (
task.node.driver_info.get('servicing_network')
or CONF.neutron.servicing_network
)
return validate_network(
servicing_network, _('servicing network'),
context=task.context)

View File

@ -574,12 +574,12 @@ RELEASE_MAPPING = {
} }
}, },
'master': { 'master': {
'api': '1.86', 'api': '1.87',
'rpc': '1.56', 'rpc': '1.57',
'objects': { 'objects': {
'Allocation': ['1.1'], 'Allocation': ['1.1'],
'BIOSSetting': ['1.1'], 'BIOSSetting': ['1.1'],
'Node': ['1.39', '1.38', '1.37'], 'Node': ['1.40', '1.39', '1.38', '1.37'],
'NodeHistory': ['1.0'], 'NodeHistory': ['1.0'],
'NodeInventory': ['1.0'], 'NodeInventory': ['1.0'],
'Conductor': ['1.3'], 'Conductor': ['1.3'],

View File

@ -53,6 +53,7 @@ VERBS = {
'rescue': 'rescue', 'rescue': 'rescue',
'unrescue': 'unrescue', 'unrescue': 'unrescue',
'unhold': 'unhold', 'unhold': 'unhold',
'service': 'service',
} }
""" Mapping of state-changing events that are PUT to the REST API """ Mapping of state-changing events that are PUT to the REST API
@ -235,11 +236,27 @@ UNRESCUEFAIL = 'unrescue failed'
UNRESCUING = 'unrescuing' UNRESCUING = 'unrescuing'
""" Node is being restored from rescue mode (to active state). """ """ Node is being restored from rescue mode (to active state). """
SERVICE = 'service'
""" Node is being requested to be modified through a service step. """
SERVICING = 'servicing'
""" Node is actively being changed by a service step. """
SERVICEWAIT = 'service wait'
""" Node is waiting for an operation to complete. """
SERVICEFAIL = 'service failed'
""" Node has failed in a service step execution. """
SERVICEHOLD = 'service hold'
""" Node is being held for direct intervention from a service step. """
# NOTE(kaifeng): INSPECTING is allowed to keep backwards compatibility, # NOTE(kaifeng): INSPECTING is allowed to keep backwards compatibility,
# starting from API 1.39 node update is disallowed in this state. # starting from API 1.39 node update is disallowed in this state.
UPDATE_ALLOWED_STATES = (DEPLOYFAIL, INSPECTING, INSPECTFAIL, INSPECTWAIT, UPDATE_ALLOWED_STATES = (DEPLOYFAIL, INSPECTING, INSPECTFAIL, INSPECTWAIT,
CLEANFAIL, ERROR, VERIFYING, ADOPTFAIL, RESCUEFAIL, CLEANFAIL, ERROR, VERIFYING, ADOPTFAIL, RESCUEFAIL,
UNRESCUEFAIL) UNRESCUEFAIL, SERVICE, SERVICEHOLD, SERVICEFAIL)
"""Transitional states in which we allow updating a node.""" """Transitional states in which we allow updating a node."""
DELETE_ALLOWED_STATES = (MANAGEABLE, ENROLL, ADOPTFAIL) DELETE_ALLOWED_STATES = (MANAGEABLE, ENROLL, ADOPTFAIL)
@ -250,7 +267,7 @@ STABLE_STATES = (ENROLL, MANAGEABLE, AVAILABLE, ACTIVE, ERROR, RESCUE)
UNSTABLE_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, VERIFYING, UNSTABLE_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, VERIFYING,
DELETING, INSPECTING, INSPECTWAIT, ADOPTING, RESCUING, DELETING, INSPECTING, INSPECTWAIT, ADOPTING, RESCUING,
RESCUEWAIT, UNRESCUING) RESCUEWAIT, UNRESCUING, SERVICING, SERVICEWAIT)
"""States that can be changed without external request.""" """States that can be changed without external request."""
STUCK_STATES_TREATED_AS_FAIL = (DEPLOYING, CLEANING, VERIFYING, INSPECTING, STUCK_STATES_TREATED_AS_FAIL = (DEPLOYING, CLEANING, VERIFYING, INSPECTING,
@ -272,12 +289,15 @@ _FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE,
DEPLOYING, DEPLOYWAIT, DEPLOYING, DEPLOYWAIT,
CLEANING, CLEANWAIT, CLEANING, CLEANWAIT,
INSPECTING, INSPECTWAIT, INSPECTING, INSPECTWAIT,
RESCUING, RESCUEWAIT) RESCUING, RESCUEWAIT,
SERVICING, SERVICEWAIT,
SERVICEHOLD)
FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES) FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES)
"""States where API lookups are permitted with fast track enabled.""" """States where API lookups are permitted with fast track enabled."""
FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL, FAILURE_STATES = frozenset((DEPLOYFAIL, CLEANFAIL, INSPECTFAIL,
RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL)) RESCUEFAIL, UNRESCUEFAIL, ADOPTFAIL,
SERVICEFAIL))
############## ##############
@ -594,3 +614,49 @@ machine.add_transition(ADOPTFAIL, ADOPTING, 'adopt')
# A node that failed adoption can be moved back to manageable # A node that failed adoption can be moved back to manageable
machine.add_transition(ADOPTFAIL, MANAGEABLE, 'manage') machine.add_transition(ADOPTFAIL, MANAGEABLE, 'manage')
# Add service* states
machine.add_state(SERVICING, target=ACTIVE, **watchers)
machine.add_state(SERVICEWAIT, target=ACTIVE, **watchers)
machine.add_state(SERVICEFAIL, target=ACTIVE, **watchers)
machine.add_state(SERVICEHOLD, target=ACTIVE, **watchers)
# A node in service an be returned to active
machine.add_transition(SERVICING, ACTIVE, 'done')
# A node in active can be serviced
machine.add_transition(ACTIVE, SERVICING, 'service')
# A node in servicing can be failed
machine.add_transition(SERVICING, SERVICEFAIL, 'fail')
# A node in service can enter a wait state
machine.add_transition(SERVICING, SERVICEWAIT, 'wait')
# A node in service can be held
machine.add_transition(SERVICING, SERVICEHOLD, 'hold')
machine.add_transition(SERVICEWAIT, SERVICEHOLD, 'hold')
# A held node in service can get more service steps to start over
machine.add_transition(SERVICEHOLD, SERVICING, 'service')
# A held node in service can be removed from service
machine.add_transition(SERVICEHOLD, SERVICEWAIT, 'unhold')
# A node in service wait can resume
machine.add_transition(SERVICEWAIT, SERVICING, 'resume')
# A node in service wait can failed
machine.add_transition(SERVICEWAIT, SERVICEFAIL, 'fail')
# A node in service wait can be aborted
machine.add_transition(SERVICEWAIT, SERVICEFAIL, 'abort')
# A node in service hold can be aborted
machine.add_transition(SERVICEHOLD, SERVICEFAIL, 'abort')
# A node in service fail can re-enter service
machine.add_transition(SERVICEFAIL, SERVICING, 'service')
# A node in service fail can be rescued
machine.add_transition(SERVICEFAIL, RESCUING, 'rescue')

View File

@ -66,6 +66,7 @@ from ironic.conductor import deployments
from ironic.conductor import inspection from ironic.conductor import inspection
from ironic.conductor import notification_utils as notify_utils from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import periodics from ironic.conductor import periodics
from ironic.conductor import servicing
from ironic.conductor import steps as conductor_steps from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager from ironic.conductor import task_manager
from ironic.conductor import utils from ironic.conductor import utils
@ -93,7 +94,7 @@ class ConductorManager(base_manager.BaseConductorManager):
# NOTE(rloo): This must be in sync with rpcapi.ConductorAPI's. # NOTE(rloo): This must be in sync with rpcapi.ConductorAPI's.
# NOTE(pas-ha): This also must be in sync with # NOTE(pas-ha): This also must be in sync with
# ironic.common.release_mappings.RELEASE_MAPPING['master'] # ironic.common.release_mappings.RELEASE_MAPPING['master']
RPC_API_VERSION = '1.56' RPC_API_VERSION = '1.57'
target = messaging.Target(version=RPC_API_VERSION) target = messaging.Target(version=RPC_API_VERSION)
@ -3710,6 +3711,81 @@ class ConductorManager(base_manager.BaseConductorManager):
task, inventory, plugin_data), task, inventory, plugin_data),
err_handler=utils.provisioning_error_handler) err_handler=utils.provisioning_error_handler)
@METRICS.timer('ConductorManager.do_node_service')
@messaging.expected_exceptions(exception.InvalidParameterValue,
exception.InvalidStateRequested,
exception.NodeInMaintenance,
exception.NodeLocked,
exception.NoFreeConductorWorker,
exception.ConcurrentActionLimit)
def do_node_service(self, context, node_id, service_steps,
disable_ramdisk=False):
"""RPC method to initiate node service.
:param context: an admin context.
:param node_id: the ID or UUID of a node.
:param service_steps: an ordered list of steps that will be
performed on the node. A step is a dictionary with required
keys 'interface' and 'step', and optional key 'args'. If
specified, the 'args' arguments are passed to the clean step
method.::
{ 'interface': <driver_interface>,
'step': <name_of__step>,
'args': {<arg1>: <value1>, ..., <argn>: <valuen>} }
For example (this isn't a real example, this service step
doesn't exist)::
{ 'interface': deploy',
'step': 'upgrade_firmware',
'args': {'force': True} }
:param disable_ramdisk: Optional. Whether to disable the ramdisk boot.
:raises: InvalidParameterValue if power validation fails.
:raises: InvalidStateRequested if the node is not in manageable state.
:raises: NodeLocked if node is locked by another conductor.
:raises: NoFreeConductorWorker when there is no free worker to start
async task.
:raises: ConcurrentActionLimit If this action would exceed the
configured limits of the deployment.
"""
self._concurrent_action_limit(action='service')
with task_manager.acquire(context, node_id, shared=False,
purpose='node service') as task:
node = task.node
if node.maintenance:
raise exception.NodeInMaintenance(op=_('service'),
node=node.uuid)
# NOTE(TheJulia): service.do_node_service() will also make similar
# calls to validate power & network, but we are doing it again
# here so that the user gets immediate feedback of any issues.
# This behaviour (of validating) is consistent with other methods
# like self.do_node_deploy().
try:
task.driver.power.validate(task)
task.driver.network.validate(task)
except exception.InvalidParameterValue as e:
msg = (_('Validation of node %(node)s for servicing '
'failed: %(msg)s') %
{'node': node.uuid, 'msg': e})
raise exception.InvalidParameterValue(msg)
try:
task.process_event(
'service',
callback=self._spawn_worker,
call_args=(servicing.do_node_service, task, service_steps,
disable_ramdisk),
err_handler=utils.provisioning_error_handler,
target_state=states.ACTIVE)
except exception.InvalidState:
raise exception.InvalidStateRequested(
action='service', node=node.uuid,
state=node.provision_state)
# NOTE(TheJulia): This is the end of the class definition for the
# conductor manager. Methods for RPC and stuffs should go above this
# point in the File. Everything below is a helper or periodic.
@METRICS.timer('get_vendor_passthru_metadata') @METRICS.timer('get_vendor_passthru_metadata')
def get_vendor_passthru_metadata(route_dict): def get_vendor_passthru_metadata(route_dict):

View File

@ -153,12 +153,13 @@ class ConductorAPI(object):
heartbeat heartbeat
| 1.55 - Added change_node_boot_mode | 1.55 - Added change_node_boot_mode
| 1.56 - Added continue_inspection | 1.56 - Added continue_inspection
| 1.57 - Added do_node_service
""" """
# NOTE(rloo): This must be in sync with manager.ConductorManager's. # NOTE(rloo): This must be in sync with manager.ConductorManager's.
# NOTE(pas-ha): This also must be in sync with # NOTE(pas-ha): This also must be in sync with
# ironic.common.release_mappings.RELEASE_MAPPING['master'] # ironic.common.release_mappings.RELEASE_MAPPING['master']
RPC_API_VERSION = '1.56' RPC_API_VERSION = '1.57'
def __init__(self, topic=None): def __init__(self, topic=None):
super(ConductorAPI, self).__init__() super(ConductorAPI, self).__init__()
@ -1409,3 +1410,27 @@ class ConductorAPI(object):
cctxt = self._prepare_call(topic=topic, version='1.56') cctxt = self._prepare_call(topic=topic, version='1.56')
return cctxt.call(context, 'continue_inspection', node_id=node_id, return cctxt.call(context, 'continue_inspection', node_id=node_id,
inventory=inventory, plugin_data=plugin_data) inventory=inventory, plugin_data=plugin_data)
def do_node_service(self, context, node_id, service_steps,
disable_ramdisk=None, topic=None):
"""Signal to conductor service to perform manual cleaning on a node.
:param context: request context.
:param node_id: node ID or UUID.
:param service_steps: a list of service step dictionaries.
:param disable_ramdisk: Whether to skip booting ramdisk for service.
:param topic: RPC topic. Defaults to self.topic.
:raises: InvalidParameterValue if validation of power driver interface
failed.
:raises: InvalidStateRequested if cleaning can not be performed.
:raises: NodeInMaintenance if node is in maintenance mode.
:raises: NodeLocked if node is locked by another conductor.
:raises: NoFreeConductorWorker when there is no free worker to start
async task.
"""
cctxt = self._prepare_call(topic=topic, version='1.57')
return cctxt.call(
context, 'do_node_service',
node_id=node_id,
service_steps=service_steps,
disable_ramdisk=disable_ramdisk)

View File

@ -0,0 +1,377 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Functionality related to servicing."""
from oslo_log import log
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import states
from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
from ironic.drivers import utils as driver_utils
from ironic import objects
LOG = log.getLogger(__name__)
@task_manager.require_exclusive_lock
def do_node_service(task, service_steps=None, disable_ramdisk=False):
"""Internal RPC method to perform servicing of a node.
:param task: a TaskManager instance with an exclusive lock on its node
:param service_steps: The list of service steps to perform. If none, step
validation will fail.
:param disable_ramdisk: Whether to skip booting ramdisk for servicing.
"""
node = task.node
try:
# NOTE(ghe): Valid power and network values are needed to perform
# a service operation.
task.driver.power.validate(task)
if not disable_ramdisk:
task.driver.network.validate(task)
except (exception.InvalidParameterValue, exception.NetworkError) as e:
msg = (_('Validation of node %(node)s for service failed: %(msg)s') %
{'node': node.uuid, 'msg': e})
return utils.servicing_error_handler(task, msg)
utils.wipe_service_internal_info(task)
node.set_driver_internal_info('service_steps', service_steps)
node.set_driver_internal_info('service_disable_ramdisk',
disable_ramdisk)
task.node.save()
# Allow the deploy driver to set up the ramdisk again (necessary for IPA)
try:
if not disable_ramdisk:
prepare_result = task.driver.deploy.prepare_service(task)
else:
LOG.info('Skipping preparing for service in-band service since '
'out-of-band only service has been requested for node '
'%s', node.uuid)
prepare_result = None
except Exception as e:
msg = (_('Failed to prepare node %(node)s for service: %(e)s')
% {'node': node.uuid, 'e': e})
return utils.servicing_error_handler(task, msg, traceback=True)
if prepare_result == states.SERVICEWAIT:
# Prepare is asynchronous, the deploy driver will need to
# set node.driver_internal_info['service_steps'] and
# node.service_step and then make an RPC call to
# continue_node_service to start service operations.
task.process_event('wait')
return
try:
conductor_steps.set_node_service_steps(
task, disable_ramdisk=disable_ramdisk)
except Exception as e:
# Catch all exceptions and follow the error handling
# path so things are cleaned up properly.
msg = (_('Cannot service node %(node)s: %(msg)s')
% {'node': node.uuid, 'msg': e})
return utils.servicing_error_handler(task, msg)
steps = node.driver_internal_info.get('service_steps', [])
if not steps:
_tear_down_node_service(task, disable_ramdisk=disable_ramdisk)
step_index = 0 if steps else None
do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk)
@utils.fail_on_error(utils.servicing_error_handler,
_("Unexpected error when processing next service step"),
traceback=True)
@task_manager.require_exclusive_lock
def do_next_service_step(task, step_index, disable_ramdisk=None):
"""Do service, starting from the specified service step.
:param task: a TaskManager instance with an exclusive lock
:param step_index: The first service step in the list to execute. This
is the index (from 0) into the list of service steps in the node's
driver_internal_info['service_steps']. Is None if there are no steps
to execute.
:param disable_ramdisk: Whether to skip booting ramdisk for service.
"""
node = task.node
# For manual cleaning, the target provision state is MANAGEABLE,
# whereas for automated cleaning, it is AVAILABLE.
if step_index is None:
steps = []
else:
assert node.driver_internal_info.get('service_steps') is not None, \
f"BUG: No steps for {node.uuid}, step index is {step_index}"
steps = node.driver_internal_info['service_steps'][step_index:]
if disable_ramdisk is None:
disable_ramdisk = node.driver_internal_info.get(
'service_disable_ramdisk', False)
LOG.info('Executing service on node %(node)s, remaining steps: '
'%(steps)s', {'node': node.uuid, 'steps': steps})
# Execute each step until we hit an async step or run out of steps
for ind, step in enumerate(steps):
# Save which step we're about to start so we can restart
# if necessary
node.service_step = step
node.set_driver_internal_info('service_step_index', step_index + ind)
node.save()
eocn = step.get('execute_on_child_nodes', False)
result = None
try:
if not eocn:
LOG.info('Executing %(step)s on node %(node)s',
{'step': step, 'node': node.uuid})
use_step_handler = conductor_steps.use_reserved_step_handler(
task, step)
if use_step_handler:
if use_step_handler == conductor_steps.EXIT_STEPS:
# Exit the step, i.e. hold step
return
# if use_step_handler == conductor_steps.USED_HANDLER
# Then we have completed the needful in the handler,
# but since there is no other value to check now,
# we know we just need to skip execute_deploy_step
else:
interface = getattr(task.driver, step.get('interface'))
result = interface.execute_service_step(task, step)
else:
LOG.info('Executing %(step)s on child nodes for node '
'%(node)s.',
{'step': step, 'node': node.uuid})
result = execute_step_on_child_nodes(task, step)
except Exception as e:
if isinstance(e, exception.AgentConnectionFailed):
if task.node.driver_internal_info.get('service_reboot'):
LOG.info('Agent is not yet running on node %(node)s '
'after service reboot, waiting for agent to '
'come up to run next service step %(step)s.',
{'node': node.uuid, 'step': step})
node.set_driver_internal_info('skip_current_service_step',
False)
task.process_event('wait')
return
if isinstance(e, exception.AgentInProgress):
LOG.info('Conductor attempted to process service step for '
'node %(node)s. Agent indicated it is presently '
'executing a command. Error: %(error)s',
{'node': task.node.uuid,
'error': e})
node.set_driver_internal_info(
'skip_current_service_step', False)
task.process_event('wait')
return
msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') %
{'node': node.uuid, 'exc': e,
'step': node.service_step})
if not disable_ramdisk:
driver_utils.collect_ramdisk_logs(task.node, label='service')
utils.servicing_error_handler(task, msg, traceback=True)
return
# Check if the step is done or not. The step should return
# states.SERVICEWAIT if the step is still being executed, or
# None if the step is done.
if result == states.SERVICEWAIT:
# Kill this worker, the async step will make an RPC call to
# continue_node_service to continue service
LOG.info('Service step %(step)s on node %(node)s being '
'executed asynchronously, waiting for driver.',
{'node': node.uuid, 'step': step})
task.process_event('wait')
return
elif result is not None:
msg = (_('While executing step %(step)s on node '
'%(node)s, step returned invalid value: %(val)s')
% {'step': step, 'node': node.uuid, 'val': result})
return utils.servicing_error_handler(task, msg)
LOG.info('Node %(node)s finished service step %(step)s',
{'node': node.uuid, 'step': step})
utils.wipe_service_internal_info(task)
if CONF.agent.deploy_logs_collect == 'always' and not disable_ramdisk:
driver_utils.collect_ramdisk_logs(task.node, label='service')
_tear_down_node_service(task, disable_ramdisk)
def _tear_down_node_service(task, disable_ramdisk):
"""Clean up a node from service.
:param task: A Taskmanager object.
:returns: None
"""
task.node.service_step = None
utils.wipe_service_internal_info(task)
task.node.save()
if not disable_ramdisk:
try:
task.driver.deploy.tear_down_service(task)
except Exception as e:
msg = (_('Failed to tear down from service for node %(node)s, '
'reason: %(err)s')
% {'node': task.node.uuid, 'err': e})
return utils.servicing_error_handler(task, msg,
traceback=True,
tear_down_service=False)
LOG.info('Node %s service complete.', task.node.uuid)
task.process_event('done')
def execute_step_on_child_nodes(task, step):
"""Execute a requested step against a child node.
:param task: The TaskManager object for the parent node.
:param step: The requested step to be executed.
:returns: None on Success, the resulting error message if a
failure has occured.
"""
# NOTE(TheJulia): We could just use nodeinfo list calls against
# dbapi.
# NOTE(TheJulia): We validate the data in advance in the API
# with the original request context.
eocn = step.get('execute_on_child_nodes')
child_nodes = step.get('limit_child_node_execution', [])
filters = {'parent_node': task.node.uuid}
if eocn and len(child_nodes) >= 1:
filters['uuid_in'] = child_nodes
child_nodes = objects.Node.list(
task.context,
filters=filters,
fields=['uuid']
)
for child_node in child_nodes:
result = None
LOG.info('Executing step %(step)s on child node %(node)s for parent '
'node %(parent_node)s',
{'step': step,
'node': child_node.uuid,
'parent_node': task.node.uuid})
with task_manager.acquire(task.context,
child_node.uuid,
purpose='execute step') as child_task:
interface = getattr(child_task.driver, step.get('interface'))
LOG.info('Executing %(step)s on node %(node)s',
{'step': step, 'node': child_task.node.uuid})
if not conductor_steps.use_reserved_step_handler(child_task, step):
result = interface.execute_service_step(child_task, step)
if result is not None:
if (result == states.SERVICEWAIT
and CONF.conductor.permit_child_node_step_async_result):
# Operator has chosen to permit this due to some reason
# NOTE(TheJulia): This is where we would likely wire agent
# error handling if we ever implicitly allowed child node
# deploys to take place with the agent from a parent node
# being deployed.
continue
msg = (_('While executing step %(step)s on child node '
'%(node)s, step returned invalid value: %(val)s')
% {'step': step, 'node': child_task.node.uuid,
'val': result})
LOG.error(msg)
# Only None or states.SERVICEWAIT are possible paths forward
# in the parent step execution code, so returning the message
# means it will be logged.
return msg
def get_last_error(node):
last_error = _('By request, the service operation was aborted')
if node.service_step:
last_error += (
_(' during or after the completion of step "%s"')
% conductor_steps.step_id(node.service_step)
)
return last_error
@task_manager.require_exclusive_lock
def do_node_service_abort(task):
"""Internal method to abort an ongoing operation.
:param task: a TaskManager instance with an exclusive lock
"""
node = task.node
try:
task.driver.deploy.tear_down_service(task)
except Exception as e:
log_msg = (_('Failed to tear down service for node %(node)s '
'after aborting the operation. Error: %(err)s') %
{'node': node.uuid, 'err': e})
error_msg = _('Failed to tear down service after aborting '
'the operation')
utils.servicing_error_handler(task, log_msg,
errmsg=error_msg,
traceback=True,
tear_down_service=False,
set_fail_state=False)
return
last_error = get_last_error(node)
info_message = _('Clean operation aborted for node %s') % node.uuid
if node.service_step:
info_message += (
_(' during or after the completion of step "%s"')
% node.service_step
)
node.last_error = last_error
node.service_step = None
utils.wipe_service_internal_info(task)
node.save()
LOG.info(info_message)
@utils.fail_on_error(utils.servicing_error_handler,
_("Unexpected error when processing next service step"),
traceback=True)
@task_manager.require_exclusive_lock
def continue_node_service(task):
"""Continue servicing after finishing an async service step.
This function calculates which step has to run next and passes control
into do_next_service_step.
:param task: a TaskManager instance with an exclusive lock
"""
node = task.node
next_step_index = utils.update_next_step_index(task, 'service')
# If this isn't the final service step in the service operation
# and it is flagged to abort after the service step that just
# finished, we abort the operation.
if node.service_step.get('abort_after'):
step_name = node.service_step['step']
if next_step_index is not None:
LOG.debug('The service operation for node %(node)s was '
'marked to be aborted after step "%(step)s '
'completed. Aborting now that it has completed.',
{'node': task.node.uuid, 'step': step_name})
task.process_event('fail')
do_node_service_abort(task)
return
LOG.debug('The service operation for node %(node)s was '
'marked to be aborted after step "%(step)s" '
'completed. However, since there are no more '
'service steps after this, the abort is not going '
'to be done.', {'node': node.uuid,
'step': step_name})
do_next_service_step(task, next_step_index)

View File

@ -54,6 +54,8 @@ DEPLOYING_INTERFACE_PRIORITY = {
'raid': 1, 'raid': 1,
} }
SERVICING_INTERFACE_PRIORITY = DEPLOYING_INTERFACE_PRIORITY.copy()
VERIFYING_INTERFACE_PRIORITY = { VERIFYING_INTERFACE_PRIORITY = {
# When two verify steps have the same priority, their order is determined # When two verify steps have the same priority, their order is determined
# by which interface is implementing the verify step. The verifying step of # by which interface is implementing the verify step. The verifying step of
@ -127,6 +129,15 @@ def _sorted_steps(steps, sort_step_key):
return sorted(steps, key=sort_step_key, reverse=True) return sorted(steps, key=sort_step_key, reverse=True)
def _service_step_key(step):
"""Sort by priority, then interface priority in event of tie.
:param step: deploy step dict to get priority for.
"""
return (step.get('priority'),
SERVICING_INTERFACE_PRIORITY[step.get('interface')])
def is_equivalent(step1, step2): def is_equivalent(step1, step2):
"""Compare steps, ignoring their priority.""" """Compare steps, ignoring their priority."""
return (step1.get('interface') == step2.get('interface') return (step1.get('interface') == step2.get('interface')
@ -240,6 +251,26 @@ def _get_deployment_steps(task, enabled=False, sort=True):
enabled=enabled, sort_step_key=sort_key) enabled=enabled, sort_step_key=sort_key)
def _get_service_steps(task, enabled=False, sort=True):
"""Get service steps for task.node.
:param task: A TaskManager object
:param enabled: If True, returns only enabled (priority > 0) steps. If
False, returns all clean steps.
:param sort: If True, the steps are sorted from highest priority to lowest
priority. For steps having the same priority, they are sorted from
highest interface priority to lowest.
:raises: NodeServicingFailure if there was a problem getting the
clean steps.
:returns: A list of clean step dictionaries
"""
sort_key = _service_step_key if sort else None
service_steps = _get_steps(task, SERVICING_INTERFACE_PRIORITY,
'get_service_steps', enabled=enabled,
sort_step_key=sort_key)
return service_steps
def _get_verify_steps(task, enabled=False, sort=True): def _get_verify_steps(task, enabled=False, sort=True):
"""Get verify steps for task.node. """Get verify steps for task.node.
@ -455,6 +486,34 @@ def set_node_deployment_steps(task, reset_current=True, skip_missing=False):
node.save() node.save()
def set_node_service_steps(task, disable_ramdisk=False):
"""Set up the node with clean step information for cleaning.
For automated cleaning, get the clean steps from the driver.
For manual cleaning, the user's clean steps are known but need to be
validated against the driver's clean steps.
:param disable_ramdisk: If `True`, only steps with requires_ramdisk=False
are accepted.
:raises: InvalidParameterValue if there is a problem with the user's
clean steps.
:raises: NodeCleaningFailure if there was a problem getting the
clean steps.
"""
node = task.node
steps = _validate_user_service_steps(
task, node.driver_internal_info.get('service_steps', []),
disable_ramdisk=disable_ramdisk)
LOG.debug('List of the steps for service of node %(node)s: '
'%(steps)s', {'node': node.uuid,
'steps': steps})
node.service_step = {}
node.set_driver_internal_info('service_steps', steps)
node.set_driver_internal_info('service_step_index', None)
node.save()
def step_id(step): def step_id(step):
"""Return the 'ID' of a deploy step. """Return the 'ID' of a deploy step.
@ -705,7 +764,6 @@ def _validate_user_steps(task, user_steps, driver_steps, step_type,
err = error_prefix or '' err = error_prefix or ''
err += '; '.join(errors) err += '; '.join(errors)
raise exception.InvalidParameterValue(err=err) raise exception.InvalidParameterValue(err=err)
return result return result
@ -769,6 +827,36 @@ def _validate_user_deploy_steps(task, user_steps, error_prefix=None,
skip_missing=skip_missing) skip_missing=skip_missing)
def _validate_user_service_steps(task, user_steps, disable_ramdisk=False):
"""Validate the user-specified service steps.
:param task: A TaskManager object
:param user_steps: a list of clean steps. A clean step is a dictionary
with required keys 'interface' and 'step', and optional key 'args'::
{ 'interface': <driver_interface>,
'step': <name_of_clean_step>,
'args': {<arg1>: <value1>, ..., <argn>: <valuen>} }
For example::
{ 'interface': 'deploy',
'step': 'upgrade_firmware',
'args': {'force': True} }
:param disable_ramdisk: If `True`, only steps with requires_ramdisk=False
are accepted.
:raises: InvalidParameterValue if validation of clean steps fails.
:raises: NodeCleaningFailure if there was a problem getting the
clean steps from the driver.
:return: validated clean steps update with information from the driver
"""
# We call with enabled = False below so we pickup auto-disabled
# steps, since service steps are not automagic like cleaning can be.
driver_steps = _get_service_steps(task, enabled=False, sort=False)
return _validate_user_steps(task, user_steps, driver_steps, 'service',
disable_ramdisk=disable_ramdisk)
def _get_validated_user_deploy_steps(task, deploy_steps=None, def _get_validated_user_deploy_steps(task, deploy_steps=None,
skip_missing=False): skip_missing=False):
"""Validate the deploy steps for a node. """Validate the deploy steps for a node.

View File

@ -574,6 +574,20 @@ def wipe_cleaning_internal_info(task):
node.del_driver_internal_info('steps_validated') node.del_driver_internal_info('steps_validated')
def wipe_service_internal_info(task):
"""Remove temporary servicing fields from driver_internal_info."""
wipe_token_and_url(task)
node = task.node
node.set_driver_internal_info('service_steps', None)
node.del_driver_internal_info('agent_cached_service_steps')
node.del_driver_internal_info('service_step_index')
node.del_driver_internal_info('service_reboot')
node.del_driver_internal_info('service_polling')
node.del_driver_internal_info('service_disable_ramdisk')
node.del_driver_internal_info('skip_current_service_step')
node.del_driver_internal_info('steps_validated')
def deploying_error_handler(task, logmsg, errmsg=None, traceback=False, def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
clean_up=True): clean_up=True):
"""Put a failed node in DEPLOYFAIL. """Put a failed node in DEPLOYFAIL.
@ -1209,7 +1223,7 @@ def _get_node_next_steps(task, step_type, skip_current_step=True):
:returns: index of the next step; None if there are none to execute. :returns: index of the next step; None if there are none to execute.
""" """
valid_types = set(['clean', 'deploy']) valid_types = set(['clean', 'deploy', 'service'])
if step_type not in valid_types: if step_type not in valid_types:
# NOTE(rloo): No need to i18n this, since this would be a # NOTE(rloo): No need to i18n this, since this would be a
# developer error; it isn't user-facing. # developer error; it isn't user-facing.
@ -1745,3 +1759,72 @@ def get_token_project_from_request(ctx):
except AttributeError: except AttributeError:
LOG.warning('Attempted to identify requestor project ID value, ' LOG.warning('Attempted to identify requestor project ID value, '
'however we were unable to do so. Possible older API?') 'however we were unable to do so. Possible older API?')
def servicing_error_handler(task, logmsg, errmsg=None, traceback=False,
tear_down_service=True, set_fail_state=True,
set_maintenance=None):
"""Put a failed node in SERVICEFAIL and maintenance (if needed).
:param task: a TaskManager instance.
:param logmsg: Message to be logged.
:param errmsg: Message for the user. Optional, if not provided `logmsg` is
used.
:param traceback: Whether to log a traceback. Defaults to False.
:param tear_down_service: Whether to clean up the PXE and DHCP files after
servie. Default to True.
:param set_fail_state: Whether to set node to failed state. Default to
True.
:param set_maintenance: Whether to set maintenance mode. If None,
maintenance mode will be set if and only if a clean step is being
executed on a node.
"""
if set_maintenance is None:
set_maintenance = bool(task.node.service_step)
errmsg = errmsg or logmsg
LOG.error(logmsg, exc_info=traceback)
node = task.node
if set_maintenance:
node.fault = faults.SERVICE_FAILURE
node.maintenance = True
if tear_down_service:
try:
task.driver.deploy.tear_down_service(task)
except Exception as e:
msg2 = ('Failed to tear down servicing on node %(uuid)s, '
'reason: %(err)s' % {'err': e, 'uuid': node.uuid})
LOG.exception(msg2)
errmsg = _('%s. Also failed to tear down servicing.') % errmsg
if node.provision_state in (
states.SERVICING,
states.SERVICEWAIT,
states.SERVICEFAIL):
# Clear clean step, msg should already include current step
node.service_step = {}
# Clear any leftover metadata about cleaning
node.del_driver_internal_info('service_step_index')
node.del_driver_internal_info('servicing_reboot')
node.del_driver_internal_info('servicing_polling')
node.del_driver_internal_info('skip_current_service_step')
# We don't need to keep the old agent URL, or token
# as it should change upon the next cleaning attempt.
wipe_token_and_url(task)
# For manual cleaning, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is AVAILABLE.
node_history_record(node, event=errmsg, event_type=states.SERVICING,
error=True)
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
if not node.maintenance_reason and set_maintenance:
node.maintenance_reason = errmsg
if CONF.conductor.poweroff_in_servicefail:
# NOTE(NobodyCam): Power off node in service fail
node_power_action(task, states.POWER_OFF)
node.save()
if set_fail_state and node.provision_state != states.SERVICEFAIL:
task.process_event('fail')

View File

@ -356,6 +356,13 @@ opts = [
'when using Cleaning to perform ' 'when using Cleaning to perform '
'hardware-transformative actions such as ' 'hardware-transformative actions such as '
'firmware upgrade.')), 'firmware upgrade.')),
cfg.BoolOpt('poweroff_in_servicefail',
default=False,
help=_('If True power off nodes in the ``service failed`` '
'state. Default False. Option may be unsafe '
'when using service to perform '
'hardware-transformative actions such as '
'firmware upgrade.')),
cfg.BoolOpt('permit_child_node_step_async_result', cfg.BoolOpt('permit_child_node_step_async_result',
default=False, default=False,
mutable=True, mutable=True,

View File

@ -128,7 +128,24 @@ opts = [
'different CLID/IAID. Due to non-identical identifiers ' 'different CLID/IAID. Due to non-identical identifiers '
'multiple addresses must be reserved for the host to ' 'multiple addresses must be reserved for the host to '
'ensure each step of the boot process can successfully ' 'ensure each step of the boot process can successfully '
'lease addresses.')) 'lease addresses.')),
cfg.StrOpt('servicing_network',
mutable=True,
help=_('Neutron network UUID or name for booting the ramdisk '
'for service mode. Required for "neutron" '
'network interface, if service mode will be used. It '
'is not used for the "flat" or "noop" network '
'interfaces. If a name is provided, it must be unique '
'among all networks or service will fail.')),
cfg.ListOpt('servicing_network_security_groups',
default=[],
mutable=True,
help=_('List of Neutron Security Group UUIDs to be applied '
'during the node service process. Optional for the '
'"neutron" network interface and not used for the '
'"flat" or "noop" network interfaces. If not '
'specified, the default security group is used.')),
] ]

View File

@ -0,0 +1,35 @@
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Add service_steps
Revision ID: aa2384fee727
Revises: d163df1bab88
Create Date: 2023-05-25 11:50:05.285602
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = 'aa2384fee727'
down_revision = 'd163df1bab88'
def upgrade():
op.add_column('nodes', sa.Column('service_step', sa.Text(),
nullable=True))

View File

@ -217,6 +217,7 @@ class NodeBase(Base):
secure_boot = Column(Boolean, nullable=True) secure_boot = Column(Boolean, nullable=True)
shard = Column(String(255), nullable=True) shard = Column(String(255), nullable=True)
parent_node = Column(String(36), nullable=True) parent_node = Column(String(36), nullable=True)
service_step = Column(db_types.JsonEncodedDict)
class Node(NodeBase): class Node(NodeBase):

View File

@ -247,6 +247,7 @@ class BaseInterface(object, metaclass=abc.ABCMeta):
instance.clean_steps = [] instance.clean_steps = []
instance.deploy_steps = [] instance.deploy_steps = []
instance.verify_steps = [] instance.verify_steps = []
instance.service_steps = []
for n, method in inspect.getmembers(instance, inspect.ismethod): for n, method in inspect.getmembers(instance, inspect.ismethod):
if getattr(method, '_is_clean_step', False): if getattr(method, '_is_clean_step', False):
# Create a CleanStep to represent this method # Create a CleanStep to represent this method
@ -271,6 +272,15 @@ class BaseInterface(object, metaclass=abc.ABCMeta):
'priority': method._verify_step_priority, 'priority': method._verify_step_priority,
'interface': instance.interface_type} 'interface': instance.interface_type}
instance.verify_steps.append(step) instance.verify_steps.append(step)
if getattr(method, '_is_service_step', False):
step = {'step': method.__name__,
'priority': method._service_step_priority,
'abortable': method._service_step_abortable,
'argsinfo': method._service_step_argsinfo,
'interface': instance.interface_type,
'requires_ramdisk':
method._service_step_requires_ramdisk}
instance.service_steps.append(step)
if instance.clean_steps: if instance.clean_steps:
LOG.debug('Found clean steps %(steps)s for interface ' LOG.debug('Found clean steps %(steps)s for interface '
@ -287,6 +297,11 @@ class BaseInterface(object, metaclass=abc.ABCMeta):
'%(interface)s', '%(interface)s',
{'steps': instance.deploy_steps, {'steps': instance.deploy_steps,
'interface': instance.interface_type}) 'interface': instance.interface_type})
if instance.service_steps:
LOG.debug('Found service steps %(steps)s for interface '
'%(interface)s',
{'steps': instance.service_steps,
'interface': instance.interface_type})
return instance return instance
@ -411,6 +426,35 @@ class BaseInterface(object, metaclass=abc.ABCMeta):
""" """
return self._execute_step(task, step) return self._execute_step(task, step)
def get_service_steps(self, task):
"""Get a list of service steps for the interface.
This function will return all service steps (both enabled and disabled)
for the interface, in an unordered list.
:param task: A TaskManager object, useful for interfaces overriding
this function
:raises NodeServiceFailure: if there is a problem getting the steps
from the driver. For example, when a node (using an agent driver)
has just been enrolled and the agent isn't alive yet to be queried
for the available clean steps.
:returns: A list of clean step dictionaries
"""
return self.service_steps
def execute_service_step(self, task, step):
"""Execute the service step on task.node.
A verify step must take a single positional argument: a TaskManager
object. It does not take keyword variable arguments.
:param task: A TaskManager object
:param step: The deploy step dictionary representing the step to
execute
:returns: None if this method has completed synchronously
"""
return self._execute_step(task, step)
class DeployInterface(BaseInterface): class DeployInterface(BaseInterface):
"""Interface for deploy-related actions.""" """Interface for deploy-related actions."""
@ -545,6 +589,38 @@ class DeployInterface(BaseInterface):
'the driver %(driver)s does not support heartbeating', 'the driver %(driver)s does not support heartbeating',
{'node': task.node.uuid, 'driver': task.node.driver}) {'node': task.node.uuid, 'driver': task.node.driver})
def tear_down_service(self, task):
"""Tear down after servicing is completed.
Given that servicing is complete, do all cleanup and tear
down necessary to allow the node to be returned to an active
state.
:param task: A TaskManager instance containing the node to act on.
"""
pass
def prepare_service(self, task):
"""Prepare the node for servicing tasks.
For example, nodes that use the Ironic Python Agent will need to
boot the ramdisk in order to do in-band service tasks.
If the function is asynchronous, the driver will need to handle
settings node.driver_internal_info['service_steps'] and
node.service_step, as they would be set in
ironic.conductor.manager._do_node_service, but cannot be set when
this is asynchronous. After, the interface should make an RPC call
to continue_node_servicing to start cleaning.
:param task: A TaskManager instance containing the node to act on.
:returns: If this function is going to be asynchronous, should return
`states.SERVICEWAIT`. Otherwise, should return `None`.
The interface will need to call _get_cleaning_steps and then RPC
to continue_node_service.
"""
pass
class BootInterface(BaseInterface): class BootInterface(BaseInterface):
"""Interface for boot-related actions.""" """Interface for boot-related actions."""
@ -1710,6 +1786,28 @@ class NetworkInterface(BaseInterface):
""" """
return task.node.network_data or {} return task.node.network_data or {}
def add_servicing_network(self, task):
"""Add the servicing network to the node.
:param task: A TaskManager instance.
:returns: a dictionary in the form {port.uuid: neutron_port['id']}
:raises: NetworkError
:raises: InvalidParameterValue, if the network interface configuration
is invalid.
"""
return {}
def remove_servicing_network(self, task):
"""Removes the servicing network from a node.
:param task: A TaskManager instance.
:raises: NetworkError
:raises: InvalidParameterValue, if the network interface configuration
is invalid.
:raises: MissingParameterValue, if some parameters are missing.
"""
pass
class StorageInterface(BaseInterface, metaclass=abc.ABCMeta): class StorageInterface(BaseInterface, metaclass=abc.ABCMeta):
"""Base class for storage interfaces.""" """Base class for storage interfaces."""
@ -2023,3 +2121,82 @@ def verify_step(priority):
return func return func
return decorator return decorator
def service_step(priority=None, abortable=False, argsinfo=None,
requires_ramdisk=True):
"""Decorator for service steps.
Service steps may be used in performing service upon a node.
For service, the steps will be executed in a similar fashion
to cleaning, but the steps and order of execution must be
explicitly specified by the user when invoking the servicing API.
Decorated service steps must take as the only a single positional
argument, a TaskManager object, in addition to a keyword arguments
variable (as described in argsinfo).
Service steps can be either synchronous or asynchronous. If the step is
synchronous, it should return `None` when finished, and the conductor
will continue on to the next step. While the clean step is executing, the
node will be in `states.SERVICING` provision state. If the step is
asynchronous, the step should return `states.SERVICEWAIT` to the
conductor before it starts the asynchronous work. When the step is
complete, the step should make an RPC call to `continue_node_service` to
move to the next step in servicing. The node will be in
`states.SERVICEWAIT` provision state during the asynchronous work.
Examples::
class MyInterface(base.BaseInterface):
@base.service_step()
def example_service(self, task):
# do some service actions
@base.service_step(priority=0, abortable=True, argsinfo=
{'size': {'description': 'size of widget (MB)',
'required': True}})
def advanced_service(self, task, **kwargs):
# do some advanced magical service
:param priority: an integer priority, defaults to None which maps to 0.
Priorities are not considered, by default but exists
should this functionality be adopted later on to align
with the steps framework.
:param abortable: Boolean value. Whether the clean step is abortable
or not; defaults to False.
:param argsinfo: a dictionary of keyword arguments where key is the name of
the argument and value is a dictionary as follows::
'description': <description>. Required. This should include
possible values.
'required': Boolean. Optional; default is False. True if this
argument is required. If so, it must be specified in
the service request; false if it is optional.
:param requires_ramdisk: Whether this step requires the ramdisk
to be running. Should be set to False for purely out-of-band steps.
:raises InvalidParameterValue: if any of the arguments are invalid
"""
def decorator(func):
func._is_service_step = True
if isinstance(priority, int):
func._service_step_priority = priority
else:
# Service steps are only invoked by operators in a model
# like manual cleaning, so there is no need to explicitly
# require it on the decorator.
func._service_step_priority = 0
if isinstance(abortable, bool):
func._service_step_abortable = abortable
else:
raise exception.InvalidParameterValue(
_('"abortable" must be a Boolean value instead of "%s"')
% abortable)
_validate_argsinfo(argsinfo)
func._service_step_argsinfo = argsinfo
func._service_step_requires_ramdisk = requires_ramdisk
return func
return decorator

View File

@ -32,6 +32,7 @@ from ironic.common import states
from ironic.common import utils from ironic.common import utils
from ironic.conductor import cleaning from ironic.conductor import cleaning
from ironic.conductor import deployments from ironic.conductor import deployments
from ironic.conductor import servicing
from ironic.conductor import steps as conductor_steps from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils from ironic.conductor import utils as manager_utils
@ -85,21 +86,23 @@ VENDOR_PROPERTIES = {
__HEARTBEAT_RECORD_ONLY = (states.ENROLL, states.MANAGEABLE, states.AVAILABLE, __HEARTBEAT_RECORD_ONLY = (states.ENROLL, states.MANAGEABLE, states.AVAILABLE,
states.CLEANING, states.DEPLOYING, states.RESCUING, states.CLEANING, states.DEPLOYING, states.RESCUING,
states.DEPLOYHOLD, states.CLEANHOLD) states.DEPLOYHOLD, states.CLEANHOLD,
states.SERVICING, states.SERVICEHOLD)
_HEARTBEAT_RECORD_ONLY = frozenset(__HEARTBEAT_RECORD_ONLY) _HEARTBEAT_RECORD_ONLY = frozenset(__HEARTBEAT_RECORD_ONLY)
_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT, _HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT,
# These are allowed but don't cause any actions since # These are allowed but don't cause any actions since
# they're also in HEARTBEAT_RECORD_ONLY. # they're also in HEARTBEAT_RECORD_ONLY.
states.DEPLOYING, states.CLEANING, states.RESCUING, states.DEPLOYING, states.CLEANING, states.RESCUING,
states.DEPLOYHOLD, states.CLEANHOLD) states.DEPLOYHOLD, states.CLEANHOLD, states.SERVICING,
states.SERVICEWAIT, states.SERVICEHOLD)
HEARTBEAT_ALLOWED = frozenset(_HEARTBEAT_ALLOWED) HEARTBEAT_ALLOWED = frozenset(_HEARTBEAT_ALLOWED)
_FASTTRACK_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, _FASTTRACK_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT,
states.RESCUEWAIT, states.ENROLL, states.RESCUEWAIT, states.ENROLL,
states.MANAGEABLE, states.AVAILABLE, states.MANAGEABLE, states.AVAILABLE,
states.DEPLOYING, states.CLEANHOLD, states.DEPLOYING, states.CLEANHOLD,
states.DEPLOYHOLD) states.DEPLOYHOLD, states.SERVICEHOLD)
FASTTRACK_HEARTBEAT_ALLOWED = frozenset(_FASTTRACK_HEARTBEAT_ALLOWED) FASTTRACK_HEARTBEAT_ALLOWED = frozenset(_FASTTRACK_HEARTBEAT_ALLOWED)
@ -164,11 +167,12 @@ def _get_post_step_hook(node, step_type):
"""Get post clean/deploy step hook for the currently executing step. """Get post clean/deploy step hook for the currently executing step.
:param node: a node object :param node: a node object
:param step_type: 'clean' or 'deploy' :param step_type: 'clean' or 'deploy' or 'service'
:returns: a method if there is a post clean step hook for this clean :returns: a method if there is a post clean step hook for this clean
step; None otherwise step; None otherwise
""" """
step_obj = node.clean_step if step_type == 'clean' else node.deploy_step
step_obj = getattr(node, "%s_step" % step_type)
interface = step_obj.get('interface') interface = step_obj.get('interface')
step = step_obj.get('step') step = step_obj.get('step')
try: try:
@ -178,17 +182,16 @@ def _get_post_step_hook(node, step_type):
def _post_step_reboot(task, step_type): def _post_step_reboot(task, step_type):
"""Reboots a node out of band after a clean/deploy step that requires it. """Reboots a node out of band after a step that requires it.
If an agent step has 'reboot_requested': True, reboots the node when If an agent step has 'reboot_requested': True, reboots the node when
the step is completed. Will put the node in CLEANFAIL/DEPLOYFAIL if the step is completed. Will put the node in CLEANFAIL/DEPLOYFAIL if
the node cannot be rebooted. the node cannot be rebooted.
:param task: a TaskManager instance :param task: a TaskManager instance
:param step_type: 'clean' or 'deploy' :param step_type: 'clean' or 'deploy' or 'service'
""" """
current_step = (task.node.clean_step if step_type == 'clean' current_step = getattr(task.node, '%s_step' % step_type)
else task.node.deploy_step)
try: try:
# NOTE(fellypefca): ensure that the baremetal node boots back into # NOTE(fellypefca): ensure that the baremetal node boots back into
# the ramdisk after reboot. # the ramdisk after reboot.
@ -205,16 +208,22 @@ def _post_step_reboot(task, step_type):
if step_type == 'clean': if step_type == 'clean':
manager_utils.cleaning_error_handler(task, msg, manager_utils.cleaning_error_handler(task, msg,
traceback=traceback) traceback=traceback)
else: elif step_type == 'deploy':
manager_utils.deploying_error_handler(task, msg, manager_utils.deploying_error_handler(task, msg,
traceback=traceback) traceback=traceback)
elif step_type == 'service':
manager_utils.servicing_error_handler(task, msg,
traceback=traceback)
return return
# Signify that we've rebooted # Signify that we've rebooted
if step_type == 'clean': if step_type == 'clean':
task.node.set_driver_internal_info('cleaning_reboot', True) task.node.set_driver_internal_info('cleaning_reboot', True)
else: elif step_type == 'deploy':
task.node.set_driver_internal_info('deployment_reboot', True) task.node.set_driver_internal_info('deployment_reboot', True)
elif step_type == 'service':
task.node.set_driver_internal_info('servicing_reboot', True)
if not task.node.driver_internal_info.get( if not task.node.driver_internal_info.get(
'agent_secret_token_pregenerated', False): 'agent_secret_token_pregenerated', False):
# Wipes out the existing recorded token because the machine will # Wipes out the existing recorded token because the machine will
@ -261,8 +270,7 @@ def _get_completed_command(task, commands, step_type):
last_result = last_command.get('command_result') or {} last_result = last_command.get('command_result') or {}
last_step = last_result.get('%s_step' % step_type) last_step = last_result.get('%s_step' % step_type)
current_step = (task.node.clean_step if step_type == 'clean' current_step = getattr(task.node, '%s_step' % step_type)
else task.node.deploy_step)
if last_command['command_status'] == 'RUNNING': if last_command['command_status'] == 'RUNNING':
LOG.debug('%(type)s step still running for node %(node)s: %(step)s', LOG.debug('%(type)s step still running for node %(node)s: %(step)s',
{'step': last_step, 'node': task.node.uuid, {'step': last_step, 'node': task.node.uuid,
@ -410,7 +418,10 @@ def _continue_steps(task, step_type):
cleaning.continue_node_clean(task) cleaning.continue_node_clean(task)
else: else:
task.process_event('resume') task.process_event('resume')
if step_type == 'deploy':
deployments.continue_node_deploy(task) deployments.continue_node_deploy(task)
else:
servicing.continue_node_service(task)
class HeartbeatMixin(object): class HeartbeatMixin(object):
@ -439,11 +450,18 @@ class HeartbeatMixin(object):
""" """
return self.refresh_steps(task, 'clean') return self.refresh_steps(task, 'clean')
def process_next_step(self, task, step_type): def refresh_service_steps(self, task):
"""Start the next clean/deploy step if the previous one is complete. """Refresh the node's cached service steps
:param task: a TaskManager instance :param task: a TaskManager instance
:param step_type: "clean" or "deploy" """
return self.refresh_steps(task, 'service')
def process_next_step(self, task, step_type):
"""Start the next step if the previous one is complete.
:param task: a TaskManager instance
:param step_type: "clean", "deploy", "service"
""" """
def continue_cleaning(self, task): def continue_cleaning(self, task):
@ -453,6 +471,13 @@ class HeartbeatMixin(object):
""" """
return self.process_next_step(task, 'clean') return self.process_next_step(task, 'clean')
def continue_servicing(self, task):
"""Start the next cleaning step if the previous one is complete.
:param task: a TaskManager instance
"""
return self.process_next_step(task, 'service')
def heartbeat_allowed(self, node): def heartbeat_allowed(self, node):
if utils.fast_track_enabled(node): if utils.fast_track_enabled(node):
return node.provision_state in FASTTRACK_HEARTBEAT_ALLOWED return node.provision_state in FASTTRACK_HEARTBEAT_ALLOWED
@ -480,6 +505,12 @@ class HeartbeatMixin(object):
'maintenance mode', node.uuid) 'maintenance mode', node.uuid)
last_error = _('Rescue aborted as node is in maintenance mode') last_error = _('Rescue aborted as node is in maintenance mode')
manager_utils.rescuing_error_handler(task, last_error) manager_utils.rescuing_error_handler(task, last_error)
elif (node.provision_state in (states.SERVICING, states.SERVICEWAIT)
and not CONF.conductor.allow_provisioning_in_maintenance):
LOG.error('Aborting service for node %s, as it is in '
'maintenance mode', node.uuid)
last_error = _('Service aborted as node is in maintenance mode')
manager_utils.servicing_error_handler(task, last_error)
else: else:
LOG.warning('Heartbeat from node %(node)s in ' LOG.warning('Heartbeat from node %(node)s in '
'maintenance mode; not taking any action.', 'maintenance mode; not taking any action.',
@ -559,6 +590,37 @@ class HeartbeatMixin(object):
states.RESCUEWAIT): states.RESCUEWAIT):
manager_utils.rescuing_error_handler(task, last_error) manager_utils.rescuing_error_handler(task, last_error)
def _heartbeat_service_wait(self, task):
node = task.node
msg = _('Failed checking if service is done')
try:
node.touch_provisioning()
if not node.service_step:
LOG.debug('Node %s just booted to start %s service',
node.uuid)
msg = _('Node failed to start the first service step')
task.process_event('resume')
# First, cache the service steps
self.refresh_service_steps(task)
# Then set/verify node servicesteps and start service
conductor_steps.set_node_service_steps(task)
servicing.continue_node_service(task)
else:
msg = _('Node failed to check service progress')
# Check if the driver is polling for completion of a step,
# via the 'cleaning_polling' flag.
polling = node.driver_internal_info.get(
'service_polling', False)
if not polling:
self.continue_servicing(task)
except Exception as e:
last_error = _('%(msg)s: %(exc)s') % {'msg': msg, 'exc': e}
log_msg = ('Asynchronous exception for node %(node)s: %(err)s' %
{'node': task.node.uuid, 'err': last_error})
if node.provision_state in (states.SERVICING, states.SERVICEWAIT):
manager_utils.servicing_error_handler(task, log_msg,
errmsg=last_error)
@METRICS.timer('HeartbeatMixin.heartbeat') @METRICS.timer('HeartbeatMixin.heartbeat')
def heartbeat(self, task, callback_url, agent_version, def heartbeat(self, task, callback_url, agent_version,
agent_verify_ca=None, agent_status=None, agent_verify_ca=None, agent_status=None,
@ -616,13 +678,14 @@ class HeartbeatMixin(object):
if node.maintenance: if node.maintenance:
return self._heartbeat_in_maintenance(task) return self._heartbeat_in_maintenance(task)
if node.provision_state == states.DEPLOYWAIT: if node.provision_state == states.DEPLOYWAIT:
self._heartbeat_deploy_wait(task) self._heartbeat_deploy_wait(task)
elif node.provision_state == states.CLEANWAIT: elif node.provision_state == states.CLEANWAIT:
self._heartbeat_clean_wait(task) self._heartbeat_clean_wait(task)
elif node.provision_state == states.RESCUEWAIT: elif node.provision_state == states.RESCUEWAIT:
self._heartbeat_rescue_wait(task) self._heartbeat_rescue_wait(task)
elif node.provision_state == states.SERVICEWAIT:
self._heartbeat_service_wait(task)
def _finalize_rescue(self, task): def _finalize_rescue(self, task):
"""Call ramdisk to prepare rescue mode and verify result. """Call ramdisk to prepare rescue mode and verify result.
@ -744,6 +807,35 @@ class AgentBaseMixin(object):
deploy_utils.tear_down_inband_cleaning( deploy_utils.tear_down_inband_cleaning(
task, manage_boot=self.should_manage_boot(task)) task, manage_boot=self.should_manage_boot(task))
@METRICS.timer('AgentBaseMixin.prepare_cleaning')
def prepare_service(self, task):
"""Boot into the agent to prepare for cleaning.
:param task: a TaskManager object containing the node
:raises: NodeCleaningFailure, NetworkError if the previous cleaning
ports cannot be removed or if new cleaning ports cannot be created.
:raises: InvalidParameterValue if cleaning network UUID config option
has an invalid value.
:returns: states.CLEANWAIT to signify an asynchronous prepare
"""
result = deploy_utils.prepare_inband_service(
task, manage_boot=self.should_manage_boot(task))
if result is None:
# Fast-track, ensure the steps are available.
self.refresh_steps(task, 'service')
return result
@METRICS.timer('AgentBaseMixin.tear_down_service')
def tear_down_service(self, task):
"""Clean up the PXE and DHCP files after cleaning.
:param task: a TaskManager object containing the node
:raises: NodeServiceFailure, NetworkError if the cleaning ports cannot
be removed
"""
deploy_utils.tear_down_inband_service(
task, manage_boot=self.should_manage_boot(task))
@METRICS.timer('AgentBaseMixin.get_clean_steps') @METRICS.timer('AgentBaseMixin.get_clean_steps')
def get_clean_steps(self, task): def get_clean_steps(self, task):
"""Get the list of clean steps from the agent. """Get the list of clean steps from the agent.
@ -785,7 +877,6 @@ class AgentBaseMixin(object):
'Previously cached steps: %(steps)s', 'Previously cached steps: %(steps)s',
{'node': node.uuid, 'type': step_type, {'node': node.uuid, 'type': step_type,
'steps': previous_steps}) 'steps': previous_steps})
client = agent_client.get_client(task) client = agent_client.get_client(task)
call = getattr(client, 'get_%s_steps' % step_type) call = getattr(client, 'get_%s_steps' % step_type)
try: try:
@ -936,32 +1027,34 @@ class AgentBaseMixin(object):
set to True, this method will coordinate the reboot once the step is set to True, this method will coordinate the reboot once the step is
completed. completed.
""" """
assert step_type in ('clean', 'deploy') assert step_type in ('clean', 'deploy', 'service')
node = task.node node = task.node
client = agent_client.get_client(task) client = agent_client.get_client(task)
agent_commands = client.get_commands_status(task.node) agent_commands = client.get_commands_status(task.node)
if _freshly_booted(agent_commands, step_type): if _freshly_booted(agent_commands, step_type):
field = ('cleaning_reboot' if step_type == 'clean' if step_type == 'clean':
else 'deployment_reboot') field = 'cleaning_reboot'
elif step_type == 'service':
field = 'servicing_reboot'
else:
# TODO(TheJulia): One day we should standardize the field
# names here, but we also need to balance human ability
# to understand what is going on so *shrug*.
field = 'deployment_reboot'
utils.pop_node_nested_field(node, 'driver_internal_info', field) utils.pop_node_nested_field(node, 'driver_internal_info', field)
node.save() node.save()
return _continue_steps(task, step_type) return _continue_steps(task, step_type)
current_step = getattr(node, '%s_step' % step_type)
current_step = (node.clean_step if step_type == 'clean'
else node.deploy_step)
command = _get_completed_command(task, agent_commands, step_type) command = _get_completed_command(task, agent_commands, step_type)
LOG.debug('%(type)s command status for node %(node)s on step %(step)s:' LOG.debug('%(type)s command status for node %(node)s on step %(step)s:'
' %(command)s', {'node': node.uuid, ' %(command)s', {'node': node.uuid,
'step': current_step, 'step': current_step,
'command': command, 'command': command,
'type': step_type}) 'type': step_type})
if not command: if not command:
# Agent command in progress # Agent command in progress
return return
if command.get('command_status') == 'FAILED': if command.get('command_status') == 'FAILED':
msg = (_('%(type)s step %(step)s failed on node %(node)s. ' msg = (_('%(type)s step %(step)s failed on node %(node)s. '
'%(err)s') % '%(err)s') %

View File

@ -771,6 +771,86 @@ def tear_down_inband_cleaning(task, manage_boot=True):
task, power_state_to_restore) task, power_state_to_restore)
def prepare_inband_service(self, task):
"""Boot a service ramdisk on the node.
:param task: a TaskManager instance.
:raises: NetworkError if the tenant ports cannot be removed.
:raises: InvalidParameterValue when the wrong power state is specified
or the wrong driver info is specified for power management.
:raises: other exceptions by the node's power driver if something
wrong occurred during the power action.
:raises: any boot interface's prepare_ramdisk exceptions.
:returns: Returns states.SERVICEWAIT
"""
manager_utils.node_power_action(task, states.POWER_OFF)
# NOTE(TheJulia): Revealing that the power is off at any time can
# cause external power sync to decide that the node must be off.
# This may result in a post-rescued instance being turned off
# unexpectedly after rescue has started.
# TODO(TheJulia): Once we have power/state callbacks to nova,
# the reset of the power_state can be removed.
task.node.power_state = states.POWER_ON
task.node.save()
task.driver.boot.clean_up_instance(task)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.unconfigure_tenant_networks(task)
task.driver.network.add_service_network(task)
if CONF.agent.manage_agent_boot:
# prepare_ramdisk will set the boot device
prepare_agent_boot(task)
manager_utils.node_power_action(task, states.POWER_ON)
return states.SERVICEWAIT
def tear_down_inband_service(task, manage_boot=True):
"""Tears down the environment setup for in-band service.
This method does the following:
1. Powers off the bare metal node (unless the node is fast
tracked or there was a service failure).
2. If 'manage_boot' parameter is set to true, it also calls
the 'clean_up_ramdisk' method of boot interface to clean
up the environment that was set for booting agent ramdisk.
3. Deletes the cleaning ports which were setup as part
of cleaning.
:param task: a TaskManager object containing the node
:param manage_boot: If this is set to True, this method calls the
'clean_up_ramdisk' method of boot interface to boot the agent
ramdisk. If False, it skips this step.
:raises: NetworkError, NodeServiceFailure if the cleaning ports cannot be
removed.
"""
node = task.node
service_failure = (node.fault == faults.SERVICE_FAILURE)
if not service_failure:
manager_utils.node_power_action(task, states.POWER_OFF)
if manage_boot:
task.driver.boot.clean_up_ramdisk(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.remove_service_network(task)
if not service_failure:
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_service_network(task)
task.driver.network.configure_tenant_networks(task)
task.driver.boot.prepare_instance(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
# Change the task instead of return the state.
task.process_event('done')
def get_image_instance_info(node): def get_image_instance_info(node):
"""Gets the image information from the node. """Gets the image information from the node.

View File

@ -224,6 +224,7 @@ class FakeVendorB(base.VendorInterface):
sleep(CONF.fake.vendor_delay) sleep(CONF.fake.vendor_delay)
return True if bar == 'woof' else False return True if bar == 'woof' else False
@base.service_step(requires_ramdisk=False)
@base.clean_step(priority=1) @base.clean_step(priority=1)
@base.passthru(['POST'], @base.passthru(['POST'],
description=_("Test pass-through to wait.")) description=_("Test pass-through to wait."))
@ -234,6 +235,10 @@ class FakeVendorB(base.VendorInterface):
# NOTE(TheJulia): Step methods invoked via an API *cannot* # NOTE(TheJulia): Step methods invoked via an API *cannot*
# have return values # have return values
@base.service_step()
def trigger_servicewait(self, task, **kwargs):
return states.SERVICEWAIT
class FakeConsole(base.ConsoleInterface): class FakeConsole(base.ConsoleInterface):
"""Example implementation of a simple console interface.""" """Example implementation of a simple console interface."""

View File

@ -217,3 +217,28 @@ class FlatNetwork(common.NeutronVIFPortIDMixin,
""" """
return self._remove_service_network( return self._remove_service_network(
task, self.get_inspection_network_uuid(task), 'inspection') task, self.get_inspection_network_uuid(task), 'inspection')
def add_servicing_network(self, task):
"""Add the rescuing network to a node.
Flat network does not use the servicing network.
Bind the port again since unconfigure_tenant_network() unbound it.
:param task: A TaskManager instance.
:returns: a dictionary in the form {port.uuid: neutron_port['id']}
:raises: NetworkError, InvalidParameterValue
"""
LOG.info('Bind ports for servicing node %s', task.node.uuid)
self._bind_flat_ports(task)
def remove_servicing_network(self, task):
"""Remove the servicing network from a node.
Flat network does not use the servicing network.
Unbind the port again since add_rescuing_network() bound it.
:param task: A TaskManager instance.
:raises: NetworkError
"""
LOG.info('Unbind ports for servicing node %s', task.node.uuid)
self._unbind_flat_ports(task)

View File

@ -265,3 +265,33 @@ class NeutronNetwork(common.NeutronVIFPortIDMixin,
""" """
return self._remove_network( return self._remove_network(
task, self.get_inspection_network_uuid(task), 'inspection') task, self.get_inspection_network_uuid(task), 'inspection')
def validate_servicing(self, task):
"""Validates the network interface for servicing operation.
:param task: a TaskManager instance.
:raises: InvalidParameterValue, if the network interface configuration
is invalid.
:raises: MissingParameterValue, if some parameters are missing.
"""
self.get_servicing_network_uuid(task)
def add_servicing_network(self, task):
"""Create neutron ports for each port to boot the servicing ramdisk.
:param task: a TaskManager instance.
:returns: a dictionary in the form {port.uuid: neutron_port['id']}
"""
return self._add_network(
task, self.get_servicing_network_uuid(task),
CONF.neutron.servicing_network_security_groups,
'servicing')
def remove_servicing_network(self, task):
"""Deletes neutron port created for booting the servicing ramdisk.
:param task: a TaskManager instance.
:raises: NetworkError
"""
return self._remove_network(
task, self.get_servicing_network_uuid(task), 'servicing')

View File

@ -81,7 +81,8 @@ class Node(base.IronicObject, object_base.VersionedObjectDictCompat):
# Version 1.37: Add shard field # Version 1.37: Add shard field
# Version 1.38: Add parent_node field # Version 1.38: Add parent_node field
# Version 1.39: Add firmware_interface field # Version 1.39: Add firmware_interface field
VERSION = '1.39' # Version 1.40: Add service_step field
VERSION = '1.40'
dbapi = db_api.get_instance() dbapi = db_api.get_instance()
@ -107,6 +108,11 @@ class Node(base.IronicObject, object_base.VersionedObjectDictCompat):
# or has not yet started. # or has not yet started.
'deploy_step': object_fields.FlexibleDictField(nullable=True), 'deploy_step': object_fields.FlexibleDictField(nullable=True),
# A service step dictionary, indicating the current step
# being executed, or None, indicating deployment is not in progress
# or has not yet started.
'service_step': object_fields.FlexibleDictField(nullable=True),
'raid_config': object_fields.FlexibleDictField(nullable=True), 'raid_config': object_fields.FlexibleDictField(nullable=True),
'target_raid_config': object_fields.FlexibleDictField(nullable=True), 'target_raid_config': object_fields.FlexibleDictField(nullable=True),

View File

@ -145,6 +145,7 @@ class TestListNodes(test_api_base.BaseApiTest):
self.assertNotIn('retired_reason', data['nodes'][0]) self.assertNotIn('retired_reason', data['nodes'][0])
self.assertNotIn('lessee', data['nodes'][0]) self.assertNotIn('lessee', data['nodes'][0])
self.assertNotIn('network_data', data['nodes'][0]) self.assertNotIn('network_data', data['nodes'][0])
self.assertNotIn('service_steps', data['nodes'][0])
@mock.patch.object(policy, 'check', autospec=True) @mock.patch.object(policy, 'check', autospec=True)
@mock.patch.object(policy, 'check_policy', autospec=True) @mock.patch.object(policy, 'check_policy', autospec=True)
@ -223,6 +224,7 @@ class TestListNodes(test_api_base.BaseApiTest):
self.assertIn('lessee', data) self.assertIn('lessee', data)
self.assertNotIn('allocation_id', data) self.assertNotIn('allocation_id', data)
self.assertIn('allocation_uuid', data) self.assertIn('allocation_uuid', data)
self.assertIn('service_step', data)
def test_get_one_configdrive_dict(self): def test_get_one_configdrive_dict(self):
fake_instance_info = { fake_instance_info = {
@ -6489,6 +6491,54 @@ ORHMKeXMO8fcK0By7CiMKwHSXCoEQgfQhWwpMdSsO8LgHCjh87DQc= """
self.assertEqual(http_client.NOT_ACCEPTABLE, ret.status_code) self.assertEqual(http_client.NOT_ACCEPTABLE, ret.status_code)
mock_dpa.assert_not_called() mock_dpa.assert_not_called()
@mock.patch.object(rpcapi.ConductorAPI, 'do_provisioning_action',
autospec=True)
def test_unhold_servicehold(self, mock_dpa):
self.node.provision_state = states.SERVICEHOLD
self.node.save()
ret = self.put_json('/nodes/%s/states/provision' % self.node.uuid,
{'target': states.VERBS['unhold']},
headers={api_base.Version.string: "1.86"})
self.assertEqual(http_client.ACCEPTED, ret.status_code)
self.assertEqual(b'', ret.body)
mock_dpa.assert_called_once_with(mock.ANY, mock.ANY, self.node.uuid,
states.VERBS['unhold'],
'test-topic')
@mock.patch.object(rpcapi.ConductorAPI, 'do_node_service',
autospec=True)
def test_service(self, mock_dns):
self.node.provision_state = states.SERVICEHOLD
self.node.save()
ret = self.put_json('/nodes/%s/states/provision' % self.node.uuid,
{'target': states.VERBS['service'],
'service_steps': [{
'interface': 'deploy',
'step': 'meow'}]},
headers={api_base.Version.string: "1.87"})
self.assertEqual(http_client.ACCEPTED, ret.status_code)
self.assertEqual(b'', ret.body)
mock_dns.assert_called_once_with(
mock.ANY, mock.ANY, self.node.uuid,
[{'interface': 'deploy', 'step': 'meow'}],
None, topic='test-topic')
@mock.patch.object(rpcapi.ConductorAPI, 'do_node_service',
autospec=True)
def test_service_args_required(self, mock_dns):
self.node.provision_state = states.SERVICEHOLD
self.node.save()
ret = self.put_json('/nodes/%s/states/provision' % self.node.uuid,
{'target': states.VERBS['service']},
headers={api_base.Version.string: "1.87"},
expect_errors=True)
self.assertEqual(http_client.BAD_REQUEST, ret.status_code)
self.assertIn('error_message', ret.json)
mock_dns.assert_not_called()
def test_set_console_mode_enabled(self): def test_set_console_mode_enabled(self):
with mock.patch.object(rpcapi.ConductorAPI, with mock.patch.object(rpcapi.ConductorAPI,
'set_console_mode', 'set_console_mode',
@ -6975,11 +7025,28 @@ class TestCheckCleanSteps(db_base.DbTestCase):
step1 = {"step": "upgrade_firmware", "interface": "deploy", step1 = {"step": "upgrade_firmware", "interface": "deploy",
"args": {"arg1": "value1", "arg2": "value2"}} "args": {"arg1": "value1", "arg2": "value2"}}
# NOTE(TheJulia): _check_service_steps and _check_deploy_steps
# both route back to _check_steps which is what backs _check
# clean steps. It is needful duplication for cases, but it doesn't
# make a ton of sense to copy/paste everything over and over unless
# there is a specific case. In any case, do the needful here.
api_node._check_clean_steps([step1]) api_node._check_clean_steps([step1])
step2 = {"step": "configure raid", "interface": "raid"} step2 = {"step": "configure raid", "interface": "raid",
"args": {}}
api_node._check_clean_steps([step1, step2]) api_node._check_clean_steps([step1, step2])
api_node._check_service_steps([step1])
api_node._check_service_steps([step1, step2])
# Schema differences exist, cleaning doesn't have a schema for
# priority when validated.
step1['priority'] = 10
step2['priority'] = 12
api_node._check_deploy_steps([step1])
api_node._check_deploy_steps([step1, step2])
@mock.patch.object(api_utils, 'check_node_policy_and_retrieve', @mock.patch.object(api_utils, 'check_node_policy_and_retrieve',
autospec=True) autospec=True)
def test__check_clean_steps_child_node(self, mock_policy): def test__check_clean_steps_child_node(self, mock_policy):
@ -7029,6 +7096,15 @@ class TestCheckCleanSteps(db_base.DbTestCase):
mock.call('baremetal:node:set_provision_state', mock.call('baremetal:node:set_provision_state',
child_node_2.uuid)]) child_node_2.uuid)])
@mock.patch.object(api_node, '_check_steps', autospec=True)
def test_check__check_steps_wrappers(self, check_mock):
api_node._check_clean_steps({})
self.assertEqual(1, check_mock.call_count)
api_node._check_deploy_steps({})
self.assertEqual(2, check_mock.call_count)
api_node._check_service_steps({})
self.assertEqual(3, check_mock.call_count)
class TestAttachDetachVif(test_api_base.BaseApiTest): class TestAttachDetachVif(test_api_base.BaseApiTest):

View File

@ -1422,3 +1422,28 @@ class TestGetPhysnetsByPortUUID(base.TestCase):
self.client, port_uuid) self.client, port_uuid)
mock_gp.assert_called_once_with(self.client, port_uuid) mock_gp.assert_called_once_with(self.client, port_uuid)
mock_gn.assert_called_once_with(self.client, network_uuid) mock_gn.assert_called_once_with(self.client, network_uuid)
class TestNeutronNetworkInterfaceMixin(db_base.DbTestCase):
def setUp(self):
super(TestNeutronNetworkInterfaceMixin, self).setUp()
self.node = object_utils.create_test_node(self.context)
def test_get_network_names_and_uuids(self):
"""A test to validate confiured overrides work."""
for name in ['cleaning', 'provisioning', 'rescuing', 'inspection',
'servicing']:
method_name = 'get_{}_network_uuid'.format(name)
method_to_call = getattr(neutron.NeutronNetworkInterfaceMixin,
method_name)
network_uuid = uuidutils.generate_uuid()
self.node.driver_info = {'%s_network' % name: network_uuid}
self.node.save()
with mock.patch.object(neutron, 'validate_network',
autospec=True) as mock_validate:
with task_manager.acquire(self.context,
self.node.uuid) as task:
method_to_call(self, task)
mock_validate.assert_called_once_with(
network_uuid, mock.ANY, context=task.context)

View File

@ -49,6 +49,7 @@ from ironic.conductor import deployments
from ironic.conductor import inspection from ironic.conductor import inspection
from ironic.conductor import manager from ironic.conductor import manager
from ironic.conductor import notification_utils from ironic.conductor import notification_utils
from ironic.conductor import servicing
from ironic.conductor import steps as conductor_steps from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager from ironic.conductor import task_manager
from ironic.conductor import utils as conductor_utils from ironic.conductor import utils as conductor_utils
@ -8601,3 +8602,49 @@ class ContinueInspectionTestCase(mgr_utils.ServiceSetUpMixin,
self.assertEqual(exception.NotFound, exc.exc_info[0]) self.assertEqual(exception.NotFound, exc.exc_info[0])
node.refresh() node.refresh()
self.assertEqual(states.AVAILABLE, node.provision_state) self.assertEqual(states.AVAILABLE, node.provision_state)
@mgr_utils.mock_record_keepalive
class DoNodeServiceTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def setUp(self):
super(DoNodeServiceTestCase, self).setUp()
@mock.patch('ironic.drivers.modules.fake.FakePower.validate',
autospec=True)
def test_do_node_service_maintenance(self, mock_validate):
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.ACTIVE,
target_provision_state=states.NOSTATE,
maintenance=True, maintenance_reason='reason')
self._start_service()
exc = self.assertRaises(messaging.rpc.ExpectedException,
self.service.do_node_service,
self.context, node.uuid, {'foo': 'bar'})
# Compare true exception hidden by @messaging.expected_exceptions
self.assertEqual(exception.NodeInMaintenance, exc.exc_info[0])
self.assertFalse(mock_validate.called)
@mock.patch.object(task_manager.TaskManager, 'process_event',
autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate',
autospec=True)
def test_do_node_service(self, mock_pv, mock_nv, mock_event):
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.ACTIVE,
target_provision_state=states.NOSTATE)
self._start_service()
self.service.do_node_service(self.context,
node.uuid, {'foo': 'bar'})
self.assertTrue(mock_pv.called)
self.assertTrue(mock_nv.called)
mock_event.assert_called_once_with(
mock.ANY,
'service',
callback=mock.ANY,
call_args=(servicing.do_node_service, mock.ANY,
{'foo': 'bar'}, False),
err_handler=mock.ANY, target_state='active')

View File

@ -722,6 +722,14 @@ class RPCAPITestCase(db_base.DbTestCase):
allocation='fake-allocation', allocation='fake-allocation',
version='1.48') version='1.48')
def test_do_node_service(self):
self._test_rpcapi('do_node_service',
'call',
node_id='fake-node',
service_steps={'foo': 'bar'},
disable_ramdisk=False,
version='1.57')
@mock.patch.object(rpc, 'GLOBAL_MANAGER', @mock.patch.object(rpc, 'GLOBAL_MANAGER',
spec_set=conductor_manager.ConductorManager) spec_set=conductor_manager.ConductorManager)
def test_local_call(self, mock_manager): def test_local_call(self, mock_manager):

File diff suppressed because it is too large Load Diff

View File

@ -1059,8 +1059,8 @@ class GetValidatedStepsFromTemplatesTestCase(db_base.DbTestCase):
mock_templates): mock_templates):
mock_templates.return_value = [self.template] mock_templates.return_value = [self.template]
mock_validate.side_effect = exception.InstanceDeployFailure('foo') mock_validate.side_effect = exception.InstanceDeployFailure('foo')
with task_manager.acquire( with task_manager.acquire(self.context, self.node.uuid,
self.context, self.node.uuid, shared=False) as task: shared=False) as task:
self.assertRaises( self.assertRaises(
exception.InstanceDeployFailure, exception.InstanceDeployFailure,
conductor_steps._get_validated_steps_from_templates, task) conductor_steps._get_validated_steps_from_templates, task)
@ -1399,3 +1399,87 @@ class ReservedStepHandlerByNameTestCase(db_base.DbTestCase):
def test_reserved_step_wait_time(self): def test_reserved_step_wait_time(self):
self._test_reserved_step({'step': 'wait', 'args': {'seconds': 1}}) self._test_reserved_step({'step': 'wait', 'args': {'seconds': 1}})
class NodeServiceStepsTestCase(db_base.DbTestCase):
def setUp(self):
super(NodeServiceStepsTestCase, self).setUp()
self.deploy_start = {
'step': 'deploy_start', 'priority': 50, 'interface': 'deploy'}
self.power_one = {
'step': 'power_one', 'priority': 40, 'interface': 'power'}
self.deploy_middle = {
'step': 'deploy_middle', 'priority': 40, 'interface': 'deploy'}
self.deploy_end = {
'step': 'deploy_end', 'priority': 20, 'interface': 'deploy'}
self.power_disable = {
'step': 'power_disable', 'priority': 0, 'interface': 'power'}
self.deploy_core = {
'step': 'deploy', 'priority': 100, 'interface': 'deploy'}
# enabled steps
self.service_steps = [self.deploy_start, self.power_one,
self.deploy_middle, self.deploy_end]
# Deploy step with argsinfo.
self.deploy_raid = {
'step': 'build_raid', 'priority': 0, 'interface': 'deploy',
'argsinfo': {'arg1': {'description': 'desc1', 'required': True},
'arg2': {'description': 'desc2'}}}
self.node = obj_utils.create_test_node(
self.context, driver='fake-hardware')
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.get_service_steps',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakePower.get_service_steps',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeManagement.get_service_steps',
autospec=True)
def test__get_service_steps(self, mock_mgt_steps, mock_power_steps,
mock_deploy_steps):
# Test getting deploy steps, with one driver returning None, two
# conflicting priorities, and asserting they are ordered properly.
mock_power_steps.return_value = [self.power_disable, self.power_one]
mock_deploy_steps.return_value = [
self.deploy_start, self.deploy_middle, self.deploy_end]
# These next steps are actually present on the FakeVendorB interface,
# and instead of just mock everything, we're actually exercising the
# rest of the way down including the decorator to get here.
fake_log_passthrough = {
'abortable': False, 'argsinfo': None, 'interface': 'vendor',
'priority': 0, 'requires_ramdisk': False,
'step': 'log_passthrough'
}
fake_trigger_servicewait = {
'abortable': False, 'argsinfo': None, 'interface': 'vendor',
'priority': 0, 'requires_ramdisk': True,
'step': 'trigger_servicewait'
}
expected = self.service_steps + [fake_log_passthrough,
fake_trigger_servicewait,
self.power_disable]
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
steps = conductor_steps._get_service_steps(task, enabled=False)
self.assertEqual(expected, steps)
mock_mgt_steps.assert_called_once_with(mock.ANY, task)
mock_power_steps.assert_called_once_with(mock.ANY, task)
mock_deploy_steps.assert_called_once_with(mock.ANY, task)
@mock.patch.object(conductor_steps, '_validate_user_service_steps',
autospec=True)
def test_set_node_service_steps(self, mock_steps):
mock_steps.return_value = self.service_steps
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
conductor_steps.set_node_service_steps(task)
self.node.refresh()
self.assertEqual(self.service_steps,
self.node.driver_internal_info['service_steps'])
self.assertEqual({}, self.node.service_step)
self.assertIsNone(
self.node.driver_internal_info['service_step_index'])
mock_steps.assert_called_once_with(task, [], disable_ramdisk=False)

View File

@ -1490,6 +1490,90 @@ class ErrorHandlersTestCase(db_base.DbTestCase):
log_mock.assert_has_calls(log_calls) log_mock.assert_has_calls(log_calls)
self.node.save.assert_called_once_with() self.node.save.assert_called_once_with()
@mock.patch.object(conductor_utils.LOG, 'error', autospec=True)
def _test_servicing_error_handler(self, mock_log_error,
prov_state=states.SERVICING):
self.node.provision_state = prov_state
target = 'baz'
self.node.target_provision_state = target
self.node.service_step = {'key': 'val'}
self.node.set_driver_internal_info('service_reboot', True)
self.node.set_driver_internal_info('service_polling', True)
self.node.set_driver_internal_info('skip_current_service_step', True)
self.node.set_driver_internal_info('service_step_index', 0)
self.node.set_driver_internal_info('agent_url', 'url')
self.node.set_driver_internal_info('agent_secret_token', 'foo')
self.node.set_driver_internal_info('agent_secret_token_pregenerated',
False)
msg = 'error bar'
last_error = "last error"
conductor_utils.servicing_error_handler(self.task, msg,
errmsg=last_error)
self.node.save.assert_called_once_with()
self.assertEqual({}, self.node.service_step)
self.assertNotIn('service_step_index', self.node.driver_internal_info)
self.assertNotIn('service_reboot', self.node.driver_internal_info)
self.assertNotIn('service_polling', self.node.driver_internal_info)
self.assertNotIn('skip_current_service_step',
self.node.driver_internal_info)
self.assertNotIn('agent_secret_token', self.node.driver_internal_info)
self.assertNotIn('agent_secret_token_pregenerated',
self.node.driver_internal_info)
self.assertEqual(last_error, self.node.last_error)
self.assertTrue(self.node.maintenance)
self.assertEqual(last_error, self.node.maintenance_reason)
self.assertEqual('service failure', self.node.fault)
driver = self.task.driver.deploy
driver.tear_down_service.assert_called_once_with(self.task)
if prov_state == states.SERVICEFAIL:
self.assertFalse(self.task.process_event.called)
else:
self.task.process_event.assert_called_once_with('fail')
self.assertNotIn('agent_url', self.node.driver_internal_info)
mock_log_error.assert_called_once_with(msg, exc_info=False)
def test_servicing_error_handler(self):
self._test_servicing_error_handler()
def test_servicing_error_handler_servicewait(self):
self._test_servicing_error_handler(prov_state=states.SERVICEWAIT)
def test_servicing_error_handler_servicefail(self):
self._test_servicing_error_handler(prov_state=states.SERVICEFAIL)
def test_servicing_error_handler_no_teardown(self):
target = states.MANAGEABLE
self.node.target_provision_state = target
conductor_utils.servicing_error_handler(self.task, 'foo',
tear_down_service=False)
self.assertFalse(self.task.driver.deploy.tear_down_service.called)
self.task.process_event.assert_called_once_with('fail')
def test_servicing_error_handler_no_fail(self):
conductor_utils.servicing_error_handler(self.task, 'foo',
set_fail_state=False)
driver = self.task.driver.deploy
driver.tear_down_service.assert_called_once_with(self.task)
self.assertFalse(self.task.process_event.called)
@mock.patch.object(conductor_utils, 'LOG', autospec=True)
def test_servicing_error_handler_tear_down_error(self, log_mock):
def _side_effect(task):
# simulate overwriting last error by another operation (e.g. power)
task.node.last_error = None
raise Exception('bar')
driver = self.task.driver.deploy
msg = 'foo'
driver.tear_down_service.side_effect = _side_effect
conductor_utils.servicing_error_handler(self.task, msg)
log_mock.error.assert_called_once_with(msg, exc_info=False)
self.assertTrue(log_mock.exception.called)
self.assertIn(msg, self.node.last_error)
self.assertIn(msg, self.node.maintenance_reason)
self.assertEqual('service failure', self.node.fault)
class ValidatePortPhysnetTestCase(db_base.DbTestCase): class ValidatePortPhysnetTestCase(db_base.DbTestCase):
@ -2738,3 +2822,46 @@ class GetTokenProjectFromRequestTestCase(db_base.DbTestCase):
self.context.auth_token_info = self.auth_token_info self.context.auth_token_info = self.auth_token_info
res = conductor_utils.get_token_project_from_request(self.context) res = conductor_utils.get_token_project_from_request(self.context)
self.assertEqual('user-project', res) self.assertEqual('user-project', res)
class ServiceUtilsTestCase(db_base.DbTestCase):
def setUp(self):
super(ServiceUtilsTestCase, self).setUp()
self.node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
uuid=uuidutils.generate_uuid(),
driver_internal_info={
'agent_last_heartbeat': str(timeutils.utcnow().isoformat()),
'agent_url': 'a_url'})
self.config(fast_track=True, group='deploy')
@mock.patch.object(fake.FakePower, 'get_power_state', autospec=True)
def test_wipe_service_internal_info(self, mock_power):
mock_power.return_value = False
self.node.driver_internal_info = {
'service_steps': {'foo': 'bar'},
'agent_cached_service_steps': {'more_foo': None},
'service_reboot': False,
'service_polling': 1,
'service_disable_ramdisk': False,
'skip_current_service_step': False,
'steps_validated': 'meow'
'agent_secret_token'}
self.node.save()
not_in_list = ['agent_cached_service_steps',
'serivce_reboot',
'service_polling',
'service_disable_ramdisk',
'skip_current_service_step',
'steps_validated',
'agent_secret_token']
with task_manager.acquire(self.context, self.node.id,
shared=True) as task:
conductor_utils.wipe_service_internal_info(task)
task.node.save()
self.assertIsNone(
task.node.driver_internal_info['service_steps']
)
for field in not_in_list:
self.assertNotIn(field, task.node.driver_internal_info)

View File

@ -238,7 +238,8 @@ def get_test_node(**kw):
'boot_mode': kw.get('boot_mode', None), 'boot_mode': kw.get('boot_mode', None),
'secure_boot': kw.get('secure_boot', None), 'secure_boot': kw.get('secure_boot', None),
'shard': kw.get('shard', None), 'shard': kw.get('shard', None),
'parent_node': kw.get('parent_node', None) 'parent_node': kw.get('parent_node', None),
'service_step': kw.get('service_step'),
} }
for iface in drivers_base.ALL_INTERFACES: for iface in drivers_base.ALL_INTERFACES:

View File

@ -25,6 +25,7 @@ from ironic.common import exception
from ironic.common import image_service from ironic.common import image_service
from ironic.common import states from ironic.common import states
from ironic.conductor import cleaning from ironic.conductor import cleaning
from ironic.conductor import servicing
from ironic.conductor import steps as conductor_steps from ironic.conductor import steps as conductor_steps
from ironic.conductor import task_manager from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils from ironic.conductor import utils as manager_utils
@ -159,7 +160,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
autospec=True) autospec=True)
def test_heartbeat_in_maintenance(self, next_step_mock): def test_heartbeat_in_maintenance(self, next_step_mock):
# NOTE(pas-ha) checking only for states that are not noop # NOTE(pas-ha) checking only for states that are not noop
for state in (states.DEPLOYWAIT, states.CLEANWAIT): for state in (states.DEPLOYWAIT, states.CLEANWAIT,
states.SERVICEWAIT):
next_step_mock.reset_mock() next_step_mock.reset_mock()
self.node.provision_state = state self.node.provision_state = state
self.node.maintenance = True self.node.maintenance = True
@ -186,7 +188,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
group='conductor') group='conductor')
for state, expected in [(states.DEPLOYWAIT, states.DEPLOYFAIL), for state, expected in [(states.DEPLOYWAIT, states.DEPLOYFAIL),
(states.CLEANWAIT, states.CLEANFAIL), (states.CLEANWAIT, states.CLEANFAIL),
(states.RESCUEWAIT, states.RESCUEFAIL)]: (states.RESCUEWAIT, states.RESCUEFAIL),
(states.SERVICEWAIT, states.SERVICEFAIL)]:
next_step_mock.reset_mock() next_step_mock.reset_mock()
self.node.provision_state = state self.node.provision_state = state
self.node.maintenance = True self.node.maintenance = True
@ -211,7 +214,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
autospec=True) autospec=True)
def test_heartbeat_with_reservation(self, next_step_mock): def test_heartbeat_with_reservation(self, next_step_mock):
# NOTE(pas-ha) checking only for states that are not noop # NOTE(pas-ha) checking only for states that are not noop
for state in (states.DEPLOYWAIT, states.CLEANWAIT): for state in (states.DEPLOYWAIT, states.CLEANWAIT,
states.SERVICEWAIT):
next_step_mock.reset_mock() next_step_mock.reset_mock()
self.node.provision_state = state self.node.provision_state = state
self.node.reservation = 'localhost' self.node.reservation = 'localhost'
@ -232,7 +236,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
def test_heartbeat_noops_in_wrong_state(self, next_step_mock, log_mock): def test_heartbeat_noops_in_wrong_state(self, next_step_mock, log_mock):
allowed = {states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT, allowed = {states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT,
states.DEPLOYING, states.CLEANING, states.RESCUING, states.DEPLOYING, states.CLEANING, states.RESCUING,
states.DEPLOYHOLD, states.CLEANHOLD} states.DEPLOYHOLD, states.CLEANHOLD, states.SERVICEHOLD,
states.SERVICING, states.SERVICEWAIT}
for state in set(states.machine.states) - allowed: for state in set(states.machine.states) - allowed:
for m in (next_step_mock, log_mock): for m in (next_step_mock, log_mock):
m.reset_mock() m.reset_mock()
@ -253,7 +258,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
def test_heartbeat_noops_in_wrong_state2(self, next_step_mock): def test_heartbeat_noops_in_wrong_state2(self, next_step_mock):
CONF.set_override('allow_provisioning_in_maintenance', False, CONF.set_override('allow_provisioning_in_maintenance', False,
group='conductor') group='conductor')
allowed = {states.DEPLOYWAIT, states.CLEANWAIT} allowed = {states.DEPLOYWAIT, states.CLEANWAIT,
states.SERVICEWAIT}
for state in set(states.machine.states) - allowed: for state in set(states.machine.states) - allowed:
next_step_mock.reset_mock() next_step_mock.reset_mock()
with task_manager.acquire(self.context, self.node.uuid, with task_manager.acquire(self.context, self.node.uuid,
@ -466,7 +472,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
@mock.patch.object(agent_base.LOG, 'error', autospec=True) @mock.patch.object(agent_base.LOG, 'error', autospec=True)
def test_heartbeat_records_when_appropriate(self, log_mock): def test_heartbeat_records_when_appropriate(self, log_mock):
for provision_state in (states.CLEANING, states.DEPLOYING, for provision_state in (states.CLEANING, states.DEPLOYING,
states.CLEANHOLD, states.DEPLOYHOLD): states.CLEANHOLD, states.DEPLOYHOLD,
states.SERVICEHOLD, states.SERVICING):
self.node.driver_internal_info = {} self.node.driver_internal_info = {}
self.node.provision_state = provision_state self.node.provision_state = provision_state
self.node.save() self.node.save()
@ -521,6 +528,68 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
task.node.driver_internal_info['agent_last_heartbeat']) task.node.driver_internal_info['agent_last_heartbeat'])
self.assertEqual(provision_state, task.node.provision_state) self.assertEqual(provision_state, task.node.provision_state)
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'refresh_steps', autospec=True)
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
@mock.patch.object(servicing, 'continue_node_service', autospec=True)
def test_heartbeat_resume_service(self, mock_service, mock_set_steps,
mock_refresh, mock_touch):
self.node.clean_step = {}
self.node.provision_state = states.SERVICEWAIT
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0')
mock_touch.assert_called_once_with(mock.ANY)
mock_refresh.assert_called_once_with(mock.ANY, task, 'service')
mock_service.assert_called_once_with(task)
mock_set_steps.assert_called_once_with(task)
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'continue_servicing', autospec=True)
def test_heartbeat_continue_servicing(self, mock_continue, mock_touch):
self.node.service_step = {
'priority': 10,
'interface': 'deploy',
'step': 'foo',
'reboot_requested': False
}
self.node.provision_state = states.SERVICEWAIT
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0')
mock_touch.assert_called_once_with(mock.ANY)
mock_continue.assert_called_once_with(mock.ANY, task)
@mock.patch.object(manager_utils, 'servicing_error_handler', autospec=True)
@mock.patch.object(agent_base.HeartbeatMixin,
'continue_servicing', autospec=True)
def test_heartbeat_continue_servicing_fails(self, mock_continue,
mock_handler):
self.node.service_step = {
'priority': 10,
'interface': 'deploy',
'step': 'foo',
'reboot_requested': False
}
mock_continue.side_effect = Exception()
self.node.provision_state = states.SERVICEWAIT
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '1.0.0')
mock_continue.assert_called_once_with(mock.ANY, task)
mock_handler.assert_called_once_with(task, mock.ANY, mock.ANY)
class AgentRescueTests(AgentDeployMixinBaseTest): class AgentRescueTests(AgentDeployMixinBaseTest):
@ -1604,6 +1673,25 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
self.assertNotIn('deployment_reboot', self.assertNotIn('deployment_reboot',
task.node.driver_internal_info) task.node.driver_internal_info)
@mock.patch.object(deploy_utils, 'build_agent_options', autospec=True)
@mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', spec_set=True,
autospec=True)
@mock.patch.object(manager_utils, 'servicing_error_handler', autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test__post_step_reboot_fail_servicing(self, mock_reboot, mock_handler,
mock_prepare, mock_build_opt):
mock_reboot.side_effect = RuntimeError("broken")
self.node.provision_state = states.SERVICEWAIT
self.node.save()
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
agent_base._post_step_reboot(task, 'service')
mock_reboot.assert_called_once_with(task, states.REBOOT)
mock_handler.assert_called_once_with(task, mock.ANY,
traceback=True)
self.assertNotIn('servicing_reboot',
task.node.driver_internal_info)
def _test_clean_step_hook(self): def _test_clean_step_hook(self):
"""Helper method for unit tests related to clean step hooks.""" """Helper method for unit tests related to clean step hooks."""
some_function_mock = mock.MagicMock() some_function_mock = mock.MagicMock()
@ -1998,6 +2086,111 @@ class ContinueCleaningTest(AgentDeployMixinBaseTest):
error_mock.assert_called_once_with(task, mock.ANY, traceback=False) error_mock.assert_called_once_with(task, mock.ANY, traceback=False)
class ContinueServiceTest(AgentDeployMixinBaseTest):
def setUp(self):
super().setUp()
self.node.provision_state = states.SERVICEWAIT
self.node.target_provision_state = states.ACTIVE
self.node.save()
@mock.patch.object(servicing, 'continue_node_service', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_servicing(self, status_mock, service_mock):
# Test a successful execute clean step on the agent
self.node.service_step = {
'priority': 10,
'interface': 'deploy',
'step': 'erase_devices',
'reboot_requested': False
}
self.node.save()
status_mock.return_value = [{
'command_status': 'SUCCEEDED',
'command_name': 'execute_service_step',
'command_result': {
'service_step': self.node.service_step
}
}]
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.deploy.continue_servicing(task)
service_mock.assert_called_once_with(task)
self.assertEqual(states.SERVICING, task.node.provision_state)
self.assertEqual(states.ACTIVE,
task.node.target_provision_state)
@mock.patch.object(deploy_utils, 'build_agent_options', autospec=True)
@mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', spec_set=True,
autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_servicing_reboot(
self, status_mock, reboot_mock, mock_prepare, mock_build_opt):
# Test a successful execute clean step on the agent, with reboot
self.node.service_step = {
'priority': 42,
'interface': 'deploy',
'step': 'reboot_me_afterwards',
'reboot_requested': True
}
self.node.save()
status_mock.return_value = [{
'command_status': 'SUCCEEDED',
'command_name': 'execute_service_step',
'command_result': {
'service_step': self.node.service_step
}
}]
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.deploy.continue_servicing(task)
reboot_mock.assert_called_once_with(task, states.REBOOT)
@mock.patch.object(servicing, 'continue_node_service', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_servicing_after_reboot(self, status_mock, service_mock):
# Test a successful execute clean step on the agent, with reboot
self.node.service_step = {
'priority': 42,
'interface': 'deploy',
'step': 'reboot_me_afterwards',
'reboot_requested': True
}
driver_internal_info = self.node.driver_internal_info
driver_internal_info['servicing_reboot'] = True
self.node.driver_internal_info = driver_internal_info
self.node.save()
# Represents a freshly booted agent with no commands
status_mock.return_value = []
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.deploy.continue_servicing(task)
service_mock.assert_called_once_with(task)
self.assertEqual(states.SERVICING, task.node.provision_state)
self.assertNotIn('servicing_reboot',
task.node.driver_internal_info)
@mock.patch.object(servicing, 'continue_node_service', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_servicing_running(self, status_mock, service_mock):
# Test that no action is taken while a clean step is executing
status_mock.return_value = [{
'command_status': 'RUNNING',
'command_name': 'execute_service_step',
'command_result': None
}]
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.deploy.continue_servicing(task)
self.assertFalse(service_mock.called)
class TestRefreshCleanSteps(AgentDeployMixinBaseTest): class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
def setUp(self): def setUp(self):

View File

@ -676,7 +676,7 @@ class TestObject(_LocalTest, _TestObject):
# version bump. It is an MD5 hash of the object fields and remotable methods. # version bump. It is an MD5 hash of the object fields and remotable methods.
# The fingerprint values should only be changed if there is a version bump. # The fingerprint values should only be changed if there is a version bump.
expected_object_fingerprints = { expected_object_fingerprints = {
'Node': '1.39-ee3f5ff28b79f9fabf84a50e34a71684', 'Node': '1.40-2182d4660bb5d5e4cc5670c37012ef71',
'MyObj': '1.5-9459d30d6954bffc7a9afd347a807ca6', 'MyObj': '1.5-9459d30d6954bffc7a9afd347a807ca6',
'Chassis': '1.3-d656e039fd8ae9f34efc232ab3980905', 'Chassis': '1.3-d656e039fd8ae9f34efc232ab3980905',
'Port': '1.11-97bf15b61224f26c65e90f007d78bfd2', 'Port': '1.11-97bf15b61224f26c65e90f007d78bfd2',

View File

@ -0,0 +1,19 @@
---
features:
- |
Adds a new Ironic capability called ``service_steps`` which allows a
deployed ``ACTIVE`` node to be modified utilizing a new API provision
state verb of ``service`` which can include a list of ``service_steps``
to be performed. This work is inspired by ``clean_steps`` and
``deploy_steps`` and similar to those efforts, this functionality will
continue to evolve as new features, functionality, and capabilities
are added.
- Adds a new driver method decorator ``base.service_step`` which operates
exactly like the existing ``base.clean_step`` and ``base.deploy_step``
decorators. Driver methods which are decorated *can* be invoked utilizing
the service steps.
issues:
- |
The ``service_steps`` functionality does not understand how to poll and
communicate with the ``ironic-python-agent``. This is anticipated to be
addressed in a future release.