From 3f734c8728f0cf436921d57ce6b2389687684221 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Fri, 8 Feb 2019 20:26:41 -0800 Subject: [PATCH] fast tracked deployment support Provides a facility to minimize the power state changes of a baremetal node to save critical time during deployment operations. Story: #2004965 Task: #29408 Depends-On: https://review.openstack.org/636778 Change-Id: I7ebbaddb33b38c87246c10165339ac4bac0ac6fc --- devstack/lib/ironic | 7 ++ ironic/api/controllers/v1/ramdisk.py | 12 +-- ironic/common/pxe_utils.py | 8 +- ironic/common/states.py | 14 ++++ ironic/conductor/utils.py | 62 +++++++++++++++ ironic/conf/deploy.py | 21 +++++ ironic/drivers/modules/agent.py | 56 ++++++++++--- ironic/drivers/modules/agent_base_vendor.py | 34 +++++++- ironic/drivers/modules/deploy_utils.py | 31 ++++++-- ironic/drivers/modules/iscsi_deploy.py | 46 ++++++++--- .../unit/api/controllers/v1/test_ramdisk.py | 12 +++ ironic/tests/unit/conductor/test_utils.py | 51 ++++++++++++ .../tests/unit/drivers/modules/test_agent.py | 79 +++++++++++++++++++ .../drivers/modules/test_agent_base_vendor.py | 21 +++++ .../unit/drivers/modules/test_deploy_utils.py | 25 +++++- .../unit/drivers/modules/test_iscsi_deploy.py | 77 ++++++++++++++++++ ...ast-track-deployment-f09a8b921b3aae36.yaml | 16 ++++ zuul.d/ironic-jobs.yaml | 11 +++ 18 files changed, 540 insertions(+), 43 deletions(-) create mode 100644 releasenotes/notes/fast-track-deployment-f09a8b921b3aae36.yaml diff --git a/devstack/lib/ironic b/devstack/lib/ironic index 178129b184..690bc00e42 100644 --- a/devstack/lib/ironic +++ b/devstack/lib/ironic @@ -449,6 +449,8 @@ IRONIC_DEPLOY_LOGS_STORAGE_BACKEND=${IRONIC_DEPLOY_LOGS_STORAGE_BACKEND:-local} # The path to the directory where Ironic should put the logs when IRONIC_DEPLOY_LOGS_STORAGE_BACKEND is set to "local" IRONIC_DEPLOY_LOGS_LOCAL_PATH=${IRONIC_DEPLOY_LOGS_LOCAL_PATH:-$IRONIC_VM_LOG_DIR/deploy_logs} +# Fast track option +IRONIC_DEPLOY_FAST_TRACK=${IRONIC_DEPLOY_FAST_TRACK:-False} # Define baremetal min_microversion in tempest config. Default value None is picked from tempest. TEMPEST_BAREMETAL_MIN_MICROVERSION=${TEMPEST_BAREMETAL_MIN_MICROVERSION:-} @@ -1115,6 +1117,9 @@ function configure_ironic { iniset $IRONIC_CONF_FILE DEFAULT rpc_transport $IRONIC_RPC_TRANSPORT iniset $IRONIC_CONF_FILE json_rpc port $IRONIC_JSON_RPC_PORT + # Set fast track options + iniset $IRONIC_CONF_FILE deploy fast_track $IRONIC_DEPLOY_FAST_TRACK + # Configure Ironic conductor, if it was enabled. if is_service_enabled ir-cond; then configure_ironic_conductor @@ -2553,6 +2558,8 @@ function ironic_configure_tempest { # Enabled features iniset $TEMPEST_CONFIG baremetal_feature_enabled ipxe_enabled $IRONIC_IPXE_ENABLED + iniset $TEMPEST_CONFIG baremetal_feature_enabled fast_track_discovery $IRONIC_DEPLOY_FAST_TRACK + } function get_ironic_node_prefix { diff --git a/ironic/api/controllers/v1/ramdisk.py b/ironic/api/controllers/v1/ramdisk.py index a0fedbc92d..17c17ade56 100644 --- a/ironic/api/controllers/v1/ramdisk.py +++ b/ironic/api/controllers/v1/ramdisk.py @@ -37,10 +37,6 @@ LOG = log.getLogger(__name__) _LOOKUP_RETURN_FIELDS = ('uuid', 'properties', 'instance_info', 'driver_internal_info') -_LOOKUP_ALLOWED_STATES = {states.DEPLOYING, states.DEPLOYWAIT, - states.CLEANING, states.CLEANWAIT, - states.INSPECTING, - states.RESCUING, states.RESCUEWAIT} def config(): @@ -83,6 +79,12 @@ class LookupResult(base.APIBase): class LookupController(rest.RestController): """Controller handling node lookup for a deploy ramdisk.""" + @property + def lookup_allowed_states(self): + if CONF.deploy.fast_track: + return states.FASTTRACK_LOOKUP_ALLOWED_STATES + return states.LOOKUP_ALLOWED_STATES + @expose.expose(LookupResult, types.listtype, types.uuid) def get_all(self, addresses=None, node_uuid=None): """Look up a node by its MAC addresses and optionally UUID. @@ -144,7 +146,7 @@ class LookupController(rest.RestController): raise exception.NotFound() if (CONF.api.restrict_lookup - and node.provision_state not in _LOOKUP_ALLOWED_STATES): + and node.provision_state not in self.lookup_allowed_states): raise exception.NotFound() return LookupResult.convert_with_links(node) diff --git a/ironic/common/pxe_utils.py b/ironic/common/pxe_utils.py index e96b185b24..901d81d912 100644 --- a/ironic/common/pxe_utils.py +++ b/ironic/common/pxe_utils.py @@ -799,7 +799,12 @@ def build_service_pxe_config(task, instance_image_info, # NOTE(pas-ha) if it is takeover of ACTIVE node or node performing # unrescue operation, first ensure that basic PXE configs and links # are in place before switching pxe config - if (node.provision_state in [states.ACTIVE, states.UNRESCUING] + # NOTE(TheJulia): Also consider deploying a valid state to go ahead + # and check things before continuing, as otherwise deployments can + # fail if the agent was booted outside the direct actions of the + # boot interface. + if (node.provision_state in [states.ACTIVE, states.UNRESCUING, + states.DEPLOYING] and not os.path.isfile(pxe_config_path)): pxe_options = build_pxe_config_options(task, instance_image_info, service=True, @@ -808,6 +813,7 @@ def build_service_pxe_config(task, instance_image_info, create_pxe_config(task, pxe_options, pxe_config_template, ipxe_enabled=ipxe_enabled) iwdi = node.driver_internal_info.get('is_whole_disk_image') + deploy_utils.switch_pxe_config( pxe_config_path, root_uuid_or_disk_id, boot_mode_utils.get_boot_mode(node), diff --git a/ironic/common/states.py b/ironic/common/states.py index 3f550d6bc5..9f5e7ca6a1 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -243,6 +243,20 @@ If a node gets stuck with one of these states for some reason to fail state. """ +_LOOKUP_ALLOWED_STATES = (DEPLOYING, DEPLOYWAIT, CLEANING, CLEANWAIT, + INSPECTING, RESCUING, RESCUEWAIT) +LOOKUP_ALLOWED_STATES = frozenset(_LOOKUP_ALLOWED_STATES) + +"""States when API lookups are normally allowed for nodes.""" + +_FASTTRACK_LOOKUP_ALLOWED_STATES = (ENROLL, MANAGEABLE, AVAILABLE, + DEPLOYING, DEPLOYWAIT, CLEANING, + CLEANWAIT, INSPECTING, RESCUING, + RESCUEWAIT) +FASTTRACK_LOOKUP_ALLOWED_STATES = frozenset(_FASTTRACK_LOOKUP_ALLOWED_STATES) +"""States where API lookups are permitted with fast track enabled.""" + + ############## # Power states ############## diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index a683250467..cd16449f60 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -13,6 +13,7 @@ # under the License. import collections +import datetime import time from openstack.baremetal import configdrive as os_configdrive @@ -21,6 +22,7 @@ from oslo_log import log from oslo_serialization import jsonutils from oslo_service import loopingcall from oslo_utils import excutils +from oslo_utils import timeutils import six from ironic.common import boot_devices @@ -1408,3 +1410,63 @@ def build_configdrive(node, configdrive): LOG.debug('Building a configdrive for node %s', node.uuid) return os_configdrive.build(meta_data, user_data=user_data, network_data=configdrive.get('network_data')) + + +def fast_track_able(task): + """Checks if the operation can be a streamlined deployment sequence. + + This is mainly focused on ensuring that we are able to quickly sequence + through operations if we already have a ramdisk heartbeating through + external means. + + :param task: Taskmanager object + :returns: True if [deploy]fast_track is set to True, no iSCSI boot + configuration is present, and no last_error is present for + the node indicating that there was a recent failure. + """ + return (CONF.deploy.fast_track + # TODO(TheJulia): Network model aside, we should be able to + # fast-track through initial sequence to complete deployment. + # This needs to be validated. + # TODO(TheJulia): Do we need a secondary guard? To prevent + # driving through this we could query the API endpoint of + # the agent with a short timeout such as 10 seconds, which + # would help verify if the node is online. + # TODO(TheJulia): Should we check the provisioning/deployment + # networks match config wise? Do we care? #decisionsdecisions + and task.driver.storage.should_write_image(task) + and task.node.last_error is None) + + +def is_fast_track(task): + """Checks a fast track is available. + + This method first ensures that the node and conductor configuration + is valid to perform a fast track sequence meaning that we already + have a ramdisk running through another means like discovery. + If not valid, False is returned. + + The method then checks for the last agent heartbeat, and if it occured + within the timeout set by [deploy]fast_track_timeout and the power + state for the machine is POWER_ON, then fast track is permitted. + + :param node: A node object. + :returns: True if the last heartbeat that was recorded was within + the [deploy]fast_track_timeout setting. + """ + if not fast_track_able(task): + return False + # use native datetime objects for conversion and compare + # slightly odd because py2 compatability :( + last = datetime.datetime.strptime( + task.node.driver_internal_info.get( + 'agent_last_heartbeat', + '1970-01-01T00:00:00.000000'), + "%Y-%m-%dT%H:%M:%S.%f") + # If we found nothing, we assume that the time is essentially epoch. + time_delta = datetime.timedelta(seconds=CONF.deploy.fast_track_timeout) + last_valid = timeutils.utcnow() - time_delta + # Checking the power state, because if we find the machine off due to + # any action, we can't actually fast track the node. :( + return (last_valid <= last + and task.driver.power.get_power_state(task) == states.POWER_ON) diff --git a/ironic/conf/deploy.py b/ironic/conf/deploy.py index 248a5c6356..092238ec8c 100644 --- a/ironic/conf/deploy.py +++ b/ironic/conf/deploy.py @@ -113,6 +113,27 @@ opts = [ 'images for the direct deploy interface, when local ' 'HTTP service is incorporated to provide instance image ' 'instead of swift tempurls.')), + cfg.BoolOpt('fast_track', + default=False, + help=_('Whether to allow deployment agents to perform lookup, ' + 'heartbeat operations during initial states of a ' + 'machine lifecycle and by-pass the normal setup ' + 'procedures for a ramdisk. This feature also enables ' + 'power operations which are part of deployment ' + 'processes to be bypassed if the ramdisk has performed ' + 'a heartbeat operation using the fast_track_timeout ' + 'setting.')), + cfg.IntOpt('fast_track_timeout', + default=300, + min=0, + max=300, + help=_('Seconds for which the last heartbeat event is to be ' + 'considered valid for the purpose of a fast ' + 'track sequence. This setting should generally be ' + 'less than the number of seconds for "Power-On Self ' + 'Test" and typical ramdisk start-up. This value should ' + 'not exceed the [api]ramdisk_heartbeat_timeout ' + 'setting.')), ] diff --git a/ironic/drivers/modules/agent.py b/ironic/drivers/modules/agent.py index 851f6b652a..1d3d552e7d 100644 --- a/ironic/drivers/modules/agent.py +++ b/ironic/drivers/modules/agent.py @@ -455,7 +455,14 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface): :param task: a TaskManager instance. :returns: status of the deploy. One of ironic.common.states. """ - if task.driver.storage.should_write_image(task): + if manager_utils.is_fast_track(task): + LOG.debug('Performing a fast track deployment for %(node)s.', + {'node': task.node.uuid}) + # Update the database for the API and the task tracking resumes + # the state machine state going from DEPLOYWAIT -> DEPLOYING + task.process_event('wait') + self.continue_deploy(task) + elif task.driver.storage.should_write_image(task): manager_utils.node_power_action(task, states.REBOOT) return states.DEPLOYWAIT else: @@ -521,6 +528,12 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface): :raises: exception.InvalidParameterValue if network validation fails. :raises: any boot interface's prepare_ramdisk exceptions. """ + + def _update_instance_info(): + node.instance_info = ( + deploy_utils.build_instance_info_for_deploy(task)) + node.save() + node = task.node deploy_utils.populate_storage_driver_internal_info(task) if node.provision_state == states.DEPLOYING: @@ -547,25 +560,44 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface): task.driver.network.validate(task) else: ctx.reraise = True - - # Adding the node to provisioning network so that the dhcp - # options get added for the provisioning port. - manager_utils.node_power_action(task, states.POWER_OFF) + # Determine if this is a fast track sequence + fast_track_deploy = manager_utils.is_fast_track(task) + if fast_track_deploy: + # The agent has already recently checked in and we are + # configured to take that as an indicator that we can + # skip ahead. + LOG.debug('The agent for node %(node)s has recently checked ' + 'in, and the node power will remain unmodified.', + {'node': task.node.uuid}) + else: + # Powering off node to setup networking for port and + # ensure that the state is reset if it is inadvertently + # on for any unknown reason. + manager_utils.node_power_action(task, states.POWER_OFF) if task.driver.storage.should_write_image(task): # NOTE(vdrok): in case of rebuild, we have tenant network # already configured, unbind tenant ports if present - power_state_to_restore = ( - manager_utils.power_on_node_if_needed(task)) + if not fast_track_deploy: + power_state_to_restore = ( + manager_utils.power_on_node_if_needed(task)) + task.driver.network.unconfigure_tenant_networks(task) task.driver.network.add_provisioning_network(task) - manager_utils.restore_power_state_if_needed( - task, power_state_to_restore) + if not fast_track_deploy: + manager_utils.restore_power_state_if_needed( + task, power_state_to_restore) + else: + # Fast track sequence in progress + _update_instance_info() # Signal to storage driver to attach volumes task.driver.storage.attach_volumes(task) - if not task.driver.storage.should_write_image(task): + if (not task.driver.storage.should_write_image(task) + or fast_track_deploy): # We have nothing else to do as this is handled in the # backend storage system, and we can return to the caller # as we do not need to boot the agent to deploy. + # Alternatively, we could be in a fast track deployment + # and again, we should have nothing to do here. return if node.provision_state in (states.ACTIVE, states.UNRESCUING): # Call is due to conductor takeover @@ -573,9 +605,7 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface): elif node.provision_state != states.ADOPTING: if node.provision_state not in (states.RESCUING, states.RESCUEWAIT, states.RESCUE, states.RESCUEFAIL): - node.instance_info = ( - deploy_utils.build_instance_info_for_deploy(task)) - node.save() + _update_instance_info() if CONF.agent.manage_agent_boot: deploy_opts = deploy_utils.build_agent_options(node) task.driver.boot.prepare_ramdisk(task, deploy_opts) diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 8ee24b0841..6ccf386b5e 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -66,6 +66,19 @@ VENDOR_PROPERTIES = { 'older deploy ramdisks. Defaults to False. Optional.') } +__HEARTBEAT_RECORD_ONLY = (states.ENROLL, states.MANAGEABLE, + states.AVAILABLE) +_HEARTBEAT_RECORD_ONLY = frozenset(__HEARTBEAT_RECORD_ONLY) + +_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT) +HEARTBEAT_ALLOWED = frozenset(_HEARTBEAT_ALLOWED) + +_FASTTRACK_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, + states.RESCUEWAIT, states.ENROLL, + states.MANAGEABLE, states.AVAILABLE, + states.DEPLOYING) +FASTTRACK_HEARTBEAT_ALLOWED = frozenset(_FASTTRACK_HEARTBEAT_ALLOWED) + def _get_client(): client = agent_client.AgentClient() @@ -260,7 +273,9 @@ class HeartbeatMixin(object): @property def heartbeat_allowed_states(self): """Define node states where heartbeating is allowed""" - return (states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT) + if CONF.deploy.fast_track: + return FASTTRACK_HEARTBEAT_ALLOWED + return HEARTBEAT_ALLOWED @METRICS.timer('HeartbeatMixin.heartbeat') def heartbeat(self, task, callback_url, agent_version): @@ -271,7 +286,8 @@ class HeartbeatMixin(object): :param agent_version: The version of the agent that is heartbeating """ # NOTE(pas-ha) immediately skip the rest if nothing to do - if task.node.provision_state not in self.heartbeat_allowed_states: + if (task.node.provision_state not in self.heartbeat_allowed_states + and not manager_utils.fast_track_able(task)): LOG.debug('Heartbeat from node %(node)s in unsupported ' 'provision state %(state)s, not taking any action.', {'node': task.node.uuid, @@ -288,13 +304,25 @@ class HeartbeatMixin(object): node = task.node LOG.debug('Heartbeat from node %s', node.uuid) - driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url driver_internal_info['agent_version'] = agent_version + # Record the last heartbeat event time in UTC, so we can make + # decisions about it later. Can be decoded to datetime object with: + # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f") + driver_internal_info['agent_last_heartbeat'] = str( + timeutils.utcnow().isoformat()) node.driver_internal_info = driver_internal_info node.save() + if node.provision_state in _HEARTBEAT_RECORD_ONLY: + # We shouldn't take any additional action. The agent will + # silently continue to heartbeat to ironic until user initiated + # state change occurs causing it to match a state below. + LOG.debug('Heartbeat from %(node)s recorded to identify the ' + 'node as on-line.', {'node': task.node.uuid}) + return + # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index 9ff7fe3ce5..9da823ba66 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -899,10 +899,22 @@ def prepare_inband_cleaning(task, manage_boot=True): :raises: InvalidParameterValue if cleaning network UUID config option has an invalid value. """ - power_state_to_restore = manager_utils.power_on_node_if_needed(task) + fast_track = manager_utils.is_fast_track(task) + if not fast_track: + power_state_to_restore = manager_utils.power_on_node_if_needed(task) + + # WARNING(TheJulia): When fast track is available, trying to plug the + # cleaning network is problematic and in practice this may fail if + # cleaning/provisioning/discovery all take place on different + # networks when.. + # Translation: Here be a realistically unavoidable footgun + # fast track support. + # TODO(TheJulia): Lets improve this somehow such that the agent host + # gracefully handles these sorts of changes. task.driver.network.add_cleaning_network(task) - manager_utils.restore_power_state_if_needed( - task, power_state_to_restore) + if not fast_track: + manager_utils.restore_power_state_if_needed( + task, power_state_to_restore) # Append required config parameters to node's driver_internal_info # to pass to IPA. @@ -912,7 +924,8 @@ def prepare_inband_cleaning(task, manage_boot=True): ramdisk_opts = build_agent_options(task.node) task.driver.boot.prepare_ramdisk(task, ramdisk_opts) - manager_utils.node_power_action(task, states.REBOOT) + if not fast_track: + manager_utils.node_power_action(task, states.REBOOT) # Tell the conductor we are waiting for the agent to boot. return states.CLEANWAIT @@ -936,14 +949,18 @@ def tear_down_inband_cleaning(task, manage_boot=True): :raises: NetworkError, NodeCleaningFailure if the cleaning ports cannot be removed. """ - manager_utils.node_power_action(task, states.POWER_OFF) + fast_track = manager_utils.is_fast_track(task) + if not fast_track: + manager_utils.node_power_action(task, states.POWER_OFF) + if manage_boot: task.driver.boot.clean_up_ramdisk(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task) task.driver.network.remove_cleaning_network(task) - manager_utils.restore_power_state_if_needed( - task, power_state_to_restore) + if not fast_track: + manager_utils.restore_power_state_if_needed( + task, power_state_to_restore) def get_image_instance_info(node): diff --git a/ironic/drivers/modules/iscsi_deploy.py b/ironic/drivers/modules/iscsi_deploy.py index 27a426d972..b663db6978 100644 --- a/ironic/drivers/modules/iscsi_deploy.py +++ b/ironic/drivers/modules/iscsi_deploy.py @@ -408,13 +408,24 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface): :returns: deploy state DEPLOYWAIT. """ node = task.node - if task.driver.storage.should_write_image(task): + if manager_utils.is_fast_track(task): + LOG.debug('Performing a fast track deployment for %(node)s.', + {'node': task.node.uuid}) + deploy_utils.cache_instance_image(task.context, node) + check_image_size(task) + # Update the database for the API and the task tracking resumes + # the state machine state going from DEPLOYWAIT -> DEPLOYING + task.process_event('wait') + self.continue_deploy(task) + elif task.driver.storage.should_write_image(task): + # Standard deploy process deploy_utils.cache_instance_image(task.context, node) check_image_size(task) manager_utils.node_power_action(task, states.REBOOT) - return states.DEPLOYWAIT else: + # Boot to an Storage Volume + # TODO(TheJulia): At some point, we should de-dupe this code # as it is nearly identical to the agent deploy interface. # This is not being done now as it is expected to be @@ -486,23 +497,38 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface): task.driver.boot.prepare_instance(task) else: if node.provision_state == states.DEPLOYING: - # Adding the node to provisioning network so that the dhcp - # options get added for the provisioning port. - manager_utils.node_power_action(task, states.POWER_OFF) + fast_track_deploy = manager_utils.is_fast_track(task) + if fast_track_deploy: + # The agent has already recently checked in and we are + # configured to take that as an indicator that we can + # skip ahead. + LOG.debug('The agent for node %(node)s has recently ' + 'checked in, and the node power will remain ' + 'unmodified.', + {'node': task.node.uuid}) + else: + # Adding the node to provisioning network so that the dhcp + # options get added for the provisioning port. + manager_utils.node_power_action(task, states.POWER_OFF) # NOTE(vdrok): in case of rebuild, we have tenant network # already configured, unbind tenant ports if present if task.driver.storage.should_write_image(task): - power_state_to_restore = ( - manager_utils.power_on_node_if_needed(task)) + if not fast_track_deploy: + power_state_to_restore = ( + manager_utils.power_on_node_if_needed(task)) task.driver.network.unconfigure_tenant_networks(task) task.driver.network.add_provisioning_network(task) - manager_utils.restore_power_state_if_needed( - task, power_state_to_restore) + if not fast_track_deploy: + manager_utils.restore_power_state_if_needed( + task, power_state_to_restore) task.driver.storage.attach_volumes(task) - if not task.driver.storage.should_write_image(task): + if (not task.driver.storage.should_write_image(task) + or fast_track_deploy): # We have nothing else to do as this is handled in the # backend storage system, and we can return to the caller # as we do not need to boot the agent to deploy. + # Alternatively, we are in a fast track deployment + # and have nothing else to do. return deploy_opts = deploy_utils.build_agent_options(node) diff --git a/ironic/tests/unit/api/controllers/v1/test_ramdisk.py b/ironic/tests/unit/api/controllers/v1/test_ramdisk.py index 466af77765..43162adc1d 100644 --- a/ironic/tests/unit/api/controllers/v1/test_ramdisk.py +++ b/ironic/tests/unit/api/controllers/v1/test_ramdisk.py @@ -24,6 +24,7 @@ from six.moves import http_client from ironic.api.controllers import base as api_base from ironic.api.controllers import v1 as api_v1 from ironic.api.controllers.v1 import ramdisk +from ironic.common import states from ironic.conductor import rpcapi from ironic.tests.unit.api import base as test_api_base from ironic.tests.unit.objects import utils as obj_utils @@ -160,6 +161,17 @@ class TestLookup(test_api_base.BaseApiTest): set(data['node'])) self._check_config(data) + def test_fast_deploy_lookup(self): + CONF.set_override('fast_track', True, 'deploy') + for provision_state in [states.ENROLL, states.MANAGEABLE, + states.AVAILABLE]: + self.node.provision_state = provision_state + data = self.get_json( + '/lookup?addresses=%s&node_uuid=%s' % + (','.join(self.addresses), self.node.uuid), + headers={api_base.Version.string: str(api_v1.max_version())}) + self.assertEqual(self.node.uuid, data['node']['uuid']) + @mock.patch.object(rpcapi.ConductorAPI, 'get_topic_for', lambda *n: 'test-topic') diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 3e39f94bb1..af06a0b3a0 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -13,6 +13,7 @@ import time import mock from oslo_config import cfg +from oslo_utils import timeutils from oslo_utils import uuidutils from ironic.common import boot_devices @@ -2537,3 +2538,53 @@ class ValidateDeployTemplatesTestCase(db_base.DbTestCase): self.assertRaises(exception.InvalidParameterValue, conductor_utils.validate_deploy_templates, task) mock_validated.assert_called_once_with(task) + + +@mock.patch.object(fake.FakePower, 'get_power_state', autospec=True) +class FastTrackTestCase(db_base.DbTestCase): + + def setUp(self): + super(FastTrackTestCase, self).setUp() + self.node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + uuid=uuidutils.generate_uuid(), + driver_internal_info={ + 'agent_last_heartbeat': str(timeutils.utcnow().isoformat())}) + self.config(fast_track=True, group='deploy') + + def test_is_fast_track(self, mock_get_power): + mock_get_power.return_value = states.POWER_ON + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.assertTrue(conductor_utils.is_fast_track(task)) + + def test_is_fast_track_config_false(self, mock_get_power): + self.config(fast_track=False, group='deploy') + mock_get_power.return_value = states.POWER_ON + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.assertFalse(conductor_utils.is_fast_track(task)) + + def test_is_fast_track_power_off_false(self, mock_get_power): + mock_get_power.return_value = states.POWER_OFF + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.assertFalse(conductor_utils.is_fast_track(task)) + + def test_is_fast_track_no_heartbeat(self, mock_get_power): + mock_get_power.return_value = states.POWER_ON + i_info = self.node.driver_internal_info + i_info.pop('agent_last_heartbeat') + self.node.driver_internal_info = i_info + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.assertFalse(conductor_utils.is_fast_track(task)) + + def test_is_fast_track_error_blocks(self, mock_get_power): + mock_get_power.return_value = states.POWER_ON + self.node.last_error = "bad things happened" + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.assertFalse(conductor_utils.is_fast_track(task)) diff --git a/ironic/tests/unit/drivers/modules/test_agent.py b/ironic/tests/unit/drivers/modules/test_agent.py index 667e20e09d..3f2353045e 100644 --- a/ironic/tests/unit/drivers/modules/test_agent.py +++ b/ironic/tests/unit/drivers/modules/test_agent.py @@ -337,6 +337,39 @@ class TestAgentDeploy(db_base.DbTestCase): self.assertIsNone(driver_return) self.assertTrue(mock_pxe_instance.called) + @mock.patch.object(agent_client.AgentClient, 'prepare_image', + autospec=True) + @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True) + @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) + def test_deploy_fast_track(self, power_mock, mock_pxe_instance, + mock_is_fast_track, prepare_image_mock): + mock_is_fast_track.return_value = True + self.node.target_provision_state = states.ACTIVE + self.node.provision_state = states.DEPLOYING + test_temp_url = 'http://image' + expected_image_info = { + 'urls': [test_temp_url], + 'id': 'fake-image', + 'node_uuid': self.node.uuid, + 'checksum': 'checksum', + 'disk_format': 'qcow2', + 'container_format': 'bare', + 'stream_raw_images': CONF.agent.stream_raw_images, + } + self.node.save() + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + self.driver.deploy(task) + self.assertFalse(power_mock.called) + self.assertFalse(mock_pxe_instance.called) + task.node.refresh() + prepare_image_mock.assert_called_with(mock.ANY, task.node, + expected_image_info) + self.assertEqual(states.DEPLOYWAIT, task.node.provision_state) + self.assertEqual(states.ACTIVE, + task.node.target_provision_state) + @mock.patch.object(noop_storage.NoopStorage, 'detach_volumes', autospec=True) @mock.patch.object(flat_network.FlatNetwork, @@ -778,6 +811,52 @@ class TestAgentDeploy(db_base.DbTestCase): build_options_mock.assert_not_called() pxe_prepare_ramdisk_mock.assert_not_called() + @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True) + @mock.patch.object(noop_storage.NoopStorage, 'attach_volumes', + autospec=True) + @mock.patch.object(deploy_utils, 'populate_storage_driver_internal_info') + @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk') + @mock.patch.object(deploy_utils, 'build_agent_options') + @mock.patch.object(deploy_utils, 'build_instance_info_for_deploy') + @mock.patch.object(flat_network.FlatNetwork, 'add_provisioning_network', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, + 'unconfigure_tenant_networks', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, 'validate', + spec_set=True, autospec=True) + def test_prepare_fast_track( + self, validate_net_mock, + unconfigure_tenant_net_mock, add_provisioning_net_mock, + build_instance_info_mock, build_options_mock, + pxe_prepare_ramdisk_mock, storage_driver_info_mock, + storage_attach_volumes_mock, is_fast_track_mock): + # TODO(TheJulia): We should revisit this test. Smartnic + # support didn't wire in tightly on testing for power in + # these tests, and largely fast_track impacts power operations. + node = self.node + node.network_interface = 'flat' + node.save() + is_fast_track_mock.return_value = True + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + task.node.provision_state = states.DEPLOYING + build_options_mock.return_value = {'a': 'b'} + self.driver.prepare(task) + storage_driver_info_mock.assert_called_once_with(task) + validate_net_mock.assert_called_once_with(mock.ANY, task) + add_provisioning_net_mock.assert_called_once_with(mock.ANY, task) + unconfigure_tenant_net_mock.assert_called_once_with(mock.ANY, task) + self.assertTrue(storage_attach_volumes_mock.called) + self.assertTrue(build_instance_info_mock.called) + # TODO(TheJulia): We should likely consider executing the + # next two methods at some point in order to facilitate + # continuity. While not explicitly required for this feature + # to work, reboots as part of deployment would need the ramdisk + # present and ready. + self.assertFalse(build_options_mock.called) + self.assertFalse(pxe_prepare_ramdisk_mock.called) + @mock.patch('ironic.common.dhcp_factory.DHCPFactory._set_dhcp_provider') @mock.patch('ironic.common.dhcp_factory.DHCPFactory.clean_dhcp') @mock.patch.object(pxe.PXEBoot, 'clean_up_instance') diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 8de0c569fd..ff634a8f5c 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -355,8 +355,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): task.node.driver_internal_info['agent_url']) self.assertEqual('3.2.0', task.node.driver_internal_info['agent_version']) + self.assertIsNotNone( + task.node.driver_internal_info['agent_last_heartbeat']) mock_touch.assert_called_once_with(mock.ANY) + def test_heartbeat_records_fast_track(self): + self.config(fast_track=True, group='deploy') + for provision_state in [states.ENROLL, states.MANAGEABLE, + states.AVAILABLE]: + self.node.driver_internal_info = {} + self.node.provision_state = provision_state + self.node.save() + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + self.deploy.heartbeat(task, 'http://127.0.0.1:8080', '3.2.0') + self.assertEqual('http://127.0.0.1:8080', + task.node.driver_internal_info['agent_url']) + self.assertEqual('3.2.0', + task.node.driver_internal_info[ + 'agent_version']) + self.assertIsNotNone( + task.node.driver_internal_info['agent_last_heartbeat']) + self.assertEqual(provision_state, task.node.provision_state) + class AgentRescueTests(AgentDeployMixinBaseTest): diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index 179230290b..86cfd3f7f9 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -1699,6 +1699,7 @@ class AgentMethodsTestCase(db_base.DbTestCase): self.assertEqual(8, task.node.driver_internal_info[ 'disk_erasure_concurrency']) + @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True) @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk', autospec=True) @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) @mock.patch.object(utils, 'build_agent_options', autospec=True) @@ -1707,15 +1708,19 @@ class AgentMethodsTestCase(db_base.DbTestCase): def _test_prepare_inband_cleaning( self, add_cleaning_network_mock, build_options_mock, power_mock, prepare_ramdisk_mock, - manage_boot=True): + is_fast_track_mock, manage_boot=True, fast_track=False): build_options_mock.return_value = {'a': 'b'} + is_fast_track_mock.return_value = fast_track with task_manager.acquire( self.context, self.node.uuid, shared=False) as task: self.assertEqual( states.CLEANWAIT, utils.prepare_inband_cleaning(task, manage_boot=manage_boot)) add_cleaning_network_mock.assert_called_once_with(task) - power_mock.assert_called_once_with(task, states.REBOOT) + if not fast_track: + power_mock.assert_called_once_with(task, states.REBOOT) + else: + self.assertFalse(power_mock.called) self.assertEqual(1, task.node.driver_internal_info[ 'agent_erase_devices_iterations']) self.assertIs(True, task.node.driver_internal_info[ @@ -1734,17 +1739,26 @@ class AgentMethodsTestCase(db_base.DbTestCase): def test_prepare_inband_cleaning_manage_boot_false(self): self._test_prepare_inband_cleaning(manage_boot=False) + def test_prepare_inband_cleaning_fast_track(self): + self._test_prepare_inband_cleaning(fast_track=True) + + @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True) @mock.patch.object(pxe.PXEBoot, 'clean_up_ramdisk', autospec=True) @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.' 'remove_cleaning_network') @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) def _test_tear_down_inband_cleaning( self, power_mock, remove_cleaning_network_mock, - clean_up_ramdisk_mock, manage_boot=True): + clean_up_ramdisk_mock, is_fast_track_mock, + manage_boot=True, fast_track=False): + is_fast_track_mock.return_value = fast_track with task_manager.acquire( self.context, self.node.uuid, shared=False) as task: utils.tear_down_inband_cleaning(task, manage_boot=manage_boot) - power_mock.assert_called_once_with(task, states.POWER_OFF) + if not fast_track: + power_mock.assert_called_once_with(task, states.POWER_OFF) + else: + self.assertFalse(power_mock.called) remove_cleaning_network_mock.assert_called_once_with(task) if manage_boot: clean_up_ramdisk_mock.assert_called_once_with( @@ -1758,6 +1772,9 @@ class AgentMethodsTestCase(db_base.DbTestCase): def test_tear_down_inband_cleaning_manage_boot_false(self): self._test_tear_down_inband_cleaning(manage_boot=False) + def test_tear_down_inband_cleaning_fast_track(self): + self._test_tear_down_inband_cleaning(fast_track=True) + def test_build_agent_options_conf(self): self.config(api_url='https://api-url', group='conductor') options = utils.build_agent_options(self.node) diff --git a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py index 01cf03040f..054a81e50d 100644 --- a/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py +++ b/ironic/tests/unit/drivers/modules/test_iscsi_deploy.py @@ -734,6 +734,53 @@ class ISCSIDeployTestCase(db_base.DbTestCase): task.driver.storage, task) self.assertEqual(2, storage_should_write_mock.call_count) + @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True) + @mock.patch.object(noop_storage.NoopStorage, 'attach_volumes', + autospec=True) + @mock.patch.object(deploy_utils, 'populate_storage_driver_internal_info') + @mock.patch.object(pxe.PXEBoot, 'prepare_ramdisk') + @mock.patch.object(deploy_utils, 'build_agent_options') + @mock.patch.object(deploy_utils, 'build_instance_info_for_deploy') + @mock.patch.object(flat_network.FlatNetwork, 'add_provisioning_network', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, + 'unconfigure_tenant_networks', + spec_set=True, autospec=True) + @mock.patch.object(flat_network.FlatNetwork, 'validate', + spec_set=True, autospec=True) + def test_prepare_fast_track( + self, validate_net_mock, + unconfigure_tenant_net_mock, add_provisioning_net_mock, + build_instance_info_mock, build_options_mock, + pxe_prepare_ramdisk_mock, storage_driver_info_mock, + storage_attach_volumes_mock, is_fast_track_mock): + # TODO(TheJulia): We should revisit this test. Smartnic + # support didn't wire in tightly on testing for power in + # these tests, and largely fast_track impacts power operations. + node = self.node + node.network_interface = 'flat' + node.save() + is_fast_track_mock.return_value = True + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + task.node.provision_state = states.DEPLOYING + build_options_mock.return_value = {'a': 'b'} + task.driver.deploy.prepare(task) + storage_driver_info_mock.assert_called_once_with(task) + # NOTE: Validate is the primary difference between agent/iscsi + self.assertFalse(validate_net_mock.called) + add_provisioning_net_mock.assert_called_once_with(mock.ANY, task) + unconfigure_tenant_net_mock.assert_called_once_with(mock.ANY, task) + self.assertTrue(storage_attach_volumes_mock.called) + self.assertFalse(build_instance_info_mock.called) + # TODO(TheJulia): We should likely consider executing the + # next two methods at some point in order to facilitate + # continuity. While not explicitly required for this feature + # to work, reboots as part of deployment would need the ramdisk + # present and ready. + self.assertFalse(build_options_mock.called) + self.assertFalse(pxe_prepare_ramdisk_mock.called) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) @mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True) @mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True) @@ -787,6 +834,36 @@ class ISCSIDeployTestCase(db_base.DbTestCase): self.assertEqual(2, mock_node_power_action.call_count) self.assertEqual(states.DEPLOYING, task.node.provision_state) + @mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True) + @mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True) + @mock.patch.object(iscsi_deploy.ISCSIDeploy, 'continue_deploy', + autospec=True) + @mock.patch('ironic.conductor.utils.is_fast_track', autospec=True) + @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True) + @mock.patch('ironic.conductor.utils.node_power_action', autospec=True) + def test_deploy_fast_track(self, power_mock, mock_pxe_instance, + mock_is_fast_track, continue_deploy_mock, + cache_image_mock, check_image_size_mock): + mock_is_fast_track.return_value = True + self.node.target_provision_state = states.ACTIVE + self.node.provision_state = states.DEPLOYING + i_info = self.node.driver_internal_info + i_info['agent_url'] = 'http://1.2.3.4:1234' + self.node.driver_internal_info = i_info + self.node.save() + with task_manager.acquire( + self.context, self.node['uuid'], shared=False) as task: + task.driver.deploy.deploy(task) + self.assertFalse(power_mock.called) + self.assertFalse(mock_pxe_instance.called) + task.node.refresh() + self.assertEqual(states.DEPLOYWAIT, task.node.provision_state) + self.assertEqual(states.ACTIVE, + task.node.target_provision_state) + cache_image_mock.assert_called_with(mock.ANY, task.node) + check_image_size_mock.assert_called_with(task) + continue_deploy_mock.assert_called_with(mock.ANY, task) + @mock.patch.object(noop_storage.NoopStorage, 'detach_volumes', autospec=True) @mock.patch.object(flat_network.FlatNetwork, diff --git a/releasenotes/notes/fast-track-deployment-f09a8b921b3aae36.yaml b/releasenotes/notes/fast-track-deployment-f09a8b921b3aae36.yaml new file mode 100644 index 0000000000..60ef8a3b0b --- /dev/null +++ b/releasenotes/notes/fast-track-deployment-f09a8b921b3aae36.yaml @@ -0,0 +1,16 @@ +--- +features: + - | + Adds a new feature called `fast-track` which allows an operator to + optionally configure the Bare Metal API Service and the Bare Metal + conductor service to permit lookup and heartbeat for nodes that are + in the process of being enrolled and created. + + These nodes can be left online, from a process such as discovery. + If ironic-python-agent has communicated with the Bare Metal Service + API endpoint with-in the last `300` seconds, then setup steps that + are normally involved with preparing to launch a ramdisk on the node, + are skipped along with power operations to enable a baremetal node to + undergo discovery through to deployment with a single power cycle. + Fast track functionality may be enabled through the ``[deploy]fast_track`` + option. diff --git a/zuul.d/ironic-jobs.yaml b/zuul.d/ironic-jobs.yaml index 24a0ebdd01..b719b24506 100644 --- a/zuul.d/ironic-jobs.yaml +++ b/zuul.d/ironic-jobs.yaml @@ -466,3 +466,14 @@ Run python 3 unit tests with driver dependencies installed. vars: tox_envlist: unit-with-driver-libs-python3 + +- job: + name: ironic-inspector-tempest-discovery-fast-track + description: ironic-inspector-tempest-discovery-fast-track + parent: ironic-inspector-tempest-discovery + vars: + tempest_test_regex: BareMetalFastTrackTest + devstack_localrc: + IRONIC_INSPECTOR_POWER_OFF: False + IRONIC_DEPLOY_FAST_TRACK: True + IRONIC_DEPLOY_FAST_TRACK_CLEANING: True