Merge "Agent vendor handles manual cleaning"
This commit is contained in:
commit
15ec41070e
ironic
conductor
drivers/modules
tests/unit
@ -646,10 +646,18 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
state=node.provision_state)
|
||||
self._do_node_clean(task)
|
||||
|
||||
def _get_node_next_clean_steps(self, task):
|
||||
def _get_node_next_clean_steps(self, task, skip_current_step=True):
|
||||
"""Get the task's node's next clean steps.
|
||||
|
||||
This returns the list of remaining (ordered) clean steps to be done
|
||||
on the node. If no clean steps have been started yet, all the clean
|
||||
steps are returned. Otherwise, the clean steps after the current
|
||||
clean step are returned. The current clean step is also returned if
|
||||
'skip_current_step' is False.
|
||||
|
||||
:param task: A TaskManager object
|
||||
:param skip_current_step: True to skip the current clean step; False to
|
||||
include it.
|
||||
:raises: NodeCleaningFailure if an internal error occurred when
|
||||
getting the next clean steps
|
||||
:returns: ordered list of clean step dictionaries
|
||||
@ -662,9 +670,12 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
return next_steps
|
||||
|
||||
try:
|
||||
# Trim off the last clean step (now finished) and
|
||||
# all previous steps
|
||||
next_steps = next_steps[next_steps.index(node.clean_step) + 1:]
|
||||
# Trim off all previous steps up to (and maybe including) the
|
||||
# current clean step.
|
||||
ind = next_steps.index(node.clean_step)
|
||||
if skip_current_step:
|
||||
ind += 1
|
||||
next_steps = next_steps[ind:]
|
||||
except ValueError:
|
||||
msg = (_('Node %(node)s got an invalid last step for '
|
||||
'%(state)s: %(step)s.') %
|
||||
@ -782,7 +793,17 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
'state': node.provision_state,
|
||||
'clean_state': states.CLEANWAIT})
|
||||
|
||||
next_steps = self._get_node_next_clean_steps(task)
|
||||
info = node.driver_internal_info
|
||||
try:
|
||||
skip_current_step = info.pop('skip_current_clean_step')
|
||||
except KeyError:
|
||||
skip_current_step = True
|
||||
else:
|
||||
node.driver_internal_info = info
|
||||
node.save()
|
||||
|
||||
next_steps = self._get_node_next_clean_steps(
|
||||
task, skip_current_step=skip_current_step)
|
||||
|
||||
# If this isn't the final clean step in the cleaning operation
|
||||
# and it is flagged to abort after the clean step that just
|
||||
|
@ -218,14 +218,18 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
def continue_cleaning(self, task, **kwargs):
|
||||
"""Start the next cleaning step if the previous one is complete.
|
||||
|
||||
In order to avoid errors and make agent upgrades painless, cleaning
|
||||
will check the version of all hardware managers during get_clean_steps
|
||||
at the beginning of cleaning and before executing each step in the
|
||||
agent. If the version has changed between steps, the agent is unable
|
||||
to tell if an ordering change will cause a cleaning issue. Therefore,
|
||||
we restart cleaning.
|
||||
In order to avoid errors and make agent upgrades painless, the agent
|
||||
compares the version of all hardware managers at the start of the
|
||||
cleaning (the agent's get_clean_steps() call) and before executing
|
||||
each clean step. If the version has changed between steps, the agent is
|
||||
unable to tell if an ordering change will cause a cleaning issue so
|
||||
it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
|
||||
the entire cleaning cycle. For manual cleaning, we don't.
|
||||
"""
|
||||
node = task.node
|
||||
# For manual clean, the target provision state is MANAGEABLE, whereas
|
||||
# for automated cleaning, it is (the default) AVAILABLE.
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
command = self._get_completed_cleaning_command(task)
|
||||
LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
|
||||
' %(command)s', {'node': node.uuid,
|
||||
@ -245,20 +249,57 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
LOG.error(msg)
|
||||
return manager_utils.cleaning_error_handler(task, msg)
|
||||
elif command.get('command_status') == 'CLEAN_VERSION_MISMATCH':
|
||||
# Restart cleaning, agent must have rebooted to new version
|
||||
LOG.info(_LI('Node %s detected a clean version mismatch, '
|
||||
'resetting clean steps and rebooting the node.'),
|
||||
node.uuid)
|
||||
try:
|
||||
manager_utils.set_node_cleaning_steps(task)
|
||||
except exception.NodeCleaningFailure:
|
||||
msg = (_('Could not restart cleaning on node %(node)s: '
|
||||
'%(err)s.') %
|
||||
{'node': node.uuid,
|
||||
'err': command.get('command_error'),
|
||||
'step': node.clean_step})
|
||||
LOG.exception(msg)
|
||||
return manager_utils.cleaning_error_handler(task, msg)
|
||||
if manual_clean:
|
||||
# Don't restart manual cleaning if agent reboots to a new
|
||||
# version. Both are operator actions, unlike automated
|
||||
# cleaning. Manual clean steps are not necessarily idempotent
|
||||
# like automated clean steps and can be even longer running.
|
||||
LOG.info(_LI('During manual cleaning, node %(node)s detected '
|
||||
'a clean version mismatch. Re-executing and '
|
||||
'continuing from current step %(step)s.'),
|
||||
{'node': node.uuid, 'step': node.clean_step})
|
||||
|
||||
result = self._client.get_clean_steps(
|
||||
task.node, task.ports).get('command_result')
|
||||
|
||||
required_keys = ('clean_steps', 'hardware_manager_version')
|
||||
missing_keys = [key for key in required_keys
|
||||
if key not in result]
|
||||
if missing_keys:
|
||||
msg = (_('Could not continue manual cleaning from step '
|
||||
'%(step)s on node %(node)s. get_clean_steps '
|
||||
'returned invalid result. The keys %(keys)s are '
|
||||
'missing from result %(result)s.') %
|
||||
{'node': node.uuid,
|
||||
'step': node.clean_step,
|
||||
'keys': missing_keys,
|
||||
'result': result})
|
||||
LOG.error(msg)
|
||||
return manager_utils.cleaning_error_handler(task, msg)
|
||||
|
||||
driver_internal_info = node.driver_internal_info
|
||||
driver_internal_info['hardware_manager_version'] = result[
|
||||
'hardware_manager_version']
|
||||
driver_internal_info['skip_current_clean_step'] = False
|
||||
node.driver_internal_info = driver_internal_info
|
||||
node.save()
|
||||
else:
|
||||
# Restart cleaning, agent must have rebooted to new version
|
||||
LOG.info(_LI('During automated cleaning, node %s detected a '
|
||||
'clean version mismatch. Resetting clean steps '
|
||||
'and rebooting the node.'),
|
||||
node.uuid)
|
||||
try:
|
||||
manager_utils.set_node_cleaning_steps(task)
|
||||
except exception.NodeCleaningFailure:
|
||||
msg = (_('Could not restart automated cleaning on node '
|
||||
'%(node)s: %(err)s.') %
|
||||
{'node': node.uuid,
|
||||
'err': command.get('command_error'),
|
||||
'step': node.clean_step})
|
||||
LOG.exception(msg)
|
||||
return manager_utils.cleaning_error_handler(task, msg)
|
||||
|
||||
self.notify_conductor_resume_clean(task)
|
||||
|
||||
elif command.get('command_status') == 'SUCCEEDED':
|
||||
@ -354,7 +395,8 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
if not node.clean_step:
|
||||
LOG.debug('Node %s just booted to start cleaning.',
|
||||
node.uuid)
|
||||
msg = _('Node failed to start the next cleaning step.')
|
||||
msg = _('Node failed to start the first cleaning '
|
||||
'step.')
|
||||
manager_utils.set_node_cleaning_steps(task)
|
||||
self.notify_conductor_resume_clean(task)
|
||||
else:
|
||||
@ -368,7 +410,7 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
except Exception as e:
|
||||
err_info = {'node': node.uuid, 'msg': msg, 'e': e}
|
||||
last_error = _('Asynchronous exception for node %(node)s: '
|
||||
'%(msg)s exception: %(e)s') % err_info
|
||||
'%(msg)s Exception: %(e)s') % err_info
|
||||
LOG.exception(last_error)
|
||||
if node.provision_state in (states.CLEANING, states.CLEANWAIT):
|
||||
manager_utils.cleaning_error_handler(task, last_error)
|
||||
|
@ -1543,6 +1543,35 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
def test_continue_node_clean_backward_compat(self):
|
||||
self._continue_node_clean(states.CLEANING)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
|
||||
def _continue_node_clean_skip_step(self, mock_spawn, skip=True):
|
||||
# test that skipping current step mechanism works
|
||||
driver_info = {'clean_steps': self.clean_steps}
|
||||
if not skip:
|
||||
driver_info['skip_current_clean_step'] = skip
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake', provision_state=states.CLEANWAIT,
|
||||
target_provision_state=states.MANAGEABLE,
|
||||
driver_internal_info=driver_info, clean_step=self.clean_steps[0])
|
||||
self._start_service()
|
||||
self.service.continue_node_clean(self.context, node.uuid)
|
||||
self.service._worker_pool.waitall()
|
||||
node.refresh()
|
||||
if skip:
|
||||
expected_steps = self.next_clean_steps
|
||||
else:
|
||||
self.assertFalse(
|
||||
'skip_current_clean_step' in node.driver_internal_info)
|
||||
expected_steps = self.clean_steps
|
||||
mock_spawn.assert_called_with(self.service._do_next_clean_step,
|
||||
mock.ANY, expected_steps)
|
||||
|
||||
def test_continue_node_clean_skip_step(self):
|
||||
self._continue_node_clean_skip_step()
|
||||
|
||||
def test_continue_node_clean_no_skip_step(self):
|
||||
self._continue_node_clean_skip_step(skip=False)
|
||||
|
||||
def _continue_node_clean_abort(self, manual=False):
|
||||
last_clean_step = self.clean_steps[0]
|
||||
last_clean_step['abortable'] = False
|
||||
@ -2061,7 +2090,7 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
def test__do_next_clean_step_manual_bad_step_return_value(self):
|
||||
self._do_next_clean_step_bad_step_return_value(manual=True)
|
||||
|
||||
def test__get_node_next_clean_steps(self):
|
||||
def __get_node_next_clean_steps(self, skip=True):
|
||||
driver_internal_info = {'clean_steps': self.clean_steps}
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake',
|
||||
@ -2072,8 +2101,16 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
clean_step=self.clean_steps[0])
|
||||
|
||||
with task_manager.acquire(self.context, node.uuid) as task:
|
||||
steps = self.service._get_node_next_clean_steps(task)
|
||||
self.assertEqual(self.next_clean_steps, steps)
|
||||
steps = self.service._get_node_next_clean_steps(
|
||||
task, skip_current_step=skip)
|
||||
expected = self.next_clean_steps if skip else self.clean_steps
|
||||
self.assertEqual(expected, steps)
|
||||
|
||||
def test__get_node_next_clean_steps(self):
|
||||
self.__get_node_next_clean_steps()
|
||||
|
||||
def test__get_node_next_clean_steps_no_skip(self):
|
||||
self.__get_node_next_clean_steps(skip=False)
|
||||
|
||||
def test__get_node_next_clean_steps_unset_clean_step(self):
|
||||
driver_internal_info = {'clean_steps': self.clean_steps}
|
||||
|
@ -319,7 +319,7 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
log_mock.assert_called_once_with(
|
||||
'Asynchronous exception for node '
|
||||
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
|
||||
'is done. exception: LlamaException')
|
||||
'is done. Exception: LlamaException')
|
||||
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'deploy_has_started',
|
||||
autospec=True)
|
||||
@ -354,7 +354,7 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
log_mock.assert_called_once_with(
|
||||
'Asynchronous exception for node '
|
||||
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
|
||||
'is done. exception: LlamaException')
|
||||
'is done. Exception: LlamaException')
|
||||
|
||||
@mock.patch.object(objects.node.Node, 'touch_provisioning', autospec=True)
|
||||
@mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True)
|
||||
@ -897,21 +897,91 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
@mock.patch.object(manager_utils, 'set_node_cleaning_steps', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||
'notify_conductor_resume_clean', autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||
autospec=True)
|
||||
def test_continue_cleaning_clean_version_mismatch(
|
||||
self, status_mock, notify_mock, steps_mock):
|
||||
# Test that cleaning is restarted if there is a version mismatch
|
||||
def _test_continue_cleaning_clean_version_mismatch(
|
||||
self, status_mock, get_steps_mock, notify_mock, steps_mock,
|
||||
manual=False):
|
||||
get_steps_mock.return_value = {
|
||||
'command_status': 'CLEAN_VERSION_MISMATCH',
|
||||
'command_name': 'get_clean_step',
|
||||
'command_result': {
|
||||
'hardware_manager_version': {'Generic': '1'},
|
||||
'clean_steps': {
|
||||
'GenericHardwareManager': [
|
||||
{'interface': 'deploy',
|
||||
'step': 'erase_devices',
|
||||
'priority': 20}]}
|
||||
}
|
||||
}
|
||||
status_mock.return_value = [{
|
||||
'command_status': 'CLEAN_VERSION_MISMATCH',
|
||||
'command_name': 'execute_clean_step',
|
||||
'command_result': {}
|
||||
}]
|
||||
with task_manager.acquire(self.context, self.node['uuid'],
|
||||
tgt_prov_state = states.MANAGEABLE if manual else states.AVAILABLE
|
||||
self.node.provision_state = states.CLEANWAIT
|
||||
self.node.target_provision_state = tgt_prov_state
|
||||
self.node.save()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
self.passthru.continue_cleaning(task)
|
||||
steps_mock.assert_called_once_with(task)
|
||||
notify_mock.assert_called_once_with(mock.ANY, task)
|
||||
if manual:
|
||||
get_steps_mock.assert_called_once_with(mock.ANY, task.node,
|
||||
task.ports)
|
||||
self.assertFalse(
|
||||
task.node.driver_internal_info['skip_current_clean_step'])
|
||||
self.assertEqual(
|
||||
{'Generic': '1'},
|
||||
task.node.driver_internal_info['hardware_manager_version'])
|
||||
self.assertFalse(steps_mock.called)
|
||||
else:
|
||||
self.assertFalse(get_steps_mock.called)
|
||||
steps_mock.assert_called_once_with(task)
|
||||
self.assertFalse('skip_current_clean_step' in
|
||||
task.node.driver_internal_info)
|
||||
|
||||
def test_continue_cleaning_automated_clean_version_mismatch(self):
|
||||
self._test_continue_cleaning_clean_version_mismatch()
|
||||
|
||||
def test_continue_cleaning_manual_clean_version_mismatch(self):
|
||||
self._test_continue_cleaning_clean_version_mismatch(manual=True)
|
||||
|
||||
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||
'notify_conductor_resume_clean', autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_clean_steps',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||
autospec=True)
|
||||
def test_continue_cleaning_manual_version_mismatch_bad(
|
||||
self, status_mock, get_steps_mock, notify_mock, error_mock):
|
||||
get_steps_mock.return_value = {
|
||||
'command_status': 'CLEAN_VERSION_MISMATCH',
|
||||
'command_name': 'get_clean_step',
|
||||
'command_result': {
|
||||
'hardware_manager_version': {'Generic': '1'}}
|
||||
}
|
||||
status_mock.return_value = [{
|
||||
'command_status': 'CLEAN_VERSION_MISMATCH',
|
||||
'command_name': 'execute_clean_step',
|
||||
}]
|
||||
tgt_prov_state = states.MANAGEABLE
|
||||
self.node.provision_state = states.CLEANWAIT
|
||||
self.node.target_provision_state = tgt_prov_state
|
||||
self.node.save()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=False) as task:
|
||||
self.passthru.continue_cleaning(task)
|
||||
|
||||
get_steps_mock.assert_called_once_with(mock.ANY, task.node,
|
||||
task.ports)
|
||||
error_mock.assert_called_once_with(task, mock.ANY)
|
||||
self.assertFalse(notify_mock.called)
|
||||
self.assertFalse('skip_current_clean_step' in
|
||||
task.node.driver_internal_info)
|
||||
|
||||
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||
|
Loading…
x
Reference in New Issue
Block a user