diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 32427c6e71..fed6846707 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -137,6 +137,10 @@ class AgentClient(object): return result @METRICS.timer('AgentClient.get_commands_status') + @retrying.retry( + retry_on_exception=( + lambda e: isinstance(e, exception.AgentConnectionFailed)), + stop_max_attempt_number=CONF.agent.max_command_attempts) def get_commands_status(self, node): """Get command status from agent. @@ -166,7 +170,16 @@ class AgentClient(object): """ url = self._get_command_url(node) LOG.debug('Fetching status of agent commands for node %s', node.uuid) - resp = self.session.get(url, timeout=CONF.agent.command_timeout) + try: + resp = self.session.get(url, timeout=CONF.agent.command_timeout) + except (requests.ConnectionError, requests.Timeout) as e: + msg = (_('Failed to connect to the agent running on node %(node)s ' + 'to collect commands status. ' + 'Error: %(error)s') % + {'node': node.uuid, 'error': e}) + LOG.error(msg) + raise exception.AgentConnectionFailed(reason=msg) + result = resp.json()['commands'] status = '; '.join('%(cmd)s: result "%(res)s", error "%(err)s"' % {'cmd': r.get('command_name'), diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index a39477f02b..78812b5a89 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -191,6 +191,17 @@ class TestAgentClient(base.TestCase): 'api_version': CONF.agent.agent_api_version}, timeout=CONF.agent.command_timeout) + def test_get_commands_status_retries(self): + with mock.patch.object(self.client.session, 'get', + autospec=True) as mock_get: + res = mock.MagicMock(spec_set=['json']) + res.json.return_value = {'commands': []} + mock_get.side_effect = [ + requests.ConnectionError('boom'), + res] + self.assertEqual([], self.client.get_commands_status(self.node)) + self.assertEqual(2, mock_get.call_count) + def test_prepare_image(self): self.client._command = mock.MagicMock(spec_set=[]) image_info = {'image_id': 'image'} diff --git a/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml b/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml new file mode 100644 index 0000000000..df49202525 --- /dev/null +++ b/releasenotes/notes/agent-command-status-retry-f9b6f53a823c6b01.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixes an issue with the agent client code where checks of the agent + command status had no logic to prevent an intermittent or transient + connection failure from causing the entire operation to fail.