Remove too large configdrive for handling error

When configdrive is too large, a node object cannot be saved to DB. If
it happens, the node cannot moved to DEPLOYFAIL because saving the node
is prevented again by the large configdrive in the object. In this
case, the node gets stuck in DEPLOYING, which doesn't allow any state
transition.

This patch removes the configdrive from a node object when storing the
configdrive fails. This also catches ConfigInvalid exception, which is
mentioned in the docsting, and any unexpected exception from
_store_configdrive() to avoid getting a node stuck in DEPLOYING.

Change-Id: I83cf3e02622fc3ed8f5b5389f533e374c1b985f3
Closes-Bug: 1745630
This commit is contained in:
Hironori Shiina 2018-02-08 18:03:09 +09:00
parent 9ab04d0962
commit 927c487a0f
3 changed files with 90 additions and 5 deletions

View File

@ -50,6 +50,7 @@ import eventlet
from futurist import periodics
from futurist import waiters
from ironic_lib import metrics_utils
from oslo_db import exception as db_exception
from oslo_log import log
import oslo_messaging as messaging
from oslo_utils import excutils
@ -3171,6 +3172,7 @@ def _store_configdrive(node, configdrive):
i_info = node.instance_info
i_info['configdrive'] = configdrive
node.instance_info = i_info
node.save()
@METRICS.timer('do_node_deploy')
@ -3190,7 +3192,7 @@ def do_node_deploy(task, conductor_id, configdrive=None):
try:
if configdrive:
_store_configdrive(node, configdrive)
except exception.SwiftOperationError as e:
except (exception.SwiftOperationError, exception.ConfigInvalid) as e:
with excutils.save_and_reraise_exception():
handle_failure(
e, task,
@ -3198,6 +3200,26 @@ def do_node_deploy(task, conductor_id, configdrive=None):
'%(node)s to Swift'),
_('Failed to upload the configdrive to Swift. '
'Error: %s'))
except db_exception.DBDataError as e:
with excutils.save_and_reraise_exception():
# NOTE(hshiina): This error happens when the configdrive is
# too large. Remove the configdrive from the
# object to update DB successfully in handling
# the failure.
node.obj_reset_changes()
handle_failure(
e, task,
('Error while storing the configdrive for %(node)s into '
'the database: %(err)s'),
_("Failed to store the configdrive in the database. : %s"))
except Exception as e:
with excutils.save_and_reraise_exception():
handle_failure(
e, task,
('Unexpected error while preparing the configdrive for '
'node %(node)s'),
_("Failed to prepare the configdrive. Exception: %s"),
traceback=True)
try:
task.driver.deploy.prepare(task)

View File

@ -24,6 +24,7 @@ import datetime
import eventlet
import mock
from oslo_config import cfg
from oslo_db import exception as db_exception
import oslo_messaging as messaging
from oslo_utils import uuidutils
from oslo_versionedobjects import base as ovo_base
@ -49,7 +50,6 @@ from ironic.drivers.modules.network import flat as n_flat
from ironic import objects
from ironic.objects import base as obj_base
from ironic.objects import fields as obj_fields
from ironic.tests import base as tests_base
from ironic.tests.unit.conductor import mgr_utils
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.db import utils as db_utils
@ -1590,6 +1590,59 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin,
self.assertIsNotNone(node.last_error)
self.assertFalse(mock_deploy.called)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.deploy')
def test__do_node_deploy_configdrive_db_error(self, mock_deploy):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake',
provision_state=states.DEPLOYING,
target_provision_state=states.ACTIVE)
task = task_manager.TaskManager(self.context, node.uuid)
task.node.save()
expected_instance_info = dict(node.instance_info)
with mock.patch.object(dbapi.IMPL, 'update_node') as mock_db:
db_node = self.dbapi.get_node_by_uuid(node.uuid)
mock_db.side_effect = [db_exception.DBDataError('DB error'),
db_node, db_node]
self.assertRaises(db_exception.DBDataError,
manager.do_node_deploy, task,
self.service.conductor.id,
configdrive=b'fake config drive')
expected_instance_info.update(configdrive=b'fake config drive')
expected_calls = [
mock.call(node.uuid,
{'version': mock.ANY,
'instance_info': expected_instance_info}),
mock.call(node.uuid,
{'version': mock.ANY,
'provision_state': states.DEPLOYFAIL,
'target_provision_state': states.ACTIVE}),
mock.call(node.uuid,
{'version': mock.ANY,
'last_error': mock.ANY})]
self.assertEqual(expected_calls, mock_db.mock_calls)
self.assertFalse(mock_deploy.called)
@mock.patch.object(manager, '_store_configdrive')
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.deploy')
def test__do_node_deploy_configdrive_unexpected_error(self, mock_deploy,
mock_store):
self._start_service()
node = obj_utils.create_test_node(self.context, driver='fake',
provision_state=states.DEPLOYING,
target_provision_state=states.ACTIVE)
task = task_manager.TaskManager(self.context, node.uuid)
mock_store.side_effect = RuntimeError('unexpected')
self.assertRaises(RuntimeError,
manager.do_node_deploy, task,
self.service.conductor.id,
configdrive=b'fake config drive')
node.refresh()
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
self.assertEqual(states.ACTIVE, node.target_provision_state)
self.assertIsNotNone(node.last_error)
self.assertFalse(mock_deploy.called)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.deploy')
def test__do_node_deploy_ok_2(self, mock_deploy):
# NOTE(rloo): a different way of testing for the same thing as in
@ -5717,16 +5770,17 @@ class ManagerSyncLocalStateTestCase(mgr_utils.CommonMixIn, db_base.DbTestCase):
@mock.patch.object(swift, 'SwiftAPI')
class StoreConfigDriveTestCase(tests_base.TestCase):
class StoreConfigDriveTestCase(db_base.DbTestCase):
def setUp(self):
super(StoreConfigDriveTestCase, self).setUp()
self.node = obj_utils.get_test_node(self.context, driver='fake',
instance_info=None)
self.node = obj_utils.create_test_node(self.context, driver='fake',
instance_info=None)
def test_store_configdrive(self, mock_swift):
manager._store_configdrive(self.node, 'foo')
expected_instance_info = {'configdrive': 'foo'}
self.node.refresh()
self.assertEqual(expected_instance_info, self.node.instance_info)
self.assertFalse(mock_swift.called)
@ -5754,6 +5808,7 @@ class StoreConfigDriveTestCase(tests_base.TestCase):
object_headers=expected_obj_header)
mock_swift.return_value.get_temp_url.assert_called_once_with(
container_name, expected_obj_name, timeout)
self.node.refresh()
self.assertEqual(expected_instance_info, self.node.instance_info)

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes a bug to get a node stuck in ``deploying`` state when the size of
the configdrive exceeds the limitation of the database. In MySQL, the
limitation is about 64KiB. With this fix, the provision state gets
``deploy failed`` in this case. See `bug 1745630
<https://bugs.launchpad.net/ironic/+bug/1745630>`_ for details.