diff --git a/releasenotes/notes/bug-2112187-763bae283e0b736d.yaml b/releasenotes/notes/bug-2112187-763bae283e0b736d.yaml new file mode 100644 index 000000000..630fbcfc5 --- /dev/null +++ b/releasenotes/notes/bug-2112187-763bae283e0b736d.yaml @@ -0,0 +1,47 @@ +--- +security: + - | + Watchers no longer forges requests on behalf of a tenant when + swapping volumes. Prior to this release watcher had 2 implementations + of moving a volume, it could use cinders volume migrate api or its own + internal implementation that directly calls nova volume attachment update + api. The former is safe and the recommend way to move volumes between + cinder storage backend the internal implementation was insecure, fragile + due to a lack of error handling and capable of deleting user data. + + Insecure: the internal volume migration operation created a new keystone + user with a weak name and password and added it to the tenants project + with the admin role. It then used that user to forge request on behalf + of the tenant with admin right to swap the volume. if the applier was + restarted during the execution of this operation it would never be cleaned + up. + + Fragile: the error handling was minimal, the swap volume api is async + so watcher has to poll for completion, there was no support to resume + that if interrupted of the time out was exceeded. + + Data-loss: while the internal polling logic returned success or failure + watcher did not check the result, once the function returned it + unconditionally deleted the source volume. For larger volumes this + could result in irretrievable data loss. + + Finally if a volume was swapped using the internal workflow it put + the nova instance in an out of sync state. If the VM was live migrated + after the swap volume completed successfully prior to a hard reboot + then the migration would fail or succeed and break tenant isolation. + + see: https://bugs.launchpad.net/nova/+bug/2112187 for details. +fixes: + - | + All code related to creating keystone user and granting roles has been + removed. The internal swap volume implementation has been removed and + replaced by cinders volume migrate api. Note as part of this change + Watcher will no longer attempt volume migrations or retypes if the + instance is in the `Verify Resize` task state. This resolves several + issues related to volume migration in the zone migration and + Storage capacity balance strategies. While efforts have been made + to maintain backward compatibility these changes are required to + address a security weakness in watcher's prior approach. + + see: https://bugs.launchpad.net/nova/+bug/2112187 for more context. + diff --git a/watcher/applier/actions/volume_migration.py b/watcher/applier/actions/volume_migration.py index aa2466f87..8575eea47 100644 --- a/watcher/applier/actions/volume_migration.py +++ b/watcher/applier/actions/volume_migration.py @@ -17,14 +17,11 @@ import jsonschema from oslo_log import log -from cinderclient import client as cinder_client from watcher._i18n import _ from watcher.applier.actions import base from watcher.common import cinder_helper from watcher.common import exception -from watcher.common import keystone_helper from watcher.common import nova_helper -from watcher.common import utils from watcher import conf CONF = conf.CONF @@ -70,8 +67,6 @@ class VolumeMigrate(base.BaseAction): def __init__(self, config, osc=None): super(VolumeMigrate, self).__init__(config) - self.temp_username = utils.random_string(10) - self.temp_password = utils.random_string(10) self.cinder_util = cinder_helper.CinderHelper(osc=self.osc) self.nova_util = nova_helper.NovaHelper(osc=self.osc) @@ -134,83 +129,42 @@ class VolumeMigrate(base.BaseAction): def _can_swap(self, volume): """Judge volume can be swapped""" + # TODO(sean-k-mooney): rename this to _can_migrate and update + # tests to reflect that. + # cinder volume migration can migrate volumes that are not + # attached to instances or nova can migrate the data for cinder + # if the volume is in-use. If the volume has no attachments + # allow cinder to decided if it can be migrated. if not volume.attachments: - return False - instance_id = volume.attachments[0]['server_id'] - instance_status = self.nova_util.find_instance(instance_id).status - - if (volume.status == 'in-use' and - instance_status in ('ACTIVE', 'PAUSED', 'RESIZED')): + LOG.debug(f"volume: {volume.id} has no attachments") return True - return False - - def _create_user(self, volume, user): - """Create user with volume attribute and user information""" - keystone_util = keystone_helper.KeystoneHelper(osc=self.osc) - project_id = getattr(volume, 'os-vol-tenant-attr:tenant_id') - user['project'] = project_id - user['domain'] = keystone_util.get_project(project_id).domain_id - user['roles'] = ['admin'] - return keystone_util.create_user(user) - - def _get_cinder_client(self, session): - """Get cinder client by session""" - return cinder_client.Client( - CONF.cinder_client.api_version, - session=session, - endpoint_type=CONF.cinder_client.endpoint_type) - - def _swap_volume(self, volume, dest_type): - """Swap volume to dest_type - - Limitation note: only for compute libvirt driver - """ - if not dest_type: - raise exception.Invalid( - message=(_("destination type is required when " - "migration type is swap"))) - - if not self._can_swap(volume): - raise exception.Invalid( - message=(_("Invalid state for swapping volume"))) - - user_info = { - 'name': self.temp_username, - 'password': self.temp_password} - user = self._create_user(volume, user_info) - keystone_util = keystone_helper.KeystoneHelper(osc=self.osc) - try: - session = keystone_util.create_session( - user.id, self.temp_password) - temp_cinder = self._get_cinder_client(session) - - # swap volume - new_volume = self.cinder_util.create_volume( - temp_cinder, volume, dest_type) - self.nova_util.swap_volume(volume, new_volume) - - # delete old volume - self.cinder_util.delete_volume(volume) - - finally: - keystone_util.delete_user(user) - - return True + # since it has attachments we need to validate nova's constraints + instance_id = volume.attachments[0]['server_id'] + instance_status = self.nova_util.find_instance(instance_id).status + LOG.debug( + f"volume: {volume.id} is attached to instance: {instance_id} " + f"in instance status: {instance_status}") + # NOTE(sean-k-mooney): This used to allow RESIZED which + # is the resize_verify task state, that is not an acceptable time + # to migrate volumes, if nova does not block this in the API + # today that is probably a bug. PAUSED is also questionable but + # it should generally be safe. + return (volume.status == 'in-use' and + instance_status in ('ACTIVE', 'PAUSED')) def _migrate(self, volume_id, dest_node, dest_type): - try: volume = self.cinder_util.get_volume(volume_id) - if self.migration_type == self.SWAP: - if dest_node: - LOG.warning("dest_node is ignored") - return self._swap_volume(volume, dest_type) + # for backward compatibility map swap to migrate. + if self.migration_type in (self.SWAP, self.MIGRATE): + if not self._can_swap(volume): + raise exception.Invalid( + message=(_("Invalid state for swapping volume"))) + return self.cinder_util.migrate(volume, dest_node) elif self.migration_type == self.RETYPE: return self.cinder_util.retype(volume, dest_type) - elif self.migration_type == self.MIGRATE: - return self.cinder_util.migrate(volume, dest_node) else: raise exception.Invalid( message=(_("Migration of type '%(migration_type)s' is not " diff --git a/watcher/common/keystone_helper.py b/watcher/common/keystone_helper.py index 53ff8e2bd..3733fd1d6 100644 --- a/watcher/common/keystone_helper.py +++ b/watcher/common/keystone_helper.py @@ -15,8 +15,6 @@ from oslo_log import log from keystoneauth1.exceptions import http as ks_exceptions -from keystoneauth1 import loading -from keystoneauth1 import session from watcher._i18n import _ from watcher.common import clients from watcher.common import exception @@ -90,35 +88,3 @@ class KeystoneHelper(object): message=(_("Domain name seems ambiguous: %s") % name_or_id)) return domains[0] - - def create_session(self, user_id, password): - user = self.get_user(user_id) - loader = loading.get_plugin_loader('password') - auth = loader.load_from_options( - auth_url=CONF.watcher_clients_auth.auth_url, - password=password, - user_id=user_id, - project_id=user.default_project_id) - return session.Session(auth=auth) - - def create_user(self, user): - project = self.get_project(user['project']) - domain = self.get_domain(user['domain']) - _user = self.keystone.users.create( - user['name'], - password=user['password'], - domain=domain, - project=project, - ) - for role in user['roles']: - role = self.get_role(role) - self.keystone.roles.grant( - role.id, user=_user.id, project=project.id) - return _user - - def delete_user(self, user): - try: - user = self.get_user(user) - self.keystone.users.delete(user) - except exception.Invalid: - pass diff --git a/watcher/common/utils.py b/watcher/common/utils.py index 5780e81f3..0d5076185 100644 --- a/watcher/common/utils.py +++ b/watcher/common/utils.py @@ -19,9 +19,7 @@ import asyncio import datetime import inspect -import random import re -import string from croniter import croniter import eventlet @@ -160,14 +158,10 @@ def extend_with_strict_schema(validator_class): StrictDefaultValidatingDraft4Validator = extend_with_default( extend_with_strict_schema(validators.Draft4Validator)) + Draft4Validator = validators.Draft4Validator -def random_string(n): - return ''.join([random.choice( - string.ascii_letters + string.digits) for i in range(n)]) - - # Some clients (e.g. MAAS) use asyncio, which isn't compatible with Eventlet. # As a workaround, we're delegating such calls to a native thread. def async_compat_call(f, *args, **kwargs): diff --git a/watcher/decision_engine/strategy/strategies/zone_migration.py b/watcher/decision_engine/strategy/strategies/zone_migration.py index 74eee043b..e8876754a 100644 --- a/watcher/decision_engine/strategy/strategies/zone_migration.py +++ b/watcher/decision_engine/strategy/strategies/zone_migration.py @@ -56,8 +56,19 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy): self.planned_cold_count = 0 self.volume_count = 0 self.planned_volume_count = 0 - self.volume_update_count = 0 - self.planned_volume_update_count = 0 + + # TODO(sean-n-mooney) This is backward compatibility + # for calling the swap code paths. Swap is now an alias + # for migrate, we should clean this up in a future + # cycle. + @property + def volume_update_count(self): + return self.volume_count + + # same as above clean up later. + @property + def planned_volume_update_count(self): + return self.planned_volume_count @classmethod def get_name(cls): @@ -311,8 +322,8 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy): planned_cold_migrate_instance_count=self.planned_cold_count, volume_migrate_count=self.volume_count, planned_volume_migrate_count=self.planned_volume_count, - volume_update_count=self.volume_update_count, - planned_volume_update_count=self.planned_volume_update_count + volume_update_count=self.volume_count, + planned_volume_update_count=self.planned_volume_count ) def set_migration_count(self, targets): @@ -327,10 +338,7 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy): elif self.is_cold(instance): self.cold_count += 1 for volume in targets.get('volume', []): - if self.is_available(volume): - self.volume_count += 1 - elif self.is_in_use(volume): - self.volume_update_count += 1 + self.volume_count += 1 def is_live(self, instance): status = getattr(instance, 'status') @@ -403,19 +411,16 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy): LOG.debug(src_type) LOG.debug("%s %s", dst_pool, dst_type) - if self.is_available(volume): - if src_type == dst_type: - self._volume_migrate(volume, dst_pool) - else: - self._volume_retype(volume, dst_type) - elif self.is_in_use(volume): - self._volume_update(volume, dst_type) + if src_type == dst_type: + self._volume_migrate(volume, dst_pool) + else: + self._volume_retype(volume, dst_type) - # if with_attached_volume is True, migrate attaching instances - if self.with_attached_volume: - instances = [self.nova.find_instance(dic.get('server_id')) - for dic in volume.attachments] - self.instances_migration(instances, action_counter) + # if with_attached_volume is True, migrate attaching instances + if self.with_attached_volume: + instances = [self.nova.find_instance(dic.get('server_id')) + for dic in volume.attachments] + self.instances_migration(instances, action_counter) action_counter.add_pool(pool) @@ -469,16 +474,6 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy): input_parameters=parameters) self.planned_cold_count += 1 - def _volume_update(self, volume, dst_type): - parameters = {"migration_type": "swap", - "destination_type": dst_type, - "resource_name": volume.name} - self.solution.add_action( - action_type="volume_migrate", - resource_id=volume.id, - input_parameters=parameters) - self.planned_volume_update_count += 1 - def _volume_migrate(self, volume, dst_pool): parameters = {"migration_type": "migrate", "destination_node": dst_pool, diff --git a/watcher/tests/applier/actions/test_volume_migration.py b/watcher/tests/applier/actions/test_volume_migration.py index ae77e5578..d97957cf4 100644 --- a/watcher/tests/applier/actions/test_volume_migration.py +++ b/watcher/tests/applier/actions/test_volume_migration.py @@ -22,7 +22,6 @@ from watcher.common import cinder_helper from watcher.common import clients from watcher.common import keystone_helper from watcher.common import nova_helper -from watcher.common import utils as w_utils from watcher.tests import base @@ -102,12 +101,15 @@ class TestMigration(base.TestCase): @staticmethod def fake_volume(**kwargs): + # FIXME(sean-k-mooney): we should be using real objects in this + # test or at lease something more Representative of the real data volume = mock.MagicMock() volume.id = kwargs.get('id', TestMigration.VOLUME_UUID) volume.size = kwargs.get('size', '1') volume.status = kwargs.get('status', 'available') volume.snapshot_id = kwargs.get('snapshot_id', None) volume.availability_zone = kwargs.get('availability_zone', 'nova') + volume.attachments = kwargs.get('attachments', []) return volume @staticmethod @@ -175,42 +177,14 @@ class TestMigration(base.TestCase): "storage1-typename", ) - def test_swap_success(self): - volume = self.fake_volume( - status='in-use', attachments=[{'server_id': 'server_id'}]) - self.m_n_helper.find_instance.return_value = self.fake_instance() - - new_volume = self.fake_volume(id=w_utils.generate_uuid()) - user = mock.Mock() - session = mock.MagicMock() - self.m_k_helper.create_user.return_value = user - self.m_k_helper.create_session.return_value = session - self.m_c_helper.get_volume.return_value = volume - self.m_c_helper.create_volume.return_value = new_volume - - result = self.action_swap.execute() - self.assertTrue(result) - - self.m_n_helper.swap_volume.assert_called_once_with( - volume, - new_volume - ) - self.m_k_helper.delete_user.assert_called_once_with(user) - - def test_swap_fail(self): - # _can_swap fail - instance = self.fake_instance(status='STOPPED') - self.m_n_helper.find_instance.return_value = instance - - result = self.action_swap.execute() - self.assertFalse(result) - def test_can_swap_success(self): volume = self.fake_volume( - status='in-use', attachments=[{'server_id': 'server_id'}]) - instance = self.fake_instance() + status='in-use', attachments=[ + {'server_id': TestMigration.INSTANCE_UUID}]) + instance = self.fake_instance() self.m_n_helper.find_instance.return_value = instance + result = self.action_swap._can_swap(volume) self.assertTrue(result) @@ -219,16 +193,33 @@ class TestMigration(base.TestCase): result = self.action_swap._can_swap(volume) self.assertTrue(result) - instance = self.fake_instance(status='RESIZED') - self.m_n_helper.find_instance.return_value = instance - result = self.action_swap._can_swap(volume) - self.assertTrue(result) - def test_can_swap_fail(self): volume = self.fake_volume( - status='in-use', attachments=[{'server_id': 'server_id'}]) + status='in-use', attachments=[ + {'server_id': TestMigration.INSTANCE_UUID}]) instance = self.fake_instance(status='STOPPED') self.m_n_helper.find_instance.return_value = instance result = self.action_swap._can_swap(volume) self.assertFalse(result) + + instance = self.fake_instance(status='RESIZED') + self.m_n_helper.find_instance.return_value = instance + result = self.action_swap._can_swap(volume) + self.assertFalse(result) + + def test_swap_success(self): + volume = self.fake_volume( + status='in-use', attachments=[ + {'server_id': TestMigration.INSTANCE_UUID}]) + self.m_c_helper.get_volume.return_value = volume + + instance = self.fake_instance() + self.m_n_helper.find_instance.return_value = instance + + result = self.action_swap.execute() + self.assertTrue(result) + self.m_c_helper.migrate.assert_called_once_with( + volume, + "storage1-poolname" + ) diff --git a/watcher/tests/decision_engine/strategy/strategies/test_zone_migration.py b/watcher/tests/decision_engine/strategy/strategies/test_zone_migration.py index 5f9b46d85..c02b21cd1 100644 --- a/watcher/tests/decision_engine/strategy/strategies/test_zone_migration.py +++ b/watcher/tests/decision_engine/strategy/strategies/test_zone_migration.py @@ -399,7 +399,10 @@ class TestZoneMigration(TestBaseStrategy): migration_types = collections.Counter( [action.get('input_parameters')['migration_type'] for action in solution.actions]) - self.assertEqual(1, migration_types.get("swap", 0)) + # watcher no longer implements swap. it is now an + # alias for migrate. + self.assertEqual(0, migration_types.get("swap", 0)) + self.assertEqual(1, migration_types.get("migrate", 1)) global_efficacy_value = solution.global_efficacy[3].get('value', 0) self.assertEqual(100, global_efficacy_value)