use cinder migrate for swap volume

This change removes watchers in tree functionality
for swapping instance volumes and defines swap as an alias
of cinder volume migrate.

The watcher native implementation was missing error handling
which could lead to irretrievable data loss.

The removed code also forged project user credentials to
perform admin request as if it was done by a member of a project.
this was unsafe an posses a security risk due to how it was
implemented. This code has been removed without replacement.

While some effort has been made to allow existing
audits that were defined to work, any reduction of functionality
as a result of this security hardening is intentional.

Closes-Bug: #2112187
Change-Id: Ic3b6bfd164e272d70fe86d7b182478dd962f8ac0
Signed-off-by: Sean Mooney <work@seanmooney.info>
This commit is contained in:
Sean Mooney
2025-06-10 20:13:49 +01:00
committed by sean mooney
parent 355671e979
commit 3742e0a79c
7 changed files with 133 additions and 183 deletions

View File

@@ -0,0 +1,47 @@
---
security:
- |
Watchers no longer forges requests on behalf of a tenant when
swapping volumes. Prior to this release watcher had 2 implementations
of moving a volume, it could use cinders volume migrate api or its own
internal implementation that directly calls nova volume attachment update
api. The former is safe and the recommend way to move volumes between
cinder storage backend the internal implementation was insecure, fragile
due to a lack of error handling and capable of deleting user data.
Insecure: the internal volume migration operation created a new keystone
user with a weak name and password and added it to the tenants project
with the admin role. It then used that user to forge request on behalf
of the tenant with admin right to swap the volume. if the applier was
restarted during the execution of this operation it would never be cleaned
up.
Fragile: the error handling was minimal, the swap volume api is async
so watcher has to poll for completion, there was no support to resume
that if interrupted of the time out was exceeded.
Data-loss: while the internal polling logic returned success or failure
watcher did not check the result, once the function returned it
unconditionally deleted the source volume. For larger volumes this
could result in irretrievable data loss.
Finally if a volume was swapped using the internal workflow it put
the nova instance in an out of sync state. If the VM was live migrated
after the swap volume completed successfully prior to a hard reboot
then the migration would fail or succeed and break tenant isolation.
see: https://bugs.launchpad.net/nova/+bug/2112187 for details.
fixes:
- |
All code related to creating keystone user and granting roles has been
removed. The internal swap volume implementation has been removed and
replaced by cinders volume migrate api. Note as part of this change
Watcher will no longer attempt volume migrations or retypes if the
instance is in the `Verify Resize` task state. This resolves several
issues related to volume migration in the zone migration and
Storage capacity balance strategies. While efforts have been made
to maintain backward compatibility these changes are required to
address a security weakness in watcher's prior approach.
see: https://bugs.launchpad.net/nova/+bug/2112187 for more context.

View File

@@ -17,14 +17,11 @@ import jsonschema
from oslo_log import log from oslo_log import log
from cinderclient import client as cinder_client
from watcher._i18n import _ from watcher._i18n import _
from watcher.applier.actions import base from watcher.applier.actions import base
from watcher.common import cinder_helper from watcher.common import cinder_helper
from watcher.common import exception from watcher.common import exception
from watcher.common import keystone_helper
from watcher.common import nova_helper from watcher.common import nova_helper
from watcher.common import utils
from watcher import conf from watcher import conf
CONF = conf.CONF CONF = conf.CONF
@@ -70,8 +67,6 @@ class VolumeMigrate(base.BaseAction):
def __init__(self, config, osc=None): def __init__(self, config, osc=None):
super(VolumeMigrate, self).__init__(config) super(VolumeMigrate, self).__init__(config)
self.temp_username = utils.random_string(10)
self.temp_password = utils.random_string(10)
self.cinder_util = cinder_helper.CinderHelper(osc=self.osc) self.cinder_util = cinder_helper.CinderHelper(osc=self.osc)
self.nova_util = nova_helper.NovaHelper(osc=self.osc) self.nova_util = nova_helper.NovaHelper(osc=self.osc)
@@ -134,83 +129,42 @@ class VolumeMigrate(base.BaseAction):
def _can_swap(self, volume): def _can_swap(self, volume):
"""Judge volume can be swapped""" """Judge volume can be swapped"""
# TODO(sean-k-mooney): rename this to _can_migrate and update
# tests to reflect that.
# cinder volume migration can migrate volumes that are not
# attached to instances or nova can migrate the data for cinder
# if the volume is in-use. If the volume has no attachments
# allow cinder to decided if it can be migrated.
if not volume.attachments: if not volume.attachments:
return False LOG.debug(f"volume: {volume.id} has no attachments")
instance_id = volume.attachments[0]['server_id']
instance_status = self.nova_util.find_instance(instance_id).status
if (volume.status == 'in-use' and
instance_status in ('ACTIVE', 'PAUSED', 'RESIZED')):
return True return True
return False # since it has attachments we need to validate nova's constraints
instance_id = volume.attachments[0]['server_id']
def _create_user(self, volume, user): instance_status = self.nova_util.find_instance(instance_id).status
"""Create user with volume attribute and user information""" LOG.debug(
keystone_util = keystone_helper.KeystoneHelper(osc=self.osc) f"volume: {volume.id} is attached to instance: {instance_id} "
project_id = getattr(volume, 'os-vol-tenant-attr:tenant_id') f"in instance status: {instance_status}")
user['project'] = project_id # NOTE(sean-k-mooney): This used to allow RESIZED which
user['domain'] = keystone_util.get_project(project_id).domain_id # is the resize_verify task state, that is not an acceptable time
user['roles'] = ['admin'] # to migrate volumes, if nova does not block this in the API
return keystone_util.create_user(user) # today that is probably a bug. PAUSED is also questionable but
# it should generally be safe.
def _get_cinder_client(self, session): return (volume.status == 'in-use' and
"""Get cinder client by session""" instance_status in ('ACTIVE', 'PAUSED'))
return cinder_client.Client(
CONF.cinder_client.api_version,
session=session,
endpoint_type=CONF.cinder_client.endpoint_type)
def _swap_volume(self, volume, dest_type):
"""Swap volume to dest_type
Limitation note: only for compute libvirt driver
"""
if not dest_type:
raise exception.Invalid(
message=(_("destination type is required when "
"migration type is swap")))
if not self._can_swap(volume):
raise exception.Invalid(
message=(_("Invalid state for swapping volume")))
user_info = {
'name': self.temp_username,
'password': self.temp_password}
user = self._create_user(volume, user_info)
keystone_util = keystone_helper.KeystoneHelper(osc=self.osc)
try:
session = keystone_util.create_session(
user.id, self.temp_password)
temp_cinder = self._get_cinder_client(session)
# swap volume
new_volume = self.cinder_util.create_volume(
temp_cinder, volume, dest_type)
self.nova_util.swap_volume(volume, new_volume)
# delete old volume
self.cinder_util.delete_volume(volume)
finally:
keystone_util.delete_user(user)
return True
def _migrate(self, volume_id, dest_node, dest_type): def _migrate(self, volume_id, dest_node, dest_type):
try: try:
volume = self.cinder_util.get_volume(volume_id) volume = self.cinder_util.get_volume(volume_id)
if self.migration_type == self.SWAP: # for backward compatibility map swap to migrate.
if dest_node: if self.migration_type in (self.SWAP, self.MIGRATE):
LOG.warning("dest_node is ignored") if not self._can_swap(volume):
return self._swap_volume(volume, dest_type) raise exception.Invalid(
message=(_("Invalid state for swapping volume")))
return self.cinder_util.migrate(volume, dest_node)
elif self.migration_type == self.RETYPE: elif self.migration_type == self.RETYPE:
return self.cinder_util.retype(volume, dest_type) return self.cinder_util.retype(volume, dest_type)
elif self.migration_type == self.MIGRATE:
return self.cinder_util.migrate(volume, dest_node)
else: else:
raise exception.Invalid( raise exception.Invalid(
message=(_("Migration of type '%(migration_type)s' is not " message=(_("Migration of type '%(migration_type)s' is not "

View File

@@ -15,8 +15,6 @@
from oslo_log import log from oslo_log import log
from keystoneauth1.exceptions import http as ks_exceptions from keystoneauth1.exceptions import http as ks_exceptions
from keystoneauth1 import loading
from keystoneauth1 import session
from watcher._i18n import _ from watcher._i18n import _
from watcher.common import clients from watcher.common import clients
from watcher.common import exception from watcher.common import exception
@@ -90,35 +88,3 @@ class KeystoneHelper(object):
message=(_("Domain name seems ambiguous: %s") % message=(_("Domain name seems ambiguous: %s") %
name_or_id)) name_or_id))
return domains[0] return domains[0]
def create_session(self, user_id, password):
user = self.get_user(user_id)
loader = loading.get_plugin_loader('password')
auth = loader.load_from_options(
auth_url=CONF.watcher_clients_auth.auth_url,
password=password,
user_id=user_id,
project_id=user.default_project_id)
return session.Session(auth=auth)
def create_user(self, user):
project = self.get_project(user['project'])
domain = self.get_domain(user['domain'])
_user = self.keystone.users.create(
user['name'],
password=user['password'],
domain=domain,
project=project,
)
for role in user['roles']:
role = self.get_role(role)
self.keystone.roles.grant(
role.id, user=_user.id, project=project.id)
return _user
def delete_user(self, user):
try:
user = self.get_user(user)
self.keystone.users.delete(user)
except exception.Invalid:
pass

View File

@@ -19,9 +19,7 @@
import asyncio import asyncio
import datetime import datetime
import inspect import inspect
import random
import re import re
import string
from croniter import croniter from croniter import croniter
import eventlet import eventlet
@@ -160,14 +158,10 @@ def extend_with_strict_schema(validator_class):
StrictDefaultValidatingDraft4Validator = extend_with_default( StrictDefaultValidatingDraft4Validator = extend_with_default(
extend_with_strict_schema(validators.Draft4Validator)) extend_with_strict_schema(validators.Draft4Validator))
Draft4Validator = validators.Draft4Validator Draft4Validator = validators.Draft4Validator
def random_string(n):
return ''.join([random.choice(
string.ascii_letters + string.digits) for i in range(n)])
# Some clients (e.g. MAAS) use asyncio, which isn't compatible with Eventlet. # Some clients (e.g. MAAS) use asyncio, which isn't compatible with Eventlet.
# As a workaround, we're delegating such calls to a native thread. # As a workaround, we're delegating such calls to a native thread.
def async_compat_call(f, *args, **kwargs): def async_compat_call(f, *args, **kwargs):

View File

@@ -56,8 +56,19 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
self.planned_cold_count = 0 self.planned_cold_count = 0
self.volume_count = 0 self.volume_count = 0
self.planned_volume_count = 0 self.planned_volume_count = 0
self.volume_update_count = 0
self.planned_volume_update_count = 0 # TODO(sean-n-mooney) This is backward compatibility
# for calling the swap code paths. Swap is now an alias
# for migrate, we should clean this up in a future
# cycle.
@property
def volume_update_count(self):
return self.volume_count
# same as above clean up later.
@property
def planned_volume_update_count(self):
return self.planned_volume_count
@classmethod @classmethod
def get_name(cls): def get_name(cls):
@@ -311,8 +322,8 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
planned_cold_migrate_instance_count=self.planned_cold_count, planned_cold_migrate_instance_count=self.planned_cold_count,
volume_migrate_count=self.volume_count, volume_migrate_count=self.volume_count,
planned_volume_migrate_count=self.planned_volume_count, planned_volume_migrate_count=self.planned_volume_count,
volume_update_count=self.volume_update_count, volume_update_count=self.volume_count,
planned_volume_update_count=self.planned_volume_update_count planned_volume_update_count=self.planned_volume_count
) )
def set_migration_count(self, targets): def set_migration_count(self, targets):
@@ -327,10 +338,7 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
elif self.is_cold(instance): elif self.is_cold(instance):
self.cold_count += 1 self.cold_count += 1
for volume in targets.get('volume', []): for volume in targets.get('volume', []):
if self.is_available(volume): self.volume_count += 1
self.volume_count += 1
elif self.is_in_use(volume):
self.volume_update_count += 1
def is_live(self, instance): def is_live(self, instance):
status = getattr(instance, 'status') status = getattr(instance, 'status')
@@ -403,19 +411,16 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
LOG.debug(src_type) LOG.debug(src_type)
LOG.debug("%s %s", dst_pool, dst_type) LOG.debug("%s %s", dst_pool, dst_type)
if self.is_available(volume): if src_type == dst_type:
if src_type == dst_type: self._volume_migrate(volume, dst_pool)
self._volume_migrate(volume, dst_pool) else:
else: self._volume_retype(volume, dst_type)
self._volume_retype(volume, dst_type)
elif self.is_in_use(volume):
self._volume_update(volume, dst_type)
# if with_attached_volume is True, migrate attaching instances # if with_attached_volume is True, migrate attaching instances
if self.with_attached_volume: if self.with_attached_volume:
instances = [self.nova.find_instance(dic.get('server_id')) instances = [self.nova.find_instance(dic.get('server_id'))
for dic in volume.attachments] for dic in volume.attachments]
self.instances_migration(instances, action_counter) self.instances_migration(instances, action_counter)
action_counter.add_pool(pool) action_counter.add_pool(pool)
@@ -469,16 +474,6 @@ class ZoneMigration(base.ZoneMigrationBaseStrategy):
input_parameters=parameters) input_parameters=parameters)
self.planned_cold_count += 1 self.planned_cold_count += 1
def _volume_update(self, volume, dst_type):
parameters = {"migration_type": "swap",
"destination_type": dst_type,
"resource_name": volume.name}
self.solution.add_action(
action_type="volume_migrate",
resource_id=volume.id,
input_parameters=parameters)
self.planned_volume_update_count += 1
def _volume_migrate(self, volume, dst_pool): def _volume_migrate(self, volume, dst_pool):
parameters = {"migration_type": "migrate", parameters = {"migration_type": "migrate",
"destination_node": dst_pool, "destination_node": dst_pool,

View File

@@ -22,7 +22,6 @@ from watcher.common import cinder_helper
from watcher.common import clients from watcher.common import clients
from watcher.common import keystone_helper from watcher.common import keystone_helper
from watcher.common import nova_helper from watcher.common import nova_helper
from watcher.common import utils as w_utils
from watcher.tests import base from watcher.tests import base
@@ -102,12 +101,15 @@ class TestMigration(base.TestCase):
@staticmethod @staticmethod
def fake_volume(**kwargs): def fake_volume(**kwargs):
# FIXME(sean-k-mooney): we should be using real objects in this
# test or at lease something more Representative of the real data
volume = mock.MagicMock() volume = mock.MagicMock()
volume.id = kwargs.get('id', TestMigration.VOLUME_UUID) volume.id = kwargs.get('id', TestMigration.VOLUME_UUID)
volume.size = kwargs.get('size', '1') volume.size = kwargs.get('size', '1')
volume.status = kwargs.get('status', 'available') volume.status = kwargs.get('status', 'available')
volume.snapshot_id = kwargs.get('snapshot_id', None) volume.snapshot_id = kwargs.get('snapshot_id', None)
volume.availability_zone = kwargs.get('availability_zone', 'nova') volume.availability_zone = kwargs.get('availability_zone', 'nova')
volume.attachments = kwargs.get('attachments', [])
return volume return volume
@staticmethod @staticmethod
@@ -175,42 +177,14 @@ class TestMigration(base.TestCase):
"storage1-typename", "storage1-typename",
) )
def test_swap_success(self):
volume = self.fake_volume(
status='in-use', attachments=[{'server_id': 'server_id'}])
self.m_n_helper.find_instance.return_value = self.fake_instance()
new_volume = self.fake_volume(id=w_utils.generate_uuid())
user = mock.Mock()
session = mock.MagicMock()
self.m_k_helper.create_user.return_value = user
self.m_k_helper.create_session.return_value = session
self.m_c_helper.get_volume.return_value = volume
self.m_c_helper.create_volume.return_value = new_volume
result = self.action_swap.execute()
self.assertTrue(result)
self.m_n_helper.swap_volume.assert_called_once_with(
volume,
new_volume
)
self.m_k_helper.delete_user.assert_called_once_with(user)
def test_swap_fail(self):
# _can_swap fail
instance = self.fake_instance(status='STOPPED')
self.m_n_helper.find_instance.return_value = instance
result = self.action_swap.execute()
self.assertFalse(result)
def test_can_swap_success(self): def test_can_swap_success(self):
volume = self.fake_volume( volume = self.fake_volume(
status='in-use', attachments=[{'server_id': 'server_id'}]) status='in-use', attachments=[
instance = self.fake_instance() {'server_id': TestMigration.INSTANCE_UUID}])
instance = self.fake_instance()
self.m_n_helper.find_instance.return_value = instance self.m_n_helper.find_instance.return_value = instance
result = self.action_swap._can_swap(volume) result = self.action_swap._can_swap(volume)
self.assertTrue(result) self.assertTrue(result)
@@ -219,16 +193,33 @@ class TestMigration(base.TestCase):
result = self.action_swap._can_swap(volume) result = self.action_swap._can_swap(volume)
self.assertTrue(result) self.assertTrue(result)
instance = self.fake_instance(status='RESIZED')
self.m_n_helper.find_instance.return_value = instance
result = self.action_swap._can_swap(volume)
self.assertTrue(result)
def test_can_swap_fail(self): def test_can_swap_fail(self):
volume = self.fake_volume( volume = self.fake_volume(
status='in-use', attachments=[{'server_id': 'server_id'}]) status='in-use', attachments=[
{'server_id': TestMigration.INSTANCE_UUID}])
instance = self.fake_instance(status='STOPPED') instance = self.fake_instance(status='STOPPED')
self.m_n_helper.find_instance.return_value = instance self.m_n_helper.find_instance.return_value = instance
result = self.action_swap._can_swap(volume) result = self.action_swap._can_swap(volume)
self.assertFalse(result) self.assertFalse(result)
instance = self.fake_instance(status='RESIZED')
self.m_n_helper.find_instance.return_value = instance
result = self.action_swap._can_swap(volume)
self.assertFalse(result)
def test_swap_success(self):
volume = self.fake_volume(
status='in-use', attachments=[
{'server_id': TestMigration.INSTANCE_UUID}])
self.m_c_helper.get_volume.return_value = volume
instance = self.fake_instance()
self.m_n_helper.find_instance.return_value = instance
result = self.action_swap.execute()
self.assertTrue(result)
self.m_c_helper.migrate.assert_called_once_with(
volume,
"storage1-poolname"
)

View File

@@ -399,7 +399,10 @@ class TestZoneMigration(TestBaseStrategy):
migration_types = collections.Counter( migration_types = collections.Counter(
[action.get('input_parameters')['migration_type'] [action.get('input_parameters')['migration_type']
for action in solution.actions]) for action in solution.actions])
self.assertEqual(1, migration_types.get("swap", 0)) # watcher no longer implements swap. it is now an
# alias for migrate.
self.assertEqual(0, migration_types.get("swap", 0))
self.assertEqual(1, migration_types.get("migrate", 1))
global_efficacy_value = solution.global_efficacy[3].get('value', 0) global_efficacy_value = solution.global_efficacy[3].get('value', 0)
self.assertEqual(100, global_efficacy_value) self.assertEqual(100, global_efficacy_value)