Add handoffs-only mode to DB replicators.
The object reconstructor has a handoffs-only mode that is very useful when a cluster requires rapid rebalancing, like when disks are nearing fullness. This mode's goal is to remove handoff partitions from disks without spending effort on primary partitions. The object replicator has a similar mode, though it varies in some details. This commit adds a handoffs-only mode to the account and container replicators. Change-Id: I588b151ee65ae49d204bd6bf58555504c15edf9f Closes-Bug: 1668399
This commit is contained in:
parent
2bfd9c6a9b
commit
47fed6f2f9
@ -163,6 +163,25 @@ use = egg:swift#recon
|
||||
# Work only with ionice_class.
|
||||
# ionice_class =
|
||||
# ionice_priority =
|
||||
#
|
||||
# The handoffs_only mode option is for special-case emergency
|
||||
# situations such as full disks in the cluster. This option SHOULD NOT
|
||||
# BE ENABLED except in emergencies. When handoffs_only mode is enabled
|
||||
# the replicator will *only* replicate from handoff nodes to primary
|
||||
# nodes and will not sync primary nodes with other primary nodes.
|
||||
#
|
||||
# This has two main effects: first, the replicator becomes much more
|
||||
# effective at removing misplaced databases, thereby freeing up disk
|
||||
# space at a much faster pace than normal. Second, the replicator does
|
||||
# not sync data between primary nodes, so out-of-sync account and
|
||||
# container listings will not resolve while handoffs_only is enabled.
|
||||
#
|
||||
# This mode is intended to allow operators to temporarily sacrifice
|
||||
# consistency in order to gain faster rebalancing, such as during a
|
||||
# capacity addition with nearly-full disks. It is not intended for
|
||||
# long-term use.
|
||||
#
|
||||
# handoffs_only = no
|
||||
|
||||
[account-auditor]
|
||||
# You can override the default log routing for this app here (don't use set!):
|
||||
|
@ -172,6 +172,25 @@ use = egg:swift#recon
|
||||
# Work only with ionice_class.
|
||||
# ionice_class =
|
||||
# ionice_priority =
|
||||
#
|
||||
# The handoffs_only mode option is for special-case emergency
|
||||
# situations such as full disks in the cluster. This option SHOULD NOT
|
||||
# BE ENABLED except in emergencies. When handoffs_only mode is enabled
|
||||
# the replicator will *only* replicate from handoff nodes to primary
|
||||
# nodes and will not sync primary nodes with other primary nodes.
|
||||
#
|
||||
# This has two main effects: first, the replicator becomes much more
|
||||
# effective at removing misplaced databases, thereby freeing up disk
|
||||
# space at a much faster pace than normal. Second, the replicator does
|
||||
# not sync data between primary nodes, so out-of-sync account and
|
||||
# container listings will not resolve while handoffs_only is enabled.
|
||||
#
|
||||
# This mode is intended to allow operators to temporarily sacrifice
|
||||
# consistency in order to gain faster rebalancing, such as during a
|
||||
# capacity addition with nearly-full disks. It is not intended for
|
||||
# long-term use.
|
||||
#
|
||||
# handoffs_only = no
|
||||
|
||||
[container-updater]
|
||||
# You can override the default log routing for this app here (don't use set!):
|
||||
|
@ -87,13 +87,14 @@ def roundrobin_datadirs(datadirs):
|
||||
found (in their proper places). The partitions within each data
|
||||
dir are walked randomly, however.
|
||||
|
||||
:param datadirs: a list of (path, node_id) to walk
|
||||
:param datadirs: a list of (path, node_id, partition_filter) to walk
|
||||
:returns: A generator of (partition, path_to_db_file, node_id)
|
||||
"""
|
||||
|
||||
def walk_datadir(datadir, node_id):
|
||||
def walk_datadir(datadir, node_id, part_filter):
|
||||
partitions = [pd for pd in os.listdir(datadir)
|
||||
if looks_like_partition(pd)]
|
||||
if looks_like_partition(pd)
|
||||
and (part_filter is None or part_filter(pd))]
|
||||
random.shuffle(partitions)
|
||||
for partition in partitions:
|
||||
part_dir = os.path.join(datadir, partition)
|
||||
@ -125,7 +126,8 @@ def roundrobin_datadirs(datadirs):
|
||||
if e.errno != errno.ENOTEMPTY:
|
||||
raise
|
||||
|
||||
its = [walk_datadir(datadir, node_id) for datadir, node_id in datadirs]
|
||||
its = [walk_datadir(datadir, node_id, filt)
|
||||
for datadir, node_id, filt in datadirs]
|
||||
while its:
|
||||
for it in its:
|
||||
try:
|
||||
@ -206,6 +208,7 @@ class Replicator(Daemon):
|
||||
self.recon_replicator)
|
||||
self.extract_device_re = re.compile('%s%s([^%s]+)' % (
|
||||
self.root, os.path.sep, os.path.sep))
|
||||
self.handoffs_only = config_true_value(conf.get('handoffs_only', 'no'))
|
||||
|
||||
def _zero_stats(self):
|
||||
"""Zero out the stats."""
|
||||
@ -631,6 +634,14 @@ class Replicator(Daemon):
|
||||
return match.groups()[0]
|
||||
return "UNKNOWN"
|
||||
|
||||
def handoffs_only_filter(self, device_id):
|
||||
def filt(partition_dir):
|
||||
partition = int(partition_dir)
|
||||
primary_node_ids = [
|
||||
d['id'] for d in self.ring.get_part_nodes(partition)]
|
||||
return device_id not in primary_node_ids
|
||||
return filt
|
||||
|
||||
def report_up_to_date(self, full_info):
|
||||
return True
|
||||
|
||||
@ -642,6 +653,13 @@ class Replicator(Daemon):
|
||||
if not ips:
|
||||
self.logger.error(_('ERROR Failed to get my own IPs?'))
|
||||
return
|
||||
|
||||
if self.handoffs_only:
|
||||
self.logger.warning(
|
||||
'Starting replication pass with handoffs_only enabled. '
|
||||
'This mode is not intended for normal '
|
||||
'operation; use handoffs_only with care.')
|
||||
|
||||
self._local_device_ids = set()
|
||||
found_local = False
|
||||
for node in self.ring.devs:
|
||||
@ -664,7 +682,9 @@ class Replicator(Daemon):
|
||||
datadir = os.path.join(self.root, node['device'], self.datadir)
|
||||
if os.path.isdir(datadir):
|
||||
self._local_device_ids.add(node['id'])
|
||||
dirs.append((datadir, node['id']))
|
||||
filt = (self.handoffs_only_filter(node['id'])
|
||||
if self.handoffs_only else None)
|
||||
dirs.append((datadir, node['id'], filt))
|
||||
if not found_local:
|
||||
self.logger.error("Can't find itself %s with port %s in ring "
|
||||
"file, not replicating",
|
||||
@ -675,6 +695,10 @@ class Replicator(Daemon):
|
||||
self._replicate_object, part, object_file, node_id)
|
||||
self.cpool.waitall()
|
||||
self.logger.info(_('Replication run OVER'))
|
||||
if self.handoffs_only:
|
||||
self.logger.warning(
|
||||
'Finished replication pass with handoffs_only enabled. '
|
||||
'If handoffs_only is no longer required, disable it.')
|
||||
self._report_stats()
|
||||
|
||||
def run_forever(self, *args, **kwargs):
|
||||
|
@ -1220,7 +1220,8 @@ class TestDBReplicator(unittest.TestCase):
|
||||
self.assertTrue(os.path.isdir(dirpath))
|
||||
|
||||
node_id = 1
|
||||
results = list(db_replicator.roundrobin_datadirs([(datadir, node_id)]))
|
||||
results = list(db_replicator.roundrobin_datadirs(
|
||||
[(datadir, node_id, None)]))
|
||||
expected = [
|
||||
('450', os.path.join(datadir, db_path), node_id),
|
||||
]
|
||||
@ -1241,12 +1242,14 @@ class TestDBReplicator(unittest.TestCase):
|
||||
self.assertEqual({'18', '1054', '1060', '450'},
|
||||
set(os.listdir(datadir)))
|
||||
|
||||
results = list(db_replicator.roundrobin_datadirs([(datadir, node_id)]))
|
||||
results = list(db_replicator.roundrobin_datadirs(
|
||||
[(datadir, node_id, None)]))
|
||||
self.assertEqual(results, expected)
|
||||
self.assertEqual({'1054', '1060', '450'},
|
||||
set(os.listdir(datadir)))
|
||||
|
||||
results = list(db_replicator.roundrobin_datadirs([(datadir, node_id)]))
|
||||
results = list(db_replicator.roundrobin_datadirs(
|
||||
[(datadir, node_id, None)]))
|
||||
self.assertEqual(results, expected)
|
||||
# non db file in '1060' dir is not deleted and exception is handled
|
||||
self.assertEqual({'1060', '450'},
|
||||
@ -1333,8 +1336,8 @@ class TestDBReplicator(unittest.TestCase):
|
||||
mock.patch(base + 'random.shuffle', _shuffle), \
|
||||
mock.patch(base + 'os.rmdir', _rmdir):
|
||||
|
||||
datadirs = [('/srv/node/sda/containers', 1),
|
||||
('/srv/node/sdb/containers', 2)]
|
||||
datadirs = [('/srv/node/sda/containers', 1, None),
|
||||
('/srv/node/sdb/containers', 2, None)]
|
||||
results = list(db_replicator.roundrobin_datadirs(datadirs))
|
||||
# The results show that the .db files are returned, the devices
|
||||
# interleaved.
|
||||
@ -1438,6 +1441,150 @@ class TestDBReplicator(unittest.TestCase):
|
||||
replicator.logger)])
|
||||
|
||||
|
||||
class TestHandoffsOnly(unittest.TestCase):
|
||||
class FakeRing3Nodes(object):
|
||||
_replicas = 3
|
||||
|
||||
# Three nodes, two disks each
|
||||
devs = [
|
||||
dict(id=0, region=1, zone=1,
|
||||
meta='', weight=500.0, ip='10.0.0.1', port=6201,
|
||||
replication_ip='10.0.0.1', replication_port=6201,
|
||||
device='sdp'),
|
||||
dict(id=1, region=1, zone=1,
|
||||
meta='', weight=500.0, ip='10.0.0.1', port=6201,
|
||||
replication_ip='10.0.0.1', replication_port=6201,
|
||||
device='sdq'),
|
||||
|
||||
dict(id=2, region=1, zone=1,
|
||||
meta='', weight=500.0, ip='10.0.0.2', port=6201,
|
||||
replication_ip='10.0.0.2', replication_port=6201,
|
||||
device='sdp'),
|
||||
dict(id=3, region=1, zone=1,
|
||||
meta='', weight=500.0, ip='10.0.0.2', port=6201,
|
||||
replication_ip='10.0.0.2', replication_port=6201,
|
||||
device='sdq'),
|
||||
|
||||
dict(id=4, region=1, zone=1,
|
||||
meta='', weight=500.0, ip='10.0.0.3', port=6201,
|
||||
replication_ip='10.0.0.3', replication_port=6201,
|
||||
device='sdp'),
|
||||
dict(id=5, region=1, zone=1,
|
||||
meta='', weight=500.0, ip='10.0.0.3', port=6201,
|
||||
replication_ip='10.0.0.3', replication_port=6201,
|
||||
device='sdq'),
|
||||
]
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
pass
|
||||
|
||||
def get_part(self, account, container=None, obj=None):
|
||||
return 0
|
||||
|
||||
def get_part_nodes(self, part):
|
||||
nodes = []
|
||||
for offset in range(self._replicas):
|
||||
i = (part + offset) % len(self.devs)
|
||||
nodes.append(self.devs[i])
|
||||
return nodes
|
||||
|
||||
def get_more_nodes(self, part):
|
||||
for offset in range(self._replicas, len(self.devs)):
|
||||
i = (part + offset) % len(self.devs)
|
||||
yield self.devs[i]
|
||||
|
||||
def _make_fake_db(self, disk, partition, db_hash):
|
||||
directories = [
|
||||
os.path.join(self.root, disk),
|
||||
os.path.join(self.root, disk, 'containers'),
|
||||
os.path.join(self.root, disk, 'containers', str(partition)),
|
||||
os.path.join(self.root, disk, 'containers', str(partition),
|
||||
db_hash[-3:]),
|
||||
os.path.join(self.root, disk, 'containers', str(partition),
|
||||
db_hash[-3:], db_hash)]
|
||||
|
||||
for d in directories:
|
||||
try:
|
||||
os.mkdir(d)
|
||||
except OSError as err:
|
||||
if err.errno != errno.EEXIST:
|
||||
raise
|
||||
file_path = os.path.join(directories[-1], db_hash + ".db")
|
||||
with open(file_path, 'w'):
|
||||
pass
|
||||
|
||||
def setUp(self):
|
||||
self.root = mkdtemp()
|
||||
|
||||
# object disks; they're just here to make sure they don't trip us up
|
||||
os.mkdir(os.path.join(self.root, 'sdc'))
|
||||
os.mkdir(os.path.join(self.root, 'sdc', 'objects'))
|
||||
os.mkdir(os.path.join(self.root, 'sdd'))
|
||||
os.mkdir(os.path.join(self.root, 'sdd', 'objects'))
|
||||
|
||||
# part 0 belongs on sdp
|
||||
self._make_fake_db('sdp', 0, '010101013cf2b7979af9eaa71cb67220')
|
||||
|
||||
# part 1 does not belong on sdp
|
||||
self._make_fake_db('sdp', 1, 'abababab2b5368158355e799323b498d')
|
||||
|
||||
# part 1 belongs on sdq
|
||||
self._make_fake_db('sdq', 1, '02020202e30f696a3cfa63d434a3c94e')
|
||||
|
||||
# part 2 does not belong on sdq
|
||||
self._make_fake_db('sdq', 2, 'bcbcbcbc15d3835053d568c57e2c83b5')
|
||||
|
||||
def cleanUp(self):
|
||||
rmtree(self.root, ignore_errors=True)
|
||||
|
||||
def test_scary_warnings(self):
|
||||
logger = unit.FakeLogger()
|
||||
replicator = TestReplicator({
|
||||
'handoffs_only': 'yes',
|
||||
'devices': self.root,
|
||||
'bind_port': 6201,
|
||||
'mount_check': 'no',
|
||||
}, logger=logger)
|
||||
|
||||
with patch.object(db_replicator, 'whataremyips',
|
||||
return_value=['10.0.0.1']), \
|
||||
patch.object(replicator, '_replicate_object'), \
|
||||
patch.object(replicator, 'ring', self.FakeRing3Nodes()):
|
||||
replicator.run_once()
|
||||
|
||||
self.assertEqual(
|
||||
logger.get_lines_for_level('warning'),
|
||||
[('Starting replication pass with handoffs_only enabled. This '
|
||||
'mode is not intended for normal operation; use '
|
||||
'handoffs_only with care.'),
|
||||
('Finished replication pass with handoffs_only enabled. '
|
||||
'If handoffs_only is no longer required, disable it.')])
|
||||
|
||||
def test_skips_primary_partitions(self):
|
||||
replicator = TestReplicator({
|
||||
'handoffs_only': 'yes',
|
||||
'devices': self.root,
|
||||
'bind_port': 6201,
|
||||
'mount_check': 'no',
|
||||
})
|
||||
|
||||
with patch.object(db_replicator, 'whataremyips',
|
||||
return_value=['10.0.0.1']), \
|
||||
patch.object(replicator, '_replicate_object') as mock_repl, \
|
||||
patch.object(replicator, 'ring', self.FakeRing3Nodes()):
|
||||
replicator.run_once()
|
||||
|
||||
self.assertEqual(sorted(mock_repl.mock_calls), [
|
||||
mock.call('1', os.path.join(
|
||||
self.root, 'sdp', 'containers', '1', '98d',
|
||||
'abababab2b5368158355e799323b498d',
|
||||
'abababab2b5368158355e799323b498d.db'), 0),
|
||||
mock.call('2', os.path.join(
|
||||
self.root, 'sdq', 'containers', '2', '3b5',
|
||||
'bcbcbcbc15d3835053d568c57e2c83b5',
|
||||
'bcbcbcbc15d3835053d568c57e2c83b5.db'), 1)])
|
||||
|
||||
|
||||
class TestReplToNode(unittest.TestCase):
|
||||
def setUp(self):
|
||||
db_replicator.ring = FakeRing()
|
||||
|
Loading…
Reference in New Issue
Block a user