Fixed bug with container reclaim/report race

Before, a really lagged cluster might not get its final report for a
deleted container database sent to its corresponding account
database. In such a case, the container database file would be
permanently deleted while still leaving the container listed in the
account database, never to be updated since the actual container
database file was gone. The only way to fix such the situation before
was to recreate and redelete the container.

Now, the container database file will not be permanently deleted
until it has sent its final report successfully to its corresponding
account database.

Change-Id: I1f42202455e7ecb0533b84ce7f45fcc7b98aeaa3
This commit is contained in:
gholt 2012-06-03 03:06:01 +00:00
parent 783f16035a
commit 213f385348
3 changed files with 51 additions and 11 deletions

View File

@ -361,6 +361,7 @@ class Replicator(Daemon):
broker.reclaim(time.time() - self.reclaim_age, broker.reclaim(time.time() - self.reclaim_age,
time.time() - (self.reclaim_age * 2)) time.time() - (self.reclaim_age * 2))
info = broker.get_replication_info() info = broker.get_replication_info()
full_info = broker.get_info()
except (Exception, Timeout), e: except (Exception, Timeout), e:
if 'no such table' in str(e): if 'no such table' in str(e):
self.logger.error(_('Quarantining DB %s'), object_file) self.logger.error(_('Quarantining DB %s'), object_file)
@ -385,10 +386,11 @@ class Replicator(Daemon):
if delete_timestamp < (time.time() - self.reclaim_age) and \ if delete_timestamp < (time.time() - self.reclaim_age) and \
delete_timestamp > put_timestamp and \ delete_timestamp > put_timestamp and \
info['count'] in (None, '', 0, '0'): info['count'] in (None, '', 0, '0'):
with lock_parent_directory(object_file): if self.report_up_to_date(full_info):
shutil.rmtree(os.path.dirname(object_file), True) with lock_parent_directory(object_file):
self.stats['remove'] += 1 shutil.rmtree(os.path.dirname(object_file), True)
self.logger.increment('removes') self.stats['remove'] += 1
self.logger.increment('removes')
self.logger.timing_since('timing', start_time) self.logger.timing_since('timing', start_time)
return return
responses = [] responses = []
@ -422,6 +424,9 @@ class Replicator(Daemon):
self.logger.increment('removes') self.logger.increment('removes')
self.logger.timing_since('timing', start_time) self.logger.timing_since('timing', start_time)
def report_up_to_date(self, full_info):
return True
def roundrobin_datadirs(self, datadirs): def roundrobin_datadirs(self, datadirs):
""" """
Generator to walk the data dirs in a round robin manner, evenly Generator to walk the data dirs in a round robin manner, evenly

View File

@ -22,3 +22,10 @@ class ContainerReplicator(db_replicator.Replicator):
brokerclass = db.ContainerBroker brokerclass = db.ContainerBroker
datadir = container_server.DATADIR datadir = container_server.DATADIR
default_port = 6001 default_port = 6001
def report_up_to_date(self, full_info):
for key in ('put_timestamp', 'delete_timestamp', 'object_count',
'bytes_used'):
if full_info['reported_' + key] != full_info[key]:
return False
return True

View File

@ -15,17 +15,45 @@
import unittest import unittest
from swift.container import replicator from swift.container import replicator
from swift.common.utils import normalize_timestamp
class TestReplicator(unittest.TestCase): class TestReplicator(unittest.TestCase):
"""
swift.container.replicator is currently just a subclass with some class
variables overridden, but at least this test stub will ensure proper Python
syntax.
"""
def test_placeholder(self): def setUp(self):
pass self.orig_ring = replicator.db_replicator.ring.Ring
replicator.db_replicator.ring.Ring = lambda *args, **kwargs: None
def tearDown(self):
replicator.db_replicator.ring.Ring = self.orig_ring
def test_report_up_to_date(self):
repl = replicator.ContainerReplicator({})
info = {'put_timestamp': normalize_timestamp(1),
'delete_timestamp': normalize_timestamp(0),
'object_count': 0,
'bytes_used': 0,
'reported_put_timestamp': normalize_timestamp(1),
'reported_delete_timestamp': normalize_timestamp(0),
'reported_object_count': 0,
'reported_bytes_used': 0}
self.assertTrue(repl.report_up_to_date(info))
info['delete_timestamp'] = normalize_timestamp(2)
self.assertFalse(repl.report_up_to_date(info))
info['reported_delete_timestamp'] = normalize_timestamp(2)
self.assertTrue(repl.report_up_to_date(info))
info['object_count'] = 1
self.assertFalse(repl.report_up_to_date(info))
info['reported_object_count'] = 1
self.assertTrue(repl.report_up_to_date(info))
info['bytes_used'] = 1
self.assertFalse(repl.report_up_to_date(info))
info['reported_bytes_used'] = 1
self.assertTrue(repl.report_up_to_date(info))
info['put_timestamp'] = normalize_timestamp(3)
self.assertFalse(repl.report_up_to_date(info))
info['reported_put_timestamp'] = normalize_timestamp(3)
self.assertTrue(repl.report_up_to_date(info))
if __name__ == '__main__': if __name__ == '__main__':