From 213f38534818db96dce6cbcaede284510335f7e9 Mon Sep 17 00:00:00 2001 From: gholt Date: Sun, 3 Jun 2012 03:06:01 +0000 Subject: [PATCH] Fixed bug with container reclaim/report race Before, a really lagged cluster might not get its final report for a deleted container database sent to its corresponding account database. In such a case, the container database file would be permanently deleted while still leaving the container listed in the account database, never to be updated since the actual container database file was gone. The only way to fix such the situation before was to recreate and redelete the container. Now, the container database file will not be permanently deleted until it has sent its final report successfully to its corresponding account database. Change-Id: I1f42202455e7ecb0533b84ce7f45fcc7b98aeaa3 --- swift/common/db_replicator.py | 13 +++++--- swift/container/replicator.py | 7 +++++ test/unit/container/test_replicator.py | 42 +++++++++++++++++++++----- 3 files changed, 51 insertions(+), 11 deletions(-) diff --git a/swift/common/db_replicator.py b/swift/common/db_replicator.py index 796a862cfd..20f83922f3 100644 --- a/swift/common/db_replicator.py +++ b/swift/common/db_replicator.py @@ -361,6 +361,7 @@ class Replicator(Daemon): broker.reclaim(time.time() - self.reclaim_age, time.time() - (self.reclaim_age * 2)) info = broker.get_replication_info() + full_info = broker.get_info() except (Exception, Timeout), e: if 'no such table' in str(e): self.logger.error(_('Quarantining DB %s'), object_file) @@ -385,10 +386,11 @@ class Replicator(Daemon): if delete_timestamp < (time.time() - self.reclaim_age) and \ delete_timestamp > put_timestamp and \ info['count'] in (None, '', 0, '0'): - with lock_parent_directory(object_file): - shutil.rmtree(os.path.dirname(object_file), True) - self.stats['remove'] += 1 - self.logger.increment('removes') + if self.report_up_to_date(full_info): + with lock_parent_directory(object_file): + shutil.rmtree(os.path.dirname(object_file), True) + self.stats['remove'] += 1 + self.logger.increment('removes') self.logger.timing_since('timing', start_time) return responses = [] @@ -422,6 +424,9 @@ class Replicator(Daemon): self.logger.increment('removes') self.logger.timing_since('timing', start_time) + def report_up_to_date(self, full_info): + return True + def roundrobin_datadirs(self, datadirs): """ Generator to walk the data dirs in a round robin manner, evenly diff --git a/swift/container/replicator.py b/swift/container/replicator.py index 571588875f..3d5aee9b73 100644 --- a/swift/container/replicator.py +++ b/swift/container/replicator.py @@ -22,3 +22,10 @@ class ContainerReplicator(db_replicator.Replicator): brokerclass = db.ContainerBroker datadir = container_server.DATADIR default_port = 6001 + + def report_up_to_date(self, full_info): + for key in ('put_timestamp', 'delete_timestamp', 'object_count', + 'bytes_used'): + if full_info['reported_' + key] != full_info[key]: + return False + return True diff --git a/test/unit/container/test_replicator.py b/test/unit/container/test_replicator.py index 6f7e9bacdb..9f1b0259a8 100644 --- a/test/unit/container/test_replicator.py +++ b/test/unit/container/test_replicator.py @@ -15,17 +15,45 @@ import unittest from swift.container import replicator +from swift.common.utils import normalize_timestamp class TestReplicator(unittest.TestCase): - """ - swift.container.replicator is currently just a subclass with some class - variables overridden, but at least this test stub will ensure proper Python - syntax. - """ - def test_placeholder(self): - pass + def setUp(self): + self.orig_ring = replicator.db_replicator.ring.Ring + replicator.db_replicator.ring.Ring = lambda *args, **kwargs: None + + def tearDown(self): + replicator.db_replicator.ring.Ring = self.orig_ring + + def test_report_up_to_date(self): + repl = replicator.ContainerReplicator({}) + info = {'put_timestamp': normalize_timestamp(1), + 'delete_timestamp': normalize_timestamp(0), + 'object_count': 0, + 'bytes_used': 0, + 'reported_put_timestamp': normalize_timestamp(1), + 'reported_delete_timestamp': normalize_timestamp(0), + 'reported_object_count': 0, + 'reported_bytes_used': 0} + self.assertTrue(repl.report_up_to_date(info)) + info['delete_timestamp'] = normalize_timestamp(2) + self.assertFalse(repl.report_up_to_date(info)) + info['reported_delete_timestamp'] = normalize_timestamp(2) + self.assertTrue(repl.report_up_to_date(info)) + info['object_count'] = 1 + self.assertFalse(repl.report_up_to_date(info)) + info['reported_object_count'] = 1 + self.assertTrue(repl.report_up_to_date(info)) + info['bytes_used'] = 1 + self.assertFalse(repl.report_up_to_date(info)) + info['reported_bytes_used'] = 1 + self.assertTrue(repl.report_up_to_date(info)) + info['put_timestamp'] = normalize_timestamp(3) + self.assertFalse(repl.report_up_to_date(info)) + info['reported_put_timestamp'] = normalize_timestamp(3) + self.assertTrue(repl.report_up_to_date(info)) if __name__ == '__main__':