sharder/replicator: emit stats for DBs created or existing

The number of container partitions that are in handoff locations can
be used to track progress of a container ring rebalance. However,
container DBs are sometimes deliberately created in handoff locations:
the sharder creates shard container DBs while cleaving and also while
moving misplaced objects; the container-replicator creates DBs to feed
misplaced objects to the reconciler. These DBs can distorts insights
into rebalance progress. Emitting stats will help quantify any such
distortion.

New sharder stats are:
    cleaved_db_created
    cleaved_db_exists
    misplaced_db_created
    misplaced_db_exists

New container-replicator stats are:
    reconciler_db_created
    reconciler_db_exists

Change-Id: Ia43e91a545a822cace41a0e814ab6c3bd89f8402
This commit is contained in:
Alistair Coles 2022-07-25 15:09:41 +01:00
parent ec2bbc0e14
commit 517738ac9a
6 changed files with 117 additions and 20 deletions

View File

@ -377,7 +377,10 @@ class ContainerBroker(DatabaseBroker):
:param put_timestamp: initial timestamp if broker needs to be
initialized
:param storage_policy_index: the storage policy index
:return: a :class:`swift.container.backend.ContainerBroker` instance
:return: a tuple of (``broker``, ``initialized``) where ``broker`` is
an instance of :class:`swift.container.backend.ContainerBroker` and
``initialized`` is True if the db file was initialized, False
otherwise.
"""
hsh = hash_path(account, container)
db_dir = storage_directory(DATADIR, part, hsh)
@ -385,12 +388,14 @@ class ContainerBroker(DatabaseBroker):
os.path.join(device_path, db_dir, hsh + '.db'), epoch)
broker = ContainerBroker(db_path, account=account, container=container,
logger=logger)
initialized = False
if not os.path.exists(broker.db_file):
try:
broker.initialize(put_timestamp, storage_policy_index)
initialized = True
except DatabaseAlreadyExists:
pass
return broker
return broker, initialized
def get_db_state(self):
"""

View File

@ -191,10 +191,12 @@ class ContainerReplicator(db_replicator.Replicator):
account = MISPLACED_OBJECTS_ACCOUNT
part = self.ring.get_part(account, container)
node = self.find_local_handoff_for_part(part)
broker = ContainerBroker.create_broker(
broker, initialized = ContainerBroker.create_broker(
os.path.join(self.root, node['device']), part, account, container,
logger=self.logger, put_timestamp=timestamp,
storage_policy_index=0)
self.logger.increment('reconciler_db_created' if initialized
else 'reconciler_db_exists')
if self.reconciler_containers is not None:
self.reconciler_containers[container] = part, broker, node['id']
return broker

View File

@ -1104,7 +1104,7 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
node = self.find_local_handoff_for_part(part)
put_timestamp = Timestamp.now().internal
shard_broker = ContainerBroker.create_broker(
shard_broker, initialized = ContainerBroker.create_broker(
os.path.join(self.root, node['device']), part, shard_range.account,
shard_range.container, epoch=shard_range.epoch,
storage_policy_index=policy_index, put_timestamp=put_timestamp)
@ -1124,6 +1124,7 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
'X-Container-Sysmeta-Sharding':
('True', Timestamp.now().internal)})
put_timestamp = put_timestamp if initialized else None
return part, shard_broker, node['id'], put_timestamp
def _audit_root_container(self, broker):
@ -1466,8 +1467,11 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
continue
if dest_shard_range not in dest_brokers:
part, dest_broker, node_id, _junk = self._get_shard_broker(
part, dest_broker, node_id, put_timestamp = \
self._get_shard_broker(
dest_shard_range, src_broker.root_path, policy_index)
stat = 'db_exists' if put_timestamp is None else 'db_created'
self._increment_stat('misplaced', stat, statsd=True)
# save the broker info that was sampled prior to the *first*
# yielded objects for this destination
destination = {'part': part,
@ -1836,6 +1840,8 @@ class ContainerSharder(ContainerSharderConf, ContainerReplicator):
shard_part, shard_broker, node_id, put_timestamp = \
self._get_shard_broker(shard_range, broker.root_path,
policy_index)
stat = 'db_exists' if put_timestamp is None else 'db_created'
self._increment_stat('cleaved', stat, statsd=True)
return self._cleave_shard_broker(
broker, cleaving_context, shard_range, own_shard_range,
shard_broker, put_timestamp, shard_part, node_id)

View File

@ -3702,15 +3702,19 @@ class TestContainerBroker(unittest.TestCase):
@with_tempdir
def test_create_broker(self, tempdir):
broker = ContainerBroker.create_broker(tempdir, 0, 'a', 'c')
broker, init = ContainerBroker.create_broker(tempdir, 0, 'a', 'c')
hsh = hash_path('a', 'c')
expected_path = os.path.join(
tempdir, 'containers', '0', hsh[-3:], hsh, hsh + '.db')
self.assertEqual(expected_path, broker.db_file)
self.assertTrue(os.path.isfile(expected_path))
self.assertTrue(init)
broker, init = ContainerBroker.create_broker(tempdir, 0, 'a', 'c')
self.assertEqual(expected_path, broker.db_file)
self.assertFalse(init)
ts = Timestamp.now()
broker = ContainerBroker.create_broker(tempdir, 0, 'a', 'c1',
broker, init = ContainerBroker.create_broker(tempdir, 0, 'a', 'c1',
put_timestamp=ts.internal)
hsh = hash_path('a', 'c1')
expected_path = os.path.join(
@ -3719,15 +3723,17 @@ class TestContainerBroker(unittest.TestCase):
self.assertTrue(os.path.isfile(expected_path))
self.assertEqual(ts.internal, broker.get_info()['put_timestamp'])
self.assertEqual(0, broker.get_info()['storage_policy_index'])
self.assertTrue(init)
epoch = Timestamp.now()
broker = ContainerBroker.create_broker(tempdir, 0, 'a', 'c3',
broker, init = ContainerBroker.create_broker(tempdir, 0, 'a', 'c3',
epoch=epoch)
hsh = hash_path('a', 'c3')
expected_path = os.path.join(
tempdir, 'containers', '0', hsh[-3:],
hsh, '%s_%s.db' % (hsh, epoch.internal))
self.assertEqual(expected_path, broker.db_file)
self.assertTrue(init)
@with_tempdir
def test_pending_file_name(self, tempdir):

View File

@ -883,6 +883,10 @@ class TestReplicatorSync(test_db_replicator.TestReplicatorSync):
daemon = self._run_once(node)
# push to remote, and third node was missing (also maybe reconciler)
self.assertTrue(2 < daemon.stats['rsync'] <= 3, daemon.stats['rsync'])
self.assertEqual(
1, self.logger.get_stats_counts().get('reconciler_db_created'))
self.assertFalse(
self.logger.get_stats_counts().get('reconciler_db_exists'))
# grab the rsynced instance of remote_broker
remote_broker = self._get_broker('a', 'c', node_index=1)
@ -902,7 +906,12 @@ class TestReplicatorSync(test_db_replicator.TestReplicatorSync):
# and we should have also enqueued these rows in a single reconciler,
# since we forced the object timestamps to be in the same hour.
self.logger.clear()
reconciler = daemon.get_reconciler_broker(misplaced[0]['created_at'])
self.assertFalse(
self.logger.get_stats_counts().get('reconciler_db_created'))
self.assertEqual(
1, self.logger.get_stats_counts().get('reconciler_db_exists'))
# but it may not be on the same node as us anymore though...
reconciler = self._get_broker(reconciler.account,
reconciler.container, node_index=0)

View File

@ -1254,11 +1254,16 @@ class TestSharder(BaseTestSharder):
self.assertFalse(sharder._cleave(broker))
expected = {'attempted': 1, 'success': 1, 'failure': 0,
'min_time': mock.ANY, 'max_time': mock.ANY}
'min_time': mock.ANY, 'max_time': mock.ANY,
'db_created': 1, 'db_exists': 0}
stats = self._assert_stats(expected, sharder, 'cleaved')
self.assertIsInstance(stats['min_time'], float)
self.assertIsInstance(stats['max_time'], float)
self.assertLessEqual(stats['min_time'], stats['max_time'])
self.assertEqual(
1, sharder.logger.get_stats_counts().get('cleaved_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('cleaved_db_exists'))
self.assertEqual(SHARDING, broker.get_db_state())
sharder._replicate_object.assert_called_once_with(
0, expected_shard_dbs[0], 0)
@ -1316,6 +1321,15 @@ class TestSharder(BaseTestSharder):
sharder._replicate_object.assert_called_once_with(
0, expected_shard_dbs[1], 0)
expected = {'attempted': 1, 'success': 0, 'failure': 1,
'min_time': mock.ANY, 'max_time': mock.ANY,
'db_created': 1, 'db_exists': 0}
self._assert_stats(expected, sharder, 'cleaved')
self.assertEqual(
1, sharder.logger.get_stats_counts().get('cleaved_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('cleaved_db_exists'))
# cleaving state is unchanged
updated_shard_ranges = broker.get_shard_ranges()
self.assertEqual(4, len(updated_shard_ranges))
@ -1344,11 +1358,16 @@ class TestSharder(BaseTestSharder):
self.assertFalse(sharder._cleave(broker))
expected = {'attempted': 2, 'success': 2, 'failure': 0,
'min_time': mock.ANY, 'max_time': mock.ANY}
'min_time': mock.ANY, 'max_time': mock.ANY,
'db_created': 1, 'db_exists': 1}
stats = self._assert_stats(expected, sharder, 'cleaved')
self.assertIsInstance(stats['min_time'], float)
self.assertIsInstance(stats['max_time'], float)
self.assertLessEqual(stats['min_time'], stats['max_time'])
self.assertEqual(
1, sharder.logger.get_stats_counts().get('cleaved_db_created'))
self.assertEqual(
1, sharder.logger.get_stats_counts().get('cleaved_db_exists'))
self.assertEqual(SHARDING, broker.get_db_state())
sharder._replicate_object.assert_has_calls(
@ -1405,11 +1424,16 @@ class TestSharder(BaseTestSharder):
self.assertFalse(sharder._cleave(broker))
expected = {'attempted': 1, 'success': 1, 'failure': 0,
'min_time': mock.ANY, 'max_time': mock.ANY}
'min_time': mock.ANY, 'max_time': mock.ANY,
'db_created': 1, 'db_exists': 0}
stats = self._assert_stats(expected, sharder, 'cleaved')
self.assertIsInstance(stats['min_time'], float)
self.assertIsInstance(stats['max_time'], float)
self.assertLessEqual(stats['min_time'], stats['max_time'])
self.assertEqual(
1, sharder.logger.get_stats_counts().get('cleaved_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('cleaved_db_exists'))
self.assertEqual(SHARDING, broker.get_db_state())
sharder._replicate_object.assert_called_once_with(
@ -1473,11 +1497,16 @@ class TestSharder(BaseTestSharder):
self.assertTrue(sharder._cleave(broker))
expected = {'attempted': 1, 'success': 1, 'failure': 0,
'min_time': mock.ANY, 'max_time': mock.ANY}
'min_time': mock.ANY, 'max_time': mock.ANY,
'db_created': 1, 'db_exists': 0}
stats = self._assert_stats(expected, sharder, 'cleaved')
self.assertIsInstance(stats['min_time'], float)
self.assertIsInstance(stats['max_time'], float)
self.assertLessEqual(stats['min_time'], stats['max_time'])
self.assertEqual(
1, sharder.logger.get_stats_counts().get('cleaved_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('cleaved_db_exists'))
sharder._replicate_object.assert_called_once_with(
0, expected_shard_dbs[4], 0)
@ -3308,7 +3337,8 @@ class TestSharder(BaseTestSharder):
sharder._move_misplaced_objects(broker)
sharder._replicate_object.assert_not_called()
expected_stats = {'attempted': 1, 'success': 1, 'failure': 0,
'found': 0, 'placed': 0, 'unplaced': 0}
'found': 0, 'placed': 0, 'unplaced': 0,
'db_created': 0, 'db_exists': 0}
self._assert_stats(expected_stats, sharder, 'misplaced')
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_found'))
@ -3316,6 +3346,10 @@ class TestSharder(BaseTestSharder):
sharder.logger.get_stats_counts().get('misplaced_placed'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_unplaced'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_exists'))
# sharding - no misplaced objects
self.assertTrue(broker.set_sharding_state())
@ -3329,6 +3363,10 @@ class TestSharder(BaseTestSharder):
sharder.logger.get_stats_counts().get('misplaced_placed'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_unplaced'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_exists'))
# pretend we cleaved up to end of second shard range
context = CleavingContext.load(broker)
@ -3344,6 +3382,10 @@ class TestSharder(BaseTestSharder):
sharder.logger.get_stats_counts().get('misplaced_placed'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_unplaced'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_exists'))
# sharding - misplaced objects
for obj in objects:
@ -3361,6 +3403,10 @@ class TestSharder(BaseTestSharder):
sharder.logger.get_stats_counts().get('misplaced_placed'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_unplaced'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_exists'))
self.assertFalse(os.path.exists(expected_shard_dbs[0]))
self.assertFalse(os.path.exists(expected_shard_dbs[1]))
self.assertFalse(os.path.exists(expected_shard_dbs[2]))
@ -3376,12 +3422,17 @@ class TestSharder(BaseTestSharder):
sharder._replicate_object.assert_called_once_with(
0, expected_shard_dbs[1], 0)
expected_stats = {'attempted': 1, 'success': 1, 'failure': 0,
'found': 1, 'placed': 2, 'unplaced': 0}
'found': 1, 'placed': 2, 'unplaced': 0,
'db_created': 1, 'db_exists': 0}
self._assert_stats(expected_stats, sharder, 'misplaced')
self.assertEqual(
1, sharder.logger.get_stats_counts()['misplaced_found'])
self.assertEqual(
2, sharder.logger.get_stats_counts()['misplaced_placed'])
self.assertEqual(
1, sharder.logger.get_stats_counts()['misplaced_db_created'])
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_exists'))
# check misplaced objects were moved
self._check_objects(objects[:2], expected_shard_dbs[1])
@ -3409,7 +3460,8 @@ class TestSharder(BaseTestSharder):
with self._mock_sharder(conf={'cleave_row_batch_size': 2}) as sharder:
sharder._move_misplaced_objects(broker)
expected_stats = {'attempted': 1, 'success': 1, 'failure': 0,
'found': 1, 'placed': 4, 'unplaced': 0}
'found': 1, 'placed': 4, 'unplaced': 0,
'db_created': 3, 'db_exists': 0}
self._assert_stats(expected_stats, sharder, 'misplaced')
sharder._replicate_object.assert_has_calls(
[mock.call(0, db, 0) for db in expected_shard_dbs[2:4]],
@ -3420,6 +3472,10 @@ class TestSharder(BaseTestSharder):
1, sharder.logger.get_stats_counts()['misplaced_found'])
self.assertEqual(
4, sharder.logger.get_stats_counts()['misplaced_placed'])
self.assertEqual(
3, sharder.logger.get_stats_counts()['misplaced_db_created'])
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_exists'))
# check misplaced objects were moved
self._check_objects(new_objects, expected_shard_dbs[0])
@ -3436,12 +3492,17 @@ class TestSharder(BaseTestSharder):
sharder._move_misplaced_objects(broker)
sharder._replicate_object.assert_not_called()
expected_stats = {'attempted': 1, 'success': 1, 'failure': 0,
'found': 0, 'placed': 0, 'unplaced': 0}
'found': 0, 'placed': 0, 'unplaced': 0,
'db_created': 0, 'db_exists': 0}
self._assert_stats(expected_stats, sharder, 'misplaced')
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_found'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_placed'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_created'))
self.assertFalse(
sharder.logger.get_stats_counts().get('misplaced_db_exists'))
# and then more misplaced updates arrive
newer_objects = [
@ -3462,13 +3523,21 @@ class TestSharder(BaseTestSharder):
for db in (expected_shard_dbs[0], expected_shard_dbs[-1])],
any_order=True
)
# shard broker for first shard range was already created but not
# removed due to mocked _replicate_object so expect one created and one
# existed db stat...
expected_stats = {'attempted': 1, 'success': 1, 'failure': 0,
'found': 1, 'placed': 3, 'unplaced': 0}
'found': 1, 'placed': 3, 'unplaced': 0,
'db_created': 1, 'db_exists': 1}
self._assert_stats(expected_stats, sharder, 'misplaced')
self.assertEqual(
1, sharder.logger.get_stats_counts()['misplaced_found'])
self.assertEqual(
3, sharder.logger.get_stats_counts()['misplaced_placed'])
self.assertEqual(
1, sharder.logger.get_stats_counts()['misplaced_db_created'])
self.assertEqual(
1, sharder.logger.get_stats_counts()['misplaced_db_exists'])
# check new misplaced objects were moved
self._check_objects(newer_objects[:1] + new_objects,