Merge "Add databases_per_second to db daemons"

This commit is contained in:
Zuul 2018-10-31 07:58:38 +00:00 committed by Gerrit Code Review
commit 35c5f666de
7 changed files with 253 additions and 178 deletions

View File

@ -1173,94 +1173,98 @@ ionice_priority None I/O scheduling priority of ser
[container-replicator] [container-replicator]
********************** **********************
================== =========================== ============================= ==================== =========================== =============================
Option Default Description Option Default Description
------------------ --------------------------- ----------------------------- -------------------- --------------------------- -----------------------------
log_name container-replicator Label used when logging log_name container-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level log_level INFO Logging level
log_address /dev/log Logging directory log_address /dev/log Logging directory
per_diff 1000 Maximum number of database per_diff 1000 Maximum number of database
rows that will be sync'd in a rows that will be sync'd in a
single HTTP replication single HTTP replication
request. Databases with less request. Databases with less
than or equal to this number than or equal to this number
of differing rows will always of differing rows will always
be sync'd using an HTTP be sync'd using an HTTP
replication request rather replication request rather
than using rsync. than using rsync.
max_diffs 100 Maximum number of HTTP max_diffs 100 Maximum number of HTTP
replication requests attempted replication requests attempted
on each replication pass for on each replication pass for
any one container. This caps any one container. This caps
how long the replicator will how long the replicator will
spend trying to sync a given spend trying to sync a given
database per pass so the other database per pass so the other
databases don't get starved. databases don't get starved.
concurrency 8 Number of replication workers concurrency 8 Number of replication workers
to spawn to spawn
interval 30 Time in seconds to wait interval 30 Time in seconds to wait
between replication passes between replication passes
node_timeout 10 Request timeout to external databases_per_second 50 Maximum databases to process
services per second. Should be tuned
conn_timeout 0.5 Connection timeout to external according to individual
services system specs. 0 is unlimited.
reclaim_age 604800 Time elapsed in seconds before node_timeout 10 Request timeout to external
a container can be reclaimed services
rsync_module {replication_ip}::container Format of the rsync module conn_timeout 0.5 Connection timeout to external
where the replicator will send services
data. The configuration value reclaim_age 604800 Time elapsed in seconds before
can include some variables a container can be reclaimed
that will be extracted from rsync_module {replication_ip}::container Format of the rsync module
the ring. Variables must where the replicator will send
follow the format {NAME} where data. The configuration value
NAME is one of: ip, port, can include some variables
replication_ip, that will be extracted from
replication_port, region, the ring. Variables must
zone, device, meta. See follow the format {NAME} where
etc/rsyncd.conf-sample for NAME is one of: ip, port,
some examples. replication_ip,
rsync_compress no Allow rsync to compress data replication_port, region,
which is transmitted to zone, device, meta. See
destination node during sync. etc/rsyncd.conf-sample for
However, this is applicable some examples.
only when destination node is rsync_compress no Allow rsync to compress data
in a different region than the which is transmitted to
local one. NOTE: Objects that destination node during sync.
are already compressed (for However, this is applicable
example: .tar.gz, mp3) might only when destination node is
slow down the syncing process. in a different region than the
recon_cache_path /var/cache/swift Path to recon cache local one. NOTE: Objects that
nice_priority None Scheduling priority of server are already compressed (for
processes. Niceness values example: .tar.gz, mp3) might
range from -20 (most favorable slow down the syncing process.
to the process) to 19 (least recon_cache_path /var/cache/swift Path to recon cache
favorable to the process). nice_priority None Scheduling priority of server
The default does not modify processes. Niceness values
priority. range from -20 (most favorable
ionice_class None I/O scheduling class of server to the process) to 19 (least
processes. I/O niceness class favorable to the process).
values are The default does not modify
IOPRIO_CLASS_RT (realtime), priority.
IOPRIO_CLASS_BE (best-effort), ionice_class None I/O scheduling class of server
and IOPRIO_CLASS_IDLE (idle). processes. I/O niceness class
The default does not modify values are
class and priority. Linux IOPRIO_CLASS_RT (realtime),
supports io scheduling IOPRIO_CLASS_BE (best-effort),
priorities and classes since and IOPRIO_CLASS_IDLE (idle).
2.6.13 with the CFQ io The default does not modify
scheduler. class and priority. Linux
Work only with ionice_priority. supports io scheduling
ionice_priority None I/O scheduling priority of priorities and classes since
server processes. I/O niceness 2.6.13 with the CFQ io
priority is a number which goes scheduler.
from 0 to 7. Work only with ionice_priority.
The higher the value, the lower ionice_priority None I/O scheduling priority of
the I/O priority of the process. server processes. I/O niceness
Work only with ionice_class. priority is a number which goes
Ignored if IOPRIO_CLASS_IDLE from 0 to 7.
is set. The higher the value, the lower
================== =========================== ============================= the I/O priority of the process.
Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE
is set.
==================== =========================== =============================
******************* *******************
[container-updater] [container-updater]
@ -1524,89 +1528,93 @@ ionice_priority None I/O scheduling priority of server
[account-replicator] [account-replicator]
******************** ********************
================== ========================= =============================== ==================== ========================= ===============================
Option Default Description Option Default Description
------------------ ------------------------- ------------------------------- -------------------- ------------------------- -------------------------------
log_name account-replicator Label used when logging log_name account-replicator Label used when logging
log_facility LOG_LOCAL0 Syslog log facility log_facility LOG_LOCAL0 Syslog log facility
log_level INFO Logging level log_level INFO Logging level
log_address /dev/log Logging directory log_address /dev/log Logging directory
per_diff 1000 Maximum number of database rows per_diff 1000 Maximum number of database rows
that will be sync'd in a single that will be sync'd in a single
HTTP replication request. HTTP replication request.
Databases with less than or Databases with less than or
equal to this number of equal to this number of
differing rows will always be differing rows will always be
sync'd using an HTTP replication sync'd using an HTTP replication
request rather than using rsync. request rather than using rsync.
max_diffs 100 Maximum number of HTTP max_diffs 100 Maximum number of HTTP
replication requests attempted replication requests attempted
on each replication pass for any on each replication pass for any
one container. This caps how one container. This caps how
long the replicator will spend long the replicator will spend
trying to sync a given database trying to sync a given database
per pass so the other databases per pass so the other databases
don't get starved. don't get starved.
concurrency 8 Number of replication workers concurrency 8 Number of replication workers
to spawn to spawn
interval 30 Time in seconds to wait between interval 30 Time in seconds to wait between
replication passes replication passes
node_timeout 10 Request timeout to external databases_per_second 50 Maximum databases to process
services per second. Should be tuned
conn_timeout 0.5 Connection timeout to external according to individual
services system specs. 0 is unlimited.
reclaim_age 604800 Time elapsed in seconds before node_timeout 10 Request timeout to external
an account can be reclaimed services
rsync_module {replication_ip}::account Format of the rsync module where conn_timeout 0.5 Connection timeout to external
the replicator will send data. services
The configuration value can reclaim_age 604800 Time elapsed in seconds before
include some variables that will an account can be reclaimed
be extracted from the ring. rsync_module {replication_ip}::account Format of the rsync module where
Variables must follow the format the replicator will send data.
{NAME} where NAME is one of: ip, The configuration value can
port, replication_ip, include some variables that will
replication_port, region, zone, be extracted from the ring.
device, meta. See Variables must follow the format
etc/rsyncd.conf-sample for some {NAME} where NAME is one of: ip,
examples. port, replication_ip,
rsync_compress no Allow rsync to compress data replication_port, region, zone,
which is transmitted to device, meta. See
destination node during sync. etc/rsyncd.conf-sample for some
However, this is applicable only examples.
when destination node is in a rsync_compress no Allow rsync to compress data
different region than the local which is transmitted to
one. NOTE: Objects that are destination node during sync.
already compressed (for example: However, this is applicable only
.tar.gz, mp3) might slow down when destination node is in a
the syncing process. different region than the local
recon_cache_path /var/cache/swift Path to recon cache one. NOTE: Objects that are
nice_priority None Scheduling priority of server already compressed (for example:
processes. Niceness values .tar.gz, mp3) might slow down
range from -20 (most favorable the syncing process.
to the process) to 19 (least recon_cache_path /var/cache/swift Path to recon cache
favorable to the process). nice_priority None Scheduling priority of server
The default does not modify processes. Niceness values
priority. range from -20 (most favorable
ionice_class None I/O scheduling class of server to the process) to 19 (least
processes. I/O niceness class favorable to the process).
values are IOPRIO_CLASS_RT The default does not modify
(realtime), IOPRIO_CLASS_BE priority.
(best-effort), and IOPRIO_CLASS_IDLE ionice_class None I/O scheduling class of server
(idle). processes. I/O niceness class
The default does not modify values are IOPRIO_CLASS_RT
class and priority. Linux supports (realtime), IOPRIO_CLASS_BE
io scheduling priorities and classes (best-effort), and IOPRIO_CLASS_IDLE
since 2.6.13 with the CFQ io scheduler. (idle).
Work only with ionice_priority. The default does not modify
ionice_priority None I/O scheduling priority of server class and priority. Linux supports
processes. I/O niceness priority io scheduling priorities and classes
is a number which goes from 0 to 7. since 2.6.13 with the CFQ io scheduler.
The higher the value, the lower Work only with ionice_priority.
the I/O priority of the process. ionice_priority None I/O scheduling priority of server
Work only with ionice_class. processes. I/O niceness priority
Ignored if IOPRIO_CLASS_IDLE is a number which goes from 0 to 7.
is set. The higher the value, the lower
================== ========================= =============================== the I/O priority of the process.
Work only with ionice_class.
Ignored if IOPRIO_CLASS_IDLE
is set.
==================== ========================= ===============================
***************** *****************
[account-auditor] [account-auditor]

View File

@ -143,6 +143,9 @@ use = egg:swift#recon
# run_pause is deprecated, use interval instead # run_pause is deprecated, use interval instead
# run_pause = 30 # run_pause = 30
# #
# Process at most this many databases per second
# databases_per_second = 50
#
# node_timeout = 10 # node_timeout = 10
# conn_timeout = 0.5 # conn_timeout = 0.5
# #

View File

@ -156,6 +156,9 @@ use = egg:swift#recon
# run_pause is deprecated, use interval instead # run_pause is deprecated, use interval instead
# run_pause = 30 # run_pause = 30
# #
# Process at most this many databases per second
# databases_per_second = 50
#
# node_timeout = 10 # node_timeout = 10
# conn_timeout = 0.5 # conn_timeout = 0.5
# #
@ -436,6 +439,9 @@ use = egg:swift#xprofile
# Time in seconds to wait between sharder cycles # Time in seconds to wait between sharder cycles
# interval = 30 # interval = 30
# #
# Process at most this many databases per second
# databases_per_second = 50
#
# The container-sharder accepts the following configuration options as defined # The container-sharder accepts the following configuration options as defined
# in the container-replicator section: # in the container-replicator section:
# #

View File

@ -33,7 +33,7 @@ from swift.common.utils import get_logger, whataremyips, storage_directory, \
renamer, mkdirs, lock_parent_directory, config_true_value, \ renamer, mkdirs, lock_parent_directory, config_true_value, \
unlink_older_than, dump_recon_cache, rsync_module_interpolation, \ unlink_older_than, dump_recon_cache, rsync_module_interpolation, \
json, parse_override_options, round_robin_iter, Everything, get_db_files, \ json, parse_override_options, round_robin_iter, Everything, get_db_files, \
parse_db_filename, quote parse_db_filename, quote, RateLimitedIterator
from swift.common import ring from swift.common import ring
from swift.common.ring.utils import is_local_device from swift.common.ring.utils import is_local_device
from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \ from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \
@ -204,6 +204,8 @@ class Replicator(Daemon):
' to use option %(type)s-replicator/' ' to use option %(type)s-replicator/'
'interval.' 'interval.'
% {'type': self.server_type}) % {'type': self.server_type})
self.databases_per_second = int(
conf.get('databases_per_second', 50))
self.node_timeout = float(conf.get('node_timeout', 10)) self.node_timeout = float(conf.get('node_timeout', 10))
self.conn_timeout = float(conf.get('conn_timeout', 0.5)) self.conn_timeout = float(conf.get('conn_timeout', 0.5))
self.rsync_compress = config_true_value( self.rsync_compress = config_true_value(
@ -733,6 +735,11 @@ class Replicator(Daemon):
def report_up_to_date(self, full_info): def report_up_to_date(self, full_info):
return True return True
def roundrobin_datadirs(self, dirs):
return RateLimitedIterator(
roundrobin_datadirs(dirs),
elements_per_second=self.databases_per_second)
def run_once(self, *args, **kwargs): def run_once(self, *args, **kwargs):
"""Run a replication pass once.""" """Run a replication pass once."""
override_options = parse_override_options(once=True, **kwargs) override_options = parse_override_options(once=True, **kwargs)
@ -789,7 +796,7 @@ class Replicator(Daemon):
"file, not replicating", "file, not replicating",
", ".join(ips), self.port) ", ".join(ips), self.port)
self.logger.info(_('Beginning replication run')) self.logger.info(_('Beginning replication run'))
for part, object_file, node_id in roundrobin_datadirs(dirs): for part, object_file, node_id in self.roundrobin_datadirs(dirs):
self.cpool.spawn_n( self.cpool.spawn_n(
self._replicate_object, part, object_file, node_id) self._replicate_object, part, object_file, node_id)
self.cpool.waitall() self.cpool.waitall()

View File

@ -23,7 +23,7 @@ import os
import six import six
from eventlet import Timeout from eventlet import Timeout
from swift.common import internal_client, db_replicator from swift.common import internal_client
from swift.common.constraints import check_drive from swift.common.constraints import check_drive
from swift.common.direct_client import (direct_put_container, from swift.common.direct_client import (direct_put_container,
DirectClientException) DirectClientException)
@ -1500,7 +1500,7 @@ class ContainerSharder(ContainerReplicator):
dirs.append((datadir, node, part_filt)) dirs.append((datadir, node, part_filt))
if not dirs: if not dirs:
self.logger.warning('Found no data dirs!') self.logger.warning('Found no data dirs!')
for part, path, node in db_replicator.roundrobin_datadirs(dirs): for part, path, node in self.roundrobin_datadirs(dirs):
# NB: get_part_nodes always provides an 'index' key; # NB: get_part_nodes always provides an 'index' key;
# this will be used in leader selection # this will be used in leader selection
for primary in self.ring.get_part_nodes(int(part)): for primary in self.ring.get_part_nodes(int(part)):

View File

@ -321,6 +321,7 @@ class TestDBReplicator(unittest.TestCase):
# later config should be extended to assert more config options # later config should be extended to assert more config options
replicator = TestReplicator({'node_timeout': '3.5'}) replicator = TestReplicator({'node_timeout': '3.5'})
self.assertEqual(replicator.node_timeout, 3.5) self.assertEqual(replicator.node_timeout, 3.5)
self.assertEqual(replicator.databases_per_second, 50)
def test_repl_connection(self): def test_repl_connection(self):
node = {'replication_ip': '127.0.0.1', 'replication_port': 80, node = {'replication_ip': '127.0.0.1', 'replication_port': 80,

View File

@ -128,6 +128,7 @@ class TestSharder(BaseTestSharder):
expected = { expected = {
'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201, 'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201,
'per_diff': 1000, 'max_diffs': 100, 'interval': 30, 'per_diff': 1000, 'max_diffs': 100, 'interval': 30,
'databases_per_second': 50,
'cleave_row_batch_size': 10000, 'cleave_row_batch_size': 10000,
'node_timeout': 10, 'conn_timeout': 5, 'node_timeout': 10, 'conn_timeout': 5,
'rsync_compress': False, 'rsync_compress': False,
@ -154,6 +155,7 @@ class TestSharder(BaseTestSharder):
conf = { conf = {
'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010, 'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010,
'per_diff': 2000, 'max_diffs': 200, 'interval': 60, 'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
'databases_per_second': 5,
'cleave_row_batch_size': 3000, 'cleave_row_batch_size': 3000,
'node_timeout': 20, 'conn_timeout': 1, 'node_timeout': 20, 'conn_timeout': 1,
'rsync_compress': True, 'rsync_compress': True,
@ -176,6 +178,7 @@ class TestSharder(BaseTestSharder):
expected = { expected = {
'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010, 'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010,
'per_diff': 2000, 'max_diffs': 200, 'interval': 60, 'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
'databases_per_second': 5,
'cleave_row_batch_size': 3000, 'cleave_row_batch_size': 3000,
'node_timeout': 20, 'conn_timeout': 1, 'node_timeout': 20, 'conn_timeout': 1,
'rsync_compress': True, 'rsync_compress': True,
@ -485,7 +488,7 @@ class TestSharder(BaseTestSharder):
0, 'text/plain', 'etag', 0) 0, 'text/plain', 'etag', 0)
# check only sharding enabled containers are processed # check only sharding enabled containers are processed
with mock.patch.object( with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker' sharder, '_process_broker'
) as mock_process_broker: ) as mock_process_broker:
sharder._local_device_ids = {'stale_node_id'} sharder._local_device_ids = {'stale_node_id'}
@ -539,7 +542,7 @@ class TestSharder(BaseTestSharder):
"for %s" % broker.path) "for %s" % broker.path)
# check exceptions are handled # check exceptions are handled
with mock.patch.object( with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker', side_effect=mock_processing sharder, '_process_broker', side_effect=mock_processing
) as mock_process_broker: ) as mock_process_broker:
sharder._local_device_ids = {'stale_node_id'} sharder._local_device_ids = {'stale_node_id'}
@ -593,7 +596,7 @@ class TestSharder(BaseTestSharder):
for i in range(10): for i in range(10):
brokers[1].delete_object( brokers[1].delete_object(
'o%s' % i, next(self.ts_iter).internal) 'o%s' % i, next(self.ts_iter).internal)
with mock.patch.object( with mock.patch('eventlet.sleep'), mock.patch.object(
sharder, '_process_broker' sharder, '_process_broker'
) as mock_process_broker: ) as mock_process_broker:
sharder._local_device_ids = {999} sharder._local_device_ids = {999}
@ -612,6 +615,53 @@ class TestSharder(BaseTestSharder):
expected_candidate_stats, sharder, 'sharding_candidates') expected_candidate_stats, sharder, 'sharding_candidates')
self._assert_recon_stats(None, sharder, 'sharding_progress') self._assert_recon_stats(None, sharder, 'sharding_progress')
def test_ratelimited_roundrobin(self):
n_databases = 100
def stub_iter(dirs):
for i in range(n_databases):
yield i, '/srv/node/sda/path/to/container.db', {}
now = time.time()
clock = {
'sleeps': [],
'now': now,
}
def fake_sleep(t):
clock['sleeps'].append(t)
clock['now'] += t
def fake_time():
return clock['now']
with self._mock_sharder({'databases_per_second': 1}) as sharder, \
mock.patch('swift.common.db_replicator.roundrobin_datadirs',
stub_iter), \
mock.patch('time.time', fake_time), \
mock.patch('eventlet.sleep', fake_sleep):
list(sharder.roundrobin_datadirs(None))
# 100 db at 1/s should take ~100s
run_time = sum(clock['sleeps'])
self.assertTrue(97 <= run_time < 100, 'took %s' % run_time)
n_databases = 1000
now = time.time()
clock = {
'sleeps': [],
'now': now,
}
with self._mock_sharder({'databases_per_second': 50}) as sharder, \
mock.patch('swift.common.db_replicator.roundrobin_datadirs',
stub_iter), \
mock.patch('time.time', fake_time), \
mock.patch('eventlet.sleep', fake_sleep):
list(sharder.roundrobin_datadirs(None))
# 1000 db at 50/s
run_time = sum(clock['sleeps'])
self.assertTrue(18 <= run_time < 20, 'took %s' % run_time)
@contextmanager @contextmanager
def _mock_sharder(self, conf=None, replicas=3): def _mock_sharder(self, conf=None, replicas=3):
conf = conf or {} conf = conf or {}