Merge "Add databases_per_second to db daemons"

2018-10-31 07:58:38 +00:00 · 2018-10-31 07:58:38 +00:00 · 35c5f666de
commit 35c5f666de
parent f4487d7399 06cf5d298f
7 changed files with 253 additions and 178 deletions
--- a/doc/source/deployment_guide.rst
+++ b/doc/source/deployment_guide.rst
@ -1173,94 +1173,98 @@ ionice_priority                 None              I/O scheduling priority of ser
 [container-replicator]
 **********************
-==================  ===========================  =============================
+==================== ===========================  =============================
-Option              Default                      Description
+Option               Default                      Description
------------------  ---------------------------  -----------------------------
+-------------------- ---------------------------  -----------------------------
-log_name            container-replicator         Label used when logging
+log_name             container-replicator         Label used when logging
-log_facility        LOG_LOCAL0                   Syslog log facility
+log_facility         LOG_LOCAL0                   Syslog log facility
-log_level           INFO                         Logging level
+log_level            INFO                         Logging level
-log_address         /dev/log                     Logging directory
+log_address          /dev/log                     Logging directory
-per_diff            1000                         Maximum number of database
+per_diff             1000                         Maximum number of database
-                                                 rows that will be sync'd in a
+                                                  rows that will be sync'd in a
-                                                 single HTTP replication
+                                                  single HTTP replication
-                                                 request. Databases with less
+                                                  request. Databases with less
-                                                 than or equal to this number
+                                                  than or equal to this number
-                                                 of differing rows will always
+                                                  of differing rows will always
-                                                 be sync'd using an HTTP
+                                                  be sync'd using an HTTP
-                                                 replication request rather
+                                                  replication request rather
-                                                 than using rsync.
+                                                  than using rsync.
-max_diffs           100                          Maximum number of HTTP
+max_diffs            100                          Maximum number of HTTP
-                                                 replication requests attempted
+                                                  replication requests attempted
-                                                 on each replication pass for
+                                                  on each replication pass for
-                                                 any one container. This caps
+                                                  any one container. This caps
-                                                 how long the replicator will
+                                                  how long the replicator will
-                                                 spend trying to sync a given
+                                                  spend trying to sync a given
-                                                 database per pass so the other
+                                                  database per pass so the other
-                                                 databases don't get starved.
+                                                  databases don't get starved.
-concurrency         8                            Number of replication workers
+concurrency          8                            Number of replication workers
-                                                 to spawn
+                                                  to spawn
-interval            30                           Time in seconds to wait
+interval             30                           Time in seconds to wait
-                                                 between replication passes
+                                                  between replication passes
-node_timeout        10                           Request timeout to external
+databases_per_second 50                           Maximum databases to process
-                                                 services
+                                                  per second.  Should be tuned
-conn_timeout        0.5                          Connection timeout to external
+                                                  according to individual
-                                                 services
+                                                  system specs.  0 is unlimited.
-reclaim_age         604800                       Time elapsed in seconds before
+node_timeout         10                           Request timeout to external
-                                                 a container can be reclaimed
+                                                  services
-rsync_module        {replication_ip}::container  Format of the rsync module
+conn_timeout         0.5                          Connection timeout to external
-                                                 where the replicator will send
+                                                  services
-                                                 data. The configuration value
+reclaim_age          604800                       Time elapsed in seconds before
-                                                 can include some variables
+                                                  a container can be reclaimed
-                                                 that will be extracted from
+rsync_module         {replication_ip}::container  Format of the rsync module
-                                                 the ring. Variables must
+                                                  where the replicator will send
-                                                 follow the format {NAME} where
+                                                  data. The configuration value
-                                                 NAME is one of: ip, port,
+                                                  can include some variables
-                                                 replication_ip,
+                                                  that will be extracted from
-                                                 replication_port, region,
+                                                  the ring. Variables must
-                                                 zone, device, meta. See
+                                                  follow the format {NAME} where
-                                                 etc/rsyncd.conf-sample for
+                                                  NAME is one of: ip, port,
-                                                 some examples.
+                                                  replication_ip,
-rsync_compress      no                           Allow rsync to compress data
+                                                  replication_port, region,
-                                                 which is transmitted to
+                                                  zone, device, meta. See
-                                                 destination node during sync.
+                                                  etc/rsyncd.conf-sample for
-                                                 However, this is applicable
+                                                  some examples.
-                                                 only when destination node is
+rsync_compress       no                           Allow rsync to compress data
-                                                 in a different region than the
+                                                  which is transmitted to
-                                                 local one. NOTE: Objects that
+                                                  destination node during sync.
-                                                 are already compressed (for
+                                                  However, this is applicable
-                                                 example: .tar.gz, mp3) might
+                                                  only when destination node is
-                                                 slow down the syncing process.
+                                                  in a different region than the
-recon_cache_path    /var/cache/swift             Path to recon cache
+                                                  local one. NOTE: Objects that
-nice_priority       None                         Scheduling priority of server
+                                                  are already compressed (for
-                                                 processes. Niceness values
+                                                  example: .tar.gz, mp3) might
-                                                 range from -20 (most favorable
+                                                  slow down the syncing process.
-                                                 to the process) to 19 (least
+recon_cache_path     /var/cache/swift             Path to recon cache
-                                                 favorable to the process).
+nice_priority        None                         Scheduling priority of server
-                                                 The default does not modify
+                                                  processes. Niceness values
-                                                 priority.
+                                                  range from -20 (most favorable
-ionice_class        None                         I/O scheduling class of server
+                                                  to the process) to 19 (least
-                                                 processes. I/O niceness class
+                                                  favorable to the process).
-                                                 values are
+                                                  The default does not modify
-                                                 IOPRIO_CLASS_RT (realtime),
+                                                  priority.
-                                                 IOPRIO_CLASS_BE (best-effort),
+ionice_class         None                         I/O scheduling class of server
-                                                 and IOPRIO_CLASS_IDLE (idle).
+                                                  processes. I/O niceness class
-                                                 The default does not modify
+                                                  values are
-                                                 class and priority. Linux
+                                                  IOPRIO_CLASS_RT (realtime),
-                                                 supports io scheduling
+                                                  IOPRIO_CLASS_BE (best-effort),
-                                                 priorities and classes since
+                                                  and IOPRIO_CLASS_IDLE (idle).
-                                                 2.6.13 with the CFQ io
+                                                  The default does not modify
-                                                 scheduler.
+                                                  class and priority. Linux
-                                                 Work only with ionice_priority.
+                                                  supports io scheduling
-ionice_priority     None                         I/O scheduling priority of
+                                                  priorities and classes since
-                                                 server processes. I/O niceness
+                                                  2.6.13 with the CFQ io
-                                                 priority is a number which goes
+                                                  scheduler.
-                                                 from 0 to 7.
+                                                  Work only with ionice_priority.
-                                                 The higher the value, the lower
+ionice_priority      None                         I/O scheduling priority of
-                                                 the I/O priority of the process.
+                                                  server processes. I/O niceness
-                                                 Work only with ionice_class.
+                                                  priority is a number which goes
-                                                 Ignored if IOPRIO_CLASS_IDLE
+                                                  from 0 to 7.
-                                                 is set.
+                                                  The higher the value, the lower
-==================  ===========================  =============================
+                                                  the I/O priority of the process.
                                                  Work only with ionice_class.
                                                  Ignored if IOPRIO_CLASS_IDLE
                                                  is set.
 ==================== ===========================  =============================
 *******************
 [container-updater]
@ -1524,89 +1528,93 @@ ionice_priority                None            I/O scheduling priority of server
 [account-replicator]
 ********************
-==================  =========================  ===============================
+==================== =========================  ===============================
-Option              Default                    Description
+Option               Default                    Description
------------------  -------------------------  -------------------------------
+-------------------- -------------------------  -------------------------------
-log_name            account-replicator         Label used when logging
+log_name             account-replicator         Label used when logging
-log_facility        LOG_LOCAL0                 Syslog log facility
+log_facility         LOG_LOCAL0                 Syslog log facility
-log_level           INFO                       Logging level
+log_level            INFO                       Logging level
-log_address         /dev/log                   Logging directory
+log_address          /dev/log                   Logging directory
-per_diff            1000                       Maximum number of database rows
+per_diff             1000                       Maximum number of database rows
-                                               that will be sync'd in a single
+                                                that will be sync'd in a single
-                                               HTTP replication request.
+                                                HTTP replication request.
-                                               Databases with less than or
+                                                Databases with less than or
-                                               equal to this number of
+                                                equal to this number of
-                                               differing rows will always be
+                                                differing rows will always be
-                                               sync'd using an HTTP replication
+                                                sync'd using an HTTP replication
-                                               request rather than using rsync.
+                                                request rather than using rsync.
-max_diffs           100                        Maximum number of HTTP
+max_diffs            100                        Maximum number of HTTP
-                                               replication requests attempted
+                                                replication requests attempted
-                                               on each replication pass for any
+                                                on each replication pass for any
-                                               one container. This caps how
+                                                one container. This caps how
-                                               long the replicator will spend
+                                                long the replicator will spend
-                                               trying to sync a given database
+                                                trying to sync a given database
-                                               per pass so the other databases
+                                                per pass so the other databases
-                                               don't get starved.
+                                                don't get starved.
-concurrency         8                          Number of replication workers
+concurrency          8                          Number of replication workers
-                                               to spawn
+                                                to spawn
-interval            30                         Time in seconds to wait between
+interval             30                         Time in seconds to wait between
-                                               replication passes
+                                                replication passes
-node_timeout        10                         Request timeout to external
+databases_per_second 50                         Maximum databases to process
-                                               services
+                                                per second.  Should be tuned
-conn_timeout        0.5                        Connection timeout to external
+                                                according to individual
-                                               services
+                                                system specs.  0 is unlimited.
-reclaim_age         604800                     Time elapsed in seconds before
+node_timeout         10                         Request timeout to external
-                                               an account can be reclaimed
+                                                services
-rsync_module        {replication_ip}::account  Format of the rsync module where
+conn_timeout         0.5                        Connection timeout to external
-                                               the replicator will send data.
+                                                services
-                                               The configuration value can
+reclaim_age          604800                     Time elapsed in seconds before
-                                               include some variables that will
+                                                an account can be reclaimed
-                                               be extracted from the ring.
+rsync_module         {replication_ip}::account  Format of the rsync module where
-                                               Variables must follow the format
+                                                the replicator will send data.
-                                               {NAME} where NAME is one of: ip,
+                                                The configuration value can
-                                               port, replication_ip,
+                                                include some variables that will
-                                               replication_port, region, zone,
+                                                be extracted from the ring.
-                                               device, meta. See
+                                                Variables must follow the format
-                                               etc/rsyncd.conf-sample for some
+                                                {NAME} where NAME is one of: ip,
-                                               examples.
+                                                port, replication_ip,
-rsync_compress      no                         Allow rsync to compress data
+                                                replication_port, region, zone,
-                                               which is transmitted to
+                                                device, meta. See
-                                               destination node during sync.
+                                                etc/rsyncd.conf-sample for some
-                                               However, this is applicable only
+                                                examples.
-                                               when destination node is in a
+rsync_compress       no                         Allow rsync to compress data
-                                               different region than the local
+                                                which is transmitted to
-                                               one. NOTE: Objects that are
+                                                destination node during sync.
-                                               already compressed (for example:
+                                                However, this is applicable only
-                                               .tar.gz, mp3) might slow down
+                                                when destination node is in a
-                                               the syncing process.
+                                                different region than the local
-recon_cache_path    /var/cache/swift           Path to recon cache
+                                                one. NOTE: Objects that are
-nice_priority       None                       Scheduling priority of server
+                                                already compressed (for example:
-                                               processes. Niceness values
+                                                .tar.gz, mp3) might slow down
-                                               range from -20 (most favorable
+                                                the syncing process.
-                                               to the process) to 19 (least
+recon_cache_path     /var/cache/swift           Path to recon cache
-                                               favorable to the process).
+nice_priority        None                       Scheduling priority of server
-                                               The default does not modify
+                                                processes. Niceness values
-                                               priority.
+                                                range from -20 (most favorable
-ionice_class        None                       I/O scheduling class of server
+                                                to the process) to 19 (least
-                                               processes. I/O niceness class
+                                                favorable to the process).
-                                               values are IOPRIO_CLASS_RT
+                                                The default does not modify
-                                               (realtime), IOPRIO_CLASS_BE
+                                                priority.
-                                               (best-effort), and IOPRIO_CLASS_IDLE
+ionice_class         None                       I/O scheduling class of server
-                                               (idle).
+                                                processes. I/O niceness class
-                                               The default does not modify
+                                                values are IOPRIO_CLASS_RT
-                                               class and priority. Linux supports
+                                                (realtime), IOPRIO_CLASS_BE
-                                               io scheduling priorities and classes
+                                                (best-effort), and IOPRIO_CLASS_IDLE
-                                               since 2.6.13 with the CFQ io scheduler.
+                                                (idle).
-                                               Work only with ionice_priority.
+                                                The default does not modify
-ionice_priority     None                       I/O scheduling priority of server
+                                                class and priority. Linux supports
-                                               processes. I/O niceness priority
+                                                io scheduling priorities and classes
-                                               is a number which goes from 0 to 7.
+                                                since 2.6.13 with the CFQ io scheduler.
-                                               The higher the value, the lower
+                                                Work only with ionice_priority.
-                                               the I/O priority of the process.
+ionice_priority      None                       I/O scheduling priority of server
-                                               Work only with ionice_class.
+                                                processes. I/O niceness priority
-                                               Ignored if IOPRIO_CLASS_IDLE
+                                                is a number which goes from 0 to 7.
-                                               is set.
+                                                The higher the value, the lower
-==================  =========================  ===============================
+                                                the I/O priority of the process.
                                                Work only with ionice_class.
                                                Ignored if IOPRIO_CLASS_IDLE
                                                is set.
 ==================== =========================  ===============================
 *****************
 [account-auditor]
--- a/etc/account-server.conf-sample
+++ b/etc/account-server.conf-sample
@ -143,6 +143,9 @@ use = egg:swift#recon
 # run_pause is deprecated, use interval instead
 # run_pause = 30
 #
 # Process at most this many databases per second
 # databases_per_second = 50
 #
 # node_timeout = 10
 # conn_timeout = 0.5
 #
--- a/etc/container-server.conf-sample
+++ b/etc/container-server.conf-sample
@ -156,6 +156,9 @@ use = egg:swift#recon
 # run_pause is deprecated, use interval instead
 # run_pause = 30
 #
 # Process at most this many databases per second
 # databases_per_second = 50
 #
 # node_timeout = 10
 # conn_timeout = 0.5
 #
@ -436,6 +439,9 @@ use = egg:swift#xprofile
 # Time in seconds to wait between sharder cycles
 # interval = 30
 #
 # Process at most this many databases per second
 # databases_per_second = 50
 #
 # The container-sharder accepts the following configuration options as defined
 # in the container-replicator section:
 #
--- a/swift/common/db_replicator.py
+++ b/swift/common/db_replicator.py
@ -33,7 +33,7 @@ from swift.common.utils import get_logger, whataremyips, storage_directory, \
    renamer, mkdirs, lock_parent_directory, config_true_value, \
    unlink_older_than, dump_recon_cache, rsync_module_interpolation, \
    json, parse_override_options, round_robin_iter, Everything, get_db_files, \
-    parse_db_filename, quote
+    parse_db_filename, quote, RateLimitedIterator
 from swift.common import ring
 from swift.common.ring.utils import is_local_device
 from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \
@ -204,6 +204,8 @@ class Replicator(Daemon):
                                ' to use option %(type)s-replicator/'
                                'interval.'
                                % {'type': self.server_type})
        self.databases_per_second = int(
            conf.get('databases_per_second', 50))
        self.node_timeout = float(conf.get('node_timeout', 10))
        self.conn_timeout = float(conf.get('conn_timeout', 0.5))
        self.rsync_compress = config_true_value(
@ -733,6 +735,11 @@ class Replicator(Daemon):
    def report_up_to_date(self, full_info):
        return True
    def roundrobin_datadirs(self, dirs):
        return RateLimitedIterator(
            roundrobin_datadirs(dirs),
            elements_per_second=self.databases_per_second)
    def run_once(self, *args, **kwargs):
        """Run a replication pass once."""
        override_options = parse_override_options(once=True, **kwargs)
@ -789,7 +796,7 @@ class Replicator(Daemon):
                              "file, not replicating",
                              ", ".join(ips), self.port)
        self.logger.info(_('Beginning replication run'))
-        for part, object_file, node_id in roundrobin_datadirs(dirs):
+        for part, object_file, node_id in self.roundrobin_datadirs(dirs):
            self.cpool.spawn_n(
                self._replicate_object, part, object_file, node_id)
        self.cpool.waitall()
--- a/swift/container/sharder.py
+++ b/swift/container/sharder.py
@ -23,7 +23,7 @@ import os
 import six
 from eventlet import Timeout
-from swift.common import internal_client, db_replicator
+from swift.common import internal_client
 from swift.common.constraints import check_drive
 from swift.common.direct_client import (direct_put_container,
                                        DirectClientException)
@ -1500,7 +1500,7 @@ class ContainerSharder(ContainerReplicator):
                dirs.append((datadir, node, part_filt))
        if not dirs:
            self.logger.warning('Found no data dirs!')
-        for part, path, node in db_replicator.roundrobin_datadirs(dirs):
+        for part, path, node in self.roundrobin_datadirs(dirs):
            # NB: get_part_nodes always provides an 'index' key;
            # this will be used in leader selection
            for primary in self.ring.get_part_nodes(int(part)):
--- a/test/unit/common/test_db_replicator.py
+++ b/test/unit/common/test_db_replicator.py
@ -321,6 +321,7 @@ class TestDBReplicator(unittest.TestCase):
        # later config should be extended to assert more config options
        replicator = TestReplicator({'node_timeout': '3.5'})
        self.assertEqual(replicator.node_timeout, 3.5)
        self.assertEqual(replicator.databases_per_second, 50)
    def test_repl_connection(self):
        node = {'replication_ip': '127.0.0.1', 'replication_port': 80,
--- a/test/unit/container/test_sharder.py
+++ b/test/unit/container/test_sharder.py
@ -128,6 +128,7 @@ class TestSharder(BaseTestSharder):
        expected = {
            'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201,
            'per_diff': 1000, 'max_diffs': 100, 'interval': 30,
            'databases_per_second': 50,
            'cleave_row_batch_size': 10000,
            'node_timeout': 10, 'conn_timeout': 5,
            'rsync_compress': False,
@ -154,6 +155,7 @@ class TestSharder(BaseTestSharder):
        conf = {
            'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010,
            'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
            'databases_per_second': 5,
            'cleave_row_batch_size': 3000,
            'node_timeout': 20, 'conn_timeout': 1,
            'rsync_compress': True,
@ -176,6 +178,7 @@ class TestSharder(BaseTestSharder):
        expected = {
            'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010,
            'per_diff': 2000, 'max_diffs': 200, 'interval': 60,
            'databases_per_second': 5,
            'cleave_row_batch_size': 3000,
            'node_timeout': 20, 'conn_timeout': 1,
            'rsync_compress': True,
@ -485,7 +488,7 @@ class TestSharder(BaseTestSharder):
                                      0, 'text/plain', 'etag', 0)
            # check only sharding enabled containers are processed
-            with mock.patch.object(
+            with mock.patch('eventlet.sleep'), mock.patch.object(
                    sharder, '_process_broker'
            ) as mock_process_broker:
                sharder._local_device_ids = {'stale_node_id'}
@ -539,7 +542,7 @@ class TestSharder(BaseTestSharder):
                                        "for %s" % broker.path)
            # check exceptions are handled
-            with mock.patch.object(
+            with mock.patch('eventlet.sleep'), mock.patch.object(
                    sharder, '_process_broker', side_effect=mock_processing
            ) as mock_process_broker:
                sharder._local_device_ids = {'stale_node_id'}
@ -593,7 +596,7 @@ class TestSharder(BaseTestSharder):
            for i in range(10):
                brokers[1].delete_object(
                    'o%s' % i, next(self.ts_iter).internal)
-            with mock.patch.object(
+            with mock.patch('eventlet.sleep'), mock.patch.object(
                    sharder, '_process_broker'
            ) as mock_process_broker:
                sharder._local_device_ids = {999}
@ -612,6 +615,53 @@ class TestSharder(BaseTestSharder):
                expected_candidate_stats, sharder, 'sharding_candidates')
            self._assert_recon_stats(None, sharder, 'sharding_progress')
    def test_ratelimited_roundrobin(self):
        n_databases = 100
        def stub_iter(dirs):
            for i in range(n_databases):
                yield i, '/srv/node/sda/path/to/container.db', {}
        now = time.time()
        clock = {
            'sleeps': [],
            'now': now,
        }
        def fake_sleep(t):
            clock['sleeps'].append(t)
            clock['now'] += t
        def fake_time():
            return clock['now']
        with self._mock_sharder({'databases_per_second': 1}) as sharder, \
                mock.patch('swift.common.db_replicator.roundrobin_datadirs',
                           stub_iter), \
                mock.patch('time.time', fake_time), \
                mock.patch('eventlet.sleep', fake_sleep):
            list(sharder.roundrobin_datadirs(None))
        # 100 db at 1/s should take ~100s
        run_time = sum(clock['sleeps'])
        self.assertTrue(97 <= run_time < 100, 'took %s' % run_time)
        n_databases = 1000
        now = time.time()
        clock = {
            'sleeps': [],
            'now': now,
        }
        with self._mock_sharder({'databases_per_second': 50}) as sharder, \
                mock.patch('swift.common.db_replicator.roundrobin_datadirs',
                           stub_iter), \
                mock.patch('time.time', fake_time), \
                mock.patch('eventlet.sleep', fake_sleep):
            list(sharder.roundrobin_datadirs(None))
        # 1000 db at 50/s
        run_time = sum(clock['sleeps'])
        self.assertTrue(18 <= run_time < 20, 'took %s' % run_time)
    @contextmanager
    def _mock_sharder(self, conf=None, replicas=3):
        conf = conf or {}