diff --git a/bin/swift-container-sharder b/bin/swift-container-sharder new file mode 100755 index 0000000000..3e6551319b --- /dev/null +++ b/bin/swift-container-sharder @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# Copyright (c) 2010-2015 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from swift.container.sharder import ContainerSharder +from swift.common.utils import parse_options +from swift.common.daemon import run_daemon +from optparse import OptionParser + +if __name__ == '__main__': + parser = OptionParser("%prog CONFIG [options]") + parser.add_option('-d', '--devices', + help='Shard containers only on given devices. ' + 'Comma-separated list. ' + 'Only has effect if --once is used.') + parser.add_option('-p', '--partitions', + help='Shard containers only in given partitions. ' + 'Comma-separated list. ' + 'Only has effect if --once is used.') + conf_file, options = parse_options(parser=parser, once=True) + run_daemon(ContainerSharder, conf_file, **options) diff --git a/doc/saio/swift/container-server/1.conf b/doc/saio/swift/container-server/1.conf index 5bf3c0f28c..e71a5b6683 100644 --- a/doc/saio/swift/container-server/1.conf +++ b/doc/saio/swift/container-server/1.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/2.conf b/doc/saio/swift/container-server/2.conf index 0b29ada029..86e58a9fde 100644 --- a/doc/saio/swift/container-server/2.conf +++ b/doc/saio/swift/container-server/2.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/3.conf b/doc/saio/swift/container-server/3.conf index 9f340d07e6..73e760af15 100644 --- a/doc/saio/swift/container-server/3.conf +++ b/doc/saio/swift/container-server/3.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/container-server/4.conf b/doc/saio/swift/container-server/4.conf index 5e95e9c57c..c254191b8f 100644 --- a/doc/saio/swift/container-server/4.conf +++ b/doc/saio/swift/container-server/4.conf @@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port} [container-auditor] [container-sync] + +[container-sharder] +auto_shard = true +rsync_module = {replication_ip}::container{replication_port} +# This is intentionally much smaller than the default of 1,000,000 so tests +# can run in a reasonable amount of time +shard_container_threshold = 100 +# The probe tests make explicit assumptions about the batch sizes +shard_scanner_batch_size = 10 +cleave_batch_size = 2 diff --git a/doc/saio/swift/internal-client.conf b/doc/saio/swift/internal-client.conf new file mode 100644 index 0000000000..052d1e7549 --- /dev/null +++ b/doc/saio/swift/internal-client.conf @@ -0,0 +1,24 @@ +[DEFAULT] + +[pipeline:main] +pipeline = catch_errors proxy-logging cache symlink proxy-server + +[app:proxy-server] +use = egg:swift#proxy +account_autocreate = true +# See proxy-server.conf-sample for options + +[filter:symlink] +use = egg:swift#symlink +# See proxy-server.conf-sample for options + +[filter:cache] +use = egg:swift#memcache +# See proxy-server.conf-sample for options + +[filter:proxy-logging] +use = egg:swift#proxy_logging + +[filter:catch_errors] +use = egg:swift#catch_errors +# See proxy-server.conf-sample for options diff --git a/etc/container-server.conf-sample b/etc/container-server.conf-sample index 4059e39418..7d38deb0c5 100644 --- a/etc/container-server.conf-sample +++ b/etc/container-server.conf-sample @@ -69,6 +69,10 @@ bind_port = 6201 # Work only with ionice_class. # ionice_class = # ionice_priority = +# +# The prefix used for hidden auto-created accounts, for example accounts in +# which shard containers are created. Defaults to '.'. +# auto_create_account_prefix = . [pipeline:main] pipeline = healthcheck recon container-server @@ -323,3 +327,117 @@ use = egg:swift#xprofile # # unwind the iterator of applications # unwind = false + +[container-sharder] +# You can override the default log routing for this app here (don't use set!): +# log_name = container-sharder +# log_facility = LOG_LOCAL0 +# log_level = INFO +# log_address = /dev/log +# +# Container sharder specific settings +# +# If the auto_shard option is true then the sharder will automatically select +# containers to shard, scan for shard ranges, and select shards to shrink. +# The default is false. +# Warning: auto-sharding is still under development and should not be used in +# production; do not set this option to true in a production cluster. +# auto_shard = false +# +# When auto-sharding is enabled shard_container_threshold defines the object +# count at which a container with container-sharding enabled will start to +# shard. shard_container_threshold also indirectly determines the initial +# nominal size of shard containers, which is shard_container_threshold // 2, as +# well as determining the thresholds for shrinking and merging shard +# containers. +# shard_container_threshold = 1000000 +# +# When auto-sharding is enabled shard_shrink_point defines the object count +# below which a 'donor' shard container will be considered for shrinking into +# another 'acceptor' shard container. shard_shrink_point is a percentage of +# shard_container_threshold e.g. the default value of 5 means 5% of the +# shard_container_threshold. +# shard_shrink_point = 5 +# +# When auto-sharding is enabled shard_shrink_merge_point defines the maximum +# allowed size of an acceptor shard container after having a donor merged into +# it. Shard_shrink_merge_point is a percentage of shard_container_threshold. +# e.g. the default value of 75 means that the projected sum of a donor object +# count and acceptor count must be less than 75% of shard_container_threshold +# for the donor to be allowed to merge into the acceptor. +# +# For example, if the shard_container_threshold is 1 million, +# shard_shrink_point is 5, and shard_shrink_merge_point is 75 then a shard will +# be considered for shrinking if it has less than or equal to 50 thousand +# objects but will only merge into an acceptor if the combined object count +# would be less than or equal to 750 thousand objects. +# shard_shrink_merge_point = 75 +# +# When auto-sharding is enabled shard_scanner_batch_size defines the maximum +# number of shard ranges that will be found each time the sharder daemon visits +# a sharding container. If necessary the sharder daemon will continue to search +# for more shard ranges each time it visits the container. +# shard_scanner_batch_size = 10 +# +# cleave_batch_size defines the number of shard ranges that will be cleaved +# each time the sharder daemon visits a sharding container. +# cleave_batch_size = 2 +# +# cleave_row_batch_size defines the size of batches of object rows read from a +# sharding container and merged to a shard container during cleaving. +# cleave_row_batch_size = 10000 +# +# Defines the number of successfully replicated shard dbs required when +# cleaving a previously uncleaved shard range before the sharder will progress +# to the next shard range. The value should be less than or equal to the +# container ring replica count. The default of 'auto' causes the container ring +# quorum value to be used. This option only applies to the container-sharder +# replication and does not affect the number of shard container replicas that +# will eventually be replicated by the container-replicator. +# shard_replication_quorum = auto +# +# Defines the number of successfully replicated shard dbs required when +# cleaving a shard range that has been previously cleaved on another node +# before the sharder will progress to the next shard range. The value should be +# less than or equal to the container ring replica count. The default of 'auto' +# causes the shard_replication_quorum value to be used. This option only +# applies to the container-sharder replication and does not affect the number +# of shard container replicas that will eventually be replicated by the +# container-replicator. +# existing_shard_replication_quorum = auto +# +# The sharder uses an internal client to create and make requests to +# containers. The absolute path to the client config file can be configured. +# internal_client_conf_path = /etc/swift/internal-client.conf +# +# The number of time the internal client will retry requests. +# request_tries = 3 +# +# Each time the sharder dumps stats to the recon cache file it includes a list +# of containers that appear to need sharding but are not yet sharding. By +# default this list is limited to the top 5 containers, ordered by object +# count. The limit may be changed by setting recon_candidates_limit to an +# integer value. A negative value implies no limit. +# recon_candidates_limit = 5 +# +# Large databases tend to take a while to work with, but we want to make sure +# we write down our progress. Use a larger-than-normal broker timeout to make +# us less likely to bomb out on a LockTimeout. +# broker_timeout = 60 +# +# Time in seconds to wait between sharder cycles +# interval = 30 +# +# The container-sharder accepts the following configuration options as defined +# in the container-replicator section: +# +# per_diff = 1000 +# max_diffs = 100 +# concurrency = 8 +# node_timeout = 10 +# conn_timeout = 0.5 +# reclaim_age = 604800 +# rsync_compress = no +# rsync_module = {replication_ip}::container +# recon_cache_path = /var/cache/swift +# diff --git a/setup.cfg b/setup.cfg index 7ed7f1ec17..bc6b1a07c0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,6 +36,7 @@ scripts = bin/swift-container-info bin/swift-container-replicator bin/swift-container-server + bin/swift-container-sharder bin/swift-container-sync bin/swift-container-updater bin/swift-container-reconciler @@ -71,6 +72,9 @@ keystone = keystonemiddleware>=4.17.0 [entry_points] +console_scripts = + swift-manage-shard-ranges = swift.cli.manage_shard_ranges:main + paste.app_factory = proxy = swift.proxy.server:app_factory object = swift.obj.server:app_factory diff --git a/swift/cli/manage_shard_ranges.py b/swift/cli/manage_shard_ranges.py new file mode 100644 index 0000000000..acbc364968 --- /dev/null +++ b/swift/cli/manage_shard_ranges.py @@ -0,0 +1,370 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from __future__ import print_function +import argparse +import json +import sys +import time + +from six.moves import input + +from swift.common.utils import Timestamp, get_logger, ShardRange +from swift.container.backend import ContainerBroker, UNSHARDED +from swift.container.sharder import make_shard_ranges, sharding_enabled, \ + CleavingContext + + +def _load_and_validate_shard_data(args): + try: + with open(args.input, 'rb') as fd: + try: + data = json.load(fd) + if not isinstance(data, list): + raise ValueError('Shard data must be a list of dicts') + for k in ('lower', 'upper', 'index', 'object_count'): + for shard in data: + shard[k] + return data + except (TypeError, ValueError, KeyError) as err: + print('Failed to load valid shard range data: %r' % err, + file=sys.stderr) + exit(2) + except IOError as err: + print('Failed to open file %s: %s' % (args.input, err), + file=sys.stderr) + exit(2) + + +def _check_shard_ranges(own_shard_range, shard_ranges): + reasons = [] + + def reason(x, y): + if x != y: + reasons.append('%s != %s' % (x, y)) + + if not shard_ranges: + reasons.append('No shard ranges.') + else: + reason(own_shard_range.lower, shard_ranges[0].lower) + reason(own_shard_range.upper, shard_ranges[-1].upper) + for x, y in zip(shard_ranges, shard_ranges[1:]): + reason(x.upper, y.lower) + + if reasons: + print('WARNING: invalid shard ranges: %s.' % reasons) + print('Aborting.') + exit(2) + + +def _check_own_shard_range(broker, args): + # TODO: this check is weak - if the shards prefix changes then we may not + # identify a shard container. The goal is to not inadvertently create an + # entire namespace default shard range for a shard container. + is_shard = broker.account.startswith(args.shards_account_prefix) + own_shard_range = broker.get_own_shard_range(no_default=is_shard) + if not own_shard_range: + print('WARNING: shard container missing own shard range.') + print('Aborting.') + exit(2) + return own_shard_range + + +def _find_ranges(broker, args, status_file=None): + start = last_report = time.time() + limit = 5 if status_file else -1 + shard_data, last_found = broker.find_shard_ranges( + args.rows_per_shard, limit=limit) + if shard_data: + while not last_found: + if last_report + 10 < time.time(): + print('Found %d ranges in %gs; looking for more...' % ( + len(shard_data), time.time() - start), file=status_file) + last_report = time.time() + # prefix doesn't matter since we aren't persisting it + found_ranges = make_shard_ranges(broker, shard_data, '.shards_') + more_shard_data, last_found = broker.find_shard_ranges( + args.rows_per_shard, existing_ranges=found_ranges, limit=5) + shard_data.extend(more_shard_data) + return shard_data, time.time() - start + + +def find_ranges(broker, args): + shard_data, delta_t = _find_ranges(broker, args, sys.stderr) + print(json.dumps(shard_data, sort_keys=True, indent=2)) + print('Found %d ranges in %gs (total object count %s)' % + (len(shard_data), delta_t, + sum(r['object_count'] for r in shard_data)), + file=sys.stderr) + return 0 + + +def show_shard_ranges(broker, args): + shard_ranges = broker.get_shard_ranges( + include_deleted=getattr(args, 'include_deleted', False)) + shard_data = [dict(sr, state=sr.state_text) + for sr in shard_ranges] + + if not shard_data: + print("No shard data found.", file=sys.stderr) + elif getattr(args, 'brief', False): + print("Existing shard ranges:", file=sys.stderr) + print(json.dumps([(sd['lower'], sd['upper']) for sd in shard_data], + sort_keys=True, indent=2)) + else: + print("Existing shard ranges:", file=sys.stderr) + print(json.dumps(shard_data, sort_keys=True, indent=2)) + return 0 + + +def db_info(broker, args): + print('Sharding enabled = %s' % sharding_enabled(broker)) + own_sr = broker.get_own_shard_range(no_default=True) + print('Own shard range: %s' % + (json.dumps(dict(own_sr, state=own_sr.state_text), + sort_keys=True, indent=2) + if own_sr else None)) + db_state = broker.get_db_state() + print('db_state = %s' % db_state) + if db_state == 'sharding': + print('Retiring db id: %s' % broker.get_brokers()[0].get_info()['id']) + print('Cleaving context: %s' % + json.dumps(dict(CleavingContext.load(broker)), + sort_keys=True, indent=2)) + print('Metadata:') + for k, (v, t) in broker.metadata.items(): + print(' %s = %s' % (k, v)) + + +def delete_shard_ranges(broker, args): + shard_ranges = broker.get_shard_ranges() + if not shard_ranges: + print("No shard ranges found to delete.") + return 0 + + while not args.force: + print('This will delete existing %d shard ranges.' % len(shard_ranges)) + if broker.get_db_state() != UNSHARDED: + print('WARNING: Be very cautious about deleting existing shard ' + 'ranges. Deleting all ranges in this db does not guarantee ' + 'deletion of all ranges on all replicas of the db.') + print(' - this db is in state %s' % broker.get_db_state()) + print(' - %d existing shard ranges have started sharding' % + [sr.state != ShardRange.FOUND + for sr in shard_ranges].count(True)) + choice = input('Do you want to show the existing ranges [s], ' + 'delete the existing ranges [yes] ' + 'or quit without deleting [q]? ') + if choice == 's': + show_shard_ranges(broker, args) + continue + elif choice == 'q': + return 1 + elif choice == 'yes': + break + else: + print('Please make a valid choice.') + print() + + now = Timestamp.now() + for sr in shard_ranges: + sr.deleted = 1 + sr.timestamp = now + broker.merge_shard_ranges(shard_ranges) + print('Deleted %s existing shard ranges.' % len(shard_ranges)) + return 0 + + +def _replace_shard_ranges(broker, args, shard_data, timeout=None): + own_shard_range = _check_own_shard_range(broker, args) + shard_ranges = make_shard_ranges( + broker, shard_data, args.shards_account_prefix) + _check_shard_ranges(own_shard_range, shard_ranges) + + if args.verbose > 0: + print('New shard ranges to be injected:') + print(json.dumps([dict(sr) for sr in shard_ranges], + sort_keys=True, indent=2)) + + # Crank up the timeout in an effort to *make sure* this succeeds + with broker.updated_timeout(max(timeout, args.replace_timeout)): + delete_shard_ranges(broker, args) + broker.merge_shard_ranges(shard_ranges) + + print('Injected %d shard ranges.' % len(shard_ranges)) + print('Run container-replicator to replicate them to other nodes.') + if args.enable: + return enable_sharding(broker, args) + else: + print('Use the enable sub-command to enable sharding.') + return 0 + + +def replace_shard_ranges(broker, args): + shard_data = _load_and_validate_shard_data(args) + return _replace_shard_ranges(broker, args, shard_data) + + +def find_replace_shard_ranges(broker, args): + shard_data, delta_t = _find_ranges(broker, args, sys.stdout) + # Since we're trying to one-shot this, and the previous step probably + # took a while, make the timeout for writing *at least* that long + return _replace_shard_ranges(broker, args, shard_data, timeout=delta_t) + + +def _enable_sharding(broker, own_shard_range, args): + if own_shard_range.update_state(ShardRange.SHARDING): + own_shard_range.epoch = Timestamp.now() + own_shard_range.state_timestamp = own_shard_range.epoch + + with broker.updated_timeout(args.enable_timeout): + broker.merge_shard_ranges([own_shard_range]) + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('True', Timestamp.now().normal)}) + return own_shard_range + + +def enable_sharding(broker, args): + own_shard_range = _check_own_shard_range(broker, args) + _check_shard_ranges(own_shard_range, broker.get_shard_ranges()) + + if own_shard_range.state == ShardRange.ACTIVE: + own_shard_range = _enable_sharding(broker, own_shard_range, args) + print('Container moved to state %r with epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + elif own_shard_range.state == ShardRange.SHARDING: + if own_shard_range.epoch: + print('Container already in state %r with epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + print('No action required.') + else: + print('Container already in state %r but missing epoch.' % + own_shard_range.state_text) + own_shard_range = _enable_sharding(broker, own_shard_range, args) + print('Container in state %r given epoch %s.' % + (own_shard_range.state_text, own_shard_range.epoch.internal)) + else: + print('WARNING: container in state %s (should be active or sharding).' + % own_shard_range.state_text) + print('Aborting.') + return 2 + + print('Run container-sharder on all nodes to shard the container.') + return 0 + + +def _add_find_args(parser): + parser.add_argument('rows_per_shard', nargs='?', type=int, default=500000) + + +def _add_replace_args(parser): + parser.add_argument( + '--shards_account_prefix', metavar='shards_account_prefix', type=str, + required=False, help='Prefix for shards account', default='.shards_') + parser.add_argument( + '--replace-timeout', type=int, default=600, + help='Minimum DB timeout to use when replacing shard ranges.') + parser.add_argument( + '--force', '-f', action='store_true', default=False, + help='Delete existing shard ranges; no questions asked.') + parser.add_argument( + '--enable', action='store_true', default=False, + help='Enable sharding after adding shard ranges.') + + +def _add_enable_args(parser): + parser.add_argument( + '--enable-timeout', type=int, default=300, + help='DB timeout to use when enabling sharding.') + + +def _make_parser(): + parser = argparse.ArgumentParser(description='Manage shard ranges') + parser.add_argument('container_db') + parser.add_argument('--verbose', '-v', action='count', + help='Increase output verbosity') + subparsers = parser.add_subparsers( + help='Sub-command help', title='Sub-commands') + + # find + find_parser = subparsers.add_parser( + 'find', help='Find and display shard ranges') + _add_find_args(find_parser) + find_parser.set_defaults(func=find_ranges) + + # delete + delete_parser = subparsers.add_parser( + 'delete', help='Delete all existing shard ranges from db') + delete_parser.add_argument( + '--force', '-f', action='store_true', default=False, + help='Delete existing shard ranges; no questions asked.') + delete_parser.set_defaults(func=delete_shard_ranges) + + # show + show_parser = subparsers.add_parser( + 'show', help='Print shard range data') + show_parser.add_argument( + '--include_deleted', '-d', action='store_true', default=False, + help='Include deleted shard ranges in output.') + show_parser.add_argument( + '--brief', '-b', action='store_true', default=False, + help='Show only shard range bounds in output.') + show_parser.set_defaults(func=show_shard_ranges) + + # info + info_parser = subparsers.add_parser( + 'info', help='Print container db info') + info_parser.set_defaults(func=db_info) + + # replace + replace_parser = subparsers.add_parser( + 'replace', + help='Replace existing shard ranges. User will be prompted before ' + 'deleting any existing shard ranges.') + replace_parser.add_argument('input', metavar='input_file', + type=str, help='Name of file') + _add_replace_args(replace_parser) + replace_parser.set_defaults(func=replace_shard_ranges) + + # find_and_replace + find_replace_parser = subparsers.add_parser( + 'find_and_replace', + help='Find new shard ranges and replace existing shard ranges. ' + 'User will be prompted before deleting any existing shard ranges.' + ) + _add_find_args(find_replace_parser) + _add_replace_args(find_replace_parser) + _add_enable_args(find_replace_parser) + find_replace_parser.set_defaults(func=find_replace_shard_ranges) + + # enable + enable_parser = subparsers.add_parser( + 'enable', help='Enable sharding and move db to sharding state.') + _add_enable_args(enable_parser) + enable_parser.set_defaults(func=enable_sharding) + _add_replace_args(enable_parser) + return parser + + +def main(args=None): + parser = _make_parser() + args = parser.parse_args(args) + logger = get_logger({}, name='ContainerBroker', log_to_console=True) + broker = ContainerBroker(args.container_db, logger=logger, + skip_commits=True) + broker.get_info() + print('Loaded db broker for %s.' % broker.path, file=sys.stderr) + return args.func(broker, args) + + +if __name__ == '__main__': + exit(main()) diff --git a/swift/cli/shard-info.py b/swift/cli/shard-info.py new file mode 100644 index 0000000000..01223787f7 --- /dev/null +++ b/swift/cli/shard-info.py @@ -0,0 +1,195 @@ +# Copyright (c) 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict + +from swift.common import utils +from swift.common.db_replicator import roundrobin_datadirs +from swift.common.ring import ring +from swift.common.utils import Timestamp +from swift.container.backend import ContainerBroker, DATADIR + +TAB = ' ' + + +def broker_key(broker): + broker.get_info() + return broker.path + + +def container_type(broker): + return 'ROOT' if broker.is_root_container() else 'SHARD' + + +def collect_brokers(conf_path, names2nodes): + conf = utils.readconf(conf_path, 'container-replicator') + root = conf.get('devices', '/srv/node') + swift_dir = conf.get('swift_dir', '/etc/swift') + c_ring = ring.Ring(swift_dir, ring_name='container') + dirs = [] + brokers = defaultdict(dict) + for node in c_ring.devs: + if node is None: + continue + datadir = os.path.join(root, node['device'], DATADIR) + if os.path.isdir(datadir): + dirs.append((datadir, node['id'], lambda *args: True)) + for part, object_file, node_id in roundrobin_datadirs(dirs): + broker = ContainerBroker(object_file) + for node in c_ring.get_part_nodes(int(part)): + if node['id'] == node_id: + node_index = str(node['index']) + break + else: + node_index = 'handoff' + names2nodes[broker_key(broker)][(node_id, node_index)] = broker + return brokers + + +def print_broker_info(node, broker, indent_level=0): + indent = indent_level * TAB + info = broker.get_info() + raw_info = broker._get_info() + deleted_at = float(info['delete_timestamp']) + if deleted_at: + deleted_at = Timestamp(info['delete_timestamp']).isoformat + else: + deleted_at = ' - ' + print('%s(%s) %s, objs: %s, bytes: %s, actual_objs: %s, put: %s, ' + 'deleted: %s' % + (indent, node[1][0], broker.get_db_state(), + info['object_count'], info['bytes_used'], raw_info['object_count'], + Timestamp(info['put_timestamp']).isoformat, deleted_at)) + + +def print_db(node, broker, expect_type='ROOT', indent_level=0): + indent = indent_level * TAB + print('%s(%s) %s node id: %s, node index: %s' % + (indent, node[1][0], broker.db_file, node[0], node[1])) + actual_type = container_type(broker) + if actual_type != expect_type: + print('%s ERROR expected %s but found %s' % + (indent, expect_type, actual_type)) + + +def print_own_shard_range(node, sr, indent_level): + indent = indent_level * TAB + range = '%r - %r' % (sr.lower, sr.upper) + print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), ' + 'modified: %s (%s), %7s: %s (%s), deleted: %s epoch: %s' % + (indent, node[1][0], range, sr.object_count, sr.bytes_used, + sr.timestamp.isoformat, sr.timestamp.internal, + sr.meta_timestamp.isoformat, sr.meta_timestamp.internal, + sr.state_text, sr.state_timestamp.isoformat, + sr.state_timestamp.internal, sr.deleted, + sr.epoch.internal if sr.epoch else None)) + + +def print_own_shard_range_info(node, shard_ranges, indent_level=0): + shard_ranges.sort(key=lambda x: x.deleted) + for sr in shard_ranges: + print_own_shard_range(node, sr, indent_level) + + +def print_shard_range(node, sr, indent_level): + indent = indent_level * TAB + range = '%r - %r' % (sr.lower, sr.upper) + print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), ' + 'modified: %s (%s), %7s: %s (%s), deleted: %s %s' % + (indent, node[1][0], range, sr.object_count, sr.bytes_used, + sr.timestamp.isoformat, sr.timestamp.internal, + sr.meta_timestamp.isoformat, sr.meta_timestamp.internal, + sr.state_text, sr.state_timestamp.isoformat, + sr.state_timestamp.internal, sr.deleted, sr.name)) + + +def print_shard_range_info(node, shard_ranges, indent_level=0): + shard_ranges.sort(key=lambda x: x.deleted) + for sr in shard_ranges: + print_shard_range(node, sr, indent_level) + + +def print_sharding_info(node, broker, indent_level=0): + indent = indent_level * TAB + print('%s(%s) %s' % (indent, node[1][0], broker.get_sharding_sysmeta())) + + +def print_container(name, name2nodes2brokers, expect_type='ROOT', + indent_level=0, used_names=None): + used_names = used_names or set() + indent = indent_level * TAB + node2broker = name2nodes2brokers[name] + ordered_by_index = sorted(node2broker.keys(), key=lambda x: x[1]) + brokers = [(node, node2broker[node]) for node in ordered_by_index] + + print('%sName: %s' % (indent, name)) + if name in used_names: + print('%s (Details already listed)\n' % indent) + return + + used_names.add(name) + print(indent + 'DB files:') + for node, broker in brokers: + print_db(node, broker, expect_type, indent_level=indent_level + 1) + + print(indent + 'Info:') + for node, broker in brokers: + print_broker_info(node, broker, indent_level=indent_level + 1) + + print(indent + 'Sharding info:') + for node, broker in brokers: + print_sharding_info(node, broker, indent_level=indent_level + 1) + print(indent + 'Own shard range:') + for node, broker in brokers: + shard_ranges = broker.get_shard_ranges( + include_deleted=True, include_own=True, exclude_others=True) + print_own_shard_range_info(node, shard_ranges, + indent_level=indent_level + 1) + print(indent + 'Shard ranges:') + shard_names = set() + for node, broker in brokers: + shard_ranges = broker.get_shard_ranges(include_deleted=True) + for sr_name in shard_ranges: + shard_names.add(sr_name.name) + print_shard_range_info(node, shard_ranges, + indent_level=indent_level + 1) + print(indent + 'Shards:') + for sr_name in shard_names: + print_container(sr_name, name2nodes2brokers, expect_type='SHARD', + indent_level=indent_level + 1, used_names=used_names) + print('\n') + + +def run(conf_paths): + # container_name -> (node id, node index) -> broker + name2nodes2brokers = defaultdict(dict) + for conf_path in conf_paths: + collect_brokers(conf_path, name2nodes2brokers) + + print('First column on each line is (node index)\n') + for name, node2broker in name2nodes2brokers.items(): + expect_root = False + for node, broker in node2broker.items(): + expect_root = broker.is_root_container() or expect_root + if expect_root: + print_container(name, name2nodes2brokers) + + +if __name__ == '__main__': + conf_dir = '/etc/swift/container-server' + conf_paths = [os.path.join(conf_dir, p) for p in os.listdir(conf_dir) + if p.endswith(('conf', 'conf.d'))] + run(conf_paths) diff --git a/swift/common/manager.py b/swift/common/manager.py index 330f8310f4..71f9e689b3 100644 --- a/swift/common/manager.py +++ b/swift/common/manager.py @@ -34,7 +34,7 @@ PROC_DIR = '/proc' ALL_SERVERS = ['account-auditor', 'account-server', 'container-auditor', 'container-replicator', 'container-reconciler', - 'container-server', 'container-sync', + 'container-server', 'container-sharder', 'container-sync', 'container-updater', 'object-auditor', 'object-server', 'object-expirer', 'object-replicator', 'object-reconstructor', 'object-updater', @@ -637,13 +637,16 @@ class Server(object): {'server': self.server, 'pid': pid, 'conf': conf_file}) return 0 - def spawn(self, conf_file, once=False, wait=True, daemon=True, **kwargs): + def spawn(self, conf_file, once=False, wait=True, daemon=True, + additional_args=None, **kwargs): """Launch a subprocess for this server. :param conf_file: path to conf_file to use as first arg :param once: boolean, add once argument to command :param wait: boolean, if true capture stdout with a pipe :param daemon: boolean, if false ask server to log to console + :param additional_args: list of additional arguments to pass + on the command line :returns: the pid of the spawned process """ @@ -653,6 +656,10 @@ class Server(object): if not daemon: # ask the server to log to console args.append('verbose') + if additional_args: + if isinstance(additional_args, str): + additional_args = [additional_args] + args.extend(additional_args) # figure out what we're going to do with stdio if not daemon: diff --git a/swift/common/utils.py b/swift/common/utils.py index 4a1c6e3911..048e64d65d 100644 --- a/swift/common/utils.py +++ b/swift/common/utils.py @@ -412,6 +412,21 @@ def config_positive_int_value(value): return result +def config_float_value(value, minimum=None, maximum=None): + try: + val = float(value) + if minimum is not None and val < minimum: + raise ValueError() + if maximum is not None and val > maximum: + raise ValueError() + return val + except (TypeError, ValueError): + min_ = ', greater than %s' % minimum if minimum is not None else '' + max_ = ', less than %s' % maximum if maximum is not None else '' + raise ValueError('Config option must be a number%s%s, not "%s".' % + (min_, max_, value)) + + def config_auto_int_value(value, default): """ Returns default if value is None or 'auto'. diff --git a/swift/container/backend.py b/swift/container/backend.py index 9d75d0f680..040b79ad0b 100644 --- a/swift/container/backend.py +++ b/swift/container/backend.py @@ -746,6 +746,43 @@ class ContainerBroker(DatabaseBroker): 'meta_timestamp': meta_timestamp} self.put_record(record) + def remove_objects(self, lower, upper, max_row=None): + """ + Removes object records in the given namespace range from the object + table. + + Note that objects are removed regardless of their storage_policy_index. + + :param lower: defines the lower bound of object names that will be + removed; names greater than this value will be removed; names less + than or equal to this value will not be removed. + :param upper: defines the upper bound of object names that will be + removed; names less than or equal to this value will be removed; + names greater than this value will not be removed. The empty string + is interpreted as there being no upper bound. + :param max_row: if specified only rows less than or equal to max_row + will be removed + """ + query_conditions = [] + query_args = [] + if max_row is not None: + query_conditions.append('ROWID <= ?') + query_args.append(str(max_row)) + if lower: + query_conditions.append('name > ?') + query_args.append(lower) + if upper: + query_conditions.append('name <= ?') + query_args.append(upper) + + query = 'DELETE FROM object WHERE deleted in (0, 1)' + if query_conditions: + query += ' AND ' + ' AND '.join(query_conditions) + + with self.get() as conn: + conn.execute(query, query_args) + conn.commit() + def _is_deleted_info(self, object_count, put_timestamp, delete_timestamp, **kwargs): """ diff --git a/swift/container/sharder.py b/swift/container/sharder.py new file mode 100644 index 0000000000..06c2b6d9db --- /dev/null +++ b/swift/container/sharder.py @@ -0,0 +1,1568 @@ +# Copyright (c) 2015 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import errno +import json +import time +from collections import defaultdict +from random import random + +import os +import six +from eventlet import Timeout + +from swift.common import internal_client, db_replicator +from swift.common.constraints import check_drive +from swift.common.direct_client import (direct_put_container, + DirectClientException) +from swift.common.exceptions import DeviceUnavailable +from swift.common.ring.utils import is_local_device +from swift.common.utils import get_logger, config_true_value, \ + dump_recon_cache, whataremyips, Timestamp, ShardRange, GreenAsyncPile, \ + config_float_value, config_positive_int_value, \ + quorum_size, parse_override_options, Everything, config_auto_int_value +from swift.container.backend import ContainerBroker, \ + RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED, COLLAPSED, \ + SHARD_UPDATE_STATES +from swift.container.replicator import ContainerReplicator + + +def sharding_enabled(broker): + # NB all shards will by default have been created with + # X-Container-Sysmeta-Sharding set and will therefore be candidates for + # sharding, along with explicitly configured root containers. + sharding = broker.metadata.get('X-Container-Sysmeta-Sharding') + if sharding and config_true_value(sharding[0]): + return True + # if broker has been marked deleted it will have lost sysmeta, but we still + # need to process the broker (for example, to shrink any shard ranges) so + # fallback to checking if it has any shard ranges + if broker.get_shard_ranges(): + return True + return False + + +def make_shard_ranges(broker, shard_data, shards_account_prefix): + timestamp = Timestamp.now() + shard_ranges = [] + for data in shard_data: + # Make a copy so we don't mutate the original + kwargs = data.copy() + path = ShardRange.make_path( + shards_account_prefix + broker.root_account, + broker.root_container, broker.container, + timestamp, kwargs.pop('index')) + + shard_ranges.append(ShardRange(path, timestamp, **kwargs)) + return shard_ranges + + +def find_missing_ranges(shard_ranges): + """ + Find any ranges in the entire object namespace that are not covered by any + shard range in the given list. + + :param shard_ranges: A list of :class:`~swift.utils.ShardRange` + :return: a list of missing ranges + """ + gaps = [] + if not shard_ranges: + return ((ShardRange.MIN, ShardRange.MAX),) + if shard_ranges[0].lower > ShardRange.MIN: + gaps.append((ShardRange.MIN, shard_ranges[0].lower)) + for first, second in zip(shard_ranges, shard_ranges[1:]): + if first.upper < second.lower: + gaps.append((first.upper, second.lower)) + if shard_ranges[-1].upper < ShardRange.MAX: + gaps.append((shard_ranges[-1].upper, ShardRange.MAX)) + return gaps + + +def find_overlapping_ranges(shard_ranges): + """ + Find all pairs of overlapping ranges in the given list. + + :param shard_ranges: A list of :class:`~swift.utils.ShardRange` + :return: a set of tuples, each tuple containing ranges that overlap with + each other. + """ + result = set() + for shard_range in shard_ranges: + overlapping = [sr for sr in shard_ranges + if shard_range != sr and shard_range.overlaps(sr)] + if overlapping: + overlapping.append(shard_range) + overlapping.sort() + result.add(tuple(overlapping)) + + return result + + +def is_sharding_candidate(shard_range, threshold): + return (shard_range.state == ShardRange.ACTIVE and + shard_range.object_count >= threshold) + + +def find_sharding_candidates(broker, threshold, shard_ranges=None): + # this should only execute on root containers; the goal is to find + # large shard containers that should be sharded. + # First cut is simple: assume root container shard usage stats are good + # enough to make decision. + # TODO: object counts may well not be the appropriate metric for + # deciding to shrink because a shard with low object_count may have a + # large number of deleted object rows that will need to be merged with + # a neighbour. We may need to expose row count as well as object count. + if shard_ranges is None: + shard_ranges = broker.get_shard_ranges(states=[ShardRange.ACTIVE]) + candidates = [] + for shard_range in shard_ranges: + if not is_sharding_candidate(shard_range, threshold): + continue + shard_range.update_state(ShardRange.SHARDING, + state_timestamp=Timestamp.now()) + shard_range.epoch = shard_range.state_timestamp + candidates.append(shard_range) + return candidates + + +def find_shrinking_candidates(broker, shrink_threshold, merge_size): + # this should only execute on root containers that have sharded; the + # goal is to find small shard containers that could be retired by + # merging with a neighbour. + # First cut is simple: assume root container shard usage stats are good + # enough to make decision; only merge with upper neighbour so that + # upper bounds never change (shard names include upper bound). + # TODO: object counts may well not be the appropriate metric for + # deciding to shrink because a shard with low object_count may have a + # large number of deleted object rows that will need to be merged with + # a neighbour. We may need to expose row count as well as object count. + shard_ranges = broker.get_shard_ranges() + own_shard_range = broker.get_own_shard_range() + if len(shard_ranges) == 1: + # special case to enable final shard to shrink into root + shard_ranges.append(own_shard_range) + + merge_pairs = {} + for donor, acceptor in zip(shard_ranges, shard_ranges[1:]): + if donor in merge_pairs: + # this range may already have been made an acceptor; if so then + # move on. In principle it might be that even after expansion + # this range and its donor(s) could all be merged with the next + # range. In practice it is much easier to reason about a single + # donor merging into a single acceptor. Don't fret - eventually + # all the small ranges will be retired. + continue + if (acceptor.name != own_shard_range.name and + acceptor.state != ShardRange.ACTIVE): + # don't shrink into a range that is not yet ACTIVE + continue + if donor.state not in (ShardRange.ACTIVE, ShardRange.SHRINKING): + # found? created? sharded? don't touch it + continue + + proposed_object_count = donor.object_count + acceptor.object_count + if (donor.state == ShardRange.SHRINKING or + (donor.object_count < shrink_threshold and + proposed_object_count < merge_size)): + # include previously identified merge pairs on presumption that + # following shrink procedure is idempotent + merge_pairs[acceptor] = donor + if donor.update_state(ShardRange.SHRINKING): + # Set donor state to shrinking so that next cycle won't use + # it as an acceptor; state_timestamp defines new epoch for + # donor and new timestamp for the expanded acceptor below. + donor.epoch = donor.state_timestamp = Timestamp.now() + if acceptor.lower != donor.lower: + # Update the acceptor container with its expanding state to + # prevent it treating objects cleaved from the donor + # as misplaced. + acceptor.lower = donor.lower + acceptor.timestamp = donor.state_timestamp + return merge_pairs + + +class CleavingContext(object): + def __init__(self, ref, cursor='', max_row=None, cleave_to_row=None, + last_cleave_to_row=None, cleaving_done=False, + misplaced_done=False, ranges_done=0, ranges_todo=0): + self.ref = ref + self._cursor = None + self.cursor = cursor + self.max_row = max_row + self.cleave_to_row = cleave_to_row + self.last_cleave_to_row = last_cleave_to_row + self.cleaving_done = cleaving_done + self.misplaced_done = misplaced_done + self.ranges_done = ranges_done + self.ranges_todo = ranges_todo + + def __iter__(self): + yield 'ref', self.ref + yield 'cursor', self.cursor + yield 'max_row', self.max_row + yield 'cleave_to_row', self.cleave_to_row + yield 'last_cleave_to_row', self.last_cleave_to_row + yield 'cleaving_done', self.cleaving_done + yield 'misplaced_done', self.misplaced_done + yield 'ranges_done', self.ranges_done + yield 'ranges_todo', self.ranges_todo + + def _encode(cls, value): + if value is not None and six.PY2 and isinstance(value, six.text_type): + return value.encode('utf-8') + return value + + @property + def cursor(self): + return self._cursor + + @cursor.setter + def cursor(self, value): + self._cursor = self._encode(value) + + @property + def marker(self): + return self.cursor + '\x00' + + @classmethod + def _make_ref(cls, broker): + return broker.get_info()['id'] + + @classmethod + def load(cls, broker): + """ + Returns a context dict for tracking the progress of cleaving this + broker's retiring DB. The context is persisted in sysmeta using a key + that is based off the retiring db id and max row. This form of + key ensures that a cleaving context is only loaded for a db that + matches the id and max row when the context was created; if a db is + modified such that its max row changes then a different context, or no + context, will be loaded. + + :return: A dict to which cleave progress metadata may be added. The + dict initially has a key ``ref`` which should not be modified by + any caller. + """ + brokers = broker.get_brokers() + ref = cls._make_ref(brokers[0]) + data = brokers[-1].get_sharding_sysmeta('Context-' + ref) + data = json.loads(data) if data else {} + data['ref'] = ref + data['max_row'] = brokers[0].get_max_row() + return cls(**data) + + def store(self, broker): + broker.set_sharding_sysmeta('Context-' + self.ref, + json.dumps(dict(self))) + + def reset(self): + self.cursor = '' + self.ranges_done = 0 + self.ranges_todo = 0 + self.cleaving_done = False + self.misplaced_done = False + self.last_cleave_to_row = self.cleave_to_row + + def start(self): + self.cursor = '' + self.ranges_done = 0 + self.ranges_todo = 0 + self.cleaving_done = False + self.cleave_to_row = self.max_row + + def done(self): + return all((self.misplaced_done, self.cleaving_done, + self.max_row == self.cleave_to_row)) + + +DEFAULT_SHARD_CONTAINER_THRESHOLD = 10000000 +DEFAULT_SHARD_SHRINK_POINT = 25 +DEFAULT_SHARD_MERGE_POINT = 75 + + +class ContainerSharder(ContainerReplicator): + """Shards containers.""" + + def __init__(self, conf, logger=None): + logger = logger or get_logger(conf, log_route='container-sharder') + super(ContainerSharder, self).__init__(conf, logger=logger) + self.shards_account_prefix = ( + (conf.get('auto_create_account_prefix') or '.') + 'shards_') + + def percent_value(key, default): + try: + value = conf.get(key, default) + return config_float_value(value, 0, 100) / 100.0 + except ValueError as err: + raise ValueError("%s: %s" % (str(err), key)) + + self.shard_shrink_point = percent_value('shard_shrink_point', + DEFAULT_SHARD_SHRINK_POINT) + self.shrink_merge_point = percent_value('shard_shrink_merge_point', + DEFAULT_SHARD_MERGE_POINT) + self.shard_container_threshold = config_positive_int_value( + conf.get('shard_container_threshold', + DEFAULT_SHARD_CONTAINER_THRESHOLD)) + self.shrink_size = (self.shard_container_threshold * + self.shard_shrink_point) + self.merge_size = (self.shard_container_threshold * + self.shrink_merge_point) + self.split_size = self.shard_container_threshold // 2 + self.scanner_batch_size = config_positive_int_value( + conf.get('shard_scanner_batch_size', 10)) + self.cleave_batch_size = config_positive_int_value( + conf.get('cleave_batch_size', 2)) + self.cleave_row_batch_size = config_positive_int_value( + conf.get('cleave_row_batch_size', 10000)) + self.auto_shard = config_true_value(conf.get('auto_shard', False)) + self.sharding_candidates = [] + self.recon_candidates_limit = int( + conf.get('recon_candidates_limit', 5)) + self.broker_timeout = config_positive_int_value( + conf.get('broker_timeout', 60)) + replica_count = self.ring.replica_count + quorum = quorum_size(replica_count) + self.shard_replication_quorum = config_auto_int_value( + conf.get('shard_replication_quorum'), quorum) + if self.shard_replication_quorum > replica_count: + self.logger.warning( + 'shard_replication_quorum of %s exceeds replica count %s' + ', reducing to %s', self.shard_replication_quorum, + replica_count, replica_count) + self.shard_replication_quorum = replica_count + self.existing_shard_replication_quorum = config_auto_int_value( + conf.get('existing_shard_replication_quorum'), + self.shard_replication_quorum) + if self.existing_shard_replication_quorum > replica_count: + self.logger.warning( + 'existing_shard_replication_quorum of %s exceeds replica count' + ' %s, reducing to %s', self.existing_shard_replication_quorum, + replica_count, replica_count) + self.existing_shard_replication_quorum = replica_count + + # internal client + self.conn_timeout = float(conf.get('conn_timeout', 5)) + request_tries = config_positive_int_value( + conf.get('request_tries', 3)) + internal_client_conf_path = conf.get('internal_client_conf_path', + '/etc/swift/internal-client.conf') + try: + self.int_client = internal_client.InternalClient( + internal_client_conf_path, + 'Swift Container Sharder', + request_tries, + allow_modify_pipeline=False) + except IOError as err: + if err.errno != errno.ENOENT: + raise + raise SystemExit( + 'Unable to load internal client from config: %r (%s)' % + (internal_client_conf_path, err)) + self.reported = 0 + + def _zero_stats(self): + """Zero out the stats.""" + super(ContainerSharder, self)._zero_stats() + # all sharding stats that are additional to the inherited replicator + # stats are maintained under the 'sharding' key in self.stats + self.stats['sharding'] = defaultdict(lambda: defaultdict(int)) + self.sharding_candidates = [] + + def _append_stat(self, category, key, value): + if not self.stats['sharding'][category][key]: + self.stats['sharding'][category][key] = list() + self.stats['sharding'][category][key].append(value) + + def _min_stat(self, category, key, value): + current = self.stats['sharding'][category][key] + if not current: + self.stats['sharding'][category][key] = value + else: + self.stats['sharding'][category][key] = min(current, value) + + def _max_stat(self, category, key, value): + current = self.stats['sharding'][category][key] + if not current: + self.stats['sharding'][category][key] = value + else: + self.stats['sharding'][category][key] = max(current, value) + + def _increment_stat(self, category, key, step=1, statsd=False): + self.stats['sharding'][category][key] += step + if statsd: + statsd_key = '%s_%s' % (category, key) + self.logger.increment(statsd_key) + + def _make_stats_info(self, broker, node, own_shard_range): + try: + file_size = os.stat(broker.db_file).st_size + except OSError: + file_size = None + + return {'path': broker.db_file, + 'node_index': node.get('index'), + 'account': broker.account, + 'container': broker.container, + 'root': broker.root_path, + 'object_count': own_shard_range.object_count, + 'meta_timestamp': own_shard_range.meta_timestamp.internal, + 'file_size': file_size} + + def _identify_sharding_candidate(self, broker, node): + own_shard_range = broker.get_own_shard_range() + if is_sharding_candidate( + own_shard_range, self.shard_container_threshold): + self.sharding_candidates.append( + self._make_stats_info(broker, node, own_shard_range)) + + def _transform_sharding_candidate_stats(self): + category = self.stats['sharding']['sharding_candidates'] + candidates = self.sharding_candidates + category['found'] = len(candidates) + candidates.sort(key=lambda c: c['object_count'], reverse=True) + if self.recon_candidates_limit >= 0: + category['top'] = candidates[:self.recon_candidates_limit] + else: + category['top'] = candidates + + def _record_sharding_progress(self, broker, node, error): + own_shard_range = broker.get_own_shard_range() + if (broker.get_db_state() in (UNSHARDED, SHARDING) and + own_shard_range.state in (ShardRange.SHARDING, + ShardRange.SHARDED)): + info = self._make_stats_info(broker, node, own_shard_range) + info['state'] = own_shard_range.state_text + info['db_state'] = broker.get_db_state() + states = [ShardRange.FOUND, ShardRange.CREATED, + ShardRange.CLEAVED, ShardRange.ACTIVE] + shard_ranges = broker.get_shard_ranges(states=states) + state_count = {} + for state in states: + state_count[ShardRange.STATES[state]] = 0 + for shard_range in shard_ranges: + state_count[shard_range.state_text] += 1 + info.update(state_count) + info['error'] = error and str(error) + self._append_stat('sharding_in_progress', 'all', info) + + def _report_stats(self): + # report accumulated stats since start of one sharder cycle + default_stats = ('attempted', 'success', 'failure') + category_keys = ( + ('visited', default_stats + ('skipped', 'completed')), + ('scanned', default_stats + ('found', 'min_time', 'max_time')), + ('created', default_stats), + ('cleaved', default_stats + ('min_time', 'max_time',)), + ('misplaced', default_stats + ('found', 'placed', 'unplaced')), + ('audit_root', default_stats), + ('audit_shard', default_stats), + ) + + now = time.time() + last_report = time.ctime(self.stats['start']) + elapsed = now - self.stats['start'] + sharding_stats = self.stats['sharding'] + for category, keys in category_keys: + stats = sharding_stats[category] + msg = ' '.join(['%s:%s' % (k, str(stats[k])) for k in keys]) + self.logger.info('Since %s %s - %s', last_report, category, msg) + + self._transform_sharding_candidate_stats() + + dump_recon_cache( + {'sharding_stats': self.stats, + 'sharding_time': elapsed, + 'sharding_last': now}, + self.rcache, self.logger) + self.reported = now + + def _periodic_report_stats(self): + if (time.time() - self.reported) >= 3600: # once an hour + self._report_stats() + + def _check_node(self, node): + if not node: + return False + if not is_local_device(self.ips, self.port, + node['replication_ip'], + node['replication_port']): + return False + if not check_drive(self.root, node['device'], + self.mount_check): + self.logger.warning( + 'Skipping %(device)s as it is not mounted' % node) + return False + return True + + def _fetch_shard_ranges(self, broker, newest=False, params=None, + include_deleted=False): + path = self.int_client.make_path(broker.root_account, + broker.root_container) + params = params or {} + params.setdefault('format', 'json') + headers = {'X-Backend-Record-Type': 'shard', + 'X-Backend-Override-Deleted': 'true', + 'X-Backend-Include-Deleted': str(include_deleted)} + if newest: + headers['X-Newest'] = 'true' + try: + try: + resp = self.int_client.make_request( + 'GET', path, headers, acceptable_statuses=(2,), + params=params) + except internal_client.UnexpectedResponse as err: + self.logger.warning("Failed to get shard ranges from %s: %s", + broker.root_path, err) + return None + record_type = resp.headers.get('x-backend-record-type') + if record_type != 'shard': + err = 'unexpected record type %r' % record_type + self.logger.error("Failed to get shard ranges from %s: %s", + broker.root_path, err) + return None + + try: + data = json.loads(resp.body) + if not isinstance(data, list): + raise ValueError('not a list') + return [ShardRange.from_dict(shard_range) + for shard_range in data] + except (ValueError, TypeError, KeyError) as err: + self.logger.error( + "Failed to get shard ranges from %s: invalid data: %r", + broker.root_path, err) + return None + finally: + self.logger.txn_id = None + + def _put_container(self, node, part, account, container, headers, body): + try: + direct_put_container(node, part, account, container, + conn_timeout=self.conn_timeout, + response_timeout=self.node_timeout, + headers=headers, contents=body) + except DirectClientException as err: + self.logger.warning( + 'Failed to put shard ranges to %s:%s/%s: %s', + node['ip'], node['port'], node['device'], err.http_status) + except (Exception, Timeout) as err: + self.logger.exception( + 'Failed to put shard ranges to %s:%s/%s: %s', + node['ip'], node['port'], node['device'], err) + else: + return True + return False + + def _send_shard_ranges(self, account, container, shard_ranges, + headers=None): + body = json.dumps([dict(sr) for sr in shard_ranges]) + part, nodes = self.ring.get_nodes(account, container) + headers = headers or {} + headers.update({'X-Backend-Record-Type': RECORD_TYPE_SHARD, + 'User-Agent': 'container-sharder %s' % os.getpid(), + 'X-Timestamp': Timestamp.now().normal, + 'Content-Length': len(body), + 'Content-Type': 'application/json'}) + + pool = GreenAsyncPile(len(nodes)) + for node in nodes: + pool.spawn(self._put_container, node, part, account, + container, headers, body) + + results = pool.waitall(None) + return results.count(True) >= quorum_size(self.ring.replica_count) + + def _get_shard_broker(self, shard_range, root_path, policy_index): + """ + Get a broker for a container db for the given shard range. If one of + the shard container's primary nodes is a local device then that will be + chosen for the db, otherwise the first of the shard container's handoff + nodes that is local will be chosen. + + :param shard_range: a :class:`~swift.common.utils.ShardRange` + :param root_path: the path of the shard's root container + :param policy_index: the storage policy index + :returns: a tuple of ``(part, broker, node_id)`` where ``part`` is the + shard container's partition, ``broker`` is an instance of + :class:`~swift.container.backend.ContainerBroker`, + ``node_id`` is the id of the selected node. + """ + part = self.ring.get_part(shard_range.account, shard_range.container) + node = self.find_local_handoff_for_part(part) + if not node: + raise DeviceUnavailable( + 'No mounted devices found suitable for creating shard broker' + 'for %s in partition %s' % (shard_range.name, part)) + + shard_broker = ContainerBroker.create_broker( + os.path.join(self.root, node['device']), part, shard_range.account, + shard_range.container, epoch=shard_range.epoch, + storage_policy_index=policy_index) + + # Get the valid info into the broker.container, etc + shard_broker.get_info() + shard_broker.merge_shard_ranges(shard_range) + shard_broker.set_sharding_sysmeta('Root', root_path) + shard_broker.update_metadata({ + 'X-Container-Sysmeta-Sharding': + ('True', Timestamp.now().internal)}) + + return part, shard_broker, node['id'] + + def _audit_root_container(self, broker): + # This is the root container, and therefore the tome of knowledge, + # all we can do is check there is nothing screwy with the ranges + self._increment_stat('audit_root', 'attempted') + warnings = [] + own_shard_range = broker.get_own_shard_range() + + if own_shard_range.state in (ShardRange.SHARDING, ShardRange.SHARDED): + shard_ranges = broker.get_shard_ranges() + missing_ranges = find_missing_ranges(shard_ranges) + if missing_ranges: + warnings.append( + 'missing range(s): %s' % + ' '.join(['%s-%s' % (lower, upper) + for lower, upper in missing_ranges])) + + for state in ShardRange.STATES: + shard_ranges = broker.get_shard_ranges(states=state) + overlaps = find_overlapping_ranges(shard_ranges) + for overlapping_ranges in overlaps: + warnings.append( + 'overlapping ranges in state %s: %s' % + (ShardRange.STATES[state], + ' '.join(['%s-%s' % (sr.lower, sr.upper) + for sr in overlapping_ranges]))) + + if warnings: + self.logger.warning( + 'Audit failed for root %s (%s): %s' % + (broker.db_file, broker.path, ', '.join(warnings))) + self._increment_stat('audit_root', 'failure', statsd=True) + return False + + self._increment_stat('audit_root', 'success', statsd=True) + return True + + def _audit_shard_container(self, broker): + # Get the root view of the world. + self._increment_stat('audit_shard', 'attempted') + warnings = [] + errors = [] + if not broker.account.startswith(self.shards_account_prefix): + warnings.append('account not in shards namespace %r' % + self.shards_account_prefix) + + own_shard_range = broker.get_own_shard_range(no_default=True) + + shard_range = None + if own_shard_range: + shard_ranges = self._fetch_shard_ranges( + broker, newest=True, + params={'marker': own_shard_range.lower, + 'end_marker': own_shard_range.upper}, + include_deleted=True) + if shard_ranges: + for shard_range in shard_ranges: + if (shard_range.lower == own_shard_range.lower and + shard_range.upper == own_shard_range.upper and + shard_range.name == own_shard_range.name): + break + else: + # this is not necessarily an error - some replicas of the + # root may not yet know about this shard container + warnings.append('root has no matching shard range') + shard_range = None + else: + warnings.append('unable to get shard ranges from root') + else: + errors.append('missing own shard range') + + if warnings: + self.logger.warning( + 'Audit warnings for shard %s (%s): %s' % + (broker.db_file, broker.path, ', '.join(warnings))) + + if errors: + self.logger.warning( + 'Audit failed for shard %s (%s) - skipping: %s' % + (broker.db_file, broker.path, ', '.join(errors))) + self._increment_stat('audit_shard', 'failure', statsd=True) + return False + + if shard_range: + self.logger.debug('Updating shard from root %s', dict(shard_range)) + broker.merge_shard_ranges(shard_range) + own_shard_range = broker.get_own_shard_range() + delete_age = time.time() - self.reclaim_age + if (own_shard_range.state == ShardRange.SHARDED and + own_shard_range.deleted and + own_shard_range.timestamp < delete_age and + broker.empty()): + broker.delete_db(Timestamp.now().internal) + self.logger.debug('Deleted shard container %s (%s)', + broker.db_file, broker.path) + self._increment_stat('audit_shard', 'success', statsd=True) + return True + + def _audit_container(self, broker): + if broker.is_deleted(): + # if the container has been marked as deleted, all metadata will + # have been erased so no point auditing. But we want it to pass, in + # case any objects exist inside it. + return True + if broker.is_root_container(): + return self._audit_root_container(broker) + return self._audit_shard_container(broker) + + def yield_objects(self, broker, src_shard_range, since_row=None): + """ + Iterates through all objects in ``src_shard_range`` in name order + yielding them in lists of up to CONTAINER_LISTING_LIMIT length. + + :param broker: A :class:`~swift.container.backend.ContainerBroker`. + :param src_shard_range: A :class:`~swift.common.utils.ShardRange` + describing the source range. + :param since_row: include only items whose ROWID is greater than + the given row id; by default all rows are included. + :return: a generator of tuples of (list of objects, broker info dict) + """ + for include_deleted in (False, True): + marker = src_shard_range.lower_str + while True: + info = broker.get_info() + info['max_row'] = broker.get_max_row() + start = time.time() + objects = broker.get_objects( + self.cleave_row_batch_size, + marker=marker, + end_marker=src_shard_range.end_marker, + include_deleted=include_deleted, + since_row=since_row) + if objects: + self.logger.debug('got %s objects from %s in %ss', + len(objects), broker.db_file, + time.time() - start) + yield objects, info + + if len(objects) < self.cleave_row_batch_size: + break + marker = objects[-1]['name'] + + def yield_objects_to_shard_range(self, broker, src_shard_range, + dest_shard_ranges): + """ + Iterates through all objects in ``src_shard_range`` to place them in + destination shard ranges provided by the ``next_shard_range`` function. + Yields tuples of (object list, destination shard range in which those + objects belong). Note that the same destination shard range may be + referenced in more than one yielded tuple. + + :param broker: A :class:`~swift.container.backend.ContainerBroker`. + :param src_shard_range: A :class:`~swift.common.utils.ShardRange` + describing the source range. + :param dest_shard_ranges: A function which should return a list of + destination shard ranges in name order. + :return: a generator of tuples of + (object list, shard range, broker info dict) + """ + dest_shard_range_iter = dest_shard_range = None + for objs, info in self.yield_objects(broker, src_shard_range): + if not objs: + return + + def next_or_none(it): + try: + return next(it) + except StopIteration: + return None + + if dest_shard_range_iter is None: + dest_shard_range_iter = iter(dest_shard_ranges()) + dest_shard_range = next_or_none(dest_shard_range_iter) + + unplaced = False + last_index = next_index = 0 + for obj in objs: + if dest_shard_range is None: + # no more destinations: yield remainder of batch and return + # NB there may be more batches of objects but none of them + # will be placed so no point fetching them + yield objs[last_index:], None, info + return + if obj['name'] <= dest_shard_range.lower: + unplaced = True + elif unplaced: + # end of run of unplaced objects, yield them + yield objs[last_index:next_index], None, info + last_index = next_index + unplaced = False + while (dest_shard_range and + obj['name'] > dest_shard_range.upper): + if next_index != last_index: + # yield the objects in current dest_shard_range + yield (objs[last_index:next_index], + dest_shard_range, + info) + last_index = next_index + dest_shard_range = next_or_none(dest_shard_range_iter) + next_index += 1 + + if next_index != last_index: + # yield tail of current batch of objects + # NB there may be more objects for the current + # dest_shard_range in the next batch from yield_objects + yield (objs[last_index:next_index], + None if unplaced else dest_shard_range, + info) + + def _post_replicate_hook(self, broker, info, responses): + # override superclass behaviour + pass + + def _replicate_and_delete(self, broker, dest_shard_range, part, + dest_broker, node_id, info): + success, responses = self._replicate_object( + part, dest_broker.db_file, node_id) + quorum = quorum_size(self.ring.replica_count) + if not success and responses.count(True) < quorum: + self.logger.warning( + 'Failed to sufficiently replicate misplaced objects: %s in %s ' + '(not removing)', dest_shard_range, broker.path) + return False + + if broker.get_info()['id'] != info['id']: + # the db changed - don't remove any objects + success = False + else: + # remove objects up to the max row of the db sampled prior to + # the first object yielded for this destination; objects added + # after that point may not have been yielded and replicated so + # it is not safe to remove them yet + broker.remove_objects( + dest_shard_range.lower_str, + dest_shard_range.upper_str, + max_row=info['max_row']) + success = True + + if not success: + self.logger.warning( + 'Refused to remove misplaced objects: %s in %s', + dest_shard_range, broker.path) + return success + + def _move_objects(self, src_broker, src_shard_range, policy_index, + shard_range_fetcher): + # move objects from src_shard_range in src_broker to destination shard + # ranges provided by shard_range_fetcher + dest_brokers = {} # map shard range -> broker + placed = unplaced = 0 + success = True + for objs, dest_shard_range, info in self.yield_objects_to_shard_range( + src_broker, src_shard_range, shard_range_fetcher): + if not dest_shard_range: + unplaced += len(objs) + success = False + continue + + if dest_shard_range.name == src_broker.path: + self.logger.debug( + 'Skipping source as misplaced objects destination') + # in shrinking context, the misplaced objects might actually be + # correctly placed if the root has expanded this shard but this + # broker has not yet been updated + continue + + if dest_shard_range not in dest_brokers: + part, dest_broker, node_id = self._get_shard_broker( + dest_shard_range, src_broker.root_path, policy_index) + # save the broker info that was sampled prior to the *first* + # yielded objects for this destination + destination = {'part': part, + 'dest_broker': dest_broker, + 'node_id': node_id, + 'info': info} + dest_brokers[dest_shard_range] = destination + else: + destination = dest_brokers[dest_shard_range] + destination['dest_broker'].merge_items(objs) + placed += len(objs) + + if unplaced: + self.logger.warning( + 'Failed to find destination for at least %s misplaced objects ' + 'in %s' % (unplaced, src_broker.path)) + + # TODO: consider executing the replication jobs concurrently + for dest_shard_range, dest_args in dest_brokers.items(): + self.logger.debug('moving misplaced objects found in range %s' % + dest_shard_range) + success &= self._replicate_and_delete( + src_broker, dest_shard_range, **dest_args) + + self._increment_stat('misplaced', 'placed', step=placed) + self._increment_stat('misplaced', 'unplaced', step=unplaced) + return success, placed + unplaced + + def _make_shard_range_fetcher(self, broker, src_shard_range): + # returns a function that will lazy load shard ranges on demand; + # this means only one lookup is made for all misplaced ranges. + outer = {} + + def shard_range_fetcher(): + if not outer: + if broker.is_root_container(): + ranges = broker.get_shard_ranges( + marker=src_shard_range.lower_str, + end_marker=src_shard_range.end_marker, + states=SHARD_UPDATE_STATES) + else: + # TODO: the root may not yet know about shard ranges to + # which a shard is sharding, but those could come from + # the broker + ranges = self._fetch_shard_ranges( + broker, newest=True, + params={'states': 'updating', + 'marker': src_shard_range.lower_str, + 'end_marker': src_shard_range.end_marker}) + outer['ranges'] = iter(ranges) + return outer['ranges'] + return shard_range_fetcher + + def _make_default_misplaced_object_bounds(self, broker): + # Objects outside of this container's own range are misplaced. + own_shard_range = broker.get_own_shard_range() + bounds = [] + if own_shard_range.lower: + bounds.append(('', own_shard_range.lower)) + if own_shard_range.upper: + bounds.append((own_shard_range.upper, '')) + return bounds + + def _make_misplaced_object_bounds(self, broker): + bounds = [] + state = broker.get_db_state() + if state == SHARDED: + # Anything in the object table is treated as a misplaced object. + bounds.append(('', '')) + + if not bounds and state == SHARDING: + # Objects outside of this container's own range are misplaced. + # Objects in already cleaved shard ranges are also misplaced. + cleave_context = CleavingContext.load(broker) + if cleave_context.cursor: + bounds.append(('', cleave_context.cursor)) + own_shard_range = broker.get_own_shard_range() + if own_shard_range.upper: + bounds.append((own_shard_range.upper, '')) + + return bounds or self._make_default_misplaced_object_bounds(broker) + + def _move_misplaced_objects(self, broker, src_broker=None, + src_bounds=None): + """ + Search for objects in the given broker that do not belong in that + broker's namespace and move those objects to their correct shard + container. + + :param broker: An instance of :class:`swift.container.ContainerBroker`. + :param src_broker: optional alternative broker to use as the source + of misplaced objects; if not specified then ``broker`` is used as + the source. + :param src_bounds: optional list of (lower, upper) namespace bounds to + use when searching for misplaced objects + :return: True if all misplaced objects were sufficiently replicated to + their correct shard containers, False otherwise + """ + self.logger.debug('Looking for misplaced objects in %s (%s)', + broker.path.decode('utf-8'), broker.db_file) + self._increment_stat('misplaced', 'attempted') + src_broker = src_broker or broker + if src_bounds is None: + src_bounds = self._make_misplaced_object_bounds(broker) + # (ab)use ShardRange instances to encapsulate source namespaces + src_ranges = [ShardRange('dont/care', Timestamp.now(), lower, upper) + for lower, upper in src_bounds] + self.logger.debug('misplaced object source bounds %s' % src_bounds) + policy_index = broker.storage_policy_index + success = True + num_found = 0 + for src_shard_range in src_ranges: + part_success, part_num_found = self._move_objects( + src_broker, src_shard_range, policy_index, + self._make_shard_range_fetcher(broker, src_shard_range)) + success &= part_success + num_found += part_num_found + + if num_found: + self._increment_stat('misplaced', 'found', statsd=True) + self.logger.debug('Moved %s misplaced objects' % num_found) + self._increment_stat('misplaced', 'success' if success else 'failure') + self.logger.debug('Finished handling misplaced objects') + return success + + def _find_shard_ranges(self, broker): + """ + Scans the container to find shard ranges and adds them to the shard + ranges table. If there are existing shard ranges then scanning starts + from the upper bound of the uppermost existing shard range. + + :param broker: An instance of :class:`swift.container.ContainerBroker` + :return: a tuple of (success, num of shard ranges found) where success + is True if the last shard range has been found, False otherwise. + """ + own_shard_range = broker.get_own_shard_range() + shard_ranges = broker.get_shard_ranges() + if shard_ranges and shard_ranges[-1].upper >= own_shard_range.upper: + self.logger.debug('Scan already completed for %s', broker.path) + return 0 + + self.logger.info('Starting scan for shard ranges on %s', broker.path) + self._increment_stat('scanned', 'attempted') + + start = time.time() + shard_data, last_found = broker.find_shard_ranges( + self.split_size, limit=self.scanner_batch_size, + existing_ranges=shard_ranges) + elapsed = time.time() - start + + if not shard_data: + if last_found: + self.logger.info("Already found all shard ranges") + self._increment_stat('scanned', 'success', statsd=True) + else: + # we didn't find anything + self.logger.warning("No shard ranges found") + self._increment_stat('scanned', 'failure', statsd=True) + return 0 + + shard_ranges = make_shard_ranges( + broker, shard_data, self.shards_account_prefix) + broker.merge_shard_ranges(shard_ranges) + num_found = len(shard_ranges) + self.logger.info( + "Completed scan for shard ranges: %d found", num_found) + self._increment_stat('scanned', 'found', step=num_found) + self._min_stat('scanned', 'min_time', round(elapsed / num_found, 3)) + self._max_stat('scanned', 'max_time', round(elapsed / num_found, 3)) + + if last_found: + self.logger.info("Final shard range reached.") + self._increment_stat('scanned', 'success', statsd=True) + return num_found + + def _create_shard_containers(self, broker): + # Create shard containers that are ready to receive redirected object + # updates. Do this now, so that redirection can begin immediately + # without waiting for cleaving to complete. + found_ranges = broker.get_shard_ranges(states=ShardRange.FOUND) + created_ranges = [] + for shard_range in found_ranges: + self._increment_stat('created', 'attempted') + shard_range.update_state(ShardRange.CREATED) + headers = { + 'X-Backend-Storage-Policy-Index': broker.storage_policy_index, + 'X-Container-Sysmeta-Shard-Root': broker.root_path, + 'X-Container-Sysmeta-Sharding': True} + success = self._send_shard_ranges( + shard_range.account, shard_range.container, + [shard_range], headers=headers) + if success: + self.logger.debug('PUT new shard range container for %s', + shard_range) + self._increment_stat('created', 'success', statsd=True) + else: + self.logger.error( + 'PUT of new shard container %r failed for %s.', + shard_range, broker.path) + self._increment_stat('created', 'failure', statsd=True) + # break, not continue, because elsewhere it is assumed that + # finding and cleaving shard ranges progresses linearly, so we + # do not want any subsequent shard ranges to be in created + # state while this one is still in found state + break + created_ranges.append(shard_range) + + if created_ranges: + broker.merge_shard_ranges(created_ranges) + if not broker.is_root_container(): + self._send_shard_ranges( + broker.root_account, broker.root_container, created_ranges) + self.logger.info( + "Completed creating shard range containers: %d created.", + len(created_ranges)) + return len(created_ranges) + + def _cleave_shard_range(self, broker, cleaving_context, shard_range): + self.logger.info("Cleaving '%s' from row %s into %s for %r", + broker.path, cleaving_context.last_cleave_to_row, + shard_range.name, shard_range) + self._increment_stat('cleaved', 'attempted') + start = time.time() + policy_index = broker.storage_policy_index + try: + shard_part, shard_broker, node_id = self._get_shard_broker( + shard_range, broker.root_path, policy_index) + except DeviceUnavailable as duex: + self.logger.warning(str(duex)) + self._increment_stat('cleaved', 'failure', statsd=True) + return False + + # only cleave from the retiring db - misplaced objects handler will + # deal with any objects in the fresh db + source_broker = broker.get_brokers()[0] + # if this range has been cleaved before but replication + # failed then the shard db may still exist and it may not be + # necessary to merge all the rows again + source_db_id = source_broker.get_info()['id'] + source_max_row = source_broker.get_max_row() + sync_point = shard_broker.get_sync(source_db_id) + if sync_point < source_max_row: + sync_from_row = max(cleaving_context.last_cleave_to_row, + sync_point) + for objects, info in self.yield_objects( + source_broker, shard_range, + since_row=sync_from_row): + shard_broker.merge_items(objects) + # Note: the max row stored as a sync point is sampled *before* + # objects are yielded to ensure that is less than or equal to + # the last yielded row. Other sync points are also copied from the + # source broker to the shards; if another replica of the source + # happens to subsequently cleave into a primary replica of the + # shard then it will only need to cleave rows after its last sync + # point with this replica of the source broker. + shard_broker.merge_syncs( + [{'sync_point': source_max_row, 'remote_id': source_db_id}] + + source_broker.get_syncs()) + else: + self.logger.debug("Cleaving '%s': %r - shard db already in sync", + broker.path, shard_range) + + own_shard_range = broker.get_own_shard_range() + + replication_quorum = self.existing_shard_replication_quorum + if shard_range.includes(own_shard_range): + # When shrinking, include deleted own (donor) shard range in + # the replicated db so that when acceptor next updates root it + # will atomically update its namespace *and* delete the donor. + # Don't do this when sharding a shard because the donor + # namespace should not be deleted until all shards are cleaved. + if own_shard_range.update_state(ShardRange.SHARDED): + own_shard_range.set_deleted() + broker.merge_shard_ranges(own_shard_range) + shard_broker.merge_shard_ranges(own_shard_range) + elif shard_range.state == ShardRange.CREATED: + # The shard range object stats may have changed since the shard + # range was found, so update with stats of objects actually + # copied to the shard broker. Only do this the first time each + # shard range is cleaved. + info = shard_broker.get_info() + shard_range.update_meta( + info['object_count'], info['bytes_used']) + shard_range.update_state(ShardRange.CLEAVED) + shard_broker.merge_shard_ranges(shard_range) + replication_quorum = self.shard_replication_quorum + + self.logger.info( + 'Replicating new shard container %s for %s', + shard_broker.path, shard_broker.get_own_shard_range()) + + success, responses = self._replicate_object( + shard_part, shard_broker.db_file, node_id) + + replication_successes = responses.count(True) + if (not success and (not responses or + replication_successes < replication_quorum)): + # insufficient replication or replication not even attempted; + # break because we don't want to progress the cleave cursor + # until each shard range has been successfully cleaved + self.logger.warning( + 'Failed to sufficiently replicate cleaved shard %s for %s: ' + '%s successes, %s required.', shard_range, broker.path, + replication_successes, replication_quorum) + self._increment_stat('cleaved', 'failure', statsd=True) + return False + + elapsed = round(time.time() - start, 3) + self._min_stat('cleaved', 'min_time', elapsed) + self._max_stat('cleaved', 'max_time', elapsed) + broker.merge_shard_ranges(shard_range) + cleaving_context.cursor = shard_range.upper_str + cleaving_context.ranges_done += 1 + cleaving_context.ranges_todo -= 1 + if shard_range.upper >= own_shard_range.upper: + # cleaving complete + cleaving_context.cleaving_done = True + cleaving_context.store(broker) + self.logger.info( + 'Cleaved %s for shard range %s in %gs.', + broker.path, shard_range, elapsed) + self._increment_stat('cleaved', 'success', statsd=True) + return True + + def _cleave(self, broker): + # Returns True if misplaced objects have been moved and the entire + # container namespace has been successfully cleaved, False otherwise + if broker.is_sharded(): + self.logger.debug('Passing over already sharded container %s/%s', + broker.account, broker.container) + return True + + cleaving_context = CleavingContext.load(broker) + if not cleaving_context.misplaced_done: + # ensure any misplaced objects in the source broker are moved; note + # that this invocation of _move_misplaced_objects is targetted at + # the *retiring* db. + self.logger.debug( + 'Moving any misplaced objects from sharding container: %s', + broker.path) + bounds = self._make_default_misplaced_object_bounds(broker) + cleaving_context.misplaced_done = self._move_misplaced_objects( + broker, src_broker=broker.get_brokers()[0], + src_bounds=bounds) + cleaving_context.store(broker) + + if cleaving_context.cleaving_done: + self.logger.debug('Cleaving already complete for container %s', + broker.path) + return cleaving_context.misplaced_done + + ranges_todo = broker.get_shard_ranges(marker=cleaving_context.marker) + if cleaving_context.cursor: + # always update ranges_todo in case more ranges have been found + # since last visit + cleaving_context.ranges_todo = len(ranges_todo) + self.logger.debug('Continuing to cleave (%s done, %s todo): %s', + cleaving_context.ranges_done, + cleaving_context.ranges_todo, + broker.path) + else: + cleaving_context.start() + cleaving_context.ranges_todo = len(ranges_todo) + self.logger.debug('Starting to cleave (%s todo): %s', + cleaving_context.ranges_todo, broker.path) + + ranges_done = [] + for shard_range in ranges_todo[:self.cleave_batch_size]: + if shard_range.state == ShardRange.FOUND: + break + elif shard_range.state in (ShardRange.CREATED, + ShardRange.CLEAVED, + ShardRange.ACTIVE): + if self._cleave_shard_range( + broker, cleaving_context, shard_range): + ranges_done.append(shard_range) + else: + break + else: + self.logger.warning('Unexpected shard range state for cleave', + shard_range.state) + break + + if not ranges_done: + cleaving_context.store(broker) + self.logger.debug( + 'Cleaved %s shard ranges for %s', len(ranges_done), broker.path) + return (cleaving_context.misplaced_done and + cleaving_context.cleaving_done) + + def _complete_sharding(self, broker): + cleaving_context = CleavingContext.load(broker) + if cleaving_context.done(): + # Move all CLEAVED shards to ACTIVE state and if a shard then + # delete own shard range; these changes will be simultaneously + # reported in the next update to the root container. + modified_shard_ranges = broker.get_shard_ranges( + states=ShardRange.CLEAVED) + for sr in modified_shard_ranges: + sr.update_state(ShardRange.ACTIVE) + own_shard_range = broker.get_own_shard_range() + own_shard_range.update_state(ShardRange.SHARDED) + own_shard_range.update_meta(0, 0) + if (not broker.is_root_container() and not + own_shard_range.deleted): + own_shard_range = own_shard_range.copy( + timestamp=Timestamp.now(), deleted=1) + modified_shard_ranges.append(own_shard_range) + broker.merge_shard_ranges(modified_shard_ranges) + if broker.set_sharded_state(): + return True + else: + self.logger.warning( + 'Failed to remove retiring db file for %s', + broker.path) + else: + self.logger.warning( + 'Repeat cleaving required for %r with context: %s' + % (broker.db_files[0], dict(cleaving_context))) + cleaving_context.reset() + cleaving_context.store(broker) + + return False + + def _find_and_enable_sharding_candidates(self, broker, shard_ranges=None): + candidates = find_sharding_candidates( + broker, self.shard_container_threshold, shard_ranges) + if candidates: + self.logger.debug('Identified %s sharding candidates' + % len(candidates)) + broker.merge_shard_ranges(candidates) + + def _find_and_enable_shrinking_candidates(self, broker): + if not broker.is_sharded(): + self.logger.warning('Cannot shrink a not yet sharded container %s', + broker.path) + return + + merge_pairs = find_shrinking_candidates( + broker, self.shrink_size, self.merge_size) + self.logger.debug('Found %s shrinking candidates' % len(merge_pairs)) + own_shard_range = broker.get_own_shard_range() + for acceptor, donor in merge_pairs.items(): + self.logger.debug('shrinking shard range %s into %s in %s' % + (donor, acceptor, broker.db_file)) + broker.merge_shard_ranges([acceptor, donor]) + if acceptor.name != own_shard_range.name: + self._send_shard_ranges( + acceptor.account, acceptor.container, [acceptor]) + acceptor.increment_meta(donor.object_count, donor.bytes_used) + else: + # no need to change namespace or stats + acceptor.update_state(ShardRange.ACTIVE, + state_timestamp=Timestamp.now()) + # Now send a copy of the expanded acceptor, with an updated + # timestamp, to the donor container. This forces the donor to + # asynchronously cleave its entire contents to the acceptor and + # delete itself. The donor will pass its own deleted shard range to + # the acceptor when cleaving. Subsequent updates from the donor or + # the acceptor will then update the root to have the deleted donor + # shard range. + self._send_shard_ranges( + donor.account, donor.container, [donor, acceptor]) + + def _update_root_container(self, broker): + own_shard_range = broker.get_own_shard_range(no_default=True) + if not own_shard_range: + return + + # persist the reported shard metadata + broker.merge_shard_ranges(own_shard_range) + # now get a consistent list of own and other shard ranges + shard_ranges = broker.get_shard_ranges( + include_own=True, + include_deleted=True) + # send everything + self._send_shard_ranges( + broker.root_account, broker.root_container, + shard_ranges) + + def _process_broker(self, broker, node, part): + broker.get_info() # make sure account/container are populated + state = broker.get_db_state() + self.logger.debug('Starting processing %s state %s', + broker.path, state) + + if not self._audit_container(broker): + return + + # now look and deal with misplaced objects. + self._move_misplaced_objects(broker) + + if broker.is_deleted(): + # This container is deleted so we can skip it. We still want + # deleted containers to go via misplaced items because they may + # have new objects sitting in them that may need to move. + return + + is_leader = node['index'] == 0 and self.auto_shard + if state in (UNSHARDED, COLLAPSED): + if is_leader and broker.is_root_container(): + # bootstrap sharding of root container + self._find_and_enable_sharding_candidates( + broker, shard_ranges=[broker.get_own_shard_range()]) + + own_shard_range = broker.get_own_shard_range() + if own_shard_range.state in (ShardRange.SHARDING, + ShardRange.SHRINKING, + ShardRange.SHARDED): + if broker.get_shard_ranges(): + # container has been given shard ranges rather than + # found them e.g. via replication or a shrink event + if broker.set_sharding_state(): + state = SHARDING + elif is_leader: + if broker.set_sharding_state(): + state = SHARDING + else: + self.logger.debug( + 'Own shard range in state %r but no shard ranges ' + 'and not leader; remaining unsharded: %s' + % (own_shard_range.state_text, broker.path)) + + if state == SHARDING: + if is_leader: + num_found = self._find_shard_ranges(broker) + else: + num_found = 0 + + # create shard containers for newly found ranges + num_created = self._create_shard_containers(broker) + + if num_found or num_created: + # share updated shard range state with other nodes + self._replicate_object(part, broker.db_file, node['id']) + + # always try to cleave any pending shard ranges + cleave_complete = self._cleave(broker) + + if cleave_complete: + self.logger.info('Completed cleaving of %s', broker.path) + if self._complete_sharding(broker): + state = SHARDED + self._increment_stat('visited', 'completed', statsd=True) + else: + self.logger.debug('Remaining in sharding state %s', + broker.path) + + if state == SHARDED and broker.is_root_container(): + if is_leader: + self._find_and_enable_shrinking_candidates(broker) + self._find_and_enable_sharding_candidates(broker) + for shard_range in broker.get_shard_ranges( + states=[ShardRange.SHARDING]): + self._send_shard_ranges( + shard_range.account, shard_range.container, + [shard_range]) + + if not broker.is_root_container(): + # Update the root container with this container's shard range + # info; do this even when sharded in case previous attempts + # failed; don't do this if there is no own shard range. When + # sharding a shard, this is when the root will see the new + # shards move to ACTIVE state and the sharded shard + # simultaneously become deleted. + self._update_root_container(broker) + + self.logger.debug('Finished processing %s/%s state %s', + broker.account, broker.container, + broker.get_db_state()) + + def _one_shard_cycle(self, devices_to_shard, partitions_to_shard): + """ + The main function, everything the sharder does forks from this method. + + The sharder loops through each container with sharding enabled and each + sharded container on the server, on each container it: + - audits the container + - checks and deals with misplaced items + - cleaves any shard ranges as required + - if not a root container, reports shard range stats to the root + container + """ + self.logger.info('Container sharder cycle starting, auto-sharding %s', + self.auto_shard) + if isinstance(devices_to_shard, (list, tuple)): + self.logger.info('(Override devices: %s)', + ', '.join(str(d) for d in devices_to_shard)) + if isinstance(partitions_to_shard, (list, tuple)): + self.logger.info('(Override partitions: %s)', + ', '.join(str(p) for p in partitions_to_shard)) + self._zero_stats() + self._local_device_ids = set() + dirs = [] + self.ips = whataremyips(bind_ip=self.bind_ip) + for node in self.ring.devs: + if not self._check_node(node): + continue + datadir = os.path.join(self.root, node['device'], self.datadir) + if os.path.isdir(datadir): + # Populate self._local_device_ids so we can find devices for + # shard containers later + self._local_device_ids.add(node['id']) + if node['device'] not in devices_to_shard: + continue + part_filt = self._partition_dir_filter( + node['id'], + partitions_to_shard) + dirs.append((datadir, node, part_filt)) + if not dirs: + self.logger.warning('Found no data dirs!') + for part, path, node in db_replicator.roundrobin_datadirs(dirs): + # NB: get_part_nodes always provides an 'index' key; + # this will be used in leader selection + for primary in self.ring.get_part_nodes(int(part)): + if node['id'] == primary['id']: + node = primary + break + else: + # Set index such that we'll *never* be selected as a leader + node['index'] = 'handoff' + + broker = ContainerBroker(path, logger=self.logger, + timeout=self.broker_timeout) + error = None + try: + self._identify_sharding_candidate(broker, node) + if sharding_enabled(broker): + self._increment_stat('visited', 'attempted') + self._process_broker(broker, node, part) + self._increment_stat('visited', 'success', statsd=True) + else: + self._increment_stat('visited', 'skipped') + except (Exception, Timeout) as error: + self._increment_stat('visited', 'failure', statsd=True) + self.logger.exception( + 'Unhandled exception while processing %s: %s', path, error) + try: + self._record_sharding_progress(broker, node, error) + except (Exception, Timeout) as error: + self.logger.exception( + 'Unhandled exception while dumping progress for %s: %s', + path, error) + self._periodic_report_stats() + + self._report_stats() + + def run_forever(self, *args, **kwargs): + """Run the container sharder until stopped.""" + self.reported = time.time() + time.sleep(random() * self.interval) + while True: + begin = time.time() + try: + self._one_shard_cycle(devices_to_shard=Everything(), + partitions_to_shard=Everything()) + except (Exception, Timeout): + self.logger.increment('errors') + self.logger.exception('Exception in sharder') + elapsed = time.time() - begin + self.logger.info( + 'Container sharder cycle completed: %.02fs', elapsed) + if elapsed < self.interval: + time.sleep(self.interval - elapsed) + + def run_once(self, *args, **kwargs): + """Run the container sharder once.""" + self.logger.info('Begin container sharder "once" mode') + override_options = parse_override_options(once=True, **kwargs) + devices_to_shard = override_options.devices or Everything() + partitions_to_shard = override_options.partitions or Everything() + begin = self.reported = time.time() + self._one_shard_cycle(devices_to_shard=devices_to_shard, + partitions_to_shard=partitions_to_shard) + elapsed = time.time() - begin + self.logger.info( + 'Container sharder "once" mode completed: %.02fs', elapsed) diff --git a/swift/proxy/controllers/base.py b/swift/proxy/controllers/base.py index cca8f6cc14..4822b01729 100644 --- a/swift/proxy/controllers/base.py +++ b/swift/proxy/controllers/base.py @@ -2007,7 +2007,7 @@ class Controller(object): :param req: original Request instance. :param account: account in which `container` is stored. - :param container: container from which listing should be fetched. + :param container: container from listing should be fetched. :param headers: headers to be included with the request :param params: query string parameters to be used. :return: a tuple of (deserialized json data structure, swob Response) diff --git a/swift/proxy/controllers/container.py b/swift/proxy/controllers/container.py index f95a31f35a..e90632a294 100644 --- a/swift/proxy/controllers/container.py +++ b/swift/proxy/controllers/container.py @@ -21,6 +21,7 @@ from swift.common.utils import public, csv_append, Timestamp, \ config_true_value, ShardRange from swift.common.constraints import check_metadata, CONTAINER_LISTING_LIMIT from swift.common.http import HTTP_ACCEPTED, is_success +from swift.common.request_helpers import get_sys_meta_prefix from swift.proxy.controllers.base import Controller, delay_denial, \ cors_validation, set_info_cache, clear_info_cache from swift.common.storage_policy import POLICIES @@ -136,6 +137,11 @@ class ContainerController(Controller): for key in self.app.swift_owner_headers: if key in resp.headers: del resp.headers[key] + # Expose sharding state in reseller requests + if req.environ.get('reseller_request', False): + resp.headers['X-Container-Sharding'] = config_true_value( + resp.headers.get(get_sys_meta_prefix('container') + 'Sharding', + 'False')) return resp def _get_from_shards(self, req, resp): @@ -257,6 +263,10 @@ class ContainerController(Controller): if not req.environ.get('swift_owner'): for key in self.app.swift_owner_headers: req.headers.pop(key, None) + if req.environ.get('reseller_request', False) and \ + 'X-Container-Sharding' in req.headers: + req.headers[get_sys_meta_prefix('container') + 'Sharding'] = \ + str(config_true_value(req.headers['X-Container-Sharding'])) length_limit = self.get_name_length_limit() if len(self.container_name) > length_limit: resp = HTTPBadRequest(request=req) @@ -305,6 +315,10 @@ class ContainerController(Controller): if not req.environ.get('swift_owner'): for key in self.app.swift_owner_headers: req.headers.pop(key, None) + if req.environ.get('reseller_request', False) and \ + 'X-Container-Sharding' in req.headers: + req.headers[get_sys_meta_prefix('container') + 'Sharding'] = \ + str(config_true_value(req.headers['X-Container-Sharding'])) account_partition, accounts, container_count = \ self.account_info(self.account_name, req) if not accounts: diff --git a/test/probe/test_sharder.py b/test/probe/test_sharder.py new file mode 100644 index 0000000000..77ee3dd35b --- /dev/null +++ b/test/probe/test_sharder.py @@ -0,0 +1,2025 @@ +# Copyright (c) 2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import hashlib +import json +import os +import shutil +import uuid + +from nose import SkipTest + +from swift.common import direct_client +from swift.common.direct_client import DirectClientException +from swift.common.utils import ShardRange, parse_db_filename, get_db_files, \ + quorum_size, config_true_value, Timestamp +from swift.container.backend import ContainerBroker, UNSHARDED, SHARDING +from swift.common import utils +from swift.common.manager import Manager +from swiftclient import client, get_auth, ClientException + +from swift.proxy.controllers.obj import num_container_updates +from test import annotate_failure +from test.probe.brain import BrainSplitter +from test.probe.common import ReplProbeTest, get_server_number, \ + wait_for_server_to_hangup + + +MIN_SHARD_CONTAINER_THRESHOLD = 4 +MAX_SHARD_CONTAINER_THRESHOLD = 100 + + +class ShardCollector(object): + """ + Returns map of node to tuples of (headers, shard ranges) returned from node + """ + def __init__(self): + self.ranges = {} + + def __call__(self, cnode, cpart, account, container): + self.ranges[cnode['id']] = direct_client.direct_get_container( + cnode, cpart, account, container, + headers={'X-Backend-Record-Type': 'shard'}) + + +class BaseTestContainerSharding(ReplProbeTest): + + def _maybe_skip_test(self): + try: + cont_configs = [utils.readconf(p, 'container-sharder') + for p in self.configs['container-server'].values()] + except ValueError: + raise SkipTest('No [container-sharder] section found in ' + 'container-server configs') + + skip_reasons = [] + auto_shard = all([config_true_value(c.get('auto_shard', False)) + for c in cont_configs]) + if not auto_shard: + skip_reasons.append( + 'auto_shard must be true in all container_sharder configs') + + self.max_shard_size = max( + int(c.get('shard_container_threshold', '1000000')) + for c in cont_configs) + + if not (MIN_SHARD_CONTAINER_THRESHOLD <= self.max_shard_size + <= MAX_SHARD_CONTAINER_THRESHOLD): + skip_reasons.append( + 'shard_container_threshold %d must be between %d and %d' % + (self.max_shard_size, MIN_SHARD_CONTAINER_THRESHOLD, + MAX_SHARD_CONTAINER_THRESHOLD)) + + def skip_check(reason_list, option, required): + values = set([int(c.get(option, required)) for c in cont_configs]) + if values != {required}: + reason_list.append('%s must be %s' % (option, required)) + + skip_check(skip_reasons, 'shard_scanner_batch_size', 10) + skip_check(skip_reasons, 'shard_batch_size', 2) + + if skip_reasons: + raise SkipTest(', '.join(skip_reasons)) + + def _load_rings_and_configs(self): + super(BaseTestContainerSharding, self)._load_rings_and_configs() + # perform checks for skipping test before starting services + self._maybe_skip_test() + + def _make_object_names(self, number): + return ['obj-%04d' % x for x in range(number)] + + def _setup_container_name(self): + self.container_name = 'container-%s' % uuid.uuid4() + + def setUp(self): + client.logger.setLevel(client.logging.WARNING) + client.requests.logging.getLogger().setLevel( + client.requests.logging.WARNING) + super(BaseTestContainerSharding, self).setUp() + _, self.admin_token = get_auth( + 'http://127.0.0.1:8080/auth/v1.0', 'admin:admin', 'admin') + self._setup_container_name() + self.brain = BrainSplitter(self.url, self.token, self.container_name, + None, 'container') + self.brain.put_container(policy_index=int(self.policy)) + self.sharders = Manager(['container-sharder']) + self.internal_client = self.make_internal_client() + + def stop_container_servers(self, node_numbers=None): + if node_numbers: + ipports = [] + server2ipport = {v: k for k, v in self.ipport2server.items()} + for number in self.brain.node_numbers[node_numbers]: + self.brain.servers.stop(number=number) + server = 'container%d' % number + ipports.append(server2ipport[server]) + else: + ipports = [k for k, v in self.ipport2server.items() + if v.startswith('container')] + self.brain.servers.stop() + for ipport in ipports: + wait_for_server_to_hangup(ipport) + + def put_objects(self, obj_names): + for obj in obj_names: + client.put_object(self.url, self.token, self.container_name, obj) + + def delete_objects(self, obj_names): + for obj in obj_names: + client.delete_object( + self.url, self.token, self.container_name, obj) + + def get_container_shard_ranges(self, account=None, container=None): + account = account if account else self.account + container = container if container else self.container_name + path = self.internal_client.make_path(account, container) + resp = self.internal_client.make_request( + 'GET', path + '?format=json', {'X-Backend-Record-Type': 'shard'}, + [200]) + return [ShardRange.from_dict(sr) for sr in json.loads(resp.body)] + + def direct_container_op(self, func, account=None, container=None, + expect_failure=False): + account = account if account else self.account + container = container if container else self.container_name + cpart, cnodes = self.container_ring.get_nodes(account, container) + unexpected_responses = [] + results = {} + for cnode in cnodes: + try: + results[cnode['id']] = func(cnode, cpart, account, container) + except DirectClientException as err: + if not expect_failure: + unexpected_responses.append((cnode, err)) + else: + if expect_failure: + unexpected_responses.append((cnode, 'success')) + if unexpected_responses: + self.fail('Unexpected responses: %s' % unexpected_responses) + return results + + def direct_get_container_shard_ranges(self, account=None, container=None, + expect_failure=False): + collector = ShardCollector() + self.direct_container_op( + collector, account, container, expect_failure) + return collector.ranges + + def direct_delete_container(self, account=None, container=None, + expect_failure=False): + self.direct_container_op(direct_client.direct_delete_container, + account, container, expect_failure) + + def direct_head_container(self, account=None, container=None, + expect_failure=False): + return self.direct_container_op(direct_client.direct_head_container, + account, container, expect_failure) + + def get_storage_dir(self, part, node, account=None, container=None): + account = account or self.brain.account + container = container or self.container_name + server_type, config_number = get_server_number( + (node['ip'], node['port']), self.ipport2server) + assert server_type == 'container' + repl_server = '%s-replicator' % server_type + conf = utils.readconf(self.configs[repl_server][config_number], + section_name=repl_server) + datadir = os.path.join(conf['devices'], node['device'], 'containers') + container_hash = utils.hash_path(account, container) + return (utils.storage_directory(datadir, part, container_hash), + container_hash) + + def get_broker(self, part, node, account=None, container=None): + container_dir, container_hash = self.get_storage_dir( + part, node, account=account, container=container) + db_file = os.path.join(container_dir, container_hash + '.db') + self.assertTrue(get_db_files(db_file)) # sanity check + return ContainerBroker(db_file) + + def categorize_container_dir_content(self, account=None, container=None): + account = account or self.brain.account + container = container or self.container_name + part, nodes = self.brain.ring.get_nodes(account, container) + storage_dirs = [ + self.get_storage_dir(part, node, account=account, + container=container)[0] + for node in nodes] + result = { + 'shard_dbs': [], + 'normal_dbs': [], + 'pendings': [], + 'locks': [], + 'other': [], + } + for storage_dir in storage_dirs: + for f in os.listdir(storage_dir): + path = os.path.join(storage_dir, f) + if path.endswith('.db'): + hash_, epoch, ext = parse_db_filename(path) + if epoch: + result['shard_dbs'].append(path) + else: + result['normal_dbs'].append(path) + elif path.endswith('.db.pending'): + result['pendings'].append(path) + elif path.endswith('/.lock'): + result['locks'].append(path) + else: + result['other'].append(path) + if result['other']: + self.fail('Found unexpected files in storage directory:\n %s' % + '\n '.join(result['other'])) + return result + + def assertLengthEqual(self, obj, length): + obj_len = len(obj) + self.assertEqual(obj_len, length, 'len(%r) == %d, not %d' % ( + obj, obj_len, length)) + + def assert_dict_contains(self, expected_items, actual_dict): + ignored = set(expected_items) ^ set(actual_dict) + filtered_actual = dict((k, actual_dict[k]) + for k in actual_dict if k not in ignored) + self.assertEqual(expected_items, filtered_actual) + + def assert_shard_ranges_contiguous(self, expected_number, shard_ranges, + first_lower='', last_upper=''): + if shard_ranges and isinstance(shard_ranges[0], ShardRange): + actual_shard_ranges = sorted(shard_ranges) + else: + actual_shard_ranges = sorted([ShardRange.from_dict(d) + for d in shard_ranges]) + self.assertLengthEqual(actual_shard_ranges, expected_number) + if expected_number: + with annotate_failure('Ranges %s.' % actual_shard_ranges): + self.assertEqual(first_lower, actual_shard_ranges[0].lower_str) + for x, y in zip(actual_shard_ranges, actual_shard_ranges[1:]): + self.assertEqual(x.upper, y.lower) + self.assertEqual(last_upper, actual_shard_ranges[-1].upper_str) + + def assert_shard_range_equal(self, expected, actual, excludes=None): + excludes = excludes or [] + expected_dict = dict(expected) + actual_dict = dict(actual) + for k in excludes: + expected_dict.pop(k, None) + actual_dict.pop(k, None) + self.assertEqual(expected_dict, actual_dict) + + def assert_shard_range_lists_equal(self, expected, actual, excludes=None): + self.assertEqual(len(expected), len(actual)) + for expected, actual in zip(expected, actual): + self.assert_shard_range_equal(expected, actual, excludes=excludes) + + def assert_shard_range_state(self, expected_state, shard_ranges): + if shard_ranges and not isinstance(shard_ranges[0], ShardRange): + shard_ranges = [ShardRange.from_dict(data) + for data in shard_ranges] + self.assertEqual([expected_state] * len(shard_ranges), + [sr.state for sr in shard_ranges]) + + def assert_total_object_count(self, expected_object_count, shard_ranges): + actual = sum([sr['object_count'] for sr in shard_ranges]) + self.assertEqual(expected_object_count, actual) + + def assert_container_listing(self, expected_listing): + headers, actual_listing = client.get_container( + self.url, self.token, self.container_name) + self.assertIn('x-container-object-count', headers) + expected_obj_count = len(expected_listing) + self.assertEqual(expected_listing, [ + x['name'].encode('utf-8') for x in actual_listing]) + self.assertEqual(str(expected_obj_count), + headers['x-container-object-count']) + return headers, actual_listing + + def assert_container_object_count(self, expected_obj_count): + headers = client.head_container( + self.url, self.token, self.container_name) + self.assertIn('x-container-object-count', headers) + self.assertEqual(str(expected_obj_count), + headers['x-container-object-count']) + + def assert_container_post_ok(self, meta_value): + key = 'X-Container-Meta-Assert-Post-Works' + headers = {key: meta_value} + client.post_container( + self.url, self.token, self.container_name, headers=headers) + resp_headers = client.head_container( + self.url, self.token, self.container_name) + self.assertEqual(meta_value, resp_headers.get(key.lower())) + + def assert_container_post_fails(self, meta_value): + key = 'X-Container-Meta-Assert-Post-Works' + headers = {key: meta_value} + with self.assertRaises(ClientException) as cm: + client.post_container( + self.url, self.token, self.container_name, headers=headers) + self.assertEqual(404, cm.exception.http_status) + + def assert_container_delete_fails(self): + with self.assertRaises(ClientException) as cm: + client.delete_container(self.url, self.token, self.container_name) + self.assertEqual(409, cm.exception.http_status) + + def assert_container_not_found(self): + with self.assertRaises(ClientException) as cm: + client.get_container(self.url, self.token, self.container_name) + self.assertEqual(404, cm.exception.http_status) + # check for headers leaking out while deleted + resp_headers = cm.exception.http_response_headers + self.assertNotIn('X-Container-Object-Count', resp_headers) + self.assertNotIn('X-Container-Bytes-Used', resp_headers) + self.assertNotIn('X-Timestamp', resp_headers) + self.assertNotIn('X-PUT-Timestamp', resp_headers) + + def assert_container_has_shard_sysmeta(self): + node_headers = self.direct_head_container() + for node_id, headers in node_headers.items(): + with annotate_failure('%s in %s' % (node_id, node_headers.keys())): + for k, v in headers.items(): + if k.lower().startswith('x-container-sysmeta-shard'): + break + else: + self.fail('No shard sysmeta found in %s' % headers) + + def assert_container_state(self, node, expected_state, num_shard_ranges): + headers, shard_ranges = direct_client.direct_get_container( + node, self.brain.part, self.account, self.container_name, + headers={'X-Backend-Record-Type': 'shard'}) + self.assertEqual(num_shard_ranges, len(shard_ranges)) + self.assertIn('X-Backend-Sharding-State', headers) + self.assertEqual( + expected_state, headers['X-Backend-Sharding-State']) + return [ShardRange.from_dict(sr) for sr in shard_ranges] + + def get_part_and_node_numbers(self, shard_range): + """Return the partition and node numbers for a shard range.""" + part, nodes = self.brain.ring.get_nodes( + shard_range.account, shard_range.container) + return part, [n['id'] + 1 for n in nodes] + + def run_sharders(self, shard_ranges): + """Run the sharder on partitions for given shard ranges.""" + if not isinstance(shard_ranges, (list, tuple, set)): + shard_ranges = (shard_ranges,) + partitions = ','.join(str(self.get_part_and_node_numbers(sr)[0]) + for sr in shard_ranges) + self.sharders.once(additional_args='--partitions=%s' % partitions) + + def run_sharder_sequentially(self, shard_range=None): + """Run sharder node by node on partition for given shard range.""" + if shard_range: + part, node_numbers = self.get_part_and_node_numbers(shard_range) + else: + part, node_numbers = self.brain.part, self.brain.node_numbers + for node_number in node_numbers: + self.sharders.once(number=node_number, + additional_args='--partitions=%s' % part) + + +class TestContainerShardingNonUTF8(BaseTestContainerSharding): + def test_sharding_listing(self): + # verify parameterised listing of a container during sharding + all_obj_names = self._make_object_names(4 * self.max_shard_size) + obj_names = all_obj_names[::2] + self.put_objects(obj_names) + # choose some names approx in middle of each expected shard range + markers = [ + obj_names[i] for i in range(self.max_shard_size / 4, + 2 * self.max_shard_size, + self.max_shard_size / 2)] + + def check_listing(objects, **params): + qs = '&'.join(['%s=%s' % param for param in params.items()]) + headers, listing = client.get_container( + self.url, self.token, self.container_name, query_string=qs) + listing = [x['name'].encode('utf-8') for x in listing] + if params.get('reverse'): + marker = params.get('marker', ShardRange.MAX) + end_marker = params.get('end_marker', ShardRange.MIN) + expected = [o for o in objects if end_marker < o < marker] + expected.reverse() + else: + marker = params.get('marker', ShardRange.MIN) + end_marker = params.get('end_marker', ShardRange.MAX) + expected = [o for o in objects if marker < o < end_marker] + if 'limit' in params: + expected = expected[:params['limit']] + self.assertEqual(expected, listing) + + def check_listing_precondition_fails(**params): + qs = '&'.join(['%s=%s' % param for param in params.items()]) + with self.assertRaises(ClientException) as cm: + client.get_container( + self.url, self.token, self.container_name, query_string=qs) + self.assertEqual(412, cm.exception.http_status) + return cm.exception + + def do_listing_checks(objects): + check_listing(objects) + check_listing(objects, marker=markers[0], end_marker=markers[1]) + check_listing(objects, marker=markers[0], end_marker=markers[2]) + check_listing(objects, marker=markers[1], end_marker=markers[3]) + check_listing(objects, marker=markers[1], end_marker=markers[3], + limit=self.max_shard_size / 4) + check_listing(objects, marker=markers[1], end_marker=markers[3], + limit=self.max_shard_size / 4) + check_listing(objects, marker=markers[1], end_marker=markers[2], + limit=self.max_shard_size / 2) + check_listing(objects, marker=markers[1], end_marker=markers[1]) + check_listing(objects, reverse=True) + check_listing(objects, reverse=True, end_marker=markers[1]) + check_listing(objects, reverse=True, marker=markers[3], + end_marker=markers[1], limit=self.max_shard_size / 4) + check_listing(objects, reverse=True, marker=markers[3], + end_marker=markers[1], limit=0) + check_listing([], marker=markers[0], end_marker=markers[0]) + check_listing([], marker=markers[0], end_marker=markers[1], + reverse=True) + check_listing(objects, prefix='obj') + check_listing([], prefix='zzz') + # delimiter + headers, listing = client.get_container( + self.url, self.token, self.container_name, + query_string='delimiter=-') + self.assertEqual([{'subdir': 'obj-'}], listing) + + limit = self.cluster_info['swift']['container_listing_limit'] + exc = check_listing_precondition_fails(limit=limit + 1) + self.assertIn('Maximum limit', exc.http_response_content) + exc = check_listing_precondition_fails(delimiter='ab') + self.assertIn('Bad delimiter', exc.http_response_content) + + # sanity checks + do_listing_checks(obj_names) + + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + # First run the 'leader' in charge of scanning, which finds all shard + # ranges and cleaves first two + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + # Then run sharder on other nodes which will also cleave first two + # shard ranges + for n in self.brain.node_numbers[1:]: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + + # sanity check shard range states + for node in self.brain.nodes: + self.assert_container_state(node, 'sharding', 4) + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 4) + self.assert_shard_range_state(ShardRange.CLEAVED, shard_ranges[:2]) + self.assert_shard_range_state(ShardRange.CREATED, shard_ranges[2:]) + + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() # confirm no sysmeta deleted + self.assert_container_post_ok('sharding') + do_listing_checks(obj_names) + + # put some new objects spread through entire namespace + new_obj_names = all_obj_names[1::4] + self.put_objects(new_obj_names) + + # new objects that fell into the first two cleaved shard ranges are + # reported in listing, new objects in the yet-to-be-cleaved shard + # ranges are not yet included in listing + exp_obj_names = [o for o in obj_names + new_obj_names + if o <= shard_ranges[1].upper] + exp_obj_names += [o for o in obj_names + if o > shard_ranges[1].upper] + exp_obj_names.sort() + do_listing_checks(exp_obj_names) + + # run all the sharders again and the last two shard ranges get cleaved + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 4) + shard_ranges = self.get_container_shard_ranges() + self.assert_shard_range_state(ShardRange.ACTIVE, shard_ranges) + + exp_obj_names = obj_names + new_obj_names + exp_obj_names.sort() + do_listing_checks(exp_obj_names) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + + # delete original objects + self.delete_objects(obj_names) + do_listing_checks(new_obj_names) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + + +class TestContainerShardingUTF8(TestContainerShardingNonUTF8): + def _make_object_names(self, number): + # override default with names that include non-ascii chars + name_length = self.cluster_info['swift']['max_object_name_length'] + obj_names = [] + for x in range(number): + name = (u'obj-\u00e4\u00ea\u00ec\u00f2\u00fb-%04d' % x) + name = name.encode('utf8').ljust(name_length, 'o') + obj_names.append(name) + return obj_names + + def _setup_container_name(self): + # override default with max length name that includes non-ascii chars + super(TestContainerShardingUTF8, self)._setup_container_name() + name_length = self.cluster_info['swift']['max_container_name_length'] + cont_name = self.container_name + u'-\u00e4\u00ea\u00ec\u00f2\u00fb' + self.conainer_name = cont_name.encode('utf8').ljust(name_length, 'x') + + +class TestContainerSharding(BaseTestContainerSharding): + def _test_sharded_listing(self, run_replicators=False): + obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(obj_names) + + # Verify that we start out with normal DBs, no shards + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['normal_dbs'], 3) + self.assertLengthEqual(found['shard_dbs'], 0) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('unsharded', broker.get_db_state()) + self.assertLengthEqual(broker.get_shard_ranges(), 0) + + headers, pre_sharding_listing = client.get_container( + self.url, self.token, self.container_name) + self.assertEqual(obj_names, [x['name'].encode('utf-8') + for x in pre_sharding_listing]) # sanity + + # Shard it + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + pre_sharding_headers = client.head_container( + self.url, self.admin_token, self.container_name) + self.assertEqual('True', + pre_sharding_headers.get('x-container-sharding')) + + # Only run the one in charge of scanning + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # Verify that we have one sharded db -- though the other normal DBs + # received the shard ranges that got defined + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 1) + broker = ContainerBroker(found['shard_dbs'][0]) + # TODO: assert the shard db is on replica 0 + self.assertIs(True, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + orig_root_shard_ranges = [dict(sr) for sr in broker.get_shard_ranges()] + self.assertLengthEqual(orig_root_shard_ranges, 2) + self.assert_total_object_count(len(obj_names), orig_root_shard_ranges) + self.assert_shard_ranges_contiguous(2, orig_root_shard_ranges) + self.assertEqual([ShardRange.ACTIVE, ShardRange.ACTIVE], + [sr['state'] for sr in orig_root_shard_ranges]) + self.direct_delete_container(expect_failure=True) + + self.assertLengthEqual(found['normal_dbs'], 2) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('unsharded', broker.get_db_state()) + # the sharded db had shard range meta_timestamps and state updated + # during cleaving, so we do not expect those to be equal on other + # nodes + self.assert_shard_range_lists_equal( + orig_root_shard_ranges, broker.get_shard_ranges(), + excludes=['meta_timestamp', 'state', 'state_timestamp']) + + if run_replicators: + Manager(['container-replicator']).once() + # replication doesn't change the db file names + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 1) + self.assertLengthEqual(found['normal_dbs'], 2) + + # Now that everyone has shard ranges, run *everyone* + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + + # Verify that we only have shard dbs now + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 0) + # Shards stayed the same + for db_file in found['shard_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + # Well, except for meta_timestamps, since the shards each reported + self.assert_shard_range_lists_equal( + orig_root_shard_ranges, broker.get_shard_ranges(), + excludes=['meta_timestamp', 'state_timestamp']) + for orig, updated in zip(orig_root_shard_ranges, + broker.get_shard_ranges()): + self.assertGreaterEqual(updated.state_timestamp, + orig['state_timestamp']) + self.assertGreaterEqual(updated.meta_timestamp, + orig['meta_timestamp']) + + # Check that entire listing is available + headers, actual_listing = self.assert_container_listing(obj_names) + # ... and check some other container properties + self.assertEqual(headers['last-modified'], + pre_sharding_headers['last-modified']) + + # It even works in reverse! + headers, listing = client.get_container(self.url, self.token, + self.container_name, + query_string='reverse=on') + self.assertEqual(pre_sharding_listing[::-1], listing) + + # Now put some new objects into first shard, taking its count to + # 3 shard ranges' worth + more_obj_names = [ + 'beta%03d' % x for x in range(self.max_shard_size)] + self.put_objects(more_obj_names) + + # The listing includes new objects... + headers, listing = self.assert_container_listing( + more_obj_names + obj_names) + self.assertEqual(pre_sharding_listing, listing[len(more_obj_names):]) + + # ...but root object count is out of date until the sharders run and + # update the root + self.assert_container_object_count(len(obj_names)) + + # run sharders on the shard to get root updated + shard_1 = ShardRange.from_dict(orig_root_shard_ranges[0]) + self.run_sharders(shard_1) + self.assert_container_object_count(len(more_obj_names + obj_names)) + + # we've added objects enough that we need to shard the first shard + # *again* into three new sub-shards, but nothing happens until the root + # leader identifies shard candidate... + root_shard_ranges = self.direct_get_container_shard_ranges() + for node, (hdrs, root_shards) in root_shard_ranges.items(): + self.assertLengthEqual(root_shards, 2) + with annotate_failure('node %s. ' % node): + self.assertEqual( + [ShardRange.ACTIVE] * 2, + [sr['state'] for sr in root_shards]) + # orig shards 0, 1 should be contiguous + self.assert_shard_ranges_contiguous(2, root_shards) + + # Now run the root leader to identify shard candidate...while one of + # the shard container servers is down + shard_1_part, shard_1_nodes = self.get_part_and_node_numbers(shard_1) + self.brain.servers.stop(number=shard_1_nodes[2]) + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # ... so third replica of first shard state is not moved to sharding + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + self.assertEqual( + [ShardRange.SHARDING, ShardRange.SHARDING, ShardRange.ACTIVE], + [ContainerBroker(db_file).get_own_shard_range().state + for db_file in found_for_shard['normal_dbs']]) + + # ...then run first cycle of first shard sharders in order, leader + # first, to get to predictable state where all nodes have cleaved 2 out + # of 3 ranges...starting with first two nodes + for node_number in shard_1_nodes[:2]: + self.sharders.once( + number=node_number, + additional_args='--partitions=%s' % shard_1_part) + + # ... first two replicas start sharding to sub-shards + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['shard_dbs'], 2) + for db_file in found_for_shard['shard_dbs'][:2]: + broker = ContainerBroker(db_file) + with annotate_failure('shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('sharding', broker.get_db_state()) + self.assertEqual( + ShardRange.SHARDING, broker.get_own_shard_range().state) + shard_shards = broker.get_shard_ranges() + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, + ShardRange.CREATED], + [sr.state for sr in shard_shards]) + self.assert_shard_ranges_contiguous( + 3, shard_shards, + first_lower=orig_root_shard_ranges[0]['lower'], + last_upper=orig_root_shard_ranges[0]['upper']) + + # but third replica still has no idea it should be sharding + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + self.assertEqual( + ShardRange.ACTIVE, + ContainerBroker( + found_for_shard['normal_dbs'][2]).get_own_shard_range().state) + + # ...but once sharder runs on third replica it will learn its state; + # note that any root replica on the stopped container server also won't + # know about the shards being in sharding state, so leave that server + # stopped for now so that shard fetches its state from an up-to-date + # root replica + self.sharders.once( + number=shard_1_nodes[2], + additional_args='--partitions=%s' % shard_1_part) + + # third replica is sharding but has no sub-shard ranges yet... + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['shard_dbs'], 2) + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + broker = ContainerBroker(found_for_shard['normal_dbs'][2]) + self.assertEqual('unsharded', broker.get_db_state()) + self.assertEqual( + ShardRange.SHARDING, broker.get_own_shard_range().state) + self.assertFalse(broker.get_shard_ranges()) + + # ...until sub-shard ranges are replicated from another shard replica; + # there may also be a sub-shard replica missing so run replicators on + # all nodes to fix that if necessary + self.brain.servers.start(number=shard_1_nodes[2]) + self.replicators.once() + + # now run sharder again on third replica + self.sharders.once( + number=shard_1_nodes[2], + additional_args='--partitions=%s' % shard_1_part) + + # check original first shard range state and sub-shards - all replicas + # should now be in consistent state + found_for_shard = self.categorize_container_dir_content( + shard_1.account, shard_1.container) + self.assertLengthEqual(found_for_shard['shard_dbs'], 3) + self.assertLengthEqual(found_for_shard['normal_dbs'], 3) + for db_file in found_for_shard['shard_dbs']: + broker = ContainerBroker(db_file) + with annotate_failure('shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('sharding', broker.get_db_state()) + self.assertEqual( + ShardRange.SHARDING, broker.get_own_shard_range().state) + shard_shards = broker.get_shard_ranges() + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, + ShardRange.CREATED], + [sr.state for sr in shard_shards]) + self.assert_shard_ranges_contiguous( + 3, shard_shards, + first_lower=orig_root_shard_ranges[0]['lower'], + last_upper=orig_root_shard_ranges[0]['upper']) + + # check third sub-shard is in created state + sub_shard = shard_shards[2] + found_for_sub_shard = self.categorize_container_dir_content( + sub_shard.account, sub_shard.container) + self.assertFalse(found_for_sub_shard['shard_dbs']) + self.assertLengthEqual(found_for_sub_shard['normal_dbs'], 3) + for db_file in found_for_sub_shard['normal_dbs']: + broker = ContainerBroker(db_file) + with annotate_failure('sub shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('unsharded', broker.get_db_state()) + self.assertEqual( + ShardRange.CREATED, broker.get_own_shard_range().state) + self.assertFalse(broker.get_shard_ranges()) + + # check root shard ranges + root_shard_ranges = self.direct_get_container_shard_ranges() + for node, (hdrs, root_shards) in root_shard_ranges.items(): + self.assertLengthEqual(root_shards, 5) + with annotate_failure('node %s. ' % node): + # shard ranges are sorted by upper, state, lower, so expect: + # sub-shards, orig shard 0, orig shard 1 + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, + ShardRange.CREATED, ShardRange.SHARDING, + ShardRange.ACTIVE], + [sr['state'] for sr in root_shards]) + # sub-shards 0, 1, 2, orig shard 1 should be contiguous + self.assert_shard_ranges_contiguous( + 4, root_shards[:3] + root_shards[4:]) + # orig shards 0, 1 should be contiguous + self.assert_shard_ranges_contiguous(2, root_shards[3:]) + + self.assert_container_listing(more_obj_names + obj_names) + self.assert_container_object_count(len(more_obj_names + obj_names)) + + # add another object that lands in the first of the new sub-shards + self.put_objects(['alpha']) + + # TODO: assert that alpha is in the first new shard + self.assert_container_listing(['alpha'] + more_obj_names + obj_names) + # Run sharders again so things settle. + self.run_sharders(shard_1) + + # check original first shard range shards + for db_file in found_for_shard['shard_dbs']: + broker = ContainerBroker(db_file) + with annotate_failure('shard db file %s. ' % db_file): + self.assertIs(False, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + self.assertEqual( + [ShardRange.ACTIVE] * 3, + [sr.state for sr in broker.get_shard_ranges()]) + # check root shard ranges + root_shard_ranges = self.direct_get_container_shard_ranges() + for node, (hdrs, root_shards) in root_shard_ranges.items(): + # old first shard range should have been deleted + self.assertLengthEqual(root_shards, 4) + with annotate_failure('node %s. ' % node): + self.assertEqual( + [ShardRange.ACTIVE] * 4, + [sr['state'] for sr in root_shards]) + self.assert_shard_ranges_contiguous(4, root_shards) + + headers, final_listing = self.assert_container_listing( + ['alpha'] + more_obj_names + obj_names) + + # check root + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 0) + new_shard_ranges = None + for db_file in found['shard_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual('sharded', broker.get_db_state()) + if new_shard_ranges is None: + new_shard_ranges = broker.get_shard_ranges( + include_deleted=True) + self.assertLengthEqual(new_shard_ranges, 5) + # Second half is still there, and unchanged + self.assertIn( + dict(orig_root_shard_ranges[1], meta_timestamp=None, + state_timestamp=None), + [dict(sr, meta_timestamp=None, state_timestamp=None) + for sr in new_shard_ranges]) + # But the first half split in three, then deleted + by_name = {sr.name: sr for sr in new_shard_ranges} + self.assertIn(orig_root_shard_ranges[0]['name'], by_name) + old_shard_range = by_name.pop( + orig_root_shard_ranges[0]['name']) + self.assertTrue(old_shard_range.deleted) + self.assert_shard_ranges_contiguous(4, by_name.values()) + else: + # Everyone's on the same page. Well, except for + # meta_timestamps, since the shards each reported + other_shard_ranges = broker.get_shard_ranges( + include_deleted=True) + self.assert_shard_range_lists_equal( + new_shard_ranges, other_shard_ranges, + excludes=['meta_timestamp', 'state_timestamp']) + for orig, updated in zip(orig_root_shard_ranges, + other_shard_ranges): + self.assertGreaterEqual(updated.meta_timestamp, + orig['meta_timestamp']) + + self.assert_container_delete_fails() + + for obj in final_listing: + client.delete_object( + self.url, self.token, self.container_name, obj['name']) + + # the objects won't be listed anymore + self.assert_container_listing([]) + # but root container stats will not yet be aware of the deletions + self.assert_container_delete_fails() + + # One server was down while the shard sharded its first two sub-shards, + # so there may be undeleted handoff db(s) for sub-shard(s) that were + # not fully replicated; run replicators now to clean up so they no + # longer report bogus stats to root. + self.replicators.once() + + # Run sharder so that shard containers update the root. Do not run + # sharder on root container because that triggers shrinks which can + # cause root object count to temporarily be non-zero and prevent the + # final delete. + self.run_sharders(self.get_container_shard_ranges()) + # then root is empty and can be deleted + self.assert_container_listing([]) + self.assert_container_object_count(0) + client.delete_container(self.url, self.token, self.container_name) + + def test_sharded_listing_no_replicators(self): + self._test_sharded_listing() + + def test_sharded_listing_with_replicators(self): + self._test_sharded_listing(run_replicators=True) + + def test_async_pendings(self): + obj_names = self._make_object_names(self.max_shard_size * 2) + + # There are some updates *everyone* gets + self.put_objects(obj_names[::5]) + # But roll some outages so each container only get ~2/5 more object + # records i.e. total of 3/5 updates per container; and async pendings + # pile up + for i, n in enumerate(self.brain.node_numbers, start=1): + self.brain.servers.stop(number=n) + self.put_objects(obj_names[i::5]) + self.brain.servers.start(number=n) + + # But there are also 1/5 updates *no one* gets + self.brain.servers.stop() + self.put_objects(obj_names[4::5]) + self.brain.servers.start() + + # Shard it + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + headers = client.head_container(self.url, self.admin_token, + self.container_name) + self.assertEqual('True', headers.get('x-container-sharding')) + + # sanity check + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 0) + self.assertLengthEqual(found['normal_dbs'], 3) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + + # Only run the 'leader' in charge of scanning. + # Each container has ~2 * max * 3/5 objects + # which are distributed from obj000 to obj<2 * max - 1>, + # so expect 3 shard ranges to be found: the first two will be complete + # shards with max/2 objects and lower/upper bounds spaced by approx: + # (2 * max - 1)/(2 * max * 3/5) * (max/2) =~ 5/6 * max + # + # Note that during this shard cycle the leader replicates to other + # nodes so they will end up with ~2 * max * 4/5 objects. + self.sharders.once(number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # Verify that we have one shard db -- though the other normal DBs + # received the shard ranges that got defined + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 1) + node_index_zero_db = found['shard_dbs'][0] + broker = ContainerBroker(node_index_zero_db) + self.assertIs(True, broker.is_root_container()) + self.assertEqual(SHARDING, broker.get_db_state()) + expected_shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(expected_shard_ranges, 3) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED], + [sr.state for sr in expected_shard_ranges]) + + # Still have all three big DBs -- we've only cleaved 2 of the 3 shard + # ranges that got defined + self.assertLengthEqual(found['normal_dbs'], 3) + db_states = [] + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertIs(True, broker.is_root_container()) + db_states.append(broker.get_db_state()) + # the sharded db had shard range meta_timestamps updated during + # cleaving, so we do not expect those to be equal on other nodes + self.assert_shard_range_lists_equal( + expected_shard_ranges, broker.get_shard_ranges(), + excludes=['meta_timestamp', 'state_timestamp', 'state']) + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + self.assertEqual([SHARDING, UNSHARDED, UNSHARDED], sorted(db_states)) + + # Run the other sharders so we're all in (roughly) the same state + for n in self.brain.node_numbers[1:]: + self.sharders.once( + number=n, + additional_args='--partitions=%s' % self.brain.part) + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 3) + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertEqual(SHARDING, broker.get_db_state()) + # no new rows + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + + # Run updaters to clear the async pendings + Manager(['object-updater']).once() + + # Our "big" dbs didn't take updates + for db_file in found['normal_dbs']: + broker = ContainerBroker(db_file) + self.assertEqual(len(obj_names) * 3 // 5, + broker.get_info()['object_count']) + + # TODO: confirm that the updates got redirected to the shards + + # The entire listing is not yet available - we have two cleaved shard + # ranges, complete with async updates, but for the remainder of the + # namespace only what landed in the original container + headers, listing = client.get_container(self.url, self.token, + self.container_name) + start_listing = [ + o for o in obj_names if o <= expected_shard_ranges[1].upper] + self.assertEqual( + [x['name'].encode('utf-8') for x in listing[:len(start_listing)]], + start_listing) + # we can't assert much about the remaining listing, other than that + # there should be something + self.assertTrue( + [x['name'].encode('utf-8') for x in listing[len(start_listing):]]) + # Object count is hard to reason about though! + # TODO: nail down what this *should* be and make sure all containers + # respond with it! Depending on what you're looking at, this + # could be 0, 1/2, 7/12 (!?), 3/5, 2/3, or 4/5 or all objects! + # Apparently, it may not even be present at all! + # self.assertIn('x-container-object-count', headers) + # self.assertEqual(headers['x-container-object-count'], + # str(len(obj_names) - len(obj_names) // 6)) + + # TODO: Doesn't work in reverse, yet + # headers, listing = client.get_container(self.url, self.token, + # self.container_name, + # query_string='reverse=on') + # self.assertEqual([x['name'].encode('utf-8') for x in listing], + # obj_names[::-1]) + + # Run the sharders again to get everything to settle + self.sharders.once() + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 3) + self.assertLengthEqual(found['normal_dbs'], 0) + # now all shards have been cleaved we should get the complete listing + headers, listing = client.get_container(self.url, self.token, + self.container_name) + self.assertEqual([x['name'].encode('utf-8') for x in listing], + obj_names) + + def test_shrinking(self): + int_client = self.make_internal_client() + + def check_node_data(node_data, exp_hdrs, exp_obj_count, exp_shards): + hdrs, range_data = node_data + self.assert_dict_contains(exp_hdrs, hdrs) + self.assert_shard_ranges_contiguous(exp_shards, range_data) + self.assert_total_object_count(exp_obj_count, range_data) + + def check_shard_nodes_data(node_data, expected_state='unsharded', + expected_shards=0, exp_obj_count=0): + # checks that shard range is consistent on all nodes + root_path = '%s/%s' % (self.account, self.container_name) + exp_shard_hdrs = {'X-Container-Sysmeta-Shard-Root': root_path, + 'X-Backend-Sharding-State': expected_state} + object_counts = [] + bytes_used = [] + for node_id, node_data in node_data.items(): + with annotate_failure('Node id %s.' % node_id): + check_node_data( + node_data, exp_shard_hdrs, exp_obj_count, + expected_shards) + hdrs = node_data[0] + object_counts.append(int(hdrs['X-Container-Object-Count'])) + bytes_used.append(int(hdrs['X-Container-Bytes-Used'])) + if len(set(object_counts)) != 1: + self.fail('Inconsistent object counts: %s' % object_counts) + if len(set(bytes_used)) != 1: + self.fail('Inconsistent bytes used: %s' % bytes_used) + return object_counts[0], bytes_used[0] + + repeat = [0] + + def do_shard_then_shrink(): + repeat[0] += 1 + obj_names = ['obj-%s-%03d' % (repeat[0], x) + for x in range(self.max_shard_size)] + self.put_objects(obj_names) + # these two object names will fall at start of first shard range... + alpha = 'alpha-%s' % repeat[0] + beta = 'beta-%s' % repeat[0] + + # Enable sharding + client.post_container( + self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + + # sanity check + self.assert_container_listing(obj_names) + + # Only run the one in charge of scanning + self.sharders.once( + number=self.brain.node_numbers[0], + additional_args='--partitions=%s' % self.brain.part) + + # check root container + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + + # nodes on which sharder has not run are still in unsharded state + # but have had shard ranges replicated to them + exp_obj_count = len(obj_names) + exp_hdrs = {'X-Backend-Sharding-State': 'unsharded', + 'X-Container-Object-Count': str(exp_obj_count)} + node_id = self.brain.node_numbers[1] - 1 + check_node_data( + root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2) + node_id = self.brain.node_numbers[2] - 1 + check_node_data( + root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2) + + # only one that ran sharder is in sharded state + exp_hdrs['X-Backend-Sharding-State'] = 'sharded' + node_id = self.brain.node_numbers[0] - 1 + check_node_data( + root_nodes_data[node_id], exp_hdrs, exp_obj_count, 2) + + orig_range_data = root_nodes_data[node_id][1] + orig_shard_ranges = [ShardRange.from_dict(r) + for r in orig_range_data] + + # check first shard + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[0].account, orig_shard_ranges[0].container) + obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data) + total_shard_object_count = obj_count + + # check second shard + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[1].account, orig_shard_ranges[1].container) + obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data) + total_shard_object_count += obj_count + self.assertEqual(exp_obj_count, total_shard_object_count) + + # Now that everyone has shard ranges, run *everyone* + self.sharders.once( + additional_args='--partitions=%s' % self.brain.part) + + # all root container nodes should now be in sharded state + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + check_node_data(node_data, exp_hdrs, exp_obj_count, 2) + + # run updaters to update .sharded account; shard containers have + # not updated account since having objects replicated to them + self.updaters.once() + shard_cont_count, shard_obj_count = int_client.get_account_info( + orig_shard_ranges[0].account, [204]) + self.assertEqual(2 * repeat[0], shard_cont_count) + self.assertEqual(len(obj_names), shard_obj_count) + + # checking the listing also refreshes proxy container info cache so + # that the proxy becomes aware that container is sharded and will + # now look up the shard target for subsequent updates + self.assert_container_listing(obj_names) + + # delete objects from first shard range + first_shard_objects = [obj_name for obj_name in obj_names + if obj_name <= orig_shard_ranges[0].upper] + for obj in first_shard_objects: + client.delete_object( + self.url, self.token, self.container_name, obj) + with self.assertRaises(ClientException): + client.get_object( + self.url, self.token, self.container_name, obj) + + second_shard_objects = [obj_name for obj_name in obj_names + if obj_name > orig_shard_ranges[1].lower] + self.assert_container_listing(second_shard_objects) + + self.put_objects([alpha]) + second_shard_objects = [obj_name for obj_name in obj_names + if obj_name > orig_shard_ranges[1].lower] + self.assert_container_listing([alpha] + second_shard_objects) + + # while container servers are down, but proxy has container info in + # cache from recent listing, put another object; this update will + # lurk in async pending until the updaters run again + # TODO: because all the root container servers are down and + # therefore cannot respond to a GET for a redirect target, the + # object update will default to being targeted at the root + # container; can we provoke an object update that does get targeted + # to the shard, but fails to update shard, so that the async + # pending will first be directed to the shard when the updaters + # run? + self.stop_container_servers() + self.put_objects([beta]) + self.brain.servers.start() + async_pendings = self.gather_async_pendings( + self.get_all_object_nodes()) + num_container_replicas = len(self.brain.nodes) + num_obj_replicas = self.policy.object_ring.replica_count + expected_num_updates = num_container_updates( + num_container_replicas, quorum_size(num_container_replicas), + num_obj_replicas, self.policy.quorum) + expected_num_pendings = min(expected_num_updates, num_obj_replicas) + # sanity check + with annotate_failure('policy %s. ' % self.policy): + self.assertLengthEqual(async_pendings, expected_num_pendings) + + # root object count is not updated... + self.assert_container_object_count(len(obj_names)) + self.assert_container_listing([alpha] + second_shard_objects) + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + check_node_data(node_data, exp_hdrs, exp_obj_count, 2) + range_data = node_data[1] + self.assert_shard_range_lists_equal( + orig_range_data, range_data, + excludes=['meta_timestamp', 'state_timestamp']) + + # ...until the sharders run and update root + self.run_sharders(orig_shard_ranges[0]) + exp_obj_count = len(second_shard_objects) + 1 + self.assert_container_object_count(exp_obj_count) + self.assert_container_listing([alpha] + second_shard_objects) + + # root sharder finds donor, acceptor pair and pushes changes + self.sharders.once( + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_listing([alpha] + second_shard_objects) + # run sharder on donor to shrink and replicate to acceptor + self.run_sharders(orig_shard_ranges[0]) + self.assert_container_listing([alpha] + second_shard_objects) + # run sharder on acceptor to update root with stats + self.run_sharders(orig_shard_ranges[1]) + self.assert_container_listing([alpha] + second_shard_objects) + self.assert_container_object_count(len(second_shard_objects) + 1) + + # check root container + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + exp_hdrs['X-Container-Object-Count'] = str(exp_obj_count) + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + # NB now only *one* shard range in root + check_node_data(node_data, exp_hdrs, exp_obj_count, 1) + + # the acceptor shard is intact.. + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[1].account, orig_shard_ranges[1].container) + obj_count, bytes_used = check_shard_nodes_data(shard_nodes_data) + # all objects should now be in this shard + self.assertEqual(exp_obj_count, obj_count) + + # the donor shard is also still intact + # TODO: once we have figured out when these redundant donors are + # deleted, test for deletion/clean up + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[0].account, orig_shard_ranges[0].container) + # the donor's shard range will have the acceptor's projected stats + obj_count, bytes_used = check_shard_nodes_data( + shard_nodes_data, expected_state='sharded', expected_shards=1, + exp_obj_count=len(second_shard_objects) + 1) + # but the donor is empty and so reports zero stats + self.assertEqual(0, obj_count) + self.assertEqual(0, bytes_used) + + # delete all the second shard's object apart from 'alpha' + for obj in second_shard_objects: + client.delete_object( + self.url, self.token, self.container_name, obj) + + self.assert_container_listing([alpha]) + + # runs sharders so second range shrinks away, requires up to 3 + # cycles + self.sharders.once() # shard updates root stats + self.assert_container_listing([alpha]) + self.sharders.once() # root finds shrinkable shard + self.assert_container_listing([alpha]) + self.sharders.once() # shards shrink themselves + self.assert_container_listing([alpha]) + + # the second shard range has sharded and is empty + shard_nodes_data = self.direct_get_container_shard_ranges( + orig_shard_ranges[1].account, orig_shard_ranges[1].container) + check_shard_nodes_data( + shard_nodes_data, expected_state='sharded', expected_shards=1, + exp_obj_count=1) + + # check root container + root_nodes_data = self.direct_get_container_shard_ranges() + self.assertEqual(3, len(root_nodes_data)) + exp_hdrs = {'X-Backend-Sharding-State': 'collapsed', + # just the alpha object + 'X-Container-Object-Count': '1'} + for node_id, node_data in root_nodes_data.items(): + with annotate_failure('Node id %s.' % node_id): + # NB now no shard ranges in root + check_node_data(node_data, exp_hdrs, 0, 0) + + # delete the alpha object + client.delete_object( + self.url, self.token, self.container_name, alpha) + # should now be able to delete the *apparently* empty container + client.delete_container(self.url, self.token, self.container_name) + self.assert_container_not_found() + self.direct_head_container(expect_failure=True) + + # and the container stays deleted even after sharders run and shard + # send updates + self.sharders.once() + self.assert_container_not_found() + self.direct_head_container(expect_failure=True) + + # now run updaters to deal with the async pending for the beta + # object + self.updaters.once() + # and the container is revived! + self.assert_container_listing([beta]) + + # finally, clear out the container + client.delete_object( + self.url, self.token, self.container_name, beta) + + do_shard_then_shrink() + # repeat from starting point of a collapsed and previously deleted + # container + do_shard_then_shrink() + + def _setup_replication_scenario(self, num_shards, extra_objs=('alpha',)): + # Get cluster to state where 2 replicas are sharding or sharded but 3rd + # replica is unsharded and has an object that the first 2 are missing. + + # put objects while all servers are up + obj_names = self._make_object_names( + num_shards * self.max_shard_size / 2) + self.put_objects(obj_names) + + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + node_numbers = self.brain.node_numbers + + # run replicators first time to get sync points set + self.replicators.once() + + # stop the leader node and one other server + self.stop_container_servers(slice(0, 2)) + + # ...then put one more object in first shard range namespace + self.put_objects(extra_objs) + + # start leader and first other server, stop third server + for number in node_numbers[:2]: + self.brain.servers.start(number=number) + self.brain.servers.stop(number=node_numbers[2]) + self.assert_container_listing(obj_names) # sanity check + + # shard the container - first two shard ranges are cleaved + for number in node_numbers[:2]: + self.sharders.once( + number=number, + additional_args='--partitions=%s' % self.brain.part) + + self.assert_container_listing(obj_names) # sanity check + return obj_names + + def test_replication_to_sharding_container(self): + # verify that replication from an unsharded replica to a sharding + # replica does not replicate rows but does replicate shard ranges + obj_names = self._setup_replication_scenario(3) + for node in self.brain.nodes[:2]: + self.assert_container_state(node, 'sharding', 3) + + # bring third server back up, run replicator + node_numbers = self.brain.node_numbers + self.brain.servers.start(number=node_numbers[2]) + # sanity check... + self.assert_container_state(self.brain.nodes[2], 'unsharded', 0) + self.replicators.once(number=node_numbers[2]) + # check db files unchanged + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 2) + self.assertLengthEqual(found['normal_dbs'], 3) + + # the 'alpha' object is NOT replicated to the two sharded nodes + for node in self.brain.nodes[:2]: + broker = self.get_broker(self.brain.part, node) + with annotate_failure( + 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])): + self.assertFalse(broker.get_objects()) + self.assert_container_state(node, 'sharding', 3) + self.brain.servers.stop(number=node_numbers[2]) + self.assert_container_listing(obj_names) + + # all nodes now have shard ranges + self.brain.servers.start(number=node_numbers[2]) + node_data = self.direct_get_container_shard_ranges() + for node, (hdrs, shard_ranges) in node_data.items(): + with annotate_failure(node): + self.assert_shard_ranges_contiguous(3, shard_ranges) + + # complete cleaving third shard range on first two nodes + self.brain.servers.stop(number=node_numbers[2]) + for number in node_numbers[:2]: + self.sharders.once( + number=number, + additional_args='--partitions=%s' % self.brain.part) + # ...and now they are in sharded state + self.assert_container_state(self.brain.nodes[0], 'sharded', 3) + self.assert_container_state(self.brain.nodes[1], 'sharded', 3) + # ...still no 'alpha' object in listing + self.assert_container_listing(obj_names) + + # run the sharder on the third server, alpha object is included in + # shards that it cleaves + self.brain.servers.start(number=node_numbers[2]) + self.assert_container_state(self.brain.nodes[2], 'unsharded', 3) + self.sharders.once(number=node_numbers[2], + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(self.brain.nodes[2], 'sharding', 3) + self.sharders.once(number=node_numbers[2], + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(self.brain.nodes[2], 'sharded', 3) + self.assert_container_listing(['alpha'] + obj_names) + + def test_replication_to_sharded_container(self): + # verify that replication from an unsharded replica to a sharded + # replica does not replicate rows but does replicate shard ranges + obj_names = self._setup_replication_scenario(2) + for node in self.brain.nodes[:2]: + self.assert_container_state(node, 'sharded', 2) + + # sanity check + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 2) + self.assertLengthEqual(found['normal_dbs'], 1) + for node in self.brain.nodes[:2]: + broker = self.get_broker(self.brain.part, node) + info = broker.get_info() + with annotate_failure( + 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])): + self.assertEqual(len(obj_names), info['object_count']) + self.assertFalse(broker.get_objects()) + + # bring third server back up, run replicator + node_numbers = self.brain.node_numbers + self.brain.servers.start(number=node_numbers[2]) + # sanity check... + self.assert_container_state(self.brain.nodes[2], 'unsharded', 0) + self.replicators.once(number=node_numbers[2]) + # check db files unchanged + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['shard_dbs'], 2) + self.assertLengthEqual(found['normal_dbs'], 1) + + # the 'alpha' object is NOT replicated to the two sharded nodes + for node in self.brain.nodes[:2]: + broker = self.get_broker(self.brain.part, node) + with annotate_failure( + 'Node id %s in %s' % (node['id'], self.brain.nodes[:2])): + self.assertFalse(broker.get_objects()) + self.assert_container_state(node, 'sharded', 2) + self.brain.servers.stop(number=node_numbers[2]) + self.assert_container_listing(obj_names) + + # all nodes now have shard ranges + self.brain.servers.start(number=node_numbers[2]) + node_data = self.direct_get_container_shard_ranges() + for node, (hdrs, shard_ranges) in node_data.items(): + with annotate_failure(node): + self.assert_shard_ranges_contiguous(2, shard_ranges) + + # run the sharder on the third server, alpha object is included in + # shards that it cleaves + self.assert_container_state(self.brain.nodes[2], 'unsharded', 2) + self.sharders.once(number=node_numbers[2], + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(self.brain.nodes[2], 'sharded', 2) + self.assert_container_listing(['alpha'] + obj_names) + + def test_sharding_requires_sufficient_replication(self): + # verify that cleaving only progresses if each cleaved shard range is + # sufficiently replicated + + # put enough objects for 4 shard ranges + obj_names = self._make_object_names(2 * self.max_shard_size) + self.put_objects(obj_names) + + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + node_numbers = self.brain.node_numbers + leader_node = self.brain.nodes[0] + leader_num = node_numbers[0] + + # run replicators first time to get sync points set + self.replicators.once() + + # start sharding on the leader node + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + shard_ranges = self.assert_container_state(leader_node, 'sharding', 4) + self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2, + [sr.state for sr in shard_ranges]) + + # stop *all* container servers for third shard range + sr_part, sr_node_nums = self.get_part_and_node_numbers(shard_ranges[2]) + for node_num in sr_node_nums: + self.brain.servers.stop(number=node_num) + + # attempt to continue sharding on the leader node + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + + # no cleaving progress was made + for node_num in sr_node_nums: + self.brain.servers.start(number=node_num) + shard_ranges = self.assert_container_state(leader_node, 'sharding', 4) + self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2, + [sr.state for sr in shard_ranges]) + + # stop two of the servers for third shard range, not including any + # server that happens to be the leader node + stopped = [] + for node_num in sr_node_nums: + if node_num != leader_num: + self.brain.servers.stop(number=node_num) + stopped.append(node_num) + if len(stopped) >= 2: + break + self.assertLengthEqual(stopped, 2) # sanity check + + # attempt to continue sharding on the leader node + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + + # no cleaving progress was made + for node_num in stopped: + self.brain.servers.start(number=node_num) + shard_ranges = self.assert_container_state(leader_node, 'sharding', 4) + self.assertEqual([ShardRange.CLEAVED] * 2 + [ShardRange.CREATED] * 2, + [sr.state for sr in shard_ranges]) + + # stop just one of the servers for third shard range + stopped = [] + for node_num in sr_node_nums: + if node_num != leader_num: + self.brain.servers.stop(number=node_num) + stopped.append(node_num) + break + self.assertLengthEqual(stopped, 1) # sanity check + + # attempt to continue sharding the container + self.sharders.once(number=leader_num, + additional_args='--partitions=%s' % self.brain.part) + + # this time cleaving completed + self.brain.servers.start(number=stopped[0]) + shard_ranges = self.assert_container_state(leader_node, 'sharded', 4) + self.assertEqual([ShardRange.ACTIVE] * 4, + [sr.state for sr in shard_ranges]) + + def test_sharded_delete(self): + all_obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(all_obj_names) + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + for n in self.brain.node_numbers: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + # sanity checks + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 2) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + self.assert_container_listing(all_obj_names) + + # delete all objects - updates redirected to shards + self.delete_objects(all_obj_names) + self.assert_container_listing([]) + self.assert_container_post_ok('has objects') + # root not yet updated with shard stats + self.assert_container_object_count(len(all_obj_names)) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + + # run sharder on shard containers to update root stats + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.run_sharders(shard_ranges) + self.assert_container_listing([]) + self.assert_container_post_ok('empty') + self.assert_container_object_count(0) + + # put a new object - update redirected to shard + self.put_objects(['alpha']) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) + + # before root learns about new object in shard, delete the container + client.delete_container(self.url, self.token, self.container_name) + self.assert_container_post_fails('deleted') + self.assert_container_not_found() + + # run the sharders to update root with shard stats + self.run_sharders(shard_ranges) + + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + self.assert_container_delete_fails() + self.assert_container_post_ok('revived') + + def test_object_update_redirection(self): + all_obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(all_obj_names) + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + for n in self.brain.node_numbers: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + # sanity checks + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 2) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + self.assert_container_listing(all_obj_names) + + # delete all objects - updates redirected to shards + self.delete_objects(all_obj_names) + self.assert_container_listing([]) + self.assert_container_post_ok('has objects') + + # run sharder on shard containers to update root stats + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.run_sharders(shard_ranges) + self.assert_container_object_count(0) + + # First, test a misplaced object moving from one shard to another. + # with one shard server down, put a new 'alpha' object... + shard_part, shard_nodes = self.get_part_and_node_numbers( + shard_ranges[0]) + self.brain.servers.stop(number=shard_nodes[2]) + self.put_objects(['alpha']) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) + self.assertLengthEqual( + self.gather_async_pendings(self.get_all_object_nodes()), 1) + self.brain.servers.start(number=shard_nodes[2]) + + # run sharder on root to discover first shrink candidate + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on the shard node without the alpha object + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=shard_nodes[2]) + # root sees first shard has shrunk, only second shard range used for + # listing so alpha object not in listing + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + self.assert_container_listing([]) + self.assert_container_object_count(0) + + # run the updaters: the async pending update will be redirected from + # shrunk shard to second shard + self.updaters.once() + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) # root not yet updated + + # then run sharder on other shard nodes to complete shrinking + for number in shard_nodes[:2]: + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=number) + # and get root updated + self.run_sharders(shard_ranges[1]) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + + # Now we have just one active shard, test a misplaced object moving + # from that shard to the root. + # with one shard server down, delete 'alpha' and put a 'beta' object... + shard_part, shard_nodes = self.get_part_and_node_numbers( + shard_ranges[1]) + self.brain.servers.stop(number=shard_nodes[2]) + self.delete_objects(['alpha']) + self.put_objects(['beta']) + self.assert_container_listing(['beta']) + self.assert_container_object_count(1) + self.assertLengthEqual( + self.gather_async_pendings(self.get_all_object_nodes()), 2) + self.brain.servers.start(number=shard_nodes[2]) + + # run sharder on root to discover second shrink candidate - root is not + # yet aware of the beta object + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on the shard node without the beta object, to shrink + # it to root - note this moves stale copy of alpha to the root db + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=shard_nodes[2]) + # now there are no active shards + self.assertFalse(self.get_container_shard_ranges()) + + # with other two shard servers down, listing won't find beta object + for number in shard_nodes[:2]: + self.brain.servers.stop(number=number) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + + # run the updaters: the async pending update will be redirected from + # shrunk shard to the root + self.updaters.once() + self.assert_container_listing(['beta']) + self.assert_container_object_count(1) + + def test_misplaced_object_movement(self): + def merge_object(shard_range, name, deleted=0): + # it's hard to get a test to put a misplaced object into a shard, + # so this hack is used force an object record directly into a shard + # container db. Note: the actual object won't exist, we're just + # using this to test object records in container dbs. + shard_part, shard_nodes = self.brain.ring.get_nodes( + shard_range.account, shard_range.container) + shard_broker = self.get_broker( + shard_part, shard_nodes[0], shard_range.account, + shard_range.container) + shard_broker.merge_items( + [{'name': name, 'created_at': Timestamp.now().internal, + 'size': 0, 'content_type': 'text/plain', + 'etag': hashlib.md5().hexdigest(), 'deleted': deleted}]) + return shard_nodes[0] + + all_obj_names = self._make_object_names(self.max_shard_size) + self.put_objects(all_obj_names) + # Shard the container + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + for n in self.brain.node_numbers: + self.sharders.once( + number=n, additional_args='--partitions=%s' % self.brain.part) + # sanity checks + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 2) + self.assert_container_delete_fails() + self.assert_container_has_shard_sysmeta() + self.assert_container_post_ok('sharded') + self.assert_container_listing(all_obj_names) + + # delete all objects - updates redirected to shards + self.delete_objects(all_obj_names) + self.assert_container_listing([]) + self.assert_container_post_ok('has objects') + + # run sharder on shard containers to update root stats + shard_ranges = self.get_container_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.run_sharders(shard_ranges) + self.assert_container_object_count(0) + + # First, test a misplaced object moving from one shard to another. + # run sharder on root to discover first shrink candidate + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on first shard range to shrink it + shard_part, shard_nodes_numbers = self.get_part_and_node_numbers( + shard_ranges[0]) + self.sharders.once(additional_args='--partitions=%s' % shard_part) + # force a misplaced object into the shrunken shard range to simulate + # a client put that was in flight when it started to shrink + misplaced_node = merge_object(shard_ranges[0], 'alpha', deleted=0) + # root sees first shard has shrunk, only second shard range used for + # listing so alpha object not in listing + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + self.assert_container_listing([]) + self.assert_container_object_count(0) + # until sharder runs on that node to move the misplaced object to the + # second shard range + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=misplaced_node['id'] + 1) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(0) # root not yet updated + + # run sharder to get root updated + self.run_sharders(shard_ranges[1]) + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + self.assertLengthEqual(self.get_container_shard_ranges(), 1) + + # Now we have just one active shard, test a misplaced object moving + # from that shard to the root. + # run sharder on root to discover second shrink candidate + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + # then run sharder on the shard node to shrink it to root - note this + # moves alpha to the root db + shard_part, shard_nodes_numbers = self.get_part_and_node_numbers( + shard_ranges[1]) + self.sharders.once(additional_args='--partitions=%s' % shard_part) + # now there are no active shards + self.assertFalse(self.get_container_shard_ranges()) + + # force some misplaced object updates into second shrunk shard range + merge_object(shard_ranges[1], 'alpha', deleted=1) + misplaced_node = merge_object(shard_ranges[1], 'beta', deleted=0) + # root is not yet aware of them + self.assert_container_listing(['alpha']) + self.assert_container_object_count(1) + # until sharder runs on that node to move the misplaced object + self.sharders.once(additional_args='--partitions=%s' % shard_part, + number=misplaced_node['id'] + 1) + self.assert_container_listing(['beta']) + self.assert_container_object_count(1) + self.assert_container_delete_fails() + + def test_replication_to_sharded_container_from_unsharded_old_primary(self): + primary_ids = [n['id'] for n in self.brain.nodes] + handoff_node = next(n for n in self.brain.ring.devs + if n['id'] not in primary_ids) + + # start with two sharded replicas and one unsharded with extra object + obj_names = self._setup_replication_scenario(2) + for node in self.brain.nodes[:2]: + self.assert_container_state(node, 'sharded', 2) + + # Fake a ring change - copy unsharded db which has no shard ranges to a + # handoff to create illusion of a new unpopulated primary node + node_numbers = self.brain.node_numbers + new_primary_node = self.brain.nodes[2] + new_primary_node_number = node_numbers[2] + new_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, new_primary_node) + old_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, handoff_node) + utils.mkdirs(os.path.dirname(old_primary_dir)) + os.rename(new_primary_dir, old_primary_dir) + + # make the cluster more or less "healthy" again + self.brain.servers.start(number=new_primary_node_number) + + # get a db on every node... + client.put_container(self.url, self.token, self.container_name) + self.assertTrue(os.path.exists(os.path.join( + new_primary_dir, container_hash + '.db'))) + found = self.categorize_container_dir_content() + self.assertLengthEqual(found['normal_dbs'], 1) # "new" primary + self.assertLengthEqual(found['shard_dbs'], 2) # existing primaries + + # catastrophic failure! drive dies and is replaced on unchanged primary + failed_node = self.brain.nodes[0] + failed_dir, _container_hash = self.get_storage_dir( + self.brain.part, failed_node) + shutil.rmtree(failed_dir) + + # replicate the "old primary" to everybody except the "new primary" + self.brain.servers.stop(number=new_primary_node_number) + self.replicators.once(number=handoff_node['id'] + 1) + + # We're willing to rsync the retiring db to the failed primary. + # This may or may not have shard ranges, depending on the order in + # which we hit the primaries, but it definitely *doesn't* have an + # epoch in its name yet. All objects are replicated. + self.assertTrue(os.path.exists(os.path.join( + failed_dir, container_hash + '.db'))) + self.assertLengthEqual(os.listdir(failed_dir), 1) + broker = self.get_broker(self.brain.part, failed_node) + self.assertLengthEqual(broker.get_objects(), len(obj_names) + 1) + + # The other out-of-date primary is within usync range but objects are + # not replicated to it because the handoff db learns about shard ranges + broker = self.get_broker(self.brain.part, self.brain.nodes[1]) + self.assertLengthEqual(broker.get_objects(), 0) + + # Handoff db still exists and now has shard ranges! + self.assertTrue(os.path.exists(os.path.join( + old_primary_dir, container_hash + '.db'))) + broker = self.get_broker(self.brain.part, handoff_node) + shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + self.assert_container_state(handoff_node, 'unsharded', 2) + + # Replicate again, this time *including* "new primary" + self.brain.servers.start(number=new_primary_node_number) + self.replicators.once(number=handoff_node['id'] + 1) + + # Ordinarily, we would have rsync_then_merge'd to "new primary" + # but instead we wait + broker = self.get_broker(self.brain.part, new_primary_node) + self.assertLengthEqual(broker.get_objects(), 0) + shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(shard_ranges, 2) + + # so the next time the sharder comes along, it can push rows out + # and delete the big db + self.sharders.once(number=handoff_node['id'] + 1, + additional_args='--partitions=%s' % self.brain.part) + self.assert_container_state(handoff_node, 'sharded', 2) + self.assertFalse(os.path.exists(os.path.join( + old_primary_dir, container_hash + '.db'))) + # the sharded db hangs around until replication confirms durability + # first attempt is not sufficiently successful + self.brain.servers.stop(number=node_numbers[0]) + self.replicators.once(number=handoff_node['id'] + 1) + self.assertTrue(os.path.exists(old_primary_dir)) + self.assert_container_state(handoff_node, 'sharded', 2) + # second attempt is successful and handoff db is deleted + self.brain.servers.start(number=node_numbers[0]) + self.replicators.once(number=handoff_node['id'] + 1) + self.assertFalse(os.path.exists(old_primary_dir)) + + # run all the sharders, get us into a consistent state + self.sharders.once(additional_args='--partitions=%s' % self.brain.part) + self.assert_container_listing(['alpha'] + obj_names) + + def test_replication_to_empty_new_primary_from_sharding_old_primary(self): + primary_ids = [n['id'] for n in self.brain.nodes] + handoff_node = next(n for n in self.brain.ring.devs + if n['id'] not in primary_ids) + num_shards = 3 + obj_names = self._make_object_names( + num_shards * self.max_shard_size / 2) + self.put_objects(obj_names) + client.post_container(self.url, self.admin_token, self.container_name, + headers={'X-Container-Sharding': 'on'}) + + # run replicators first time to get sync points set + self.replicators.once() + # start sharding on only the leader node + leader_node = self.brain.nodes[0] + leader_node_number = self.brain.node_numbers[0] + self.sharders.once(number=leader_node_number) + self.assert_container_state(leader_node, 'sharding', 3) + for node in self.brain.nodes[1:]: + self.assert_container_state(node, 'unsharded', 3) + + # Fake a ring change - copy leader node db to a handoff to create + # illusion of a new unpopulated primary leader node + new_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, leader_node) + old_primary_dir, container_hash = self.get_storage_dir( + self.brain.part, handoff_node) + utils.mkdirs(os.path.dirname(old_primary_dir)) + os.rename(new_primary_dir, old_primary_dir) + self.assert_container_state(handoff_node, 'sharding', 3) + + # run replicator on handoff node to create a fresh db on new primary + self.assertFalse(os.path.exists(new_primary_dir)) + self.replicators.once(number=handoff_node['id'] + 1) + self.assertTrue(os.path.exists(new_primary_dir)) + self.assert_container_state(leader_node, 'sharded', 3) + broker = self.get_broker(self.brain.part, leader_node) + shard_ranges = broker.get_shard_ranges() + self.assertLengthEqual(shard_ranges, 3) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED], + [sr.state for sr in shard_ranges]) + + # db still exists on handoff + self.assertTrue(os.path.exists(old_primary_dir)) + self.assert_container_state(handoff_node, 'sharding', 3) + # continue sharding it... + self.sharders.once(number=handoff_node['id'] + 1) + self.assert_container_state(leader_node, 'sharded', 3) + # now handoff is fully sharded the replicator will delete it + self.replicators.once(number=handoff_node['id'] + 1) + self.assertFalse(os.path.exists(old_primary_dir)) + + # all primaries now have active shard ranges but only one is in sharded + # state + self.assert_container_state(leader_node, 'sharded', 3) + for node in self.brain.nodes[1:]: + self.assert_container_state(node, 'unsharded', 3) + node_data = self.direct_get_container_shard_ranges() + for node_id, (hdrs, shard_ranges) in node_data.items(): + with annotate_failure( + 'node id %s from %s' % (node_id, node_data.keys)): + self.assert_shard_range_state(ShardRange.ACTIVE, shard_ranges) + + # check handoff cleaved all objects before it was deleted - stop all + # but leader node so that listing is fetched from shards + for number in self.brain.node_numbers[1:3]: + self.brain.servers.stop(number=number) + + self.assert_container_listing(obj_names) + + for number in self.brain.node_numbers[1:3]: + self.brain.servers.start(number=number) + + self.sharders.once() + self.assert_container_state(leader_node, 'sharded', 3) + for node in self.brain.nodes[1:]: + self.assert_container_state(node, 'sharding', 3) + self.sharders.once() + for node in self.brain.nodes: + self.assert_container_state(node, 'sharded', 3) + + self.assert_container_listing(obj_names) diff --git a/test/unit/__init__.py b/test/unit/__init__.py index 2e611806a4..278c55a4ca 100644 --- a/test/unit/__init__.py +++ b/test/unit/__init__.py @@ -1343,3 +1343,46 @@ def unlink_files(paths): except OSError as err: if err.errno != errno.ENOENT: raise + + +class FakeHTTPResponse(object): + + def __init__(self, resp): + self.resp = resp + + @property + def status(self): + return self.resp.status_int + + @property + def data(self): + return self.resp.body + + +def attach_fake_replication_rpc(rpc, replicate_hook=None, errors=None): + class FakeReplConnection(object): + + def __init__(self, node, partition, hash_, logger): + self.logger = logger + self.node = node + self.partition = partition + self.path = '/%s/%s/%s' % (node['device'], partition, hash_) + self.host = node['replication_ip'] + + def replicate(self, op, *sync_args): + print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args)) + resp = None + if errors and op in errors and errors[op]: + resp = errors[op].pop(0) + if not resp: + replicate_args = self.path.lstrip('/').split('/') + args = [op] + copy.deepcopy(list(sync_args)) + with mock_check_drive(isdir=not rpc.mount_check, + ismount=rpc.mount_check): + swob_response = rpc.dispatch(replicate_args, args) + resp = FakeHTTPResponse(swob_response) + if replicate_hook: + replicate_hook(op, *sync_args) + return resp + + return FakeReplConnection diff --git a/test/unit/cli/test_manage_shard_ranges.py b/test/unit/cli/test_manage_shard_ranges.py new file mode 100644 index 0000000000..8cefa5b19c --- /dev/null +++ b/test/unit/cli/test_manage_shard_ranges.py @@ -0,0 +1,362 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy +# of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from __future__ import unicode_literals + +import json +import os +import unittest +import mock +from shutil import rmtree +from tempfile import mkdtemp + +from six.moves import cStringIO as StringIO + +from swift.cli.manage_shard_ranges import main +from swift.common import utils +from swift.common.utils import Timestamp, ShardRange +from swift.container.backend import ContainerBroker +from test.unit import mock_timestamp_now + + +class TestManageShardRanges(unittest.TestCase): + def setUp(self): + self.testdir = os.path.join(mkdtemp(), 'tmp_test_cli_find_shards') + utils.mkdirs(self.testdir) + rmtree(self.testdir) + self.shard_data = [ + {'index': 0, 'lower': '', 'upper': 'obj09', 'object_count': 10}, + {'index': 1, 'lower': 'obj09', 'upper': 'obj19', + 'object_count': 10}, + {'index': 2, 'lower': 'obj19', 'upper': 'obj29', + 'object_count': 10}, + {'index': 3, 'lower': 'obj29', 'upper': 'obj39', + 'object_count': 10}, + {'index': 4, 'lower': 'obj39', 'upper': 'obj49', + 'object_count': 10}, + {'index': 5, 'lower': 'obj49', 'upper': 'obj59', + 'object_count': 10}, + {'index': 6, 'lower': 'obj59', 'upper': 'obj69', + 'object_count': 10}, + {'index': 7, 'lower': 'obj69', 'upper': 'obj79', + 'object_count': 10}, + {'index': 8, 'lower': 'obj79', 'upper': 'obj89', + 'object_count': 10}, + {'index': 9, 'lower': 'obj89', 'upper': '', 'object_count': 10}, + ] + + def tearDown(self): + rmtree(os.path.dirname(self.testdir)) + + def assert_starts_with(self, value, prefix): + self.assertTrue(value.startswith(prefix), + "%r does not start with %r" % (value, prefix)) + + def assert_formatted_json(self, output, expected): + try: + loaded = json.loads(output) + except ValueError as err: + self.fail('Invalid JSON: %s\n%r' % (err, output)) + # Check this one first, for a prettier diff + self.assertEqual(loaded, expected) + formatted = json.dumps(expected, sort_keys=True, indent=2) + '\n' + self.assertEqual(output, formatted) + + def _make_broker(self, account='a', container='c', + device='sda', part=0): + datadir = os.path.join( + self.testdir, device, 'containers', str(part), 'ash', 'hash') + db_file = os.path.join(datadir, 'hash.db') + broker = ContainerBroker( + db_file, account=account, container=container) + broker.initialize() + return broker + + def test_find_shard_ranges(self): + db_file = os.path.join(self.testdir, 'hash.db') + broker = ContainerBroker(db_file) + broker.account = 'a' + broker.container = 'c' + broker.initialize() + ts = utils.Timestamp.now() + broker.merge_items([ + {'name': 'obj%02d' % i, 'created_at': ts.internal, 'size': 0, + 'content_type': 'application/octet-stream', 'etag': 'not-really', + 'deleted': 0, 'storage_policy_index': 0, + 'ctype_timestamp': ts.internal, 'meta_timestamp': ts.internal} + for i in range(100)]) + + # Default uses a large enough value that sharding isn't required + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find']) + self.assert_formatted_json(out.getvalue(), []) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 0 ranges in ') + + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find', '100']) + self.assert_formatted_json(out.getvalue(), []) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 0 ranges in ') + + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find', '99']) + self.assert_formatted_json(out.getvalue(), [ + {'index': 0, 'lower': '', 'upper': 'obj98', 'object_count': 99}, + {'index': 1, 'lower': 'obj98', 'upper': '', 'object_count': 1}, + ]) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 2 ranges in ') + + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([db_file, 'find', '10']) + self.assert_formatted_json(out.getvalue(), [ + {'index': 0, 'lower': '', 'upper': 'obj09', 'object_count': 10}, + {'index': 1, 'lower': 'obj09', 'upper': 'obj19', + 'object_count': 10}, + {'index': 2, 'lower': 'obj19', 'upper': 'obj29', + 'object_count': 10}, + {'index': 3, 'lower': 'obj29', 'upper': 'obj39', + 'object_count': 10}, + {'index': 4, 'lower': 'obj39', 'upper': 'obj49', + 'object_count': 10}, + {'index': 5, 'lower': 'obj49', 'upper': 'obj59', + 'object_count': 10}, + {'index': 6, 'lower': 'obj59', 'upper': 'obj69', + 'object_count': 10}, + {'index': 7, 'lower': 'obj69', 'upper': 'obj79', + 'object_count': 10}, + {'index': 8, 'lower': 'obj79', 'upper': 'obj89', + 'object_count': 10}, + {'index': 9, 'lower': 'obj89', 'upper': '', 'object_count': 10}, + ]) + err_lines = err.getvalue().split('\n') + self.assert_starts_with(err_lines[0], 'Loaded db broker for ') + self.assert_starts_with(err_lines[1], 'Found 10 ranges in ') + + def test_info(self): + broker = self._make_broker() + broker.update_metadata({'X-Container-Sysmeta-Sharding': + (True, Timestamp.now().internal)}) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'info']) + expected = ['Sharding enabled = True', + 'Own shard range: None', + 'db_state = unsharded', + 'Metadata:', + ' X-Container-Sysmeta-Sharding = True'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + retiring_db_id = broker.get_info()['id'] + broker.merge_shard_ranges(ShardRange('.shards/cc', Timestamp.now())) + epoch = Timestamp.now() + with mock_timestamp_now(epoch) as now: + broker.enable_sharding(epoch) + self.assertTrue(broker.set_sharding_state()) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now(now): + main([broker.db_file, 'info']) + expected = ['Sharding enabled = True', + 'Own shard range: {', + ' "bytes_used": 0, ', + ' "deleted": 0, ', + ' "epoch": "%s", ' % epoch.internal, + ' "lower": "", ', + ' "meta_timestamp": "%s", ' % now.internal, + ' "name": "a/c", ', + ' "object_count": 0, ', + ' "state": "sharding", ', + ' "state_timestamp": "%s", ' % now.internal, + ' "timestamp": "%s", ' % now.internal, + ' "upper": ""', + '}', + 'db_state = sharding', + 'Retiring db id: %s' % retiring_db_id, + 'Cleaving context: {', + ' "cleave_to_row": null, ', + ' "cleaving_done": false, ', + ' "cursor": "", ', + ' "last_cleave_to_row": null, ', + ' "max_row": -1, ', + ' "misplaced_done": false, ', + ' "ranges_done": 0, ', + ' "ranges_todo": 0, ', + ' "ref": "%s"' % retiring_db_id, + '}', + 'Metadata:', + ' X-Container-Sysmeta-Sharding = True'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + self.assertTrue(broker.set_sharded_state()) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now(now): + main([broker.db_file, 'info']) + expected = ['Sharding enabled = True', + 'Own shard range: {', + ' "bytes_used": 0, ', + ' "deleted": 0, ', + ' "epoch": "%s", ' % epoch.internal, + ' "lower": "", ', + ' "meta_timestamp": "%s", ' % now.internal, + ' "name": "a/c", ', + ' "object_count": 0, ', + ' "state": "sharding", ', + ' "state_timestamp": "%s", ' % now.internal, + ' "timestamp": "%s", ' % now.internal, + ' "upper": ""', + '}', + 'db_state = sharded', + 'Metadata:', + ' X-Container-Sysmeta-Sharding = True'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + def test_replace(self): + broker = self._make_broker() + broker.update_metadata({'X-Container-Sysmeta-Sharding': + (True, Timestamp.now().internal)}) + input_file = os.path.join(self.testdir, 'shards') + with open(input_file, 'wb') as fd: + json.dump(self.shard_data, fd) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'replace', input_file]) + expected = [ + 'No shard ranges found to delete.', + 'Injected 10 shard ranges.', + 'Run container-replicator to replicate them to other nodes.', + 'Use the enable sub-command to enable sharding.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self.assertEqual( + [(data['lower'], data['upper']) for data in self.shard_data], + [(sr.lower_str, sr.upper_str) for sr in broker.get_shard_ranges()]) + + def _assert_enabled(self, broker, epoch): + own_sr = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_sr.state) + self.assertEqual(epoch, own_sr.epoch) + self.assertEqual(ShardRange.MIN, own_sr.lower) + self.assertEqual(ShardRange.MAX, own_sr.upper) + self.assertEqual( + 'True', broker.metadata['X-Container-Sysmeta-Sharding'][0]) + + def test_enable(self): + broker = self._make_broker() + broker.update_metadata({'X-Container-Sysmeta-Sharding': + (True, Timestamp.now().internal)}) + # no shard ranges + out = StringIO() + err = StringIO() + with self.assertRaises(SystemExit): + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'enable']) + expected = ["WARNING: invalid shard ranges: ['No shard ranges.'].", + 'Aborting.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + + # success + shard_ranges = [] + for data in self.shard_data: + path = ShardRange.make_path( + '.shards_a', 'c', 'c', Timestamp.now(), data['index']) + shard_ranges.append( + ShardRange(path, Timestamp.now(), data['lower'], + data['upper'], data['object_count'])) + broker.merge_shard_ranges(shard_ranges) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now() as now: + main([broker.db_file, 'enable']) + expected = [ + "Container moved to state 'sharding' with epoch %s." % + now.internal, + 'Run container-sharder on all nodes to shard the container.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self._assert_enabled(broker, now) + + # already enabled + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + main([broker.db_file, 'enable']) + expected = [ + "Container already in state 'sharding' with epoch %s." % + now.internal, + 'No action required.', + 'Run container-sharder on all nodes to shard the container.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self._assert_enabled(broker, now) + + def test_find_replace_enable(self): + db_file = os.path.join(self.testdir, 'hash.db') + broker = ContainerBroker(db_file) + broker.account = 'a' + broker.container = 'c' + broker.initialize() + ts = utils.Timestamp.now() + broker.merge_items([ + {'name': 'obj%02d' % i, 'created_at': ts.internal, 'size': 0, + 'content_type': 'application/octet-stream', 'etag': 'not-really', + 'deleted': 0, 'storage_policy_index': 0, + 'ctype_timestamp': ts.internal, 'meta_timestamp': ts.internal} + for i in range(100)]) + out = StringIO() + err = StringIO() + with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err): + with mock_timestamp_now() as now: + main([broker.db_file, 'find_and_replace', '10', '--enable']) + expected = [ + 'No shard ranges found to delete.', + 'Injected 10 shard ranges.', + 'Run container-replicator to replicate them to other nodes.', + "Container moved to state 'sharding' with epoch %s." % + now.internal, + 'Run container-sharder on all nodes to shard the container.'] + self.assertEqual(expected, out.getvalue().splitlines()) + self.assertEqual(['Loaded db broker for a/c.'], + err.getvalue().splitlines()) + self._assert_enabled(broker, now) + self.assertEqual( + [(data['lower'], data['upper']) for data in self.shard_data], + [(sr.lower_str, sr.upper_str) for sr in broker.get_shard_ranges()]) diff --git a/test/unit/common/test_db_replicator.py b/test/unit/common/test_db_replicator.py index e4fdce8e91..21eedb9b7d 100644 --- a/test/unit/common/test_db_replicator.py +++ b/test/unit/common/test_db_replicator.py @@ -28,7 +28,6 @@ from tempfile import mkdtemp, NamedTemporaryFile import json import mock -from copy import deepcopy from mock import patch, call from six.moves import reload_module @@ -40,7 +39,7 @@ from swift.common.exceptions import DriveNotMounted from swift.common.swob import HTTPException from test import unit -from test.unit import FakeLogger +from test.unit import FakeLogger, attach_fake_replication_rpc from test.unit.common.test_db import ExampleBroker @@ -2054,49 +2053,6 @@ class TestReplToNode(unittest.TestCase): ]) -class FakeHTTPResponse(object): - - def __init__(self, resp): - self.resp = resp - - @property - def status(self): - return self.resp.status_int - - @property - def data(self): - return self.resp.body - - -def attach_fake_replication_rpc(rpc, replicate_hook=None, errors=None): - class FakeReplConnection(object): - - def __init__(self, node, partition, hash_, logger): - self.logger = logger - self.node = node - self.partition = partition - self.path = '/%s/%s/%s' % (node['device'], partition, hash_) - self.host = node['replication_ip'] - - def replicate(self, op, *sync_args): - print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args)) - resp = None - if errors and op in errors and errors[op]: - resp = errors[op].pop(0) - if not resp: - replicate_args = self.path.lstrip('/').split('/') - args = [op] + deepcopy(list(sync_args)) - with unit.mock_check_drive(isdir=not rpc.mount_check, - ismount=rpc.mount_check): - swob_response = rpc.dispatch(replicate_args, args) - resp = FakeHTTPResponse(swob_response) - if replicate_hook: - replicate_hook(op, *sync_args) - return resp - - return FakeReplConnection - - class ExampleReplicator(db_replicator.Replicator): server_type = 'fake' brokerclass = ExampleBroker diff --git a/test/unit/common/test_utils.py b/test/unit/common/test_utils.py index bfb83bf871..7abad33ec2 100644 --- a/test/unit/common/test_utils.py +++ b/test/unit/common/test_utils.py @@ -2766,6 +2766,53 @@ cluster_dfw1 = http://dfw1.host/v1/ else: self.assertEqual(expected, rv) + def test_config_float_value(self): + for args, expected in ( + ((99, None, None), 99.0), + ((99.01, None, None), 99.01), + (('99', None, None), 99.0), + (('99.01', None, None), 99.01), + ((99, 99, None), 99.0), + ((99.01, 99.01, None), 99.01), + (('99', 99, None), 99.0), + (('99.01', 99.01, None), 99.01), + ((99, None, 99), 99.0), + ((99.01, None, 99.01), 99.01), + (('99', None, 99), 99.0), + (('99.01', None, 99.01), 99.01), + ((-99, -99, -99), -99.0), + ((-99.01, -99.01, -99.01), -99.01), + (('-99', -99, -99), -99.0), + (('-99.01', -99.01, -99.01), -99.01),): + actual = utils.config_float_value(*args) + self.assertEqual(expected, actual) + + for val, minimum in ((99, 100), + ('99', 100), + (-99, -98), + ('-98.01', -98)): + with self.assertRaises(ValueError) as cm: + utils.config_float_value(val, minimum=minimum) + self.assertIn('greater than %s' % minimum, cm.exception.args[0]) + self.assertNotIn('less than', cm.exception.args[0]) + + for val, maximum in ((99, 98), + ('99', 98), + (-99, -100), + ('-97.9', -98)): + with self.assertRaises(ValueError) as cm: + utils.config_float_value(val, maximum=maximum) + self.assertIn('less than %s' % maximum, cm.exception.args[0]) + self.assertNotIn('greater than', cm.exception.args[0]) + + for val, minimum, maximum in ((99, 99, 98), + ('99', 100, 100), + (99, 98, 98),): + with self.assertRaises(ValueError) as cm: + utils.config_float_value(val, minimum=minimum, maximum=maximum) + self.assertIn('greater than %s' % minimum, cm.exception.args[0]) + self.assertIn('less than %s' % maximum, cm.exception.args[0]) + def test_config_auto_int_value(self): expectations = { # (value, default) : expected, diff --git a/test/unit/container/test_backend.py b/test/unit/container/test_backend.py index 0069f812e1..79ede02901 100644 --- a/test/unit/container/test_backend.py +++ b/test/unit/container/test_backend.py @@ -2013,6 +2013,75 @@ class TestContainerBroker(unittest.TestCase): self.assertEqual(info['reported_object_count'], 2) self.assertEqual(info['reported_bytes_used'], 1123) + @with_tempdir + def test_remove_objects(self, tempdir): + objects = (('undeleted', Timestamp.now().internal, 0, 'text/plain', + EMPTY_ETAG, 0, 0), + ('other_policy', Timestamp.now().internal, 0, 'text/plain', + EMPTY_ETAG, 0, 1), + ('deleted', Timestamp.now().internal, 0, 'text/plain', + EMPTY_ETAG, 1, 0)) + object_names = [o[0] for o in objects] + + def get_rows(broker): + with broker.get() as conn: + cursor = conn.execute("SELECT * FROM object") + return [r[1] for r in cursor] + + def do_setup(): + db_path = os.path.join( + tempdir, 'part', 'suffix', 'hash', '%s.db' % uuid4()) + broker = ContainerBroker(db_path, account='a', container='c') + broker.initialize(Timestamp.now().internal, 0) + for obj in objects: + # ensure row order matches put order + broker.put_object(*obj) + broker._commit_puts() + + self.assertEqual(3, broker.get_max_row()) # sanity check + self.assertEqual(object_names, get_rows(broker)) # sanity check + return broker + + broker = do_setup() + broker.remove_objects('', '') + self.assertFalse(get_rows(broker)) + + broker = do_setup() + broker.remove_objects('deleted', '') + self.assertEqual([object_names[2]], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', 'deleted', max_row=2) + self.assertEqual(object_names, get_rows(broker)) + + broker = do_setup() + broker.remove_objects('deleted', 'un') + self.assertEqual([object_names[0], object_names[2]], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=-1) + self.assertEqual(object_names, get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=0) + self.assertEqual(object_names, get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=1) + self.assertEqual(object_names[1:], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=2) + self.assertEqual(object_names[2:], get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=3) + self.assertFalse(get_rows(broker)) + + broker = do_setup() + broker.remove_objects('', '', max_row=99) + self.assertFalse(get_rows(broker)) + def test_get_objects(self): broker = ContainerBroker(':memory:', account='a', container='c') broker.initialize(Timestamp('1').internal, 0) diff --git a/test/unit/container/test_sharder.py b/test/unit/container/test_sharder.py new file mode 100644 index 0000000000..353d980bbf --- /dev/null +++ b/test/unit/container/test_sharder.py @@ -0,0 +1,4580 @@ +# Copyright (c) 2010-2017 OpenStack Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import hashlib +import json +import random + +import eventlet +import os +import shutil +from contextlib import contextmanager +from tempfile import mkdtemp + +import mock +import unittest + +from collections import defaultdict + +import time + +from copy import deepcopy + +from swift.common import internal_client +from swift.container import replicator +from swift.container.backend import ContainerBroker, UNSHARDED, SHARDING, \ + SHARDED, DATADIR +from swift.container.sharder import ContainerSharder, sharding_enabled, \ + CleavingContext, DEFAULT_SHARD_SHRINK_POINT, \ + DEFAULT_SHARD_CONTAINER_THRESHOLD +from swift.common.utils import ShardRange, Timestamp, hash_path, \ + encode_timestamps, parse_db_filename, quorum_size, Everything +from test import annotate_failure + +from test.unit import FakeLogger, debug_logger, FakeRing, \ + make_timestamp_iter, unlink_files, mocked_http_conn, mock_timestamp_now, \ + attach_fake_replication_rpc + + +class BaseTestSharder(unittest.TestCase): + def setUp(self): + self.tempdir = mkdtemp() + self.ts_iter = make_timestamp_iter() + + def tearDown(self): + shutil.rmtree(self.tempdir, ignore_errors=True) + + def _assert_shard_ranges_equal(self, expected, actual): + self.assertEqual([dict(sr) for sr in expected], + [dict(sr) for sr in actual]) + + def _make_broker(self, account='a', container='c', epoch=None, + device='sda', part=0, hash_=None): + hash_ = hash_ or hashlib.md5(container).hexdigest() + datadir = os.path.join( + self.tempdir, device, 'containers', str(part), hash_[-3:], hash_) + if epoch: + filename = '%s_%s.db' % (hash, epoch) + else: + filename = hash_ + '.db' + db_file = os.path.join(datadir, filename) + broker = ContainerBroker( + db_file, account=account, container=container, + logger=debug_logger()) + broker.initialize() + return broker + + def _make_sharding_broker(self, account='a', container='c', + shard_bounds=(('', 'middle'), ('middle', ''))): + broker = self._make_broker(account=account, container=container) + broker.set_sharding_sysmeta('Root', 'a/c') + old_db_id = broker.get_info()['id'] + broker.enable_sharding(next(self.ts_iter)) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CLEAVED) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + broker = ContainerBroker(broker.db_file, account='a', container='c') + self.assertNotEqual(old_db_id, broker.get_info()['id']) # sanity check + return broker + + def _make_shard_ranges(self, bounds, state=None, object_count=0): + return [ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), + lower, upper, state=state, + object_count=object_count) + for lower, upper in bounds] + + def ts_encoded(self): + # make a unique timestamp string with multiple timestamps encoded; + # use different deltas between component timestamps + timestamps = [next(self.ts_iter) for i in range(4)] + return encode_timestamps( + timestamps[0], timestamps[1], timestamps[3]) + + +class TestSharder(BaseTestSharder): + def test_init(self): + def do_test(conf, expected): + with mock.patch( + 'swift.container.sharder.internal_client.InternalClient') \ + as mock_ic: + with mock.patch('swift.common.db_replicator.ring.Ring') \ + as mock_ring: + mock_ring.return_value = mock.MagicMock() + mock_ring.return_value.replica_count = 3 + sharder = ContainerSharder(conf) + mock_ring.assert_called_once_with( + '/etc/swift', ring_name='container') + self.assertEqual( + 'container-sharder', sharder.logger.logger.name) + for k, v in expected.items(): + self.assertTrue(hasattr(sharder, k), 'Missing attr %s' % k) + self.assertEqual(v, getattr(sharder, k), + 'Incorrect value: expected %s=%s but got %s' % + (k, v, getattr(sharder, k))) + return mock_ic + + expected = { + 'mount_check': True, 'bind_ip': '0.0.0.0', 'port': 6201, + 'per_diff': 1000, 'max_diffs': 100, 'interval': 30, + 'cleave_row_batch_size': 10000, + 'node_timeout': 10, 'conn_timeout': 5, + 'rsync_compress': False, + 'rsync_module': '{replication_ip}::container', + 'reclaim_age': 86400 * 7, + 'shard_shrink_point': 0.25, + 'shrink_merge_point': 0.75, + 'shard_container_threshold': 10000000, + 'split_size': 5000000, + 'cleave_batch_size': 2, + 'scanner_batch_size': 10, + 'rcache': '/var/cache/swift/container.recon', + 'shards_account_prefix': '.shards_', + 'auto_shard': False, + 'recon_candidates_limit': 5, + 'shard_replication_quorum': 2, + 'existing_shard_replication_quorum': 2 + } + mock_ic = do_test({}, expected) + mock_ic.assert_called_once_with( + '/etc/swift/internal-client.conf', 'Swift Container Sharder', 3, + allow_modify_pipeline=False) + + conf = { + 'mount_check': False, 'bind_ip': '10.11.12.13', 'bind_port': 62010, + 'per_diff': 2000, 'max_diffs': 200, 'interval': 60, + 'cleave_row_batch_size': 3000, + 'node_timeout': 20, 'conn_timeout': 1, + 'rsync_compress': True, + 'rsync_module': '{replication_ip}::container_sda/', + 'reclaim_age': 86400 * 14, + 'shard_shrink_point': 35, + 'shard_shrink_merge_point': 85, + 'shard_container_threshold': 20000000, + 'cleave_batch_size': 4, + 'shard_scanner_batch_size': 8, + 'request_tries': 2, + 'internal_client_conf_path': '/etc/swift/my-sharder-ic.conf', + 'recon_cache_path': '/var/cache/swift-alt', + 'auto_create_account_prefix': '...', + 'auto_shard': 'yes', + 'recon_candidates_limit': 10, + 'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0 + } + expected = { + 'mount_check': False, 'bind_ip': '10.11.12.13', 'port': 62010, + 'per_diff': 2000, 'max_diffs': 200, 'interval': 60, + 'cleave_row_batch_size': 3000, + 'node_timeout': 20, 'conn_timeout': 1, + 'rsync_compress': True, + 'rsync_module': '{replication_ip}::container_sda', + 'reclaim_age': 86400 * 14, + 'shard_shrink_point': 0.35, + 'shrink_merge_point': 0.85, + 'shard_container_threshold': 20000000, + 'split_size': 10000000, + 'cleave_batch_size': 4, + 'scanner_batch_size': 8, + 'rcache': '/var/cache/swift-alt/container.recon', + 'shards_account_prefix': '...shards_', + 'auto_shard': True, + 'recon_candidates_limit': 10, + 'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0 + } + mock_ic = do_test(conf, expected) + mock_ic.assert_called_once_with( + '/etc/swift/my-sharder-ic.conf', 'Swift Container Sharder', 2, + allow_modify_pipeline=False) + + expected.update({'shard_replication_quorum': 3, + 'existing_shard_replication_quorum': 3}) + conf.update({'shard_replication_quorum': 4, + 'existing_shard_replication_quorum': 4}) + do_test(conf, expected) + + with self.assertRaises(ValueError) as cm: + do_test({'shard_shrink_point': 101}, {}) + self.assertIn( + 'greater than 0, less than 100, not "101"', cm.exception.message) + self.assertIn('shard_shrink_point', cm.exception.message) + + with self.assertRaises(ValueError) as cm: + do_test({'shard_shrink_merge_point': 101}, {}) + self.assertIn( + 'greater than 0, less than 100, not "101"', cm.exception.message) + self.assertIn('shard_shrink_merge_point', cm.exception.message) + + def test_init_internal_client_conf_loading_error(self): + with mock.patch('swift.common.db_replicator.ring.Ring') \ + as mock_ring: + mock_ring.return_value = mock.MagicMock() + mock_ring.return_value.replica_count = 3 + with self.assertRaises(SystemExit) as cm: + ContainerSharder( + {'internal_client_conf_path': + os.path.join(self.tempdir, 'nonexistent')}) + self.assertIn('Unable to load internal client', str(cm.exception)) + + with mock.patch('swift.common.db_replicator.ring.Ring') \ + as mock_ring: + mock_ring.return_value = mock.MagicMock() + mock_ring.return_value.replica_count = 3 + with mock.patch( + 'swift.container.sharder.internal_client.InternalClient', + side_effect=Exception('kaboom')): + with self.assertRaises(Exception) as cm: + ContainerSharder({}) + self.assertIn('kaboom', str(cm.exception)) + + def _assert_stats(self, expected, sharder, category): + # assertEqual doesn't work with a defaultdict + stats = sharder.stats['sharding'][category] + for k, v in expected.items(): + actual = stats[k] + self.assertEqual( + v, actual, 'Expected %s but got %s for %s in %s' % + (v, actual, k, stats)) + return stats + + def _assert_recon_stats(self, expected, sharder, category): + with open(sharder.rcache, 'rb') as fd: + recon = json.load(fd) + stats = recon['sharding_stats']['sharding'].get(category) + self.assertEqual(expected, stats) + + def test_increment_stats(self): + with self._mock_sharder() as sharder: + sharder._increment_stat('visited', 'success') + sharder._increment_stat('visited', 'success') + sharder._increment_stat('visited', 'failure') + sharder._increment_stat('visited', 'completed') + sharder._increment_stat('cleaved', 'success') + sharder._increment_stat('scanned', 'found', step=4) + expected = {'success': 2, + 'failure': 1, + 'completed': 1} + self._assert_stats(expected, sharder, 'visited') + self._assert_stats({'success': 1}, sharder, 'cleaved') + self._assert_stats({'found': 4}, sharder, 'scanned') + + def test_increment_stats_with_statsd(self): + with self._mock_sharder() as sharder: + sharder._increment_stat('visited', 'success', statsd=True) + sharder._increment_stat('visited', 'success', statsd=True) + sharder._increment_stat('visited', 'failure', statsd=True) + sharder._increment_stat('visited', 'failure', statsd=False) + sharder._increment_stat('visited', 'completed') + expected = {'success': 2, + 'failure': 2, + 'completed': 1} + self._assert_stats(expected, sharder, 'visited') + counts = sharder.logger.get_increment_counts() + self.assertEqual(2, counts.get('visited_success')) + self.assertEqual(1, counts.get('visited_failure')) + self.assertIsNone(counts.get('visited_completed')) + + def test_run_forever(self): + conf = {'recon_cache_path': self.tempdir, + 'devices': self.tempdir} + with self._mock_sharder(conf) as sharder: + sharder._check_node = lambda *args: True + sharder.logger.clear() + brokers = [] + for container in ('c1', 'c2'): + broker = self._make_broker( + container=container, hash_=container + 'hash', + device=sharder.ring.devs[0]['device'], part=0) + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('true', next(self.ts_iter).internal)}) + brokers.append(broker) + + fake_stats = { + 'scanned': {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 2, 'min_time': 99, 'max_time': 123}, + 'created': {'attempted': 1, 'success': 1, 'failure': 1}, + 'cleaved': {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': 0.01, 'max_time': 1.3}, + 'misplaced': {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 1, 'unplaced': 0}, + 'audit_root': {'attempted': 5, 'success': 4, 'failure': 1}, + 'audit_shard': {'attempted': 2, 'success': 2, 'failure': 0}, + } + # NB these are time increments not absolute times... + fake_periods = [1, 2, 3, 3600, 4, 15, 15, 0] + fake_periods_iter = iter(fake_periods) + recon_data = [] + fake_process_broker_calls = [] + + def mock_dump_recon_cache(data, *args): + recon_data.append(deepcopy(data)) + + with mock.patch('swift.container.sharder.time.time') as fake_time: + def fake_process_broker(broker, *args, **kwargs): + # increment time and inject some fake stats + fake_process_broker_calls.append((broker, args, kwargs)) + try: + fake_time.return_value += next(fake_periods_iter) + except StopIteration: + # bail out + fake_time.side_effect = Exception('Test over') + sharder.stats['sharding'].update(fake_stats) + + with mock.patch( + 'swift.container.sharder.time.sleep') as mock_sleep: + with mock.patch( + 'swift.container.sharder.is_sharding_candidate', + return_value=True): + with mock.patch( + 'swift.container.sharder.dump_recon_cache', + mock_dump_recon_cache): + fake_time.return_value = next(fake_periods_iter) + sharder._is_sharding_candidate = lambda x: True + sharder._process_broker = fake_process_broker + with self.assertRaises(Exception) as cm: + sharder.run_forever() + + self.assertEqual('Test over', cm.exception.message) + # four cycles are started, two brokers visited per cycle, but + # fourth never completes + self.assertEqual(8, len(fake_process_broker_calls)) + # expect initial random sleep then one sleep between first and + # second pass + self.assertEqual(2, mock_sleep.call_count) + self.assertLessEqual(mock_sleep.call_args_list[0][0][0], 30) + self.assertLessEqual(mock_sleep.call_args_list[1][0][0], + 30 - fake_periods[0]) + + lines = sharder.logger.get_lines_for_level('info') + categories = ('visited', 'scanned', 'created', 'cleaved', + 'misplaced', 'audit_root', 'audit_shard') + + def check_categories(start_time): + for category in categories: + line = lines.pop(0) + self.assertIn('Since %s' % time.ctime(start_time), line) + self.assertIn(category, line) + for k, v in fake_stats.get(category, {}).items(): + self.assertIn('%s:%s' % (k, v), line) + + def check_logs(cycle_time, start_time, + expect_periodic_stats=False): + self.assertIn('Container sharder cycle starting', lines.pop(0)) + check_categories(start_time) + if expect_periodic_stats: + check_categories(start_time) + self.assertIn('Container sharder cycle completed: %.02fs' % + cycle_time, lines.pop(0)) + + check_logs(sum(fake_periods[1:3]), fake_periods[0]) + check_logs(sum(fake_periods[3:5]), sum(fake_periods[:3]), + expect_periodic_stats=True) + check_logs(sum(fake_periods[5:7]), sum(fake_periods[:5])) + # final cycle start but then exception pops to terminate test + self.assertIn('Container sharder cycle starting', lines.pop(0)) + self.assertFalse(lines) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn( + 'Unhandled exception while dumping progress', lines[0]) + self.assertIn('Test over', lines[0]) + + def check_recon(data, time, last, expected_stats): + self.assertEqual(time, data['sharding_time']) + self.assertEqual(last, data['sharding_last']) + self.assertEqual( + expected_stats, dict(data['sharding_stats']['sharding'])) + + def stats_for_candidate(broker): + return {'object_count': 0, + 'account': broker.account, + 'meta_timestamp': mock.ANY, + 'container': broker.container, + 'file_size': os.stat(broker.db_file).st_size, + 'path': broker.db_file, + 'root': broker.path, + 'node_index': 0} + + self.assertEqual(4, len(recon_data)) + # stats report at end of first cycle + fake_stats.update({'visited': {'attempted': 2, 'skipped': 0, + 'success': 2, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 2, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[:2]] + } + }) + check_recon(recon_data[0], sum(fake_periods[1:3]), + sum(fake_periods[:3]), fake_stats) + # periodic stats report after first broker has been visited during + # second cycle - one candidate identified so far this cycle + fake_stats.update({'visited': {'attempted': 1, 'skipped': 0, + 'success': 1, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 1, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[2:3]] + } + }) + check_recon(recon_data[1], fake_periods[3], + sum(fake_periods[:4]), fake_stats) + # stats report at end of second cycle - both candidates reported + fake_stats.update({'visited': {'attempted': 2, 'skipped': 0, + 'success': 2, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 2, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[2:4]] + } + }) + check_recon(recon_data[2], sum(fake_periods[3:5]), + sum(fake_periods[:5]), fake_stats) + # stats report at end of third cycle + fake_stats.update({'visited': {'attempted': 2, 'skipped': 0, + 'success': 2, 'failure': 0, + 'completed': 0}}) + fake_stats.update({ + 'sharding_candidates': { + 'found': 2, + 'top': [stats_for_candidate(call[0]) + for call in fake_process_broker_calls[4:6]] + } + }) + check_recon(recon_data[3], sum(fake_periods[5:7]), + sum(fake_periods[:7]), fake_stats) + + def test_one_shard_cycle(self): + conf = {'recon_cache_path': self.tempdir, + 'devices': self.tempdir, + 'shard_container_threshold': 9} + with self._mock_sharder(conf) as sharder: + sharder._check_node = lambda *args: True + sharder.reported = time.time() + sharder.logger = debug_logger() + brokers = [] + device_ids = set(range(3)) + for device_id in device_ids: + brokers.append(self._make_broker( + container='c%s' % device_id, hash_='c%shash' % device_id, + device=sharder.ring.devs[device_id]['device'], part=0)) + # enable a/c2 and a/c3 for sharding + for broker in brokers[1:]: + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('true', next(self.ts_iter).internal)}) + # make a/c2 a candidate for sharding + for i in range(10): + brokers[1].put_object('o%s' % i, next(self.ts_iter).internal, + 0, 'text/plain', 'etag', 0) + + # check only sharding enabled containers are processed + with mock.patch.object( + sharder, '_process_broker' + ) as mock_process_broker: + sharder._local_device_ids = {'stale_node_id'} + sharder._one_shard_cycle(Everything(), Everything()) + + self.assertEqual(device_ids, sharder._local_device_ids) + self.assertEqual(2, mock_process_broker.call_count) + processed_paths = [call[0][0].path + for call in mock_process_broker.call_args_list] + self.assertEqual({'a/c1', 'a/c2'}, set(processed_paths)) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + expected_stats = {'attempted': 2, 'success': 2, 'failure': 0, + 'skipped': 1, 'completed': 0} + self._assert_recon_stats(expected_stats, sharder, 'visited') + expected_candidate_stats = { + 'found': 1, + 'top': [{'object_count': 10, 'account': 'a', 'container': 'c1', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[1].db_file).st_size, + 'path': brokers[1].db_file, 'root': 'a/c1', + 'node_index': 1}]} + self._assert_recon_stats( + expected_candidate_stats, sharder, 'sharding_candidates') + self._assert_recon_stats(None, sharder, 'sharding_progress') + + # enable and progress container a/c1 by giving it shard ranges + now = next(self.ts_iter) + brokers[0].merge_shard_ranges( + [ShardRange('a/c0', now, '', '', state=ShardRange.SHARDING), + ShardRange('.s_a/1', now, '', 'b', state=ShardRange.ACTIVE), + ShardRange('.s_a/2', now, 'b', 'c', state=ShardRange.CLEAVED), + ShardRange('.s_a/3', now, 'c', 'd', state=ShardRange.CREATED), + ShardRange('.s_a/4', now, 'd', 'e', state=ShardRange.CREATED), + ShardRange('.s_a/5', now, 'e', '', state=ShardRange.FOUND)]) + brokers[1].merge_shard_ranges( + [ShardRange('a/c1', now, '', '', state=ShardRange.SHARDING), + ShardRange('.s_a/6', now, '', 'b', state=ShardRange.ACTIVE), + ShardRange('.s_a/7', now, 'b', 'c', state=ShardRange.ACTIVE), + ShardRange('.s_a/8', now, 'c', 'd', state=ShardRange.CLEAVED), + ShardRange('.s_a/9', now, 'd', 'e', state=ShardRange.CREATED), + ShardRange('.s_a/0', now, 'e', '', state=ShardRange.CREATED)]) + for i in range(11): + brokers[2].put_object('o%s' % i, next(self.ts_iter).internal, + 0, 'text/plain', 'etag', 0) + + def mock_processing(broker, node, part): + if broker.path == 'a/c1': + raise Exception('kapow!') + elif broker.path not in ('a/c0', 'a/c2'): + raise BaseException("I don't know how to handle a broker " + "for %s" % broker.path) + + # check exceptions are handled + with mock.patch.object( + sharder, '_process_broker', side_effect=mock_processing + ) as mock_process_broker: + sharder._local_device_ids = {'stale_node_id'} + sharder._one_shard_cycle(Everything(), Everything()) + + self.assertEqual(device_ids, sharder._local_device_ids) + self.assertEqual(3, mock_process_broker.call_count) + processed_paths = [call[0][0].path + for call in mock_process_broker.call_args_list] + self.assertEqual({'a/c0', 'a/c1', 'a/c2'}, set(processed_paths)) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn('Unhandled exception while processing', lines[0]) + self.assertFalse(lines[1:]) + sharder.logger.clear() + expected_stats = {'attempted': 3, 'success': 2, 'failure': 1, + 'skipped': 0, 'completed': 0} + self._assert_recon_stats(expected_stats, sharder, 'visited') + expected_candidate_stats = { + 'found': 1, + 'top': [{'object_count': 11, 'account': 'a', 'container': 'c2', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[1].db_file).st_size, + 'path': brokers[2].db_file, 'root': 'a/c2', + 'node_index': 2}]} + self._assert_recon_stats( + expected_candidate_stats, sharder, 'sharding_candidates') + expected_in_progress_stats = { + 'all': [{'object_count': 0, 'account': 'a', 'container': 'c0', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[0].db_file).st_size, + 'path': brokers[0].db_file, 'root': 'a/c0', + 'node_index': 0, + 'found': 1, 'created': 2, 'cleaved': 1, 'active': 1, + 'state': 'sharding', 'db_state': 'unsharded', + 'error': None}, + {'object_count': 10, 'account': 'a', 'container': 'c1', + 'meta_timestamp': mock.ANY, + 'file_size': os.stat(brokers[1].db_file).st_size, + 'path': brokers[1].db_file, 'root': 'a/c1', + 'node_index': 1, + 'found': 0, 'created': 2, 'cleaved': 1, 'active': 2, + 'state': 'sharding', 'db_state': 'unsharded', + 'error': 'kapow!'}]} + self._assert_stats( + expected_in_progress_stats, sharder, 'sharding_in_progress') + + # check that candidates and in progress stats don't stick in recon + own_shard_range = brokers[0].get_own_shard_range() + own_shard_range.state = ShardRange.ACTIVE + brokers[0].merge_shard_ranges([own_shard_range]) + for i in range(10): + brokers[1].delete_object( + 'o%s' % i, next(self.ts_iter).internal) + with mock.patch.object( + sharder, '_process_broker' + ) as mock_process_broker: + sharder._local_device_ids = {999} + sharder._one_shard_cycle(Everything(), Everything()) + + self.assertEqual(device_ids, sharder._local_device_ids) + self.assertEqual(3, mock_process_broker.call_count) + processed_paths = [call[0][0].path + for call in mock_process_broker.call_args_list] + self.assertEqual({'a/c0', 'a/c1', 'a/c2'}, set(processed_paths)) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + expected_stats = {'attempted': 3, 'success': 3, 'failure': 0, + 'skipped': 0, 'completed': 0} + self._assert_recon_stats(expected_stats, sharder, 'visited') + self._assert_recon_stats( + expected_candidate_stats, sharder, 'sharding_candidates') + self._assert_recon_stats(None, sharder, 'sharding_progress') + + @contextmanager + def _mock_sharder(self, conf=None, replicas=3): + conf = conf or {} + conf['devices'] = self.tempdir + with mock.patch( + 'swift.container.sharder.internal_client.InternalClient'): + with mock.patch( + 'swift.common.db_replicator.ring.Ring', + lambda *args, **kwargs: FakeRing(replicas=replicas)): + sharder = ContainerSharder(conf, logger=FakeLogger()) + sharder._local_device_ids = {0, 1, 2} + sharder._replicate_object = mock.MagicMock( + return_value=(True, [True] * sharder.ring.replica_count)) + yield sharder + + def _get_raw_object_records(self, broker): + # use list_objects_iter with no-op transform_func to get back actual + # un-transformed rows with encoded timestamps + return [list(obj) for obj in broker.list_objects_iter( + 10, '', '', '', '', include_deleted=None, all_policies=True, + transform_func=lambda record: record)] + + def _check_objects(self, expected_objs, shard_db): + shard_broker = ContainerBroker(shard_db) + shard_objs = self._get_raw_object_records(shard_broker) + expected_objs = [list(obj) for obj in expected_objs] + self.assertEqual(expected_objs, shard_objs) + + def _check_shard_range(self, expected, actual): + expected_dict = dict(expected) + actual_dict = dict(actual) + self.assertGreater(actual_dict.pop('meta_timestamp'), + expected_dict.pop('meta_timestamp')) + self.assertEqual(expected_dict, actual_dict) + + def test_fetch_shard_ranges_unexpected_response(self): + broker = self._make_broker() + exc = internal_client.UnexpectedResponse( + 'Unexpected response: 404', None) + with self._mock_sharder() as sharder: + sharder.int_client.make_request.side_effect = exc + self.assertIsNone(sharder._fetch_shard_ranges(broker)) + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Unexpected response: 404', lines[0]) + self.assertFalse(lines[1:]) + + def test_fetch_shard_ranges_bad_record_type(self): + def do_test(mock_resp_headers): + with self._mock_sharder() as sharder: + mock_make_request = mock.MagicMock( + return_value=mock.MagicMock(headers=mock_resp_headers)) + sharder.int_client.make_request = mock_make_request + self.assertIsNone(sharder._fetch_shard_ranges(broker)) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn('unexpected record type', lines[0]) + self.assertFalse(lines[1:]) + + broker = self._make_broker() + do_test({}) + do_test({'x-backend-record-type': 'object'}) + do_test({'x-backend-record-type': 'disco'}) + + def test_fetch_shard_ranges_bad_data(self): + def do_test(mock_resp_body): + mock_resp_headers = {'x-backend-record-type': 'shard'} + with self._mock_sharder() as sharder: + mock_make_request = mock.MagicMock( + return_value=mock.MagicMock(headers=mock_resp_headers, + body=mock_resp_body)) + sharder.int_client.make_request = mock_make_request + self.assertIsNone(sharder._fetch_shard_ranges(broker)) + lines = sharder.logger.get_lines_for_level('error') + self.assertIn('invalid data', lines[0]) + self.assertFalse(lines[1:]) + + broker = self._make_broker() + do_test({}) + do_test('') + do_test(json.dumps({})) + do_test(json.dumps([{'account': 'a', 'container': 'c'}])) + + def test_fetch_shard_ranges_ok(self): + def do_test(mock_resp_body, params): + mock_resp_headers = {'x-backend-record-type': 'shard'} + with self._mock_sharder() as sharder: + mock_make_request = mock.MagicMock( + return_value=mock.MagicMock(headers=mock_resp_headers, + body=mock_resp_body)) + sharder.int_client.make_request = mock_make_request + mock_make_path = mock.MagicMock(return_value='/v1/a/c') + sharder.int_client.make_path = mock_make_path + actual = sharder._fetch_shard_ranges(broker, params=params) + sharder.int_client.make_path.assert_called_once_with('a', 'c') + self.assertFalse(sharder.logger.get_lines_for_level('error')) + return actual, mock_make_request + + expected_headers = {'X-Backend-Record-Type': 'shard', + 'X-Backend-Include-Deleted': 'False', + 'X-Backend-Override-Deleted': 'true'} + broker = self._make_broker() + shard_ranges = self._make_shard_ranges((('', 'm'), ('m', ''))) + + params = {'format': 'json'} + actual, mock_call = do_test(json.dumps([dict(shard_ranges[0])]), + params={}) + mock_call.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + self._assert_shard_ranges_equal([shard_ranges[0]], actual) + + params = {'format': 'json', 'includes': 'thing'} + actual, mock_call = do_test( + json.dumps([dict(sr) for sr in shard_ranges]), params=params) + self._assert_shard_ranges_equal(shard_ranges, actual) + mock_call.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + params = {'format': 'json', 'end_marker': 'there', 'marker': 'here'} + actual, mock_call = do_test(json.dumps([]), params=params) + self._assert_shard_ranges_equal([], actual) + mock_call.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + def _check_cleave_root(self, conf=None): + broker = self._make_broker() + objects = [ + # shard 0 + ('a', self.ts_encoded(), 10, 'text/plain', 'etag_a', 0, 0), + ('here', self.ts_encoded(), 10, 'text/plain', 'etag_here', 0, 0), + # shard 1 + ('m', self.ts_encoded(), 1, 'text/plain', 'etag_m', 0, 0), + ('n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0), + ('there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0), + # shard 2 + ('where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0), + # shard 3 + ('x', self.ts_encoded(), 0, '', '', 1, 0), # deleted + ('y', self.ts_encoded(), 1000, 'text/plain', 'etag_y', 0, 0), + # shard 4 + ('yyyy', self.ts_encoded(), 14, 'text/plain', 'etag_yyyy', 0, 0), + ] + for obj in objects: + broker.put_object(*obj) + initial_root_info = broker.get_info() + broker.enable_sharding(Timestamp.now()) + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', 'yonder'), + ('yonder', '')) + shard_ranges = self._make_shard_ranges(shard_bounds) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + # used to accumulate stats from sharded dbs + total_shard_stats = {'object_count': 0, 'bytes_used': 0} + # run cleave - no shard ranges, nothing happens + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(0, context.ranges_done) + self.assertEqual(0, context.ranges_todo) + + self.assertEqual(UNSHARDED, broker.get_db_state()) + sharder._replicate_object.assert_not_called() + for db in expected_shard_dbs: + with annotate_failure(db): + self.assertFalse(os.path.exists(db)) + + # run cleave - all shard ranges in found state, nothing happens + broker.merge_shard_ranges(shard_ranges[:4]) + self.assertTrue(broker.set_sharding_state()) + + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(0, context.ranges_done) + self.assertEqual(4, context.ranges_todo) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_not_called() + for db in expected_shard_dbs: + with annotate_failure(db): + self.assertFalse(os.path.exists(db)) + for shard_range in broker.get_shard_ranges(): + with annotate_failure(shard_range): + self.assertEqual(ShardRange.FOUND, shard_range.state) + + # move first shard range to created state, first shard range is cleaved + shard_ranges[0].update_state(ShardRange.CREATED) + broker.merge_shard_ranges(shard_ranges[:1]) + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + expected = {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[0], 0) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + # update expected state and metadata, check cleaved shard range + shard_ranges[0].bytes_used = 20 + shard_ranges[0].object_count = 2 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(objects[:2], expected_shard_dbs[0]) + # other shard ranges should be unchanged + for i in range(1, len(shard_ranges)): + with annotate_failure(i): + self.assertFalse(os.path.exists(expected_shard_dbs[i])) + for i in range(1, len(updated_shard_ranges)): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_ranges[i])) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('here', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(1, context.ranges_done) + self.assertEqual(3, context.ranges_todo) + + unlink_files(expected_shard_dbs) + + # move more shard ranges to created state + for i in range(1, 4): + shard_ranges[i].update_state(ShardRange.CREATED) + broker.merge_shard_ranges(shard_ranges[1:4]) + + # replication of next shard range is not sufficiently successful + with self._mock_sharder(conf=conf) as sharder: + quorum = quorum_size(sharder.ring.replica_count) + successes = [True] * (quorum - 1) + fails = [False] * (sharder.ring.replica_count - len(successes)) + responses = successes + fails + random.shuffle(responses) + sharder._replicate_object = mock.MagicMock( + side_effect=((False, responses),)) + self.assertFalse(sharder._cleave(broker)) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + + # cleaving state is unchanged + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + for i in range(1, len(updated_shard_ranges)): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_ranges[i])) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('here', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(1, context.ranges_done) + self.assertEqual(3, context.ranges_todo) + + # try again, this time replication is sufficiently successful + with self._mock_sharder(conf=conf) as sharder: + successes = [True] * quorum + fails = [False] * (sharder.ring.replica_count - len(successes)) + responses1 = successes + fails + responses2 = fails + successes + sharder._replicate_object = mock.MagicMock( + side_effect=((False, responses1), (False, responses2))) + self.assertFalse(sharder._cleave(broker)) + + expected = {'attempted': 2, 'success': 2, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in expected_shard_dbs[1:3]] + ) + for db in expected_shard_dbs[1:3]: + shard_broker = ContainerBroker(db) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + + # only 2 are cleaved per batch + # update expected state and metadata, check cleaved shard ranges + shard_ranges[1].bytes_used = 6 + shard_ranges[1].object_count = 3 + shard_ranges[1].state = ShardRange.CLEAVED + shard_ranges[2].bytes_used = 100 + shard_ranges[2].object_count = 1 + shard_ranges[2].state = ShardRange.CLEAVED + for i in range(0, 3): + with annotate_failure(i): + self._check_shard_range( + shard_ranges[i], updated_shard_ranges[i]) + self._check_objects(objects[2:5], expected_shard_dbs[1]) + self._check_objects(objects[5:6], expected_shard_dbs[2]) + # other shard ranges should be unchanged + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + for i, db in enumerate(expected_shard_dbs[3:], 3): + with annotate_failure(i): + self.assertFalse(os.path.exists(db)) + for i, updated_shard_range in enumerate(updated_shard_ranges[3:], 3): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_range)) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('where', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(3, context.ranges_done) + self.assertEqual(1, context.ranges_todo) + + unlink_files(expected_shard_dbs) + + # run cleave again - should process the fourth range + with self._mock_sharder(conf=conf) as sharder: + sharder.logger = debug_logger() + self.assertFalse(sharder._cleave(broker)) + + expected = {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[3], 0) + shard_broker = ContainerBroker(expected_shard_dbs[3]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(4, len(updated_shard_ranges)) + + shard_ranges[3].bytes_used = 1000 + shard_ranges[3].object_count = 1 + shard_ranges[3].state = ShardRange.CLEAVED + for i in range(0, 4): + with annotate_failure(i): + self._check_shard_range( + shard_ranges[i], updated_shard_ranges[i]) + # NB includes the deleted object + self._check_objects(objects[6:8], expected_shard_dbs[3]) + # other shard ranges should be unchanged + for i, db in enumerate(expected_shard_dbs[:3]): + with annotate_failure(i): + self.assertFalse(os.path.exists(db)) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + for i, updated_shard_range in enumerate(updated_shard_ranges[4:], 4): + with annotate_failure(i): + self.assertEqual(dict(shard_ranges[i]), + dict(updated_shard_range)) + + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('yonder', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(4, context.ranges_done) + self.assertEqual(0, context.ranges_todo) + + unlink_files(expected_shard_dbs) + + # run cleave - should be a no-op, all existing ranges have been cleaved + with self._mock_sharder(conf=conf) as sharder: + self.assertFalse(sharder._cleave(broker)) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_not_called() + + # add final shard range - move this to ACTIVE state and update stats to + # simulate another replica having cleaved it and replicated its state + shard_ranges[4].update_state(ShardRange.ACTIVE) + shard_ranges[4].update_meta(2, 15) + broker.merge_shard_ranges(shard_ranges[4:]) + + with self._mock_sharder(conf=conf) as sharder: + self.assertTrue(sharder._cleave(broker)) + + expected = {'attempted': 1, 'success': 1, 'failure': 0, + 'min_time': mock.ANY, 'max_time': mock.ANY} + stats = self._assert_stats(expected, sharder, 'cleaved') + self.assertIsInstance(stats['min_time'], float) + self.assertIsInstance(stats['max_time'], float) + self.assertLessEqual(stats['min_time'], stats['max_time']) + + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[4], 0) + shard_broker = ContainerBroker(expected_shard_dbs[4]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.ACTIVE, shard_own_sr.state) + shard_info = shard_broker.get_info() + total_shard_stats['object_count'] += shard_info['object_count'] + total_shard_stats['bytes_used'] += shard_info['bytes_used'] + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(5, len(updated_shard_ranges)) + # NB stats of the ACTIVE shard range should not be reset by cleaving + for i in range(0, 4): + with annotate_failure(i): + self._check_shard_range( + shard_ranges[i], updated_shard_ranges[i]) + self.assertEqual(dict(shard_ranges[4]), dict(updated_shard_ranges[4])) + + # object copied to shard + self._check_objects(objects[8:], expected_shard_dbs[4]) + # other shard ranges should be unchanged + for i, db in enumerate(expected_shard_dbs[:4]): + with annotate_failure(i): + self.assertFalse(os.path.exists(db)) + + self.assertEqual(initial_root_info['object_count'], + total_shard_stats['object_count']) + self.assertEqual(initial_root_info['bytes_used'], + total_shard_stats['bytes_used']) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(9, context.cleave_to_row) + self.assertEqual(9, context.max_row) + self.assertEqual(5, context.ranges_done) + self.assertEqual(0, context.ranges_todo) + + with self._mock_sharder(conf=conf) as sharder: + self.assertTrue(sharder._cleave(broker)) + sharder._replicate_object.assert_not_called() + + self.assertTrue(broker.set_sharded_state()) + # run cleave - should be a no-op + with self._mock_sharder(conf=conf) as sharder: + self.assertTrue(sharder._cleave(broker)) + + sharder._replicate_object.assert_not_called() + + def test_cleave_root(self): + self._check_cleave_root() + + def test_cleave_root_listing_limit_one(self): + # force yield_objects to update its marker and call to the broker's + # get_objects() for each shard range, to check the marker moves on + self._check_cleave_root(conf={'cleave_row_batch_size': 1}) + + def test_cleave_root_ranges_change(self): + # verify that objects are not missed if shard ranges change between + # cleaving batches + broker = self._make_broker() + objects = [ + ('a', self.ts_encoded(), 10, 'text/plain', 'etag_a', 0, 0), + ('b', self.ts_encoded(), 10, 'text/plain', 'etag_b', 0, 0), + ('c', self.ts_encoded(), 1, 'text/plain', 'etag_c', 0, 0), + ('d', self.ts_encoded(), 2, 'text/plain', 'etag_d', 0, 0), + ('e', self.ts_encoded(), 3, 'text/plain', 'etag_e', 0, 0), + ('f', self.ts_encoded(), 100, 'text/plain', 'etag_f', 0, 0), + ('x', self.ts_encoded(), 0, '', '', 1, 0), # deleted + ('z', self.ts_encoded(), 1000, 'text/plain', 'etag_z', 0, 0) + ] + for obj in objects: + broker.put_object(*obj) + broker.enable_sharding(Timestamp.now()) + + shard_bounds = (('', 'd'), ('d', 'x'), ('x', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + broker.merge_shard_ranges(shard_ranges[:3]) + self.assertTrue(broker.set_sharding_state()) + + # run cleave - first batch is cleaved + with self._mock_sharder() as sharder: + self.assertFalse(sharder._cleave(broker)) + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual(str(shard_ranges[1].upper), context.cursor) + self.assertEqual(8, context.cleave_to_row) + self.assertEqual(8, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in expected_shard_dbs[:2]] + ) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(3, len(updated_shard_ranges)) + + # first 2 shard ranges should have updated object count, bytes used and + # meta_timestamp + shard_ranges[0].bytes_used = 23 + shard_ranges[0].object_count = 4 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + shard_ranges[1].bytes_used = 103 + shard_ranges[1].object_count = 2 + shard_ranges[1].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[1], updated_shard_ranges[1]) + self._check_objects(objects[:4], expected_shard_dbs[0]) + self._check_objects(objects[4:7], expected_shard_dbs[1]) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # third shard range should be unchanged - not yet cleaved + self.assertEqual(dict(shard_ranges[2]), + dict(updated_shard_ranges[2])) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual(str(shard_ranges[1].upper), context.cursor) + self.assertEqual(8, context.cleave_to_row) + self.assertEqual(8, context.max_row) + + # now change the shard ranges so that third consumes second + shard_ranges[1].set_deleted() + shard_ranges[2].lower = 'd' + shard_ranges[2].timestamp = Timestamp.now() + + broker.merge_shard_ranges(shard_ranges[1:3]) + + # run cleave - should process the extended third (final) range + with self._mock_sharder() as sharder: + self.assertTrue(sharder._cleave(broker)) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[2], 0) + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(2, len(updated_shard_ranges)) + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + # third shard range should now have updated object count, bytes used, + # including objects previously in the second shard range + shard_ranges[2].bytes_used = 1103 + shard_ranges[2].object_count = 3 + shard_ranges[2].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[2], updated_shard_ranges[1]) + self._check_objects(objects[4:8], expected_shard_dbs[2]) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual(str(shard_ranges[2].upper), context.cursor) + self.assertEqual(8, context.cleave_to_row) + self.assertEqual(8, context.max_row) + + def test_cleave_shard(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + own_shard_range = ShardRange( + broker.path, Timestamp.now(), 'here', 'where', + state=ShardRange.SHARDING, epoch=Timestamp.now()) + broker.merge_shard_ranges([own_shard_range]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertFalse(broker.is_root_container()) # sanity check + + objects = [ + ('m', self.ts_encoded(), 1, 'text/plain', 'etag_m', 0, 0), + ('n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0), + ('there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0), + ('where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0), + ] + misplaced_objects = [ + ('a', self.ts_encoded(), 1, 'text/plain', 'etag_a', 0, 0), + ('z', self.ts_encoded(), 100, 'text/plain', 'etag_z', 1, 0), + ] + for obj in objects + misplaced_objects: + broker.put_object(*obj) + + shard_bounds = (('here', 'there'), + ('there', 'where')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + misplaced_bounds = (('', 'here'), + ('where', '')) + misplaced_ranges = self._make_shard_ranges( + misplaced_bounds, state=ShardRange.ACTIVE) + misplaced_dbs = [] + for shard_range in misplaced_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + misplaced_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + + # run cleave - first range is cleaved but move of misplaced objects is + # not successful + sharder_conf = {'cleave_batch_size': 1} + with self._mock_sharder(sharder_conf) as sharder: + with mock.patch.object( + sharder, '_make_shard_range_fetcher', + return_value=lambda: iter(misplaced_ranges)): + # cause misplaced objects replication to not succeed + quorum = quorum_size(sharder.ring.replica_count) + successes = [True] * (quorum - 1) + fails = [False] * (sharder.ring.replica_count - len(successes)) + responses = successes + fails + random.shuffle(responses) + bad_result = (False, responses) + ok_result = (True, [True] * sharder.ring.replica_count) + sharder._replicate_object = mock.MagicMock( + # result for misplaced, misplaced, cleave + side_effect=(bad_result, ok_result, ok_result)) + self.assertFalse(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertFalse(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual(str(shard_ranges[0].upper), context.cursor) + self.assertEqual(6, context.cleave_to_row) + self.assertEqual(6, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, misplaced_dbs[0], 0), + mock.call(0, misplaced_dbs[1], 0), + mock.call(0, expected_shard_dbs[0], 0)]) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + # NB cleaving a shard, state goes to CLEAVED not ACTIVE + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(2, len(updated_shard_ranges)) + + # first shard range should have updated object count, bytes used and + # meta_timestamp + shard_ranges[0].bytes_used = 6 + shard_ranges[0].object_count = 3 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(objects[:3], expected_shard_dbs[0]) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self._check_objects(misplaced_objects[:1], misplaced_dbs[0]) + self._check_objects(misplaced_objects[1:], misplaced_dbs[1]) + unlink_files(expected_shard_dbs) + unlink_files(misplaced_dbs) + + # run cleave - second (final) range is cleaved; move this range to + # CLEAVED state and update stats to simulate another replica having + # cleaved it and replicated its state + shard_ranges[1].update_state(ShardRange.CLEAVED) + shard_ranges[1].update_meta(2, 15) + broker.merge_shard_ranges(shard_ranges[1:2]) + with self._mock_sharder(sharder_conf) as sharder: + with mock.patch.object( + sharder, '_make_shard_range_fetcher', + return_value=lambda: iter(misplaced_ranges)): + self.assertTrue(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual(str(shard_ranges[1].upper), context.cursor) + self.assertEqual(6, context.cleave_to_row) + self.assertEqual(6, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, misplaced_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + shard_broker = ContainerBroker(expected_shard_dbs[1]) + shard_own_sr = shard_broker.get_own_shard_range() + self.assertEqual(ShardRange.CLEAVED, shard_own_sr.state) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(2, len(updated_shard_ranges)) + + # second shard range should have updated object count, bytes used and + # meta_timestamp + self.assertEqual(dict(shard_ranges[1]), dict(updated_shard_ranges[1])) + self._check_objects(objects[3:], expected_shard_dbs[1]) + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self._check_objects(misplaced_objects[:1], misplaced_dbs[0]) + self.assertFalse(os.path.exists(misplaced_dbs[1])) + + def test_cleave_shard_shrinking(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + own_shard_range = ShardRange( + broker.path, next(self.ts_iter), 'here', 'where', + state=ShardRange.SHRINKING, epoch=next(self.ts_iter)) + broker.merge_shard_ranges([own_shard_range]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertFalse(broker.is_root_container()) # sanity check + + objects = [ + ('there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0), + ('where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0), + ] + for obj in objects: + broker.put_object(*obj) + acceptor_epoch = next(self.ts_iter) + acceptor = ShardRange('.shards_a/acceptor', Timestamp.now(), + 'here', 'yonder', '1000', '11111', + state=ShardRange.ACTIVE, epoch=acceptor_epoch) + db_hash = hash_path(acceptor.account, acceptor.container) + # NB expected cleave db includes acceptor epoch + expected_shard_db = os.path.join( + self.tempdir, 'sda', 'containers', '0', db_hash[-3:], db_hash, + '%s_%s.db' % (db_hash, acceptor_epoch.internal)) + + broker.merge_shard_ranges([acceptor]) + broker.set_sharding_state() + + # run cleave + with self._mock_sharder() as sharder: + self.assertTrue(sharder._cleave(broker)) + + context = CleavingContext.load(broker) + self.assertTrue(context.misplaced_done) + self.assertTrue(context.cleaving_done) + self.assertEqual(str(acceptor.upper), context.cursor) + self.assertEqual(2, context.cleave_to_row) + self.assertEqual(2, context.max_row) + + self.assertEqual(SHARDING, broker.get_db_state()) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_db, 0)]) + shard_broker = ContainerBroker(expected_shard_db) + # NB when cleaving a shard container to a larger acceptor namespace + # then expect the shard broker's own shard range to reflect that of the + # acceptor shard range rather than being set to CLEAVED. + self.assertEqual( + ShardRange.ACTIVE, shard_broker.get_own_shard_range().state) + + updated_shard_ranges = broker.get_shard_ranges() + self.assertEqual(1, len(updated_shard_ranges)) + self.assertEqual(dict(acceptor), dict(updated_shard_ranges[0])) + + # shard range should have unmodified acceptor, bytes used and + # meta_timestamp + self._check_objects(objects, expected_shard_db) + + def test_cleave_repeated(self): + # verify that if new objects are merged into retiring db after cleaving + # started then cleaving will repeat but only new objects are cleaved + # in the repeated cleaving pass + broker = self._make_broker() + objects = [ + ('obj%03d' % i, next(self.ts_iter), 1, 'text/plain', 'etag', 0, 0) + for i in range(10) + ] + new_objects = [ + (name, next(self.ts_iter), 1, 'text/plain', 'etag', 0, 0) + for name in ('alpha', 'zeta') + ] + for obj in objects: + broker.put_object(*obj) + broker._commit_puts() + broker.enable_sharding(Timestamp.now()) + shard_bounds = (('', 'obj004'), ('obj004', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + old_broker = broker.get_brokers()[0] + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + + calls = [] + key = ('name', 'created_at', 'size', 'content_type', 'etag', 'deleted') + + def mock_replicate_object(part, db, node_id): + # merge new objects between cleave of first and second shard ranges + if not calls: + old_broker.merge_items( + [dict(zip(key, obj)) for obj in new_objects]) + calls.append((part, db, node_id)) + return True, [True, True, True] + + with self._mock_sharder() as sharder: + sharder._audit_container = mock.MagicMock() + sharder._replicate_object = mock_replicate_object + sharder._process_broker(broker, node, 99) + + # sanity check - the new objects merged into the old db + self.assertFalse(broker.get_objects()) + self.assertEqual(12, len(old_broker.get_objects())) + + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self.assertEqual([(0, expected_shard_dbs[0], 0), + (0, expected_shard_dbs[1], 0)], calls) + + # check shard ranges were updated to CLEAVED + updated_shard_ranges = broker.get_shard_ranges() + # 'alpha' was not in table when first shard was cleaved + shard_ranges[0].bytes_used = 5 + shard_ranges[0].object_count = 5 + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(objects[:5], expected_shard_dbs[0]) + # 'zeta' was in table when second shard was cleaved + shard_ranges[1].bytes_used = 6 + shard_ranges[1].object_count = 6 + shard_ranges[1].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[1], updated_shard_ranges[1]) + self._check_objects(objects[5:] + new_objects[1:], + expected_shard_dbs[1]) + + context = CleavingContext.load(broker) + self.assertFalse(context.misplaced_done) + self.assertFalse(context.cleaving_done) + self.assertEqual('', context.cursor) + self.assertEqual(10, context.cleave_to_row) + self.assertEqual(12, context.max_row) # note that max row increased + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Repeat cleaving required', lines[0]) + self.assertFalse(lines[1:]) + unlink_files(expected_shard_dbs) + + # repeat the cleaving - the newer objects get cleaved + with self._mock_sharder() as sharder: + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + # this time the sharding completed + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + + # shard ranges are now ACTIVE - stats not updated by cleaving + updated_shard_ranges = broker.get_shard_ranges() + shard_ranges[0].state = ShardRange.ACTIVE + self._check_shard_range(shard_ranges[0], updated_shard_ranges[0]) + self._check_objects(new_objects[:1], expected_shard_dbs[0]) + # both new objects are included in repeat cleaving but no older objects + shard_ranges[1].state = ShardRange.ACTIVE + self._check_shard_range(shard_ranges[1], updated_shard_ranges[1]) + self._check_objects(new_objects[1:], expected_shard_dbs[1]) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + def test_cleave_multiple_storage_policies(self): + # verify that objects in all storage policies are cleaved + broker = self._make_broker() + # add objects in multiple policies + objects = [{'name': 'obj_%03d' % i, + 'created_at': Timestamp.now().normal, + 'content_type': 'text/plain', + 'etag': 'etag_%d' % i, + 'size': 1024 * i, + 'deleted': i % 2, + 'storage_policy_index': i % 2, + } for i in range(1, 8)] + # merge_items mutates items + broker.merge_items([dict(obj) for obj in objects]) + broker.enable_sharding(Timestamp.now()) + shard_ranges = self._make_shard_ranges( + (('', 'obj_004'), ('obj_004', '')), state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + + with self._mock_sharder() as sharder: + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + # check shard ranges were updated to ACTIVE + self.assertEqual([ShardRange.ACTIVE] * 2, + [sr.state for sr in broker.get_shard_ranges()]) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + actual_objects = shard_broker.get_objects() + self.assertEqual(objects[:4], actual_objects) + + shard_broker = ContainerBroker(expected_shard_dbs[1]) + actual_objects = shard_broker.get_objects() + self.assertEqual(objects[4:], actual_objects) + + def test_cleave_insufficient_replication(self): + # verify that if replication of a cleaved shard range fails then rows + # are not merged again to the existing shard db + broker = self._make_broker() + retiring_db_id = broker.get_info()['id'] + objects = [ + {'name': 'obj%03d' % i, 'created_at': next(self.ts_iter), + 'size': 1, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + for i in range(10) + ] + broker.merge_items([dict(obj) for obj in objects]) + broker._commit_puts() + broker.enable_sharding(Timestamp.now()) + shard_bounds = (('', 'obj004'), ('obj004', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + new_object = {'name': 'alpha', 'created_at': next(self.ts_iter), + 'size': 0, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + broker.merge_items([dict(new_object)]) + + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + orig_merge_items = ContainerBroker.merge_items + + def mock_merge_items(broker, items): + merge_items_calls.append((broker.path, + # merge mutates item so make a copy + [dict(item) for item in items])) + orig_merge_items(broker, items) + + # first shard range cleaved but fails to replicate + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + return_value=(False, [False, False, True])) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + # first shard range cleaved to shard broker + self.assertEqual([(shard_ranges[0].name, objects[:5])], + merge_items_calls) + # replication of first shard range fails - no more shards attempted + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[0], 0) + # shard broker has sync points + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertEqual( + [{'remote_id': retiring_db_id, 'sync_point': len(objects)}], + shard_broker.get_syncs()) + self.assertEqual(objects[:5], shard_broker.get_objects()) + + # first shard range replicates ok, no new merges required, second is + # cleaved but fails to replicate + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items), self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, True, True]), + (False, [False, False, True])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + + broker_shard_ranges = broker.get_shard_ranges() + shard_ranges[0].object_count = 5 + shard_ranges[0].bytes_used = sum(obj['size'] for obj in objects[:5]) + shard_ranges[0].state = ShardRange.CLEAVED + self._check_shard_range(shard_ranges[0], broker_shard_ranges[0]) + # second shard range still in created state + self._assert_shard_ranges_equal([shard_ranges[1]], + [broker_shard_ranges[1]]) + # only second shard range rows were merged to shard db + self.assertEqual([(shard_ranges[1].name, objects[5:])], + merge_items_calls) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + # shard broker has sync points + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': retiring_db_id, 'sync_point': len(objects)}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:], shard_broker.get_objects()) + + # repeat - second shard range cleaves fully because its previously + # cleaved shard db no longer exists + unlink_files(expected_shard_dbs) + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(True, [True, True, True]), # misplaced obj + (False, [False, True, True])]) + sharder._audit_container = mock.MagicMock() + sharder.logger = debug_logger() + sharder._process_broker(broker, node, 99) + + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + + broker_shard_ranges = broker.get_shard_ranges() + shard_ranges[1].object_count = 5 + shard_ranges[1].bytes_used = sum(obj['size'] for obj in objects[5:]) + shard_ranges[1].state = ShardRange.ACTIVE + self._check_shard_range(shard_ranges[1], broker_shard_ranges[1]) + # second shard range rows were merged to shard db again + self.assertEqual([(shard_ranges[0].name, [new_object]), + (shard_ranges[1].name, objects[5:])], + merge_items_calls) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0)]) + # first shard broker was created by misplaced object - no sync point + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertFalse(shard_broker.get_syncs()) + self.assertEqual([new_object], shard_broker.get_objects()) + # second shard broker has sync points + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': retiring_db_id, 'sync_point': len(objects)}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:], shard_broker.get_objects()) + + def test_shard_replication_quorum_failures(self): + broker = self._make_broker() + objects = [ + {'name': 'obj%03d' % i, 'created_at': next(self.ts_iter), + 'size': 1, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + for i in range(10) + ] + broker.merge_items([dict(obj) for obj in objects]) + broker._commit_puts() + shard_bounds = (('', 'obj002'), ('obj002', 'obj004'), + ('obj004', 'obj006'), ('obj006', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CREATED) + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.enable_sharding(Timestamp.now()) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + with self._mock_sharder({'shard_replication_quorum': 3}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, True, True]), + (False, [False, False, True])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + # replication of first shard range fails - no more shards attempted + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[0], 0) + self.assertEqual([ShardRange.CREATED] * 4, + [sr.state for sr in broker.get_shard_ranges()]) + + # and again with a chilled out quorom, so cleaving moves onto second + # shard range which fails to reach even chilled quorum + with self._mock_sharder({'shard_replication_quorum': 1}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, False, True]), + (False, [False, False, False])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self.assertEqual(sharder._replicate_object.call_args_list, [ + mock.call(0, expected_shard_dbs[0], 0), + mock.call(0, expected_shard_dbs[1], 0), + ]) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CREATED, ShardRange.CREATED, + ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + + # now pretend another node successfully cleaved the second shard range, + # but this node still fails to replicate so still cannot move on + shard_ranges[1].update_state(ShardRange.CLEAVED) + broker.merge_shard_ranges(shard_ranges[1]) + with self._mock_sharder({'shard_replication_quorum': 1}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, False, False])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED, + ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + + # until a super-chilled quorum is used - but even then there must have + # been an attempt to replicate + with self._mock_sharder( + {'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [])]) # maybe shard db was deleted + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + self.assertEqual( + [ShardRange.CLEAVED, ShardRange.CLEAVED, ShardRange.CREATED, + ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + + # next pass - the second shard replication is attempted and fails, but + # that's ok because another node has cleaved it and + # existing_shard_replication_quorum is zero + with self._mock_sharder( + {'shard_replication_quorum': 1, + 'existing_shard_replication_quorum': 0}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, False, False]), + (False, [False, True, False])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + self.assertEqual(sharder._replicate_object.call_args_list, [ + mock.call(0, expected_shard_dbs[1], 0), + mock.call(0, expected_shard_dbs[2], 0), + ]) + self.assertEqual([ShardRange.CLEAVED] * 3 + [ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + self.assertEqual(1, sharder.shard_replication_quorum) + self.assertEqual(0, sharder.existing_shard_replication_quorum) + + # crazy replication quorums will be capped to replica_count + with self._mock_sharder( + {'shard_replication_quorum': 99, + 'existing_shard_replication_quorum': 99}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(False, [False, True, True])]) + sharder._audit_container = mock.MagicMock() + sharder.logger = debug_logger() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[3], 0) + self.assertEqual([ShardRange.CLEAVED] * 3 + [ShardRange.CREATED], + [sr.state for sr in broker.get_shard_ranges()]) + self.assertEqual(3, sharder.shard_replication_quorum) + self.assertEqual(3, sharder.existing_shard_replication_quorum) + + # ...and progress is still made if replication fully succeeds + with self._mock_sharder( + {'shard_replication_quorum': 99, + 'existing_shard_replication_quorum': 99}) as sharder: + sharder._replicate_object = mock.MagicMock( + side_effect=[(True, [True, True, True])]) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[3], 0) + self.assertEqual([ShardRange.ACTIVE] * 4, + [sr.state for sr in broker.get_shard_ranges()]) + warnings = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'shard_replication_quorum of 99 exceeds replica count', + warnings[0]) + self.assertIn( + 'existing_shard_replication_quorum of 99 exceeds replica count', + warnings[1]) + self.assertEqual(3, sharder.shard_replication_quorum) + self.assertEqual(3, sharder.existing_shard_replication_quorum) + + def test_cleave_to_existing_shard_db(self): + # verify that when cleaving to an already existing shard db + def replicate(node, from_broker, part): + # short circuit replication + rpc = replicator.ContainerReplicatorRpc( + self.tempdir, DATADIR, ContainerBroker, mount_check=False) + + fake_repl_connection = attach_fake_replication_rpc(rpc) + with mock.patch('swift.common.db_replicator.ReplConnection', + fake_repl_connection): + with mock.patch('swift.common.db_replicator.ring.Ring', + lambda *args, **kwargs: FakeRing()): + daemon = replicator.ContainerReplicator({}) + info = from_broker.get_replication_info() + success = daemon._repl_to_node( + node, from_broker, part, info) + self.assertTrue(success) + + orig_merge_items = ContainerBroker.merge_items + + def mock_merge_items(broker, items): + # capture merge_items calls + merge_items_calls.append((broker.path, + # merge mutates item so make a copy + [dict(item) for item in items])) + orig_merge_items(broker, items) + + objects = [ + {'name': 'obj%03d' % i, 'created_at': next(self.ts_iter), + 'size': 1, 'content_type': 'text/plain', 'etag': 'etag', + 'deleted': 0, 'storage_policy_index': 0} + for i in range(10) + ] + # local db gets 4 objects + local_broker = self._make_broker() + local_broker.merge_items([dict(obj) for obj in objects[2:6]]) + local_broker._commit_puts() + local_retiring_db_id = local_broker.get_info()['id'] + + # remote db gets 5 objects + remote_broker = self._make_broker(device='sdb') + remote_broker.merge_items([dict(obj) for obj in objects[2:7]]) + remote_broker._commit_puts() + remote_retiring_db_id = remote_broker.get_info()['id'] + + local_node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda', + 'id': '2', 'index': 0, 'replication_ip': '1.2.3.4', + 'replication_port': 6040} + remote_node = {'ip': '1.2.3.5', 'port': 6040, 'device': 'sdb', + 'id': '3', 'index': 1, 'replication_ip': '1.2.3.5', + 'replication_port': 6040} + + # remote db replicates to local, bringing local db's total to 5 objects + self.assertNotEqual(local_broker.get_objects(), + remote_broker.get_objects()) + replicate(local_node, remote_broker, 0) + self.assertEqual(local_broker.get_objects(), + remote_broker.get_objects()) + + # local db gets 2 new objects, bringing its total to 7 + local_broker.merge_items([dict(obj) for obj in objects[1:2]]) + local_broker.merge_items([dict(obj) for obj in objects[7:8]]) + + # local db gets shard ranges + own_shard_range = local_broker.get_own_shard_range() + now = Timestamp.now() + own_shard_range.update_state(ShardRange.SHARDING, state_timestamp=now) + own_shard_range.epoch = now + shard_ranges = self._make_shard_ranges( + (('', 'obj004'), ('obj004', '')), state=ShardRange.CREATED) + local_broker.merge_shard_ranges([own_shard_range] + shard_ranges) + self.assertTrue(local_broker.set_sharding_state()) + + # local db shards + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + return_value=(True, [True, True, True])) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(local_broker, local_node, 0) + + # all objects merged from local to shard ranges + self.assertEqual([(shard_ranges[0].name, objects[1:5]), + (shard_ranges[1].name, objects[5:8])], + merge_items_calls) + + # shard brokers have sync points + expected_shard_dbs = [] + for shard_range in shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 5}], + shard_broker.get_syncs()) + self.assertEqual(objects[1:5], shard_broker.get_objects()) + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 5}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:8], shard_broker.get_objects()) + + # local db replicates to remote, so remote now has shard ranges + # note: no objects replicated because local is sharded + self.assertFalse(remote_broker.get_shard_ranges()) + replicate(remote_node, local_broker, 0) + self._assert_shard_ranges_equal(local_broker.get_shard_ranges(), + remote_broker.get_shard_ranges()) + + # remote db gets 3 new objects, bringing its total to 8 + remote_broker.merge_items([dict(obj) for obj in objects[:1]]) + remote_broker.merge_items([dict(obj) for obj in objects[8:]]) + + merge_items_calls = [] + with mock.patch('swift.container.backend.ContainerBroker.merge_items', + mock_merge_items): + with self._mock_sharder() as sharder: + sharder._replicate_object = mock.MagicMock( + return_value=(True, [True, True, True])) + sharder._audit_container = mock.MagicMock() + sharder._process_broker(remote_broker, remote_node, 0) + + # shard brokers have sync points for the remote db so only new objects + # are merged from remote broker to shard brokers + self.assertEqual([(shard_ranges[0].name, objects[:1]), + (shard_ranges[1].name, objects[8:])], + merge_items_calls) + # sync points are updated + shard_broker = ContainerBroker(expected_shard_dbs[0]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 8}], + shard_broker.get_syncs()) + self.assertEqual(objects[:5], shard_broker.get_objects()) + shard_broker = ContainerBroker(expected_shard_dbs[1]) + self.assertEqual( + [{'remote_id': local_retiring_db_id, 'sync_point': 7}, + {'remote_id': remote_retiring_db_id, 'sync_point': 8}], + shard_broker.get_syncs()) + self.assertEqual(objects[5:], shard_broker.get_objects()) + + def _check_complete_sharding(self, account, container, shard_bounds): + broker = self._make_sharding_broker( + account=account, container=container, shard_bounds=shard_bounds) + obj = {'name': 'obj', 'created_at': next(self.ts_iter).internal, + 'size': 14, 'content_type': 'text/plain', 'etag': 'an etag', + 'deleted': 0} + broker.get_brokers()[0].merge_items([obj]) + self.assertEqual(2, len(broker.db_files)) # sanity check + + def check_not_complete(): + with self._mock_sharder() as sharder: + self.assertFalse(sharder._complete_sharding(broker)) + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'Repeat cleaving required for %r' % broker.db_files[0], + warning_lines[0]) + self.assertFalse(warning_lines[1:]) + sharder.logger.clear() + context = CleavingContext.load(broker) + self.assertFalse(context.cleaving_done) + self.assertFalse(context.misplaced_done) + self.assertEqual('', context.cursor) + self.assertEqual(ShardRange.SHARDING, + broker.get_own_shard_range().state) + for shard_range in broker.get_shard_ranges(): + self.assertEqual(ShardRange.CLEAVED, shard_range.state) + self.assertEqual(SHARDING, broker.get_db_state()) + + # no cleave context progress + check_not_complete() + + # cleaving_done is False + context = CleavingContext.load(broker) + self.assertEqual(1, context.max_row) + context.cleave_to_row = 1 # pretend all rows have been cleaved + context.cleaving_done = False + context.misplaced_done = True + context.store(broker) + check_not_complete() + + # misplaced_done is False + context.misplaced_done = False + context.cleaving_done = True + context.store(broker) + check_not_complete() + + # modified db max row + old_broker = broker.get_brokers()[0] + obj = {'name': 'obj', 'created_at': next(self.ts_iter).internal, + 'size': 14, 'content_type': 'text/plain', 'etag': 'an etag', + 'deleted': 1} + old_broker.merge_items([obj]) + self.assertGreater(old_broker.get_max_row(), context.max_row) + context.misplaced_done = True + context.cleaving_done = True + context.store(broker) + check_not_complete() + + # db id changes + broker.get_brokers()[0].newid('fake_remote_id') + context.cleave_to_row = 2 # pretend all rows have been cleaved, again + context.store(broker) + check_not_complete() + + # context ok + context = CleavingContext.load(broker) + context.cleave_to_row = context.max_row + context.misplaced_done = True + context.cleaving_done = True + context.store(broker) + with self._mock_sharder() as sharder: + self.assertTrue(sharder._complete_sharding(broker)) + self.assertEqual(SHARDED, broker.get_db_state()) + self.assertEqual(ShardRange.SHARDED, + broker.get_own_shard_range().state) + for shard_range in broker.get_shard_ranges(): + self.assertEqual(ShardRange.ACTIVE, shard_range.state) + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertFalse(warning_lines) + sharder.logger.clear() + return broker + + def test_complete_sharding_root(self): + broker = self._check_complete_sharding( + 'a', 'c', (('', 'mid'), ('mid', ''))) + self.assertEqual(0, broker.get_own_shard_range().deleted) + + def test_complete_sharding_shard(self): + broker = self._check_complete_sharding( + '.shards_', 'shard_c', (('l', 'mid'), ('mid', 'u'))) + self.assertEqual(1, broker.get_own_shard_range().deleted) + + def test_identify_sharding_candidate(self): + brokers = [self._make_broker(container='c%03d' % i) for i in range(6)] + for broker in brokers: + broker.set_sharding_sysmeta('Root', 'a/c') + node = {'index': 2} + # containers are all empty + with self._mock_sharder() as sharder: + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + expected_stats = {} + self._assert_stats(expected_stats, sharder, 'sharding_candidates') + + objects = [ + ['obj%3d' % i, next(self.ts_iter).internal, i, 'text/plain', + 'etag%s' % i, 0] for i in range(160)] + + # one container has 100 objects, which is below the sharding threshold + for obj in objects[:100]: + brokers[0].put_object(*obj) + conf = {'recon_cache_path': self.tempdir} + with self._mock_sharder(conf=conf) as sharder: + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + self.assertFalse(sharder.sharding_candidates) + expected_recon = { + 'found': 0, + 'top': []} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # reduce the sharding threshold and the container is reported + conf = {'shard_container_threshold': 100, + 'recon_cache_path': self.tempdir} + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now() as now: + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + stats_0 = {'path': brokers[0].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c000', + 'root': 'a/c', + 'object_count': 100, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[0].db_file).st_size} + self.assertEqual([stats_0], sharder.sharding_candidates) + expected_recon = { + 'found': 1, + 'top': [stats_0]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # repeat with handoff node and db_file error + with self._mock_sharder(conf=conf) as sharder: + with mock.patch('os.stat', side_effect=OSError('test error')): + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, {}) + stats_0_b = {'path': brokers[0].db_file, + 'node_index': None, + 'account': 'a', + 'container': 'c000', + 'root': 'a/c', + 'object_count': 100, + 'meta_timestamp': now.internal, + 'file_size': None} + self.assertEqual([stats_0_b], sharder.sharding_candidates) + self._assert_stats(expected_stats, sharder, 'sharding_candidates') + expected_recon = { + 'found': 1, + 'top': [stats_0_b]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # load up another container, but not to threshold for sharding, and + # verify it is never a candidate for sharding + for obj in objects[:50]: + brokers[2].put_object(*obj) + own_sr = brokers[2].get_own_shard_range() + for state in ShardRange.STATES: + own_sr.update_state(state, state_timestamp=Timestamp.now()) + brokers[2].merge_shard_ranges([own_sr]) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + with annotate_failure(state): + self.assertEqual([stats_0], sharder.sharding_candidates) + + # reduce the threshold and the second container is included + conf = {'shard_container_threshold': 50, + 'recon_cache_path': self.tempdir} + own_sr.update_state(ShardRange.ACTIVE, state_timestamp=Timestamp.now()) + brokers[2].merge_shard_ranges([own_sr]) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + stats_2 = {'path': brokers[2].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c002', + 'root': 'a/c', + 'object_count': 50, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[2].db_file).st_size} + self.assertEqual([stats_0, stats_2], sharder.sharding_candidates) + expected_recon = { + 'found': 2, + 'top': [stats_0, stats_2]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # a broker not in active state is not included + own_sr = brokers[0].get_own_shard_range() + for state in ShardRange.STATES: + if state == ShardRange.ACTIVE: + continue + own_sr.update_state(state, state_timestamp=Timestamp.now()) + brokers[0].merge_shard_ranges([own_sr]) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + with annotate_failure(state): + self.assertEqual([stats_2], sharder.sharding_candidates) + + own_sr.update_state(ShardRange.ACTIVE, state_timestamp=Timestamp.now()) + brokers[0].merge_shard_ranges([own_sr]) + + # load up a third container with 150 objects + for obj in objects[:150]: + brokers[5].put_object(*obj) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + stats_5 = {'path': brokers[5].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c005', + 'root': 'a/c', + 'object_count': 150, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[5].db_file).st_size} + self.assertEqual([stats_0, stats_2, stats_5], + sharder.sharding_candidates) + # note recon top list is sorted by size + expected_recon = { + 'found': 3, + 'top': [stats_5, stats_0, stats_2]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # restrict the number of reported candidates + conf = {'shard_container_threshold': 50, + 'recon_cache_path': self.tempdir, + 'recon_candidates_limit': 2} + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + self.assertEqual([stats_0, stats_2, stats_5], + sharder.sharding_candidates) + expected_recon = { + 'found': 3, + 'top': [stats_5, stats_0]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + # unrestrict the number of reported candidates + conf = {'shard_container_threshold': 50, + 'recon_cache_path': self.tempdir, + 'recon_candidates_limit': -1} + for i, broker in enumerate([brokers[1]] + brokers[3:5]): + for obj in objects[:(151 + i)]: + broker.put_object(*obj) + with self._mock_sharder(conf=conf) as sharder: + with mock_timestamp_now(now): + for broker in brokers: + sharder._identify_sharding_candidate(broker, node) + + stats_4 = {'path': brokers[4].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c004', + 'root': 'a/c', + 'object_count': 153, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[4].db_file).st_size} + stats_3 = {'path': brokers[3].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c003', + 'root': 'a/c', + 'object_count': 152, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[3].db_file).st_size} + stats_1 = {'path': brokers[1].db_file, + 'node_index': 2, + 'account': 'a', + 'container': 'c001', + 'root': 'a/c', + 'object_count': 151, + 'meta_timestamp': now.internal, + 'file_size': os.stat(brokers[1].db_file).st_size} + + self.assertEqual( + [stats_0, stats_1, stats_2, stats_3, stats_4, stats_5], + sharder.sharding_candidates) + self._assert_stats(expected_stats, sharder, 'sharding_candidates') + expected_recon = { + 'found': 6, + 'top': [stats_4, stats_3, stats_1, stats_5, stats_0, stats_2]} + sharder._report_stats() + self._assert_recon_stats( + expected_recon, sharder, 'sharding_candidates') + + def test_misplaced_objects_root_container(self): + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + + objects = [ + # misplaced objects in second and third shard ranges + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 1], + ['where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0], + # deleted + ['x', self.ts_encoded(), 0, '', '', 1, 1], + ] + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', 'yonder'), + ('yonder', '')) + initial_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for shard_range in initial_shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(initial_shard_ranges) + + # unsharded + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # sharding - no misplaced objects + self.assertTrue(broker.set_sharding_state()) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # pretend we cleaved up to end of second shard range + context = CleavingContext.load(broker) + context.cursor = 'there' + context.store(broker) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # sharding - misplaced objects + for obj in objects: + broker.put_object(*obj) + # pretend we have not cleaved any ranges + context.cursor = '' + context.store(broker) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + self.assertFalse(os.path.exists(expected_shard_dbs[3])) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + + # pretend we cleaved up to end of second shard range + context.cursor = 'there' + context.store(broker) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_called_once_with( + 0, expected_shard_dbs[1], 0) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects(objects[2:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + self.assertFalse(os.path.exists(expected_shard_dbs[3])) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + + # pretend we cleaved up to end of fourth shard range + context.cursor = 'yonder' + context.store(broker) + # and some new misplaced updates arrived in the first shard range + new_objects = [ + ['b', self.ts_encoded(), 10, 'text/plain', 'etag_b', 0, 0], + ['c', self.ts_encoded(), 20, 'text/plain', 'etag_c', 0, 0], + ] + for obj in new_objects: + broker.put_object(*obj) + + # check that *all* misplaced objects are moved despite exceeding + # the listing limit + with self._mock_sharder(conf={'cleave_row_batch_size': 2}) as sharder: + sharder._move_misplaced_objects(broker) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in expected_shard_dbs[2:4]], + any_order=True + ) + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check misplaced objects were moved + self._check_objects(new_objects, expected_shard_dbs[0]) + self._check_objects(objects[:2], expected_shard_dbs[1]) + self._check_objects(objects[2:3], expected_shard_dbs[2]) + self._check_objects(objects[3:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[4])) + + # pretend we cleaved all ranges - sharded state + self.assertTrue(broker.set_sharded_state()) + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + + # and then more misplaced updates arrive + newer_objects = [ + ['a', self.ts_encoded(), 51, 'text/plain', 'etag_a', 0, 0], + ['z', self.ts_encoded(), 52, 'text/plain', 'etag_z', 0, 0], + ] + for obj in newer_objects: + broker.put_object(*obj) + broker.get_info() # force updates to be committed + # sanity check the puts landed in sharded broker + self._check_objects(newer_objects, broker.db_file) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) + for db in (expected_shard_dbs[0], expected_shard_dbs[-1])], + any_order=True + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check new misplaced objects were moved + self._check_objects(newer_objects[:1] + new_objects, + expected_shard_dbs[0]) + self._check_objects(newer_objects[1:], expected_shard_dbs[4]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + # ... and other shard dbs were unchanged + self._check_objects(objects[:2], expected_shard_dbs[1]) + self._check_objects(objects[2:3], expected_shard_dbs[2]) + self._check_objects(objects[3:], expected_shard_dbs[3]) + + def _setup_misplaced_objects(self): + # make a broker with shard ranges, move it to sharded state and then + # put some misplaced objects in it + broker = self._make_broker() + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', 'yonder'), + ('yonder', '')) + initial_shard_ranges = [ + ShardRange('.shards_a/%s-%s' % (lower, upper), + Timestamp.now(), lower, upper, state=ShardRange.ACTIVE) + for lower, upper in shard_bounds + ] + expected_dbs = [] + for shard_range in initial_shard_ranges: + db_hash = hash_path(shard_range.account, shard_range.container) + expected_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(initial_shard_ranges) + objects = [ + # misplaced objects in second, third and fourth shard ranges + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0], + ['where', self.ts_encoded(), 100, 'text/plain', 'etag_where', 0, + 0], + # deleted + ['x', self.ts_encoded(), 0, '', '', 1, 0], + ] + broker.enable_sharding(Timestamp.now()) + self.assertTrue(broker.set_sharding_state()) + self.assertTrue(broker.set_sharded_state()) + for obj in objects: + broker.put_object(*obj) + self.assertEqual(SHARDED, broker.get_db_state()) + return broker, objects, expected_dbs + + def test_misplaced_objects_newer_objects(self): + # verify that objects merged to the db after misplaced objects have + # been identified are not removed from the db + broker, objects, expected_dbs = self._setup_misplaced_objects() + newer_objects = [ + ['j', self.ts_encoded(), 51, 'text/plain', 'etag_j', 0, 0], + ['k', self.ts_encoded(), 52, 'text/plain', 'etag_k', 1, 0], + ] + + calls = [] + pre_removal_objects = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + if db == expected_dbs[1]: + # put some new objects in the shard range that is being + # replicated before misplaced objects are removed from that + # range in the source db + for obj in newer_objects: + broker.put_object(*obj) + # grab a snapshot of the db contents - a side effect is + # that the newer objects are now committed to the db + pre_removal_objects.extend( + broker.get_objects()) + return True, [True, True, True] + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + # sanity check - the newer objects were in the db before the misplaced + # object were removed + for obj in newer_objects: + self.assertIn(obj[0], [o['name'] for o in pre_removal_objects]) + for obj in objects[:2]: + self.assertIn(obj[0], [o['name'] for o in pre_removal_objects]) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but newer objects were not removed from the source db + self._check_objects(newer_objects, broker.db_file) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + # they will be moved on next cycle + unlink_files(expected_dbs) + with self._mock_sharder(replicas=3) as sharder: + sharder._move_misplaced_objects(broker) + + self._check_objects(newer_objects, expected_dbs[1]) + self._check_objects([], broker.db_file) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + def test_misplaced_objects_db_id_changed(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + pre_info = broker.get_info() + calls = [] + expected_retained_objects = [] + expected_retained_objects_dbs = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + if len(calls) == 2: + broker.newid('fake_remote_id') + # grab snapshot of the objects in the broker when it changed id + expected_retained_objects.extend( + self._get_raw_object_records(broker)) + if len(calls) >= 2: + expected_retained_objects_dbs.append(db) + return True, [True, True, True] + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + # sanity checks + self.assertNotEqual(pre_info['id'], broker.get_info()['id']) + self.assertTrue(expected_retained_objects) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but objects were not removed after the source db id changed + self._check_objects(expected_retained_objects, broker.db_file) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Refused to remove misplaced objects', lines[0]) + self.assertIn('Refused to remove misplaced objects', lines[1]) + self.assertFalse(lines[2:]) + + # they will be moved again on next cycle + unlink_files(expected_dbs) + sharder.logger.clear() + with self._mock_sharder(replicas=3) as sharder: + sharder._move_misplaced_objects(broker) + + self.assertEqual(2, len(set(expected_retained_objects_dbs))) + for db in expected_retained_objects_dbs: + if db == expected_dbs[1]: + self._check_objects(objects[:2], expected_dbs[1]) + if db == expected_dbs[2]: + self._check_objects(objects[2:3], expected_dbs[2]) + if db == expected_dbs[3]: + self._check_objects(objects[3:], expected_dbs[3]) + self._check_objects([], broker.db_file) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': len(expected_retained_objects), + 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + + def test_misplaced_objects_sufficient_replication(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object.return_value = (True, [True, True, True]) + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_dbs[2:4])], + any_order=True) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def test_misplaced_objects_insufficient_replication_3_replicas(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + returns = {expected_dbs[1]: (True, [True, True, True]), # ok + expected_dbs[2]: (False, [True, False, False]), # < quorum + expected_dbs[3]: (False, [False, True, True])} # ok + calls = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + return returns[db] + + with self._mock_sharder(replicas=3) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved to shard dbs + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but only removed from the source db if sufficiently replicated + self._check_objects(objects[2:3], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def test_misplaced_objects_insufficient_replication_2_replicas(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + returns = {expected_dbs[1]: (True, [True, True]), # ok + expected_dbs[2]: (False, [True, False]), # ok + expected_dbs[3]: (False, [False, False])} # < quorum> + calls = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + return returns[db] + + with self._mock_sharder(replicas=2) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved to shard dbs + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but only removed from the source db if sufficiently replicated + self._check_objects(objects[3:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def test_misplaced_objects_insufficient_replication_4_replicas(self): + broker, objects, expected_dbs = self._setup_misplaced_objects() + + returns = {expected_dbs[1]: (False, [True, False, False, False]), + expected_dbs[2]: (True, [True, False, False, True]), + expected_dbs[3]: (False, [False, False, False, False])} + calls = [] + + def mock_replicate_object(part, db, node_id): + calls.append((part, db, node_id)) + return returns[db] + + with self._mock_sharder(replicas=4) as sharder: + sharder._replicate_object = mock_replicate_object + sharder._move_misplaced_objects(broker) + + self.assertEqual( + set([(0, db, 0) for db in (expected_dbs[1:4])]), set(calls)) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'placed': 4, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # check misplaced objects were moved to shard dbs + self._check_objects(objects[:2], expected_dbs[1]) + self._check_objects(objects[2:3], expected_dbs[2]) + self._check_objects(objects[3:], expected_dbs[3]) + # ... but only removed from the source db if sufficiently replicated + self._check_objects(objects[:2] + objects[3:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_dbs[0])) + self.assertFalse(os.path.exists(expected_dbs[4])) + + def _check_misplaced_objects_shard_container_unsharded(self, conf=None): + broker = self._make_broker(account='.shards_a', container='.shard_c') + ts_shard = next(self.ts_iter) + own_sr = ShardRange(broker.path, ts_shard, 'here', 'where') + broker.merge_shard_ranges([own_sr]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertEqual(own_sr, broker.get_own_shard_range()) # sanity check + self.assertEqual(UNSHARDED, broker.get_db_state()) + + objects = [ + # some of these are misplaced objects + ['b', self.ts_encoded(), 2, 'text/plain', 'etag_b', 0, 0], + ['here', self.ts_encoded(), 2, 'text/plain', 'etag_here', 0, 0], + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0], + ['x', self.ts_encoded(), 0, '', '', 1, 0], # deleted + ['y', self.ts_encoded(), 10, 'text/plain', 'etag_y', 0, 0], + ] + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', '')) + root_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for sr in root_shard_ranges: + db_hash = hash_path(sr.account, sr.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + # no objects + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_not_called() + + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # now put objects + for obj in objects: + broker.put_object(*obj) + self._check_objects(objects, broker.db_file) # sanity check + + # NB final shard range not available + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges[:-1]) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'}), + mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + sharder._replicate_object.assert_called_with( + 0, expected_shard_dbs[0], 0), + + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 1, 'placed': 2, 'unplaced': 2} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + # some misplaced objects could not be moved... + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'Failed to find destination for at least 2 misplaced objects', + warning_lines[0]) + self.assertFalse(warning_lines[1:]) + sharder.logger.clear() + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[0]) + # ... and removed from the source db + self._check_objects(objects[2:], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + self.assertFalse(os.path.exists(expected_shard_dbs[3])) + + # repeat with final shard range available + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + + sharder._replicate_object.assert_called_with( + 0, expected_shard_dbs[-1], 0), + + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[0]) + self._check_objects(objects[4:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[2:4], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # repeat - no work remaining + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_not_called() + sharder._replicate_object.assert_not_called() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 0, 'placed': 0, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertFalse( + sharder.logger.get_increment_counts().get('misplaced_found')) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # and then more misplaced updates arrive + new_objects = [ + ['a', self.ts_encoded(), 51, 'text/plain', 'etag_a', 0, 0], + ['z', self.ts_encoded(), 52, 'text/plain', 'etag_z', 0, 0], + ] + for obj in new_objects: + broker.put_object(*obj) + # sanity check the puts landed in sharded broker + self._check_objects(new_objects[:1] + objects[2:4] + new_objects[1:], + broker.db_file) + + with self._mock_sharder(conf=conf) as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'}), + mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) + for db in (expected_shard_dbs[0], expected_shard_dbs[3])], + any_order=True + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check new misplaced objects were moved + self._check_objects(new_objects[:1] + objects[:2], + expected_shard_dbs[0]) + self._check_objects(objects[4:] + new_objects[1:], + expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[2:4], broker.db_file) + # ... and nothing else moved + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + def test_misplaced_objects_shard_container_unsharded(self): + self._check_misplaced_objects_shard_container_unsharded() + + def test_misplaced_objects_shard_container_unsharded_limit_two(self): + self._check_misplaced_objects_shard_container_unsharded( + conf={'cleave_row_batch_size': 2}) + + def test_misplaced_objects_shard_container_unsharded_limit_one(self): + self._check_misplaced_objects_shard_container_unsharded( + conf={'cleave_row_batch_size': 1}) + + def test_misplaced_objects_shard_container_sharding(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + ts_shard = next(self.ts_iter) + # note that own_sr spans two root shard ranges + own_sr = ShardRange(broker.path, ts_shard, 'here', 'where') + own_sr.update_state(ShardRange.SHARDING) + own_sr.epoch = next(self.ts_iter) + broker.merge_shard_ranges([own_sr]) + broker.set_sharding_sysmeta('Root', 'a/c') + self.assertEqual(own_sr, broker.get_own_shard_range()) # sanity check + self.assertEqual(UNSHARDED, broker.get_db_state()) + + objects = [ + # some of these are misplaced objects + ['b', self.ts_encoded(), 2, 'text/plain', 'etag_b', 0, 0], + ['here', self.ts_encoded(), 2, 'text/plain', 'etag_here', 0, 0], + ['n', self.ts_encoded(), 2, 'text/plain', 'etag_n', 0, 0], + ['there', self.ts_encoded(), 3, 'text/plain', 'etag_there', 0, 0], + ['v', self.ts_encoded(), 10, 'text/plain', 'etag_v', 0, 0], + ['y', self.ts_encoded(), 10, 'text/plain', 'etag_y', 0, 0], + ] + + shard_bounds = (('', 'here'), ('here', 'there'), + ('there', 'where'), ('where', '')) + root_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for sr in root_shard_ranges: + db_hash = hash_path(sr.account, sr.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + + # pretend broker is sharding but not yet cleaved a shard + self.assertTrue(broker.set_sharding_state()) + broker.merge_shard_ranges([dict(sr) for sr in root_shard_ranges[1:3]]) + # then some updates arrive + for obj in objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(objects, broker.db_file) # sanity check + + # first destination is not available + with self._mock_sharder() as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges[1:]) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'}), + mock.call(broker, newest=True, params={'states': 'updating', + 'marker': 'where', + 'end_marker': ''})]) + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[-1], 0)], + ) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 1, 'placed': 1, 'unplaced': 2} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + warning_lines = sharder.logger.get_lines_for_level('warning') + self.assertIn( + 'Failed to find destination for at least 2 misplaced objects', + warning_lines[0]) + self.assertFalse(warning_lines[1:]) + sharder.logger.clear() + + # check some misplaced objects were moved + self._check_objects(objects[5:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[:5], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[0])) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # normality resumes and all destinations are available + with self._mock_sharder() as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, params={'states': 'updating', + 'marker': '', + 'end_marker': 'here\x00'})] + ) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, expected_shard_dbs[0], 0)], + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check misplaced objects were moved + self._check_objects(objects[:2], expected_shard_dbs[0]) + self._check_objects(objects[5:], expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[2:5], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[1])) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + # pretend first shard has been cleaved + context = CleavingContext.load(broker) + context.cursor = 'there' + context.store(broker) + # and then more misplaced updates arrive + new_objects = [ + ['a', self.ts_encoded(), 51, 'text/plain', 'etag_a', 0, 0], + # this one is in the now cleaved shard range... + ['k', self.ts_encoded(), 52, 'text/plain', 'etag_k', 0, 0], + ['z', self.ts_encoded(), 53, 'text/plain', 'etag_z', 0, 0], + ] + for obj in new_objects: + broker.put_object(*obj) + broker.get_info() # force updates to be committed + # sanity check the puts landed in sharded broker + self._check_objects(sorted(new_objects + objects[2:5]), broker.db_file) + with self._mock_sharder() as sharder: + sharder._fetch_shard_ranges = mock.MagicMock( + return_value=root_shard_ranges) + sharder._move_misplaced_objects(broker) + + sharder._fetch_shard_ranges.assert_has_calls( + [mock.call(broker, newest=True, + params={'states': 'updating', 'marker': '', + 'end_marker': 'there\x00'}), + mock.call(broker, newest=True, + params={'states': 'updating', 'marker': 'where', + 'end_marker': ''})]) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1], + expected_shard_dbs[-1])], + any_order=True + ) + + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 5, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + + # check *all* the misplaced objects were moved + self._check_objects(new_objects[:1] + objects[:2], + expected_shard_dbs[0]) + self._check_objects(new_objects[1:2] + objects[2:4], + expected_shard_dbs[1]) + self._check_objects(objects[5:] + new_objects[2:], + expected_shard_dbs[3]) + # ... and removed from the source db + self._check_objects(objects[4:5], broker.db_file) + self.assertFalse(os.path.exists(expected_shard_dbs[2])) + + def test_misplaced_objects_deleted_and_updated(self): + # setup + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + + shard_bounds = (('', 'here'), ('here', '')) + root_shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE) + expected_shard_dbs = [] + for sr in root_shard_ranges: + db_hash = hash_path(sr.account, sr.container) + expected_shard_dbs.append( + os.path.join(self.tempdir, 'sda', 'containers', '0', + db_hash[-3:], db_hash, db_hash + '.db')) + broker.merge_shard_ranges(root_shard_ranges) + self.assertTrue(broker.set_sharding_state()) + + ts_older_internal = self.ts_encoded() # used later + # put deleted objects into source + objects = [ + ['b', self.ts_encoded(), 0, '', '', 1, 0], + ['x', self.ts_encoded(), 0, '', '', 1, 0] + ] + for obj in objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(objects, broker.db_file) # sanity check + # pretend we cleaved all ranges - sharded state + self.assertTrue(broker.set_sharded_state()) + + with self._mock_sharder() as sharder: + sharder.logger = debug_logger() + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1])], + any_order=True + ) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'placed': 2, 'unplaced': 0} + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check new misplaced objects were moved + self._check_objects(objects[:1], expected_shard_dbs[0]) + self._check_objects(objects[1:], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + + # update source db with older undeleted versions of same objects + old_objects = [ + ['b', ts_older_internal, 2, 'text/plain', 'etag_b', 0, 0], + ['x', ts_older_internal, 4, 'text/plain', 'etag_x', 0, 0] + ] + for obj in old_objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(old_objects, broker.db_file) # sanity check + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1])], + any_order=True + ) + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check older misplaced objects were not merged to shard brokers + self._check_objects(objects[:1], expected_shard_dbs[0]) + self._check_objects(objects[1:], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + + # the destination shard dbs for misplaced objects may already exist so + # check they are updated correctly when overwriting objects + # update source db with newer deleted versions of same objects + new_objects = [ + ['b', self.ts_encoded(), 0, '', '', 1, 0], + ['x', self.ts_encoded(), 0, '', '', 1, 0] + ] + for obj in new_objects: + broker.put_object(*obj) + broker.get_info() + self._check_objects(new_objects, broker.db_file) # sanity check + shard_broker = ContainerBroker( + expected_shard_dbs[0], account=root_shard_ranges[0].account, + container=root_shard_ranges[0].container) + # update one shard container with even newer version of object + timestamps = [next(self.ts_iter) for i in range(7)] + ts_newer = encode_timestamps( + timestamps[1], timestamps[3], timestamps[5]) + newer_object = ('b', ts_newer, 10, 'text/plain', 'etag_b', 0, 0) + shard_broker.put_object(*newer_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + sharder._replicate_object.assert_has_calls( + [mock.call(0, db, 0) for db in (expected_shard_dbs[0], + expected_shard_dbs[1])], + any_order=True + ) + self._assert_stats(expected_stats, sharder, 'misplaced') + self.assertEqual( + 1, sharder.logger.get_increment_counts()['misplaced_found']) + + # check only the newer misplaced object was moved + self._check_objects([newer_object], expected_shard_dbs[0]) + self._check_objects(new_objects[1:], expected_shard_dbs[1]) + # ... and removed from the source db + self._check_objects([], broker.db_file) + + # update source with a version of 'b' that has newer data + # but older content-type and metadata relative to shard object + ts_update = encode_timestamps( + timestamps[2], timestamps[3], timestamps[4]) + update_object = ('b', ts_update, 20, 'text/ignored', 'etag_newer', 0, + 0) + broker.put_object(*update_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + ts_expected = encode_timestamps( + timestamps[2], timestamps[3], timestamps[5]) + expected = ('b', ts_expected, 20, 'text/plain', 'etag_newer', 0, 0) + self._check_objects([expected], expected_shard_dbs[0]) + self._check_objects([], broker.db_file) + + # update source with a version of 'b' that has older data + # and content-type but newer metadata relative to shard object + ts_update = encode_timestamps( + timestamps[1], timestamps[3], timestamps[6]) + update_object = ('b', ts_update, 999, 'text/ignored', 'etag_b', 0, 0) + broker.put_object(*update_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + ts_expected = encode_timestamps( + timestamps[2], timestamps[3], timestamps[6]) + expected = ('b', ts_expected, 20, 'text/plain', 'etag_newer', 0, 0) + self._check_objects([expected], expected_shard_dbs[0]) + self._check_objects([], broker.db_file) + + # update source with a version of 'b' that has older data + # but newer content-type and metadata + ts_update = encode_timestamps( + timestamps[2], timestamps[6], timestamps[6]) + update_object = ('b', ts_update, 999, 'text/newer', 'etag_b', 0, 0) + broker.put_object(*update_object) + + with self._mock_sharder() as sharder: + sharder._move_misplaced_objects(broker) + + ts_expected = encode_timestamps( + timestamps[2], timestamps[6], timestamps[6]) + expected = ('b', ts_expected, 20, 'text/newer', 'etag_newer', 0, 0) + self._check_objects([expected], expected_shard_dbs[0]) + self._check_objects([], broker.db_file) + + def _setup_find_ranges(self, account, cont, lower, upper): + broker = self._make_broker(account=account, container=cont) + own_sr = ShardRange('%s/%s' % (account, cont), Timestamp.now(), + lower, upper) + broker.merge_shard_ranges([own_sr]) + broker.set_sharding_sysmeta('Root', 'a/c') + objects = [ + # some of these are misplaced objects + ['obj%3d' % i, self.ts_encoded(), i, 'text/plain', 'etag%s' % i, 0] + for i in range(100)] + for obj in objects: + broker.put_object(*obj) + return broker, objects + + def _check_find_shard_ranges_none_found(self, broker, objects): + with self._mock_sharder() as sharder: + num_found = sharder._find_shard_ranges(broker) + self.assertGreater(sharder.split_size, len(objects)) + self.assertEqual(0, num_found) + self.assertFalse(broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + with self._mock_sharder( + conf={'shard_container_threshold': 200}) as sharder: + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(sharder.split_size, len(objects)) + self.assertEqual(0, num_found) + self.assertFalse(broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + def test_find_shard_ranges_none_found_root(self): + broker, objects = self._setup_find_ranges('a', 'c', '', '') + self._check_find_shard_ranges_none_found(broker, objects) + + def test_find_shard_ranges_none_found_shard(self): + broker, objects = self._setup_find_ranges( + '.shards_a', 'c', 'lower', 'upper') + self._check_find_shard_ranges_none_found(broker, objects) + + def _check_find_shard_ranges_finds_two(self, account, cont, lower, upper): + def check_ranges(): + self.assertEqual(2, len(broker.get_shard_ranges())) + expected_ranges = [ + ShardRange( + ShardRange.make_path('.int_shards_a', 'c', cont, now, 0), + now, lower, objects[98][0], 99), + ShardRange( + ShardRange.make_path('.int_shards_a', 'c', cont, now, 1), + now, objects[98][0], upper, 1), + ] + self._assert_shard_ranges_equal(expected_ranges, + broker.get_shard_ranges()) + + # first invocation finds both ranges + broker, objects = self._setup_find_ranges( + account, cont, lower, upper) + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'auto_create_account_prefix': '.int_'} + ) as sharder: + with mock_timestamp_now() as now: + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(99, sharder.split_size) + self.assertEqual(2, num_found) + check_ranges() + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 2, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + # second invocation finds none + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'auto_create_account_prefix': '.int_'} + ) as sharder: + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(0, num_found) + self.assertEqual(2, len(broker.get_shard_ranges())) + check_ranges() + expected_stats = {'attempted': 0, 'success': 0, 'failure': 0, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + def test_find_shard_ranges_finds_two_root(self): + self._check_find_shard_ranges_finds_two('a', 'c', '', '') + + def test_find_shard_ranges_finds_two_shard(self): + self._check_find_shard_ranges_finds_two('.shards_a', 'c_', 'l', 'u') + + def _check_find_shard_ranges_finds_three(self, account, cont, lower, + upper): + broker, objects = self._setup_find_ranges( + account, cont, lower, upper) + now = Timestamp.now() + expected_ranges = [ + ShardRange( + ShardRange.make_path('.shards_a', 'c', cont, now, 0), + now, lower, objects[44][0], 45), + ShardRange( + ShardRange.make_path('.shards_a', 'c', cont, now, 1), + now, objects[44][0], objects[89][0], 45), + ShardRange( + ShardRange.make_path('.shards_a', 'c', cont, now, 2), + now, objects[89][0], upper, 10), + ] + # first invocation finds 2 ranges + with self._mock_sharder( + conf={'shard_container_threshold': 90, + 'shard_scanner_batch_size': 2}) as sharder: + with mock_timestamp_now(now): + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(45, sharder.split_size) + self.assertEqual(2, num_found) + self.assertEqual(2, len(broker.get_shard_ranges())) + self._assert_shard_ranges_equal(expected_ranges[:2], + broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 2, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + # second invocation finds third shard range + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'shard_scanner_batch_size': 2} + ) as sharder: + with mock_timestamp_now(now): + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(1, num_found) + self.assertEqual(3, len(broker.get_shard_ranges())) + self._assert_shard_ranges_equal(expected_ranges, + broker.get_shard_ranges()) + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0, + 'found': 1, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + # third invocation finds none + with self._mock_sharder(conf={'shard_container_threshold': 199, + 'shard_scanner_batch_size': 2} + ) as sharder: + sharder._send_shard_ranges = mock.MagicMock(return_value=True) + num_found = sharder._find_shard_ranges(broker) + self.assertEqual(0, num_found) + self.assertEqual(3, len(broker.get_shard_ranges())) + self._assert_shard_ranges_equal(expected_ranges, + broker.get_shard_ranges()) + expected_stats = {'attempted': 0, 'success': 0, 'failure': 0, + 'found': 0, 'min_time': mock.ANY, + 'max_time': mock.ANY} + stats = self._assert_stats(expected_stats, sharder, 'scanned') + self.assertGreaterEqual(stats['max_time'], stats['min_time']) + + def test_find_shard_ranges_finds_three_root(self): + self._check_find_shard_ranges_finds_three('a', 'c', '', '') + + def test_find_shard_ranges_finds_three_shard(self): + self._check_find_shard_ranges_finds_three('.shards_a', 'c_', 'l', 'u') + + def test_sharding_enabled(self): + broker = self._make_broker() + self.assertFalse(sharding_enabled(broker)) + broker.update_metadata( + {'X-Container-Sysmeta-Sharding': + ('yes', Timestamp.now().internal)}) + self.assertTrue(sharding_enabled(broker)) + # deleting broker clears sharding sysmeta + broker.delete_db(Timestamp.now().internal) + self.assertFalse(sharding_enabled(broker)) + # but if broker has a shard range then sharding is enabled + broker.merge_shard_ranges( + ShardRange('acc/a_shard', Timestamp.now(), 'l', 'u')) + self.assertTrue(sharding_enabled(broker)) + + def test_send_shard_ranges(self): + shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + + def do_test(replicas, *resp_codes): + sent_data = defaultdict(str) + + def on_send(fake_conn, data): + sent_data[fake_conn] += data + + with self._mock_sharder(replicas=replicas) as sharder: + with mocked_http_conn(*resp_codes, give_send=on_send) as conn: + with mock_timestamp_now() as now: + res = sharder._send_shard_ranges( + 'a', 'c', shard_ranges) + + self.assertEqual(sharder.ring.replica_count, len(conn.requests)) + expected_body = json.dumps([dict(sr) for sr in shard_ranges]) + expected_headers = {'Content-Type': 'application/json', + 'Content-Length': str(len(expected_body)), + 'X-Timestamp': now.internal, + 'X-Backend-Record-Type': 'shard', + 'User-Agent': mock.ANY} + for data in sent_data.values(): + self.assertEqual(expected_body, data) + hosts = set() + for req in conn.requests: + path_parts = req['path'].split('/')[1:] + hosts.add('%s:%s/%s' % (req['ip'], req['port'], path_parts[0])) + # FakeRing only has one partition + self.assertEqual('0', path_parts[1]) + self.assertEqual('PUT', req['method']) + self.assertEqual(['a', 'c'], path_parts[-2:]) + req_headers = req['headers'] + for k, v in expected_headers.items(): + self.assertEqual(v, req_headers[k]) + self.assertTrue( + req_headers['User-Agent'].startswith('container-sharder')) + self.assertEqual(sharder.ring.replica_count, len(hosts)) + return res, sharder + + replicas = 3 + res, sharder = do_test(replicas, 202, 202, 202) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 202, 404) + self.assertTrue(res) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 202, Exception) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, 202, 404, 404) + self.assertFalse(res) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 500, 500, 500) + self.assertFalse(res) + self.assertEqual([True, True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, Exception, Exception, 202) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, Exception, eventlet.Timeout(), 202) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + + replicas = 2 + res, sharder = do_test(replicas, 202, 202) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 404) + self.assertTrue(res) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, Exception) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, 404, 404) + self.assertFalse(res) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, Exception, Exception) + self.assertFalse(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, eventlet.Timeout(), Exception) + self.assertFalse(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + + replicas = 4 + res, sharder = do_test(replicas, 202, 202, 202, 202) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertTrue(res) + res, sharder = do_test(replicas, 202, 202, 404, 404) + self.assertTrue(res) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 202, 202, Exception, Exception) + self.assertTrue(res) + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test(replicas, 202, 404, 404, 404) + self.assertFalse(res) + self.assertEqual([True, True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, 500, 500, 500, 202) + self.assertFalse(res) + self.assertEqual([True, True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('warning')]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + res, sharder = do_test(replicas, Exception, Exception, 202, 404) + self.assertFalse(res) + self.assertEqual([True], [ + all(msg in line for msg in ('Failed to put shard ranges', '404')) + for line in sharder.logger.get_lines_for_level('warning')]) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + res, sharder = do_test( + replicas, eventlet.Timeout(), eventlet.Timeout(), 202, 404) + self.assertFalse(res) + self.assertEqual([True], [ + all(msg in line for msg in ('Failed to put shard ranges', '404')) + for line in sharder.logger.get_lines_for_level('warning')]) + self.assertEqual([True, True], [ + 'Failed to put shard ranges' in line for line in + sharder.logger.get_lines_for_level('error')]) + + def test_process_broker_not_sharding_no_others(self): + # verify that sharding process will not start when own shard range is + # missing or in wrong state or there are no other shard ranges + broker = self._make_broker() + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + # sanity check + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + + # no own shard range + with self._mock_sharder() as sharder: + sharder._process_broker(broker, node, 99) + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + # now add own shard range + for state in sorted(ShardRange.STATES): + own_sr = broker.get_own_shard_range() # returns the default + own_sr.update_state(state) + broker.merge_shard_ranges([own_sr]) + with mock.patch.object( + broker, 'set_sharding_state') as mock_set_sharding_state: + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + with mock.patch.object(sharder, '_audit_container'): + sharder.logger = debug_logger() + sharder._process_broker(broker, node, 99) + own_shard_range = broker.get_own_shard_range( + no_default=True) + mock_set_sharding_state.assert_not_called() + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(own_shard_range)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + def _check_process_broker_sharding_no_others(self, state): + # verify that when existing own_shard_range has given state and there + # are other shard ranges then the sharding process will begin + broker = self._make_broker(hash_='hash%s' % state) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + own_sr = broker.get_own_shard_range() + self.assertTrue(own_sr.update_state(state)) + epoch = Timestamp.now() + own_sr.epoch = epoch + shard_ranges = self._make_shard_ranges((('', 'm'), ('m', ''))) + broker.merge_shard_ranges([own_sr] + shard_ranges) + + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_create_shard_containers', return_value=0): + with mock_timestamp_now() as now: + sharder._audit_container = mock.MagicMock() + sharder._process_broker(broker, node, 99) + final_own_sr = broker.get_own_shard_range(no_default=True) + + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(final_own_sr)) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(epoch.normal, parse_db_filename(broker.db_file)[1]) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + + def test_process_broker_sharding_with_own_shard_range_no_others(self): + self._check_process_broker_sharding_no_others(ShardRange.SHARDING) + self._check_process_broker_sharding_no_others(ShardRange.SHRINKING) + + def test_process_broker_not_sharding_others(self): + # verify that sharding process will not start when own shard range is + # missing or in wrong state even when other shard ranges are in the db + broker = self._make_broker() + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + # sanity check + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + + # add shard ranges - but not own + shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + broker.merge_shard_ranges(shard_ranges) + + with self._mock_sharder() as sharder: + sharder._process_broker(broker, node, 99) + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + # now add own shard range + for state in sorted(ShardRange.STATES): + if state in (ShardRange.SHARDING, + ShardRange.SHRINKING, + ShardRange.SHARDED): + epoch = None + else: + epoch = Timestamp.now() + + own_sr = broker.get_own_shard_range() # returns the default + own_sr.update_state(state) + own_sr.epoch = epoch + broker.merge_shard_ranges([own_sr]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._process_broker(broker, node, 99) + own_shard_range = broker.get_own_shard_range( + no_default=True) + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(own_shard_range)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + if epoch: + self.assertFalse(broker.logger.get_lines_for_level('warning')) + else: + self.assertIn('missing epoch', + broker.logger.get_lines_for_level('warning')[0]) + self.assertFalse(broker.logger.get_lines_for_level('error')) + broker.logger.clear() + + def _check_process_broker_sharding_others(self, state): + # verify states in which own_shard_range will cause sharding + # process to start when other shard ranges are in the db + broker = self._make_broker(hash_='hash%s' % state) + node = {'ip': '1.2.3.4', 'port': 6040, 'device': 'sda5', 'id': '2', + 'index': 0} + # add shard ranges - but not own + shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + broker.merge_shard_ranges(shard_ranges) + # sanity check + self.assertIsNone(broker.get_own_shard_range(no_default=True)) + self.assertEqual(UNSHARDED, broker.get_db_state()) + + # now set own shard range to given state and persist it + own_sr = broker.get_own_shard_range() # returns the default + self.assertTrue(own_sr.update_state(state)) + epoch = Timestamp.now() + own_sr.epoch = epoch + broker.merge_shard_ranges([own_sr]) + with self._mock_sharder() as sharder: + + sharder.logger = debug_logger() + with mock_timestamp_now() as now: + # we're not testing rest of the process here so prevent any + # attempt to progress shard range states + sharder._create_shard_containers = lambda *args: 0 + sharder._process_broker(broker, node, 99) + own_shard_range = broker.get_own_shard_range(no_default=True) + + self.assertEqual(dict(own_sr, meta_timestamp=now), + dict(own_shard_range)) + self.assertEqual(SHARDING, broker.get_db_state()) + self.assertEqual(epoch.normal, parse_db_filename(broker.db_file)[1]) + self.assertFalse(broker.logger.get_lines_for_level('warning')) + self.assertFalse(broker.logger.get_lines_for_level('error')) + + def test_process_broker_sharding_with_own_shard_range_and_others(self): + self._check_process_broker_sharding_others(ShardRange.SHARDING) + self._check_process_broker_sharding_others(ShardRange.SHRINKING) + self._check_process_broker_sharding_others(ShardRange.SHARDED) + + def check_shard_ranges_sent(self, broker, expected_sent): + bodies = [] + + def capture_send(conn, data): + bodies.append(data) + + with self._mock_sharder() as sharder: + with mocked_http_conn(204, 204, 204, + give_send=capture_send) as mock_conn: + sharder._update_root_container(broker) + + for req in mock_conn.requests: + self.assertEqual('PUT', req['method']) + self.assertEqual([expected_sent] * 3, + [json.loads(b) for b in bodies]) + + def test_update_root_container_own_range(self): + broker = self._make_broker() + + # nothing to send + with self._mock_sharder() as sharder: + with mocked_http_conn() as mock_conn: + sharder._update_root_container(broker) + self.assertFalse(mock_conn.requests) + + def check_only_own_shard_range_sent(state): + own_shard_range = broker.get_own_shard_range() + self.assertTrue(own_shard_range.update_state( + state, state_timestamp=next(self.ts_iter))) + broker.merge_shard_ranges([own_shard_range]) + # add an object, expect to see it reflected in the own shard range + # that is sent + broker.put_object(str(own_shard_range.object_count + 1), + next(self.ts_iter).internal, 1, '', '') + with mock_timestamp_now() as now: + # force own shard range meta updates to be at fixed timestamp + expected_sent = [ + dict(own_shard_range, + meta_timestamp=now.internal, + object_count=own_shard_range.object_count + 1, + bytes_used=own_shard_range.bytes_used + 1)] + self.check_shard_ranges_sent(broker, expected_sent) + + for state in ShardRange.STATES: + with annotate_failure(state): + check_only_own_shard_range_sent(state) + + def test_update_root_container_all_ranges(self): + broker = self._make_broker() + other_shard_ranges = self._make_shard_ranges((('', 'h'), ('h', ''))) + self.assertTrue(other_shard_ranges[0].set_deleted()) + broker.merge_shard_ranges(other_shard_ranges) + + # own range missing - send nothing + with self._mock_sharder() as sharder: + with mocked_http_conn() as mock_conn: + sharder._update_root_container(broker) + self.assertFalse(mock_conn.requests) + + def check_all_shard_ranges_sent(state): + own_shard_range = broker.get_own_shard_range() + self.assertTrue(own_shard_range.update_state( + state, state_timestamp=next(self.ts_iter))) + broker.merge_shard_ranges([own_shard_range]) + # add an object, expect to see it reflected in the own shard range + # that is sent + broker.put_object(str(own_shard_range.object_count + 1), + next(self.ts_iter).internal, 1, '', '') + with mock_timestamp_now() as now: + shard_ranges = broker.get_shard_ranges(include_deleted=True) + expected_sent = sorted([ + own_shard_range.copy( + meta_timestamp=now.internal, + object_count=own_shard_range.object_count + 1, + bytes_used=own_shard_range.bytes_used + 1)] + + shard_ranges, + key=lambda sr: (sr.upper, sr.state, sr.lower)) + self.check_shard_ranges_sent( + broker, [dict(sr) for sr in expected_sent]) + + for state in ShardRange.STATES.keys(): + with annotate_failure(state): + check_all_shard_ranges_sent(state) + + def test_audit_root_container(self): + broker = self._make_broker() + + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0} + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + self._assert_stats(expected_stats, sharder, 'audit_root') + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + mocked.assert_not_called() + + def assert_overlap_warning(line, state_text): + self.assertIn( + 'Audit failed for root %s' % broker.db_file, line) + self.assertIn( + 'overlapping ranges in state %s: k-t s-z' % state_text, + line) + + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1} + shard_bounds = (('a', 'j'), ('k', 't'), ('s', 'z')) + for state, state_text in ShardRange.STATES.items(): + shard_ranges = self._make_shard_ranges(shard_bounds, state) + broker.merge_shard_ranges(shard_ranges) + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + lines = sharder.logger.get_lines_for_level('warning') + assert_overlap_warning(lines[0], state_text) + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self._assert_stats(expected_stats, sharder, 'audit_root') + mocked.assert_not_called() + + def assert_missing_warning(line): + self.assertIn( + 'Audit failed for root %s' % broker.db_file, line) + self.assertIn('missing range(s): -a j-k z-', line) + + own_shard_range = broker.get_own_shard_range() + states = (ShardRange.SHARDING, ShardRange.SHARDED) + for state in states: + own_shard_range.update_state( + state, state_timestamp=next(self.ts_iter)) + broker.merge_shard_ranges([own_shard_range]) + with self._mock_sharder() as sharder: + with mock.patch.object( + sharder, '_audit_shard_container') as mocked: + sharder._audit_container(broker) + lines = sharder.logger.get_lines_for_level('warning') + assert_missing_warning(lines[0]) + assert_overlap_warning(lines[0], state_text) + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self._assert_stats(expected_stats, sharder, 'audit_root') + mocked.assert_not_called() + + def test_audit_shard_container(self): + broker = self._make_broker(account='.shards_a', container='shard_c') + broker.set_sharding_sysmeta('Root', 'a/c') + # include overlaps to verify correct match for updating own shard range + shard_bounds = ( + ('a', 'j'), ('k', 't'), ('k', 's'), ('l', 's'), ('s', 'z')) + shard_ranges = self._make_shard_ranges(shard_bounds, ShardRange.ACTIVE) + shard_ranges[1].name = broker.path + expected_stats = {'attempted': 1, 'success': 0, 'failure': 1} + + def call_audit_container(exc=None): + with self._mock_sharder() as sharder: + sharder.logger = debug_logger() + with mock.patch.object(sharder, '_audit_root_container') \ + as mocked, mock.patch.object( + sharder, 'int_client') as mock_swift: + mock_response = mock.MagicMock() + mock_response.headers = {'x-backend-record-type': + 'shard'} + mock_response.body = json.dumps( + [dict(sr) for sr in shard_ranges]) + mock_swift.make_request.return_value = mock_response + mock_swift.make_request.side_effect = exc + mock_swift.make_path = (lambda a, c: + '/v1/%s/%s' % (a, c)) + sharder.reclaim_age = 0 + sharder._audit_container(broker) + mocked.assert_not_called() + return sharder, mock_swift + + # bad account name + broker.account = 'bad_account' + sharder, mock_swift = call_audit_container() + lines = sharder.logger.get_lines_for_level('warning') + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertIn('Audit warnings for shard %s' % broker.db_file, lines[0]) + self.assertIn('account not in shards namespace', lines[0]) + self.assertNotIn('root has no matching shard range', lines[0]) + self.assertNotIn('unable to get shard ranges from root', lines[0]) + self.assertIn('Audit failed for shard %s' % broker.db_file, lines[1]) + self.assertIn('missing own shard range', lines[1]) + self.assertFalse(lines[2:]) + self.assertFalse(broker.is_deleted()) + + # missing own shard range + broker.get_info() + sharder, mock_swift = call_audit_container() + lines = sharder.logger.get_lines_for_level('warning') + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertIn('Audit failed for shard %s' % broker.db_file, lines[0]) + self.assertIn('missing own shard range', lines[0]) + self.assertNotIn('unable to get shard ranges from root', lines[0]) + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertFalse(broker.is_deleted()) + + # create own shard range, no match in root + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0} + own_shard_range = broker.get_own_shard_range() # get the default + own_shard_range.lower = 'j' + own_shard_range.upper = 'k' + broker.merge_shard_ranges([own_shard_range]) + sharder, mock_swift = call_audit_container() + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Audit warnings for shard %s' % broker.db_file, lines[0]) + self.assertNotIn('account not in shards namespace', lines[0]) + self.assertNotIn('missing own shard range', lines[0]) + self.assertIn('root has no matching shard range', lines[0]) + self.assertNotIn('unable to get shard ranges from root', lines[0]) + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertFalse(lines[1:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertFalse(broker.is_deleted()) + expected_headers = {'X-Backend-Record-Type': 'shard', + 'X-Newest': 'true', + 'X-Backend-Include-Deleted': 'True', + 'X-Backend-Override-Deleted': 'true'} + params = {'format': 'json', 'marker': 'j', 'end_marker': 'k'} + mock_swift.make_request.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + # create own shard range, failed response from root + expected_stats = {'attempted': 1, 'success': 1, 'failure': 0} + own_shard_range = broker.get_own_shard_range() # get the default + own_shard_range.lower = 'j' + own_shard_range.upper = 'k' + broker.merge_shard_ranges([own_shard_range]) + sharder, mock_swift = call_audit_container( + exc=internal_client.UnexpectedResponse('bad', 'resp')) + lines = sharder.logger.get_lines_for_level('warning') + self.assertIn('Failed to get shard ranges', lines[0]) + self.assertIn('Audit warnings for shard %s' % broker.db_file, lines[1]) + self.assertNotIn('account not in shards namespace', lines[1]) + self.assertNotIn('missing own shard range', lines[1]) + self.assertNotIn('root has no matching shard range', lines[1]) + self.assertIn('unable to get shard ranges from root', lines[1]) + self._assert_stats(expected_stats, sharder, 'audit_shard') + self.assertFalse(lines[2:]) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self.assertFalse(broker.is_deleted()) + mock_swift.make_request.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + def assert_ok(): + sharder, mock_swift = call_audit_container() + self.assertFalse(sharder.logger.get_lines_for_level('warning')) + self.assertFalse(sharder.logger.get_lines_for_level('error')) + self._assert_stats(expected_stats, sharder, 'audit_shard') + params = {'format': 'json', 'marker': 'k', 'end_marker': 't'} + mock_swift.make_request.assert_called_once_with( + 'GET', '/v1/a/c', expected_headers, acceptable_statuses=(2,), + params=params) + + # make own shard range match one in root, but different state + shard_ranges[1].timestamp = Timestamp.now() + broker.merge_shard_ranges([shard_ranges[1]]) + now = Timestamp.now() + shard_ranges[1].update_state(ShardRange.SHARDING, state_timestamp=now) + assert_ok() + self.assertFalse(broker.is_deleted()) + # own shard range state is updated from root version + own_shard_range = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_shard_range.state) + self.assertEqual(now, own_shard_range.state_timestamp) + + own_shard_range.update_state(ShardRange.SHARDED, + state_timestamp=Timestamp.now()) + broker.merge_shard_ranges([own_shard_range]) + assert_ok() + + own_shard_range.deleted = 1 + own_shard_range.timestamp = Timestamp.now() + broker.merge_shard_ranges([own_shard_range]) + assert_ok() + self.assertTrue(broker.is_deleted()) + + def test_find_and_enable_sharding_candidates(self): + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + shard_bounds = (('', 'here'), ('here', 'there'), ('there', '')) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.CLEAVED) + shard_ranges[0].state = ShardRange.ACTIVE + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + self.assertTrue(broker.set_sharded_state()) + with self._mock_sharder() as sharder: + sharder._find_and_enable_sharding_candidates(broker) + + # one range just below threshold + shard_ranges[0].update_meta(sharder.shard_container_threshold - 1, 0) + broker.merge_shard_ranges(shard_ranges[0]) + with self._mock_sharder() as sharder: + sharder._find_and_enable_sharding_candidates(broker) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + + # two ranges above threshold, only one ACTIVE + shard_ranges[0].update_meta(sharder.shard_container_threshold, 0) + shard_ranges[2].update_meta(sharder.shard_container_threshold + 1, 0) + broker.merge_shard_ranges([shard_ranges[0], shard_ranges[2]]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + expected = shard_ranges[0].copy(state=ShardRange.SHARDING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal([expected] + shard_ranges[1:], + broker.get_shard_ranges()) + + # check idempotency + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + self._assert_shard_ranges_equal([expected] + shard_ranges[1:], + broker.get_shard_ranges()) + + # two ranges above threshold, both ACTIVE + shard_ranges[2].update_state(ShardRange.ACTIVE) + broker.merge_shard_ranges(shard_ranges[2]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + expected_2 = shard_ranges[2].copy(state=ShardRange.SHARDING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal( + [expected, shard_ranges[1], expected_2], broker.get_shard_ranges()) + + # check idempotency + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates(broker) + self._assert_shard_ranges_equal( + [expected, shard_ranges[1], expected_2], broker.get_shard_ranges()) + + def test_find_and_enable_sharding_candidates_bootstrap(self): + broker = self._make_broker() + with self._mock_sharder( + conf={'shard_container_threshold': 1}) as sharder: + sharder._find_and_enable_sharding_candidates(broker) + self.assertEqual(ShardRange.ACTIVE, broker.get_own_shard_range().state) + broker.put_object('obj', next(self.ts_iter).internal, 1, '', '') + self.assertEqual(1, broker.get_info()['object_count']) + with self._mock_sharder( + conf={'shard_container_threshold': 1}) as sharder: + with mock_timestamp_now() as now: + sharder._find_and_enable_sharding_candidates( + broker, [broker.get_own_shard_range()]) + own_sr = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_sr.state) + self.assertEqual(now, own_sr.state_timestamp) + self.assertEqual(now, own_sr.epoch) + + # check idempotency + with self._mock_sharder( + conf={'shard_container_threshold': 1}) as sharder: + with mock_timestamp_now(): + sharder._find_and_enable_sharding_candidates( + broker, [broker.get_own_shard_range()]) + own_sr = broker.get_own_shard_range() + self.assertEqual(ShardRange.SHARDING, own_sr.state) + self.assertEqual(now, own_sr.state_timestamp) + self.assertEqual(now, own_sr.epoch) + + def test_find_and_enable_shrinking_candidates(self): + broker = self._make_broker() + broker.enable_sharding(next(self.ts_iter)) + shard_bounds = (('', 'here'), ('here', 'there'), ('there', '')) + size = (DEFAULT_SHARD_SHRINK_POINT * + DEFAULT_SHARD_CONTAINER_THRESHOLD / 100) + shard_ranges = self._make_shard_ranges( + shard_bounds, state=ShardRange.ACTIVE, object_count=size) + broker.merge_shard_ranges(shard_ranges) + self.assertTrue(broker.set_sharding_state()) + self.assertTrue(broker.set_sharded_state()) + with self._mock_sharder() as sharder: + sharder._find_and_enable_shrinking_candidates(broker) + self._assert_shard_ranges_equal(shard_ranges, + broker.get_shard_ranges()) + + # one range just below threshold + shard_ranges[0].update_meta(size - 1, 0) + broker.merge_shard_ranges(shard_ranges[0]) + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + acceptor = shard_ranges[1].copy(lower=shard_ranges[0].lower) + acceptor.timestamp = now + donor = shard_ranges[0].copy(state=ShardRange.SHRINKING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal([donor, acceptor, shard_ranges[2]], + broker.get_shard_ranges()) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(acceptor.account, acceptor.container, [acceptor]), + mock.call(donor.account, donor.container, [donor, acceptor])] + ) + + # check idempotency + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + self._assert_shard_ranges_equal([donor, acceptor, shard_ranges[2]], + broker.get_shard_ranges()) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(acceptor.account, acceptor.container, [acceptor]), + mock.call(donor.account, donor.container, [donor, acceptor])] + ) + + # acceptor falls below threshold - not a candidate + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + acceptor.update_meta(0, 0, meta_timestamp=now) + broker.merge_shard_ranges(acceptor) + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + self._assert_shard_ranges_equal([donor, acceptor, shard_ranges[2]], + broker.get_shard_ranges()) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(acceptor.account, acceptor.container, [acceptor]), + mock.call(donor.account, donor.container, [donor, acceptor])] + ) + + # ...until donor has shrunk + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + donor.update_state(ShardRange.SHARDED, state_timestamp=now) + donor.set_deleted(timestamp=now) + broker.merge_shard_ranges(donor) + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + new_acceptor = shard_ranges[2].copy(lower=acceptor.lower) + new_acceptor.timestamp = now + new_donor = acceptor.copy(state=ShardRange.SHRINKING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal( + [donor, new_donor, new_acceptor], + broker.get_shard_ranges(include_deleted=True)) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(new_acceptor.account, new_acceptor.container, + [new_acceptor]), + mock.call(new_donor.account, new_donor.container, + [new_donor, new_acceptor])] + ) + + # ..finally last shard shrinks to root + with self._mock_sharder() as sharder: + with mock_timestamp_now() as now: + new_donor.update_state(ShardRange.SHARDED, state_timestamp=now) + new_donor.set_deleted(timestamp=now) + new_acceptor.update_meta(0, 0, meta_timestamp=now) + broker.merge_shard_ranges([new_donor, new_acceptor]) + sharder._send_shard_ranges = mock.MagicMock() + sharder._find_and_enable_shrinking_candidates(broker) + final_donor = new_acceptor.copy(state=ShardRange.SHRINKING, + state_timestamp=now, epoch=now) + self._assert_shard_ranges_equal( + [donor, new_donor, final_donor], + broker.get_shard_ranges(include_deleted=True)) + sharder._send_shard_ranges.assert_has_calls( + [mock.call(final_donor.account, final_donor.container, + [final_donor, broker.get_own_shard_range()])] + ) + + def test_partition_and_device_filters(self): + # verify partitions and devices kwargs result in filtering of processed + # containers but not of the local device ids. + ring = FakeRing() + dev_ids = set() + container_data = [] + for dev in ring.devs: + dev_ids.add(dev['id']) + part = str(dev['id']) + broker = self._make_broker( + container='c%s' % dev['id'], hash_='c%shash' % dev['id'], + device=dev['device'], part=part) + broker.update_metadata({'X-Container-Sysmeta-Sharding': + ('true', next(self.ts_iter).internal)}) + container_data.append((broker.path, dev['id'], part)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once() + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set(container_data), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(partitions='0') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set([container_data[0]]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(partitions='2,0') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set([container_data[0], container_data[2]]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(partitions='2,0', devices='sdc') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set([container_data[2]]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + with self._mock_sharder() as sharder: + sharder.ring = ring + sharder._check_node = lambda *args: True + with mock.patch.object( + sharder, '_process_broker') as mock_process_broker: + sharder.run_once(devices='sdb,sdc') + self.assertEqual(dev_ids, set(sharder._local_device_ids)) + self.assertEqual(set(container_data[1:]), + set((call[0][0].path, call[0][1]['id'], call[0][2]) + for call in mock_process_broker.call_args_list)) + + +class TestCleavingContext(BaseTestSharder): + def test_init(self): + ctx = CleavingContext(ref='test') + self.assertEqual('test', ctx.ref) + self.assertEqual('', ctx.cursor) + self.assertIsNone(ctx.max_row) + self.assertIsNone(ctx.cleave_to_row) + self.assertIsNone(ctx.last_cleave_to_row) + self.assertFalse(ctx.misplaced_done) + self.assertFalse(ctx.cleaving_done) + + def test_iter(self): + ctx = CleavingContext('test', 'curs', 12, 11, 10, False, True, 0, 4) + expected = {'ref': 'test', + 'cursor': 'curs', + 'max_row': 12, + 'cleave_to_row': 11, + 'last_cleave_to_row': 10, + 'cleaving_done': False, + 'misplaced_done': True, + 'ranges_done': 0, + 'ranges_todo': 4} + self.assertEqual(expected, dict(ctx)) + + def test_cursor(self): + broker = self._make_broker() + ref = CleavingContext._make_ref(broker) + + for curs in ('curs', u'curs\u00e4\u00fb'): + with annotate_failure('%r' % curs): + ctx = CleavingContext(ref, curs, 12, 11, 10, False, True) + self.assertEqual(curs.encode('utf8'), ctx.cursor) + ctx.store(broker) + ctx = CleavingContext.load(broker) + self.assertEqual(curs.encode('utf8'), ctx.cursor) + + def test_load(self): + broker = self._make_broker() + for i in range(6): + broker.put_object('o%s' % i, next(self.ts_iter).internal, 10, + 'text/plain', 'etag_a', 0) + + db_id = broker.get_info()['id'] + params = {'ref': db_id, + 'cursor': 'curs', + 'max_row': 2, + 'cleave_to_row': 2, + 'last_cleave_to_row': 1, + 'cleaving_done': False, + 'misplaced_done': True, + 'ranges_done': 2, + 'ranges_todo': 4} + key = 'X-Container-Sysmeta-Shard-Context-%s' % db_id + broker.update_metadata( + {key: (json.dumps(params), Timestamp.now().internal)}) + ctx = CleavingContext.load(broker) + self.assertEqual(db_id, ctx.ref) + self.assertEqual('curs', ctx.cursor) + # note max_row is dynamically updated during load + self.assertEqual(6, ctx.max_row) + self.assertEqual(2, ctx.cleave_to_row) + self.assertEqual(1, ctx.last_cleave_to_row) + self.assertTrue(ctx.misplaced_done) + self.assertFalse(ctx.cleaving_done) + self.assertEqual(2, ctx.ranges_done) + self.assertEqual(4, ctx.ranges_todo) + + def test_store(self): + broker = self._make_sharding_broker() + old_db_id = broker.get_brokers()[0].get_info()['id'] + ctx = CleavingContext(old_db_id, 'curs', 12, 11, 2, True, True, 2, 4) + ctx.store(broker) + key = 'X-Container-Sysmeta-Shard-Context-%s' % old_db_id + data = json.loads(broker.metadata[key][0]) + expected = {'ref': old_db_id, + 'cursor': 'curs', + 'max_row': 12, + 'cleave_to_row': 11, + 'last_cleave_to_row': 2, + 'cleaving_done': True, + 'misplaced_done': True, + 'ranges_done': 2, + 'ranges_todo': 4} + self.assertEqual(expected, data) + + def test_store_add_row_load(self): + # adding row to older db changes only max_row in the context + broker = self._make_sharding_broker() + old_broker = broker.get_brokers()[0] + old_db_id = old_broker.get_info()['id'] + old_broker.merge_items([old_broker._record_to_dict( + ('obj', next(self.ts_iter).internal, 0, 'text/plain', 'etag', 1))]) + old_max_row = old_broker.get_max_row() + self.assertEqual(1, old_max_row) # sanity check + ctx = CleavingContext(old_db_id, 'curs', 1, 1, 0, True, True) + ctx.store(broker) + + # adding a row changes max row + old_broker.merge_items([old_broker._record_to_dict( + ('obj', next(self.ts_iter).internal, 0, 'text/plain', 'etag', 1))]) + + new_ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, new_ctx.ref) + self.assertEqual('curs', new_ctx.cursor) + self.assertEqual(2, new_ctx.max_row) + self.assertEqual(1, new_ctx.cleave_to_row) + self.assertEqual(0, new_ctx.last_cleave_to_row) + self.assertTrue(new_ctx.misplaced_done) + self.assertTrue(new_ctx.cleaving_done) + + def test_store_reclaim_load(self): + # reclaiming rows from older db does not change context + broker = self._make_sharding_broker() + old_broker = broker.get_brokers()[0] + old_db_id = old_broker.get_info()['id'] + old_broker.merge_items([old_broker._record_to_dict( + ('obj', next(self.ts_iter).internal, 0, 'text/plain', 'etag', 1))]) + old_max_row = old_broker.get_max_row() + self.assertEqual(1, old_max_row) # sanity check + ctx = CleavingContext(old_db_id, 'curs', 1, 1, 0, True, True) + ctx.store(broker) + + self.assertEqual( + 1, len(old_broker.get_objects())) + now = next(self.ts_iter).internal + broker.get_brokers()[0].reclaim(now, now) + self.assertFalse(old_broker.get_objects()) + + new_ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, new_ctx.ref) + self.assertEqual('curs', new_ctx.cursor) + self.assertEqual(1, new_ctx.max_row) + self.assertEqual(1, new_ctx.cleave_to_row) + self.assertEqual(0, new_ctx.last_cleave_to_row) + self.assertTrue(new_ctx.misplaced_done) + self.assertTrue(new_ctx.cleaving_done) + + def test_store_modify_db_id_load(self): + # changing id changes ref, so results in a fresh context + broker = self._make_sharding_broker() + old_broker = broker.get_brokers()[0] + old_db_id = old_broker.get_info()['id'] + ctx = CleavingContext(old_db_id, 'curs', 12, 11, 2, True, True) + ctx.store(broker) + + old_broker.newid('fake_remote_id') + new_db_id = old_broker.get_info()['id'] + self.assertNotEqual(old_db_id, new_db_id) + + new_ctx = CleavingContext.load(broker) + self.assertEqual(new_db_id, new_ctx.ref) + self.assertEqual('', new_ctx.cursor) + # note max_row is dynamically updated during load + self.assertEqual(-1, new_ctx.max_row) + self.assertEqual(None, new_ctx.cleave_to_row) + self.assertEqual(None, new_ctx.last_cleave_to_row) + self.assertFalse(new_ctx.misplaced_done) + self.assertFalse(new_ctx.cleaving_done) + + def test_load_modify_store_load(self): + broker = self._make_sharding_broker() + old_db_id = broker.get_brokers()[0].get_info()['id'] + ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, ctx.ref) + self.assertEqual('', ctx.cursor) # sanity check + ctx.cursor = 'curs' + ctx.misplaced_done = True + ctx.store(broker) + ctx = CleavingContext.load(broker) + self.assertEqual(old_db_id, ctx.ref) + self.assertEqual('curs', ctx.cursor) + self.assertTrue(ctx.misplaced_done) + + def test_reset(self): + ctx = CleavingContext('test', 'curs', 12, 11, 2, True, True) + + def check_context(): + self.assertEqual('test', ctx.ref) + self.assertEqual('', ctx.cursor) + self.assertEqual(12, ctx.max_row) + self.assertEqual(11, ctx.cleave_to_row) + self.assertEqual(11, ctx.last_cleave_to_row) + self.assertFalse(ctx.misplaced_done) + self.assertFalse(ctx.cleaving_done) + self.assertEqual(0, ctx.ranges_done) + self.assertEqual(0, ctx.ranges_todo) + ctx.reset() + # check idempotency + ctx.reset() + + def test_start(self): + ctx = CleavingContext('test', 'curs', 12, 11, 2, True, True) + + def check_context(): + self.assertEqual('test', ctx.ref) + self.assertEqual('', ctx.cursor) + self.assertEqual(12, ctx.max_row) + self.assertEqual(12, ctx.cleave_to_row) + self.assertEqual(2, ctx.last_cleave_to_row) + self.assertTrue(ctx.misplaced_done) # *not* reset here + self.assertFalse(ctx.cleaving_done) + self.assertEqual(0, ctx.ranges_done) + self.assertEqual(0, ctx.ranges_todo) + ctx.start() + # check idempotency + ctx.start() diff --git a/test/unit/proxy/controllers/test_container.py b/test/unit/proxy/controllers/test_container.py index e85e50362a..ae44f8b001 100644 --- a/test/unit/proxy/controllers/test_container.py +++ b/test/unit/proxy/controllers/test_container.py @@ -159,6 +159,91 @@ class TestContainerController(TestRingBase): for key in owner_headers: self.assertIn(key, resp.headers) + def test_reseller_admin(self): + reseller_internal_headers = { + get_sys_meta_prefix('container') + 'sharding': 'True'} + reseller_external_headers = {'x-container-sharding': 'on'} + controller = proxy_server.ContainerController(self.app, 'a', 'c') + + # Normal users, even swift owners, can't set it + req = Request.blank('/v1/a/c', method='PUT', + headers=reseller_external_headers, + environ={'swift_owner': True}) + with mocked_http_conn(*[201] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertNotIn(key.title(), captured['headers']) + + req = Request.blank('/v1/a/c', method='POST', + headers=reseller_external_headers, + environ={'swift_owner': True}) + with mocked_http_conn(*[204] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertNotIn(key.title(), captured['headers']) + + req = Request.blank('/v1/a/c', environ={'swift_owner': True}) + # Heck, they don't even get to know + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.HEAD(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertNotIn(key, resp.headers) + + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.GET(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertNotIn(key, resp.headers) + + # But reseller admins can set it + req = Request.blank('/v1/a/c', method='PUT', + headers=reseller_external_headers, + environ={'reseller_request': True}) + with mocked_http_conn(*[201] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertIn(key.title(), captured['headers']) + + req = Request.blank('/v1/a/c', method='POST', + headers=reseller_external_headers, + environ={'reseller_request': True}) + with mocked_http_conn(*[204] * self.CONTAINER_REPLICAS) as mock_conn: + resp = req.get_response(self.app) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_internal_headers: + for captured in mock_conn.requests: + self.assertIn(key.title(), captured['headers']) + + # And see that they have + req = Request.blank('/v1/a/c', environ={'reseller_request': True}) + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.HEAD(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertIn(key, resp.headers) + self.assertEqual(resp.headers[key], 'True') + + with mock.patch('swift.proxy.controllers.base.http_connect', + fake_http_connect(200, 200, + headers=reseller_internal_headers)): + resp = controller.GET(req) + self.assertEqual(2, resp.status_int // 100) + for key in reseller_external_headers: + self.assertEqual(resp.headers[key], 'True') + def test_sys_meta_headers_PUT(self): # check that headers in sys meta namespace make it through # the container controller