Merge container sharding into master

Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com> Co-Authored-By: John Dickinson <me@not.mn> Co-Authored-By: Kazuhiro MIYAHARA <miyahara.kazuhiro@lab.ntt.co.jp> Co-Authored-By: Matthew Oliver <matt@oliver.net.au> Co-Authored-By: Samuel Merritt <sam@swiftstack.com> Co-Authored-By: Tim Burke <tim.burke@gmail.com> Change-Id: I964666d2c1ce893326c6aa2bbe9e1dd0312e7a9e
2018-05-18 18:24:30 -07:00 · 2018-05-18 18:24:30 -07:00 · ea92e49980
commit ea92e49980
parent de9cee090b 5c5b08d047
66 changed files with 35616 additions and 806 deletions
--- a/bin/swift-container-sharder
+++ b/bin/swift-container-sharder
@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# Copyright (c) 2010-2015 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from swift.container.sharder import ContainerSharder
+from swift.common.utils import parse_options
+from swift.common.daemon import run_daemon
+from optparse import OptionParser
+
+if __name__ == '__main__':
+    parser = OptionParser("%prog CONFIG [options]")
+    parser.add_option('-d', '--devices',
+                      help='Shard containers only on given devices. '
+                           'Comma-separated list. '
+                           'Only has effect if --once is used.')
+    parser.add_option('-p', '--partitions',
+                      help='Shard containers only in given partitions. '
+                           'Comma-separated list. '
+                           'Only has effect if --once is used.')
+    conf_file, options = parse_options(parser=parser, once=True)
+    run_daemon(ContainerSharder, conf_file, **options)
--- a/doc/saio/swift/container-server/1.conf
+++ b/doc/saio/swift/container-server/1.conf
@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port}
 [container-auditor]

 [container-sync]
+
+[container-sharder]
+auto_shard = true
+rsync_module = {replication_ip}::container{replication_port}
+# This is intentionally much smaller than the default of 1,000,000 so tests
+# can run in a reasonable amount of time
+shard_container_threshold = 100
+# The probe tests make explicit assumptions about the batch sizes
+shard_scanner_batch_size = 10
+cleave_batch_size = 2
--- a/doc/saio/swift/container-server/2.conf
+++ b/doc/saio/swift/container-server/2.conf
@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port}
 [container-auditor]

 [container-sync]
+
+[container-sharder]
+auto_shard = true
+rsync_module = {replication_ip}::container{replication_port}
+# This is intentionally much smaller than the default of 1,000,000 so tests
+# can run in a reasonable amount of time
+shard_container_threshold = 100
+# The probe tests make explicit assumptions about the batch sizes
+shard_scanner_batch_size = 10
+cleave_batch_size = 2
--- a/doc/saio/swift/container-server/3.conf
+++ b/doc/saio/swift/container-server/3.conf
@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port}
 [container-auditor]

 [container-sync]
+
+[container-sharder]
+auto_shard = true
+rsync_module = {replication_ip}::container{replication_port}
+# This is intentionally much smaller than the default of 1,000,000 so tests
+# can run in a reasonable amount of time
+shard_container_threshold = 100
+# The probe tests make explicit assumptions about the batch sizes
+shard_scanner_batch_size = 10
+cleave_batch_size = 2
--- a/doc/saio/swift/container-server/4.conf
+++ b/doc/saio/swift/container-server/4.conf
@ -27,3 +27,13 @@ rsync_module = {replication_ip}::container{replication_port}
 [container-auditor]

 [container-sync]
+
+[container-sharder]
+auto_shard = true
+rsync_module = {replication_ip}::container{replication_port}
+# This is intentionally much smaller than the default of 1,000,000 so tests
+# can run in a reasonable amount of time
+shard_container_threshold = 100
+# The probe tests make explicit assumptions about the batch sizes
+shard_scanner_batch_size = 10
+cleave_batch_size = 2
--- a/doc/saio/swift/internal-client.conf
+++ b/doc/saio/swift/internal-client.conf
@ -0,0 +1,24 @@
+[DEFAULT]
+
+[pipeline:main]
+pipeline = catch_errors proxy-logging cache symlink proxy-server
+
+[app:proxy-server]
+use = egg:swift#proxy
+account_autocreate = true
+# See proxy-server.conf-sample for options
+
+[filter:symlink]
+use = egg:swift#symlink
+# See proxy-server.conf-sample for options
+
+[filter:cache]
+use = egg:swift#memcache
+# See proxy-server.conf-sample for options
+
+[filter:proxy-logging]
+use = egg:swift#proxy_logging
+
+[filter:catch_errors]
+use = egg:swift#catch_errors
+# See proxy-server.conf-sample for options
--- a/doc/source/container.rst
+++ b/doc/source/container.rst
@ -24,6 +24,16 @@ Container Backend
    :undoc-members:
    :show-inheritance:

+.. _container-replicator:
+
+Container Replicator
+====================
+
+.. automodule:: swift.container.replicator
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 .. _container-server:

 Container Server
@ -44,12 +54,12 @@ Container Reconciler
    :undoc-members:
    :show-inheritance:

-.. _container-replicator:
+.. _container-sharder:

-Container Replicator
-====================
+Container Sharder
+=================

-.. automodule:: swift.container.replicator
+.. automodule:: swift.container.sharder
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/source/images/sharded_GET.svg
+++ b/doc/source/images/sharded_GET.svg
--- a/doc/source/images/sharding_GET.svg
+++ b/doc/source/images/sharding_GET.svg
--- a/doc/source/images/sharding_cleave1_load.svg
+++ b/doc/source/images/sharding_cleave1_load.svg
--- a/doc/source/images/sharding_cleave2_load.svg
+++ b/doc/source/images/sharding_cleave2_load.svg
--- a/doc/source/images/sharding_cleave_basic.svg
+++ b/doc/source/images/sharding_cleave_basic.svg
@ -0,0 +1,649 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="630"
+   height="235"
+   version="1.1"
+   id="svg161"
+   sodipodi:docname="sharding_snip5.svg"
+   inkscape:version="0.92.2 (5c3e80d, 2017-08-06)">
+  <metadata
+     id="metadata167">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs165">
+    <defs
+       id="defs157">
+      <path
+         id="f"
+         d="M 0,20 411,-1484 H 569 L 162,20 H 0"
+         inkscape:connector-curvature="0" />
+      <path
+         id="g"
+         d="M 797,-207 C 713,-60 620,16 414,20 203,24 88,-98 87,-302 c 0,-112 37,-198 111,-258 74,-60 192,-93 356,-96 l 243,-4 c 10,-201 -43,-307 -232,-305 -154,2 -223,43 -242,172 l -188,-17 c 31,-195 175,-292 434,-292 259,0 410,116 410,364 v 466 c 3,95 12,159 101,161 17,0 37,-2 59,-7 V -6 C 1094,5 1047,10 1000,10 857,8 812,-66 803,-207 Z m -525,-92 c -1,116 66,187 183,184 230,-7 361,-164 342,-419 -124,5 -305,-2 -389,30 -83,32 -136,92 -136,205"
+         inkscape:connector-curvature="0" />
+      <path
+         id="h"
+         d="m 779,-765 c -19,-119 -93,-196 -233,-196 -95,0 -164,32 -207,95 -43,63 -64,170 -64,320 0,144 23,251 68,320 45,69 114,104 205,104 139,0 224,-79 240,-212 l 182,12 C 946,-119 781,22 553,20 230,17 87,-201 87,-542 c 0,-338 145,-557 464,-560 225,-1 380,129 413,323"
+         inkscape:connector-curvature="0" />
+      <path
+         id="i"
+         d="m 336,-268 c 2,98 22,139 114,141 24,0 59,-5 104,-14 V -8 C 495,8 434,16 372,16 228,16 156,-66 156,-229 V -951 H 31 v -131 h 132 l 53,-242 h 120 v 242 h 200 v 131 H 336 v 683"
+         inkscape:connector-curvature="0" />
+      <g
+         id="a">
+        <use
+           id="use42"
+           xlink:href="#f"
+           transform="scale(0.01736111)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use44"
+           xlink:href="#g"
+           transform="matrix(0.01736111,0,0,0.01736111,9.8784722,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use46"
+           xlink:href="#h"
+           transform="matrix(0.01736111,0,0,0.01736111,29.652778,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use48"
+           xlink:href="#h"
+           transform="matrix(0.01736111,0,0,0.01736111,47.430556,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use50"
+           xlink:href="#i"
+           transform="matrix(0.01736111,0,0,0.01736111,65.208333,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+      </g>
+      <path
+         id="j"
+         d="M 187,0 V -219 H 382 V 0 H 187"
+         inkscape:connector-curvature="0" />
+      <path
+         id="k"
+         d="m 513,-963 c -139,1 -238,29 -238,149 0,72 41,100 95,127 55,27 348,94 404,125 101,56 176,118 176,263 C 950,-73 758,21 511,20 254,19 107,-55 57,-254 l 159,-31 c 34,123 133,168 295,168 156,0 264,-34 264,-168 0,-156 -183,-165 -315,-204 -171,-51 -245,-68 -323,-172 -26,-35 -37,-82 -37,-135 0,-220 172,-304 413,-303 232,1 379,74 418,265 l -162,20 C 746,-918 647,-964 513,-963"
+         inkscape:connector-curvature="0" />
+      <path
+         id="l"
+         d="m 322,-1484 c -2,195 6,405 -8,587 h 3 c 73,-129 159,-205 346,-205 250,0 342,118 343,381 V 0 H 825 v -686 c 3,-190 -43,-277 -223,-277 -176,-1 -280,140 -280,325 V 0 H 142 v -1484 h 180"
+         inkscape:connector-curvature="0" />
+      <path
+         id="m"
+         d="m 318,-861 c 55,-157 83,-241 257,-241 24,0 48,3 73,10 v 165 c -24,-7 -56,-10 -96,-10 -75,0 -132,33 -171,97 -39,64 -59,156 -59,276 V 0 H 142 c -3,-364 6,-725 -6,-1082 h 170 c 5,123 8,196 8,221 h 4"
+         inkscape:connector-curvature="0" />
+      <path
+         id="n"
+         d="m 835,0 c -5,-29 -11,-137 -10,-174 h -4 C 759,-45 663,20 484,20 177,20 86,-201 86,-536 c 0,-377 133,-566 398,-566 178,1 273,67 339,188 -4,-187 -1,-380 -2,-570 h 180 v 1261 c 0,113 2,187 6,223 z m -14,-554 c 0,-255 -58,-415 -289,-415 -91,0 -151,37 -196,101 -76,109 -77,543 -1,651 44,63 105,98 195,98 235,0 291,-173 291,-435"
+         inkscape:connector-curvature="0" />
+      <path
+         id="o"
+         d="m 87,-548 c 0,-338 159,-553 484,-554 325,-2 484,204 477,599 H 276 c -1,227 88,385 302,388 146,2 246,-65 283,-166 l 158,45 C 954,-65 807,20 578,20 240,20 87,-193 87,-548 Z m 775,-93 c -19,-206 -90,-328 -294,-328 -185,0 -285,140 -290,328 h 584"
+         inkscape:connector-curvature="0" />
+      <path
+         id="p"
+         d="M -31,407 V 277 H 1162 V 407 H -31"
+         inkscape:connector-curvature="0" />
+      <g
+         id="b">
+        <use
+           id="use60"
+           xlink:href="#f"
+           transform="scale(0.01736111)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use62"
+           xlink:href="#j"
+           transform="matrix(0.01736111,0,0,0.01736111,9.8784722,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use64"
+           xlink:href="#k"
+           transform="matrix(0.01736111,0,0,0.01736111,19.756944,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use66"
+           xlink:href="#l"
+           transform="matrix(0.01736111,0,0,0.01736111,37.534722,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use68"
+           xlink:href="#g"
+           transform="matrix(0.01736111,0,0,0.01736111,57.309028,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use70"
+           xlink:href="#m"
+           transform="matrix(0.01736111,0,0,0.01736111,77.083333,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use72"
+           xlink:href="#n"
+           transform="matrix(0.01736111,0,0,0.01736111,88.923611,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use74"
+           xlink:href="#o"
+           transform="matrix(0.01736111,0,0,0.01736111,108.69792,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use76"
+           xlink:href="#n"
+           transform="matrix(0.01736111,0,0,0.01736111,128.47222,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use78"
+           xlink:href="#p"
+           transform="matrix(0.01736111,0,0,0.01736111,148.24653,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use80"
+           xlink:href="#g"
+           transform="matrix(0.01736111,0,0,0.01736111,168.02083,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use82"
+           xlink:href="#h"
+           transform="matrix(0.01736111,0,0,0.01736111,187.79514,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use84"
+           xlink:href="#h"
+           transform="matrix(0.01736111,0,0,0.01736111,205.57292,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use86"
+           xlink:href="#i"
+           transform="matrix(0.01736111,0,0,0.01736111,223.35069,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+      </g>
+      <path
+         id="q"
+         d="m 571,-1102 c 350,0 480,196 482,560 2,357 -151,562 -488,562 -332,0 -479,-218 -479,-562 0,-373 162,-560 485,-560 z m -8,989 c 244,0 301,-164 301,-429 0,-266 -49,-427 -290,-427 -239,0 -299,165 -299,427 0,252 61,429 288,429"
+         inkscape:connector-curvature="0" />
+      <path
+         id="r"
+         d="m 663,-1102 c 251,0 343,119 343,381 V 0 H 825 v -686 c 0,-183 -40,-279 -223,-277 -184,2 -280,141 -280,336 V 0 H 142 c -3,-345 6,-754 -6,-1082 h 170 c 5,68 6,94 8,185 h 3 c 76,-134 157,-205 346,-205"
+         inkscape:connector-curvature="0" />
+      <path
+         id="s"
+         d="m 156,0 v -153 h 359 v -1084 l -318,227 v -170 l 333,-229 h 166 v 1256 h 343 V 0 H 156"
+         inkscape:connector-curvature="0" />
+      <path
+         id="t"
+         d="m 655,-1102 c 307,0 398,221 398,556 0,377 -133,566 -398,566 -180,0 -272,-66 -339,-188 1,22 -7,151 -10,168 H 132 c 4,-36 6,-110 6,-223 v -1261 h 180 c -2,196 4,384 -4,576 h 4 c 62,-129 158,-194 337,-194 z m -337,573 c 0,254 57,416 289,416 91,0 152,-37 197,-101 76,-109 76,-543 0,-651 -44,-63 -105,-98 -195,-98 -236,0 -291,169 -291,434"
+         inkscape:connector-curvature="0" />
+      <path
+         id="u"
+         d="m 572,-1430 c 269,0 442,128 442,386 0,144 -72,232 -149,325 -108,130 -487,366 -564,566 h 735 V 0 H 103 v -127 c 119,-285 378,-432 583,-627 76,-72 141,-150 143,-284 1,-156 -100,-244 -257,-244 -156,0 -263,93 -277,238 l -184,-17 c 24,-224 208,-369 461,-369"
+         inkscape:connector-curvature="0" />
+      <path
+         id="v"
+         d="m 715,-719 c 191,17 334,134 334,330 C 1049,-115 858,20 571,20 288,20 108,-110 78,-362 l 186,-17 c 24,167 126,250 307,250 177,0 294,-88 291,-266 -3,-174 -149,-246 -344,-244 H 416 v -156 h 98 c 179,2 311,-77 311,-243 0,-157 -98,-244 -264,-244 -159,0 -264,88 -278,233 l -181,-14 c 23,-229 206,-367 461,-367 262,0 447,124 447,373 0,197 -120,298 -295,334 v 4"
+         inkscape:connector-curvature="0" />
+      <path
+         id="w"
+         d="M 1036,-1263 C 892,-1043 790,-871 731,-746 626,-522 556,-302 553,0 H 365 c 0,-180 39,-369 115,-568 76,-199 203,-429 382,-688 H 105 v -153 h 931 v 146"
+         inkscape:connector-curvature="0" />
+      <g
+         id="c">
+        <use
+           id="use96"
+           xlink:href="#h"
+           transform="scale(0.01302083)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use98"
+           xlink:href="#q"
+           transform="matrix(0.01302083,0,0,0.01302083,13.333333,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use100"
+           xlink:href="#r"
+           transform="matrix(0.01302083,0,0,0.01302083,28.164062,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use102"
+           xlink:href="#i"
+           transform="matrix(0.01302083,0,0,0.01302083,42.994792,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use104"
+           xlink:href="#p"
+           transform="matrix(0.01302083,0,0,0.01302083,50.403646,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use106"
+           xlink:href="#s"
+           transform="matrix(0.01302083,0,0,0.01302083,65.234375,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use108"
+           xlink:href="#o"
+           transform="matrix(0.01302083,0,0,0.01302083,80.065104,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use110"
+           xlink:href="#o"
+           transform="matrix(0.01302083,0,0,0.01302083,94.895833,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use112"
+           xlink:href="#t"
+           transform="matrix(0.01302083,0,0,0.01302083,109.72656,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use114"
+           xlink:href="#u"
+           transform="matrix(0.01302083,0,0,0.01302083,124.55729,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use116"
+           xlink:href="#v"
+           transform="matrix(0.01302083,0,0,0.01302083,139.38802,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use118"
+           xlink:href="#w"
+           transform="matrix(0.01302083,0,0,0.01302083,154.21875,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+      </g>
+      <g
+         id="d">
+        <use
+           id="use121"
+           xlink:href="#h"
+           transform="scale(0.01302083)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use123"
+           xlink:href="#q"
+           transform="matrix(0.01302083,0,0,0.01302083,13.333333,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use125"
+           xlink:href="#r"
+           transform="matrix(0.01302083,0,0,0.01302083,28.164062,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use127"
+           xlink:href="#i"
+           transform="matrix(0.01302083,0,0,0.01302083,42.994792,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+      </g>
+      <path
+         id="x"
+         d="m 765,-739 c 171,25 285,155 285,346 C 1049,-119 859,20 570,20 286,20 89,-117 89,-391 89,-574 212,-714 370,-737 v -4 c -143,-30 -248,-160 -248,-328 1,-229 198,-361 444,-361 254,0 448,125 449,363 1,166 -104,300 -250,324 z m -197,-70 c 171,-2 261,-74 260,-248 0,-159 -87,-239 -262,-239 -165,0 -260,77 -260,239 0,163 99,249 262,248 z m 4,694 c 200,0 291,-92 291,-295 1,-179 -116,-264 -297,-264 -175,0 -292,98 -291,268 0,194 99,291 297,291"
+         inkscape:connector-curvature="0" />
+      <path
+         id="y"
+         d="m 492,-1341 c -103,3 -130,49 -131,162 v 97 h 211 v 131 H 361 V 0 H 181 V -951 H 29 v -131 h 152 v -122 c 0,-192 78,-276 264,-278 50,0 92,4 127,12 v 137 c -30,-5 -57,-8 -80,-8"
+         inkscape:connector-curvature="0" />
+      <g
+         id="e">
+        <use
+           id="use132"
+           xlink:href="#h"
+           transform="scale(0.01302083)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use134"
+           xlink:href="#q"
+           transform="matrix(0.01302083,0,0,0.01302083,13.333333,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use136"
+           xlink:href="#r"
+           transform="matrix(0.01302083,0,0,0.01302083,28.164062,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use138"
+           xlink:href="#i"
+           transform="matrix(0.01302083,0,0,0.01302083,42.994792,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use140"
+           xlink:href="#p"
+           transform="matrix(0.01302083,0,0,0.01302083,50.403646,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use142"
+           xlink:href="#n"
+           transform="matrix(0.01302083,0,0,0.01302083,65.234375,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use144"
+           xlink:href="#n"
+           transform="matrix(0.01302083,0,0,0.01302083,80.065104,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use146"
+           xlink:href="#x"
+           transform="matrix(0.01302083,0,0,0.01302083,94.895833,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use148"
+           xlink:href="#v"
+           transform="matrix(0.01302083,0,0,0.01302083,109.72656,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use150"
+           xlink:href="#u"
+           transform="matrix(0.01302083,0,0,0.01302083,124.55729,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use152"
+           xlink:href="#x"
+           transform="matrix(0.01302083,0,0,0.01302083,139.38802,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use154"
+           xlink:href="#y"
+           transform="matrix(0.01302083,0,0,0.01302083,154.21875,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+      </g>
+    </defs>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="3840"
+     inkscape:window-height="2031"
+     id="namedview163"
+     showgrid="false"
+     inkscape:zoom="2.8284271"
+     inkscape:cx="431.66392"
+     inkscape:cy="182.41243"
+     inkscape:window-x="0"
+     inkscape:window-y="55"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="svg161" />
+  <g
+     id="g264">
+    <path
+       d="m 0.80639,2.9507 h 628.09436 v 115.928 H 0.80639 Z"
+       id="path2"
+       inkscape:connector-curvature="0"
+       style="fill:#ffffff;fill-opacity:0;stroke-width:0.45284379" />
+    <path
+       d="m 9.90855,2.9507 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.11121 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 H 196.53 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.0931 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.11121 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55105,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5556,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5556,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5465,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5466,0 h 9.1021 m 4.5556,0 h 9.1022 m 4.5511,0 h 4.5511 v 4.293 m 0,4.2929 v 8.5905 m 0,4.2929 v 8.5905 m 0,4.2929 v 8.5859 m 0,4.3021 v 8.5859 m 0,4.2929 v 8.5859 m 0,4.3021 v 8.5814 m 0,4.2929 v 8.5859 m 0,4.2885 v 8.5904 m 0,4.293 v 4.2884 h -4.5511 m -4.5511,0 h -9.1022 m -4.551,0 h -9.1022 m -4.5556,0 h -9.1022 m -4.5556,0 h -9.1021 m -4.5511,0 h -9.1022 m -4.5556,0 h -9.1021 m -4.5511,0 h -9.1112 m -4.5511,0 h -9.1022 m -4.5511,0 h -9.1021 m -4.5511,0 h -9.1022 m -4.551,0 h -9.1022 m -4.5556,0 h -9.1022 m -4.5511,0 h -9.1021 m -4.55109,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.09763 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.11121 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55561,0 H 9.90855 m -4.55108,0 H 0.80639 v -4.293 m 0,-4.2929 v -8.5905 m 0,-4.2929 v -8.5905 m 0,-4.2929 v -8.5769 m 0,-4.302 v -8.5859 m 0,-4.293 v -8.5859 m 0,-4.302 v -8.5814 m 0,-4.293 v -8.5904 m 0,-4.2884 v -8.5905 m 0,-4.2929 v -4.293 h 4.55108"
+       id="path4"
+       inkscape:connector-curvature="0"
+       style="fill:none;stroke:#000000;stroke-width:0.90568757" />
+    <path
+       d="M 4.88199,2.9507 H 51.97774 V 33.2912 H 4.88199 Z"
+       id="path6"
+       inkscape:connector-curvature="0"
+       style="fill:#ffffff;fill-opacity:0;stroke:#000000;stroke-width:0.90568757;stroke-opacity:0" />
+    <path
+       d="m 0.80639,118.8787 h 628.09436 v 115.928 H 0.80639 Z"
+       id="path10"
+       inkscape:connector-curvature="0"
+       style="fill:#d2fff2;stroke-width:0.45284379" />
+    <path
+       d="m 9.90855,118.8787 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.11121 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 H 196.53 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.0931 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.11121 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55105,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5556,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5556,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5465,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5466,0 h 9.1021 m 4.5556,0 h 9.1022 m 4.5511,0 h 4.5511 v 4.293 m 0,4.2929 v 8.5905 m 0,4.2929 v 8.5905 m 0,4.2929 v 8.5859 m 0,4.3021 v 8.5859 m 0,4.2929 v 8.586 m 0,4.302 v 8.5814 m 0,4.2929 v 8.5859 m 0,4.2885 v 8.5904 m 0,4.293 v 4.2884 h -4.5511 m -4.5511,0 h -9.1022 m -4.551,0 h -9.1022 m -4.5556,0 h -9.1022 m -4.5556,0 h -9.1021 m -4.5511,0 h -9.1022 m -4.5556,0 h -9.1021 m -4.5511,0 h -9.1112 m -4.5511,0 h -9.1022 m -4.5511,0 h -9.1021 m -4.5511,0 h -9.1022 m -4.551,0 h -9.1022 m -4.5556,0 h -9.1022 m -4.5511,0 h -9.1021 m -4.55109,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.09763 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.11121 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55561,0 H 9.90855 m -4.55108,0 H 0.80639 v -4.293 m 0,-4.2929 v -8.5905 m 0,-4.2929 v -8.5905 m 0,-4.2929 v -8.5769 m 0,-4.302 v -8.5859 m 0,-4.293 v -8.5859 m 0,-4.302 v -8.5814 m 0,-4.293 v -8.5904 m 0,-4.2884 v -8.5905 m 0,-4.2929 v -4.293 h 4.55108"
+       id="path12"
+       inkscape:connector-curvature="0"
+       style="fill:none;stroke:#000000;stroke-width:0.90568757" />
+    <path
+       d="m 4.88199,118.8787 h 117.73938 v 30.3405 H 4.88199 Z"
+       id="path14"
+       inkscape:connector-curvature="0"
+       style="fill:#ffffff;fill-opacity:0;stroke:#000000;stroke-width:0.90568757;stroke-opacity:0" />
+    <path
+       d="m 43.49452,161.537 c 0,-2.7368 3.59697,-4.9515 8.042,-4.9515 h 159.37776 c 4.44503,0 8.042,2.2147 8.042,4.9515 v 30.1586 c 0,2.7368 -3.59697,4.9514 -8.042,4.9514 H 51.53652 c -4.44503,0 -8.042,-2.2146 -8.042,-4.9514 z"
+       id="path18"
+       inkscape:connector-curvature="0"
+       style="fill:#ffffff;stroke:#000000;stroke-width:1.1473186" />
+    <path
+       d="M 154.20269,158.1403 380.23061,81.7455"
+       id="path22"
+       inkscape:connector-curvature="0"
+       style="fill:none;stroke:#000000;stroke-width:0.90568757" />
+    <path
+       style="stroke-width:0.45284379"
+       d="m 160.45647,151.574 -5.41149,6.2764 8.11496,1.7208 -0.18113,0.8786 -9.61841,-2.0288 6.40774,-7.4447 z m 221.20965,-69.8285 -1.2906,0.4302 -0.29435,-0.8604 0.0725,-0.023 h 1.35853 z"
+       id="path24"
+       inkscape:connector-curvature="0" />
+    <path
+       d="m 321.87263,45.9708 c 0,-2.7532 2.22799,-4.9812 4.98128,-4.9812 h 228.68614 c 2.7533,0 4.9812,2.228 4.9812,4.9812 v 30.3406 c 0,2.7533 -2.2279,4.9813 -4.9812,4.9813 H 326.85391 c -2.75329,0 -4.98128,-2.228 -4.98128,-4.9813 z"
+       id="path26"
+       inkscape:connector-curvature="0"
+       style="fill:#ffffff;stroke:#000000;stroke-width:0.90568757" />
+    <path
+       d="m 239.12085,161.5353 c 0,-2.7371 3.5692,-4.9519 7.97993,-4.9519 H 405.2485 c 4.41075,0 7.97997,2.2148 7.97997,4.9519 v 30.1619 c 0,2.7371 -3.56922,4.952 -7.97997,4.952 H 247.10078 c -4.41073,0 -7.97993,-2.2149 -7.97993,-4.952 z"
+       id="path30"
+       inkscape:connector-curvature="0"
+       style="fill:#ffffff;stroke:#000000;stroke-width:1.14294505" />
+    <path
+       d="M 273.62212,155.5772 404.62983,81.7455"
+       id="path34"
+       inkscape:connector-curvature="0"
+       style="fill:none;stroke:#000000;stroke-width:0.90568757" />
+    <path
+       style="stroke-width:0.45284379"
+       d="m 278.54454,147.9513 -4.14805,7.1866 8.28704,0.1721 -0.0136,0.9057 -9.82671,-0.2038 4.91336,-8.5135 z m 127.49363,-66.4775 -1.17739,0.6657 -0.45285,-0.788 0.10869,-0.059 h 1.42193 z"
+       id="path36"
+       inkscape:connector-curvature="0" />
+    <text
+       id="text277"
+       y="135.63394"
+       x="8.0519142"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+       xml:space="preserve"><tspan
+         style="stroke-width:0.45284379"
+         y="135.63394"
+         x="8.0519142"
+         id="tspan275"
+         sodipodi:role="line">/.shards_acct</tspan></text>
+    <text
+       id="text281"
+       y="20.61161"
+       x="8.9575539"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+       xml:space="preserve"><tspan
+         style="stroke-width:0.45284379"
+         y="20.61161"
+         x="8.9575539"
+         id="tspan279"
+         sodipodi:role="line">/acct</tspan></text>
+    <text
+       id="text101"
+       y="182.27689"
+       x="61.487461"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:13.2834177px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+       xml:space="preserve"><tspan
+         style="stroke-width:0.45284379"
+         y="182.27689"
+         x="61.487461"
+         id="tspan99"
+         sodipodi:role="line">cont-568d8e-&lt;ts&gt;-0</tspan></text>
+    <text
+       id="text101-7"
+       y="183.15335"
+       x="260.96158"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:13.2834177px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+       xml:space="preserve"><tspan
+         style="stroke-width:0.45284379"
+         y="183.15335"
+         x="260.96158"
+         id="tspan99-8"
+         sodipodi:role="line">cont-750ed3-&lt;ts&gt;-1</tspan></text>
+    <text
+       id="text101-6"
+       y="65.650284"
+       x="427.37085"
+       style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:13.2834177px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+       xml:space="preserve"><tspan
+         style="stroke-width:0.45284379"
+         y="65.650284"
+         x="427.37085"
+         id="tspan99-88"
+         sodipodi:role="line">cont</tspan></text>
+  </g>
+</svg>
--- a/doc/source/images/sharding_db_states.svg
+++ b/doc/source/images/sharding_db_states.svg
--- a/doc/source/images/sharding_scan_basic.svg
+++ b/doc/source/images/sharding_scan_basic.svg
@ -0,0 +1,259 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="630.00006"
+   height="120"
+   version="1.1"
+   id="svg54"
+   sodipodi:docname="sharding_snip2.svg"
+   inkscape:version="0.92.2 (5c3e80d, 2017-08-06)">
+  <metadata
+     id="metadata60">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs58">
+    <defs
+       id="defs50">
+      <path
+         id="d"
+         d="m 779,-765 c -19,-119 -93,-196 -233,-196 -95,0 -164,32 -207,95 -43,63 -64,170 -64,320 0,144 23,251 68,320 45,69 114,104 205,104 139,0 224,-79 240,-212 l 182,12 C 946,-119 781,22 553,20 230,17 87,-201 87,-542 c 0,-338 145,-557 464,-560 225,-1 380,129 413,323"
+         inkscape:connector-curvature="0" />
+      <path
+         id="e"
+         d="m 571,-1102 c 350,0 480,196 482,560 2,357 -151,562 -488,562 -332,0 -479,-218 -479,-562 0,-373 162,-560 485,-560 z m -8,989 c 244,0 301,-164 301,-429 0,-266 -49,-427 -290,-427 -239,0 -299,165 -299,427 0,252 61,429 288,429"
+         inkscape:connector-curvature="0" />
+      <path
+         id="f"
+         d="m 663,-1102 c 251,0 343,119 343,381 V 0 H 825 v -686 c 0,-183 -40,-279 -223,-277 -184,2 -280,141 -280,336 V 0 H 142 c -3,-345 6,-754 -6,-1082 h 170 c 5,68 6,94 8,185 h 3 c 76,-134 157,-205 346,-205"
+         inkscape:connector-curvature="0" />
+      <path
+         id="g"
+         d="m 336,-268 c 2,98 22,139 114,141 24,0 59,-5 104,-14 V -8 C 495,8 434,16 372,16 228,16 156,-66 156,-229 V -951 H 31 v -131 h 132 l 53,-242 h 120 v 242 h 200 v 131 H 336 v 683"
+         inkscape:connector-curvature="0" />
+      <g
+         id="a">
+        <use
+           id="use26"
+           xlink:href="#d"
+           transform="scale(0.01302083)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use28"
+           xlink:href="#e"
+           transform="matrix(0.01302083,0,0,0.01302083,13.333333,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use30"
+           xlink:href="#f"
+           transform="matrix(0.01302083,0,0,0.01302083,28.164062,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use32"
+           xlink:href="#g"
+           transform="matrix(0.01302083,0,0,0.01302083,42.994792,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+      </g>
+      <path
+         id="h"
+         d="M 0,20 411,-1484 H 569 L 162,20 H 0"
+         inkscape:connector-curvature="0" />
+      <path
+         id="i"
+         d="M 797,-207 C 713,-60 620,16 414,20 203,24 88,-98 87,-302 c 0,-112 37,-198 111,-258 74,-60 192,-93 356,-96 l 243,-4 c 10,-201 -43,-307 -232,-305 -154,2 -223,43 -242,172 l -188,-17 c 31,-195 175,-292 434,-292 259,0 410,116 410,364 v 466 c 3,95 12,159 101,161 17,0 37,-2 59,-7 V -6 C 1094,5 1047,10 1000,10 857,8 812,-66 803,-207 Z m -525,-92 c -1,116 66,187 183,184 230,-7 361,-164 342,-419 -124,5 -305,-2 -389,30 -83,32 -136,92 -136,205"
+         inkscape:connector-curvature="0" />
+      <g
+         id="b">
+        <use
+           id="use37"
+           xlink:href="#h"
+           transform="scale(0.01736111)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use39"
+           xlink:href="#i"
+           transform="matrix(0.01736111,0,0,0.01736111,9.8784722,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use41"
+           xlink:href="#d"
+           transform="matrix(0.01736111,0,0,0.01736111,29.652778,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use43"
+           xlink:href="#d"
+           transform="matrix(0.01736111,0,0,0.01736111,47.430556,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+        <use
+           id="use45"
+           xlink:href="#g"
+           transform="matrix(0.01736111,0,0,0.01736111,65.208333,0)"
+           x="0"
+           y="0"
+           width="100%"
+           height="100%" />
+      </g>
+      <path
+         id="j"
+         d="m 492,-1341 c -103,3 -130,49 -131,162 v 97 h 211 v 131 H 361 V 0 H 181 V -951 H 29 v -131 h 152 v -122 c 0,-192 78,-276 264,-278 50,0 92,4 127,12 v 137 c -30,-5 -57,-8 -80,-8"
+         inkscape:connector-curvature="0"
+         style="fill:#b1001c" />
+      <use
+         id="c"
+         xlink:href="#j"
+         transform="scale(0.01736111)"
+         x="0"
+         y="0"
+         width="100%"
+         height="100%" />
+    </defs>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="3840"
+     inkscape:window-height="2031"
+     id="namedview56"
+     showgrid="false"
+     inkscape:zoom="1.8847584"
+     inkscape:cx="-83.692254"
+     inkscape:cy="345.83434"
+     inkscape:window-x="0"
+     inkscape:window-y="55"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="svg54"
+     inkscape:pagecheckerboard="true"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <path
+     style="fill:#ffffff;fill-opacity:0;stroke-width:0.45284379"
+     inkscape:connector-curvature="0"
+     id="path2"
+     d="M 1.4528438,1.9 H 629.5472 V 117.8 H 1.4528438 Z" />
+  <path
+     style="fill:none;stroke:#000000;stroke-width:0.90568757"
+     inkscape:connector-curvature="0"
+     id="path4"
+     d="m 10.555004,1.9 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.111206 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.5556,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10213 m 4.5511,0 h 9.1112 m 4.5511,0 h 9.1022 m 4.551,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5556,0 h 9.1022 m 4.551,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5556,0 h 9.1022 m 4.5511,0 h 9.1021 m 4.5466,0 h 9.1021 m 4.5511,0 h 9.1022 m 4.5465,0 h 9.1022 m 4.5556,0 h 9.1022 m 4.551,0 h 4.5421 v 4.3 m 0,4.2 V 19 m 0,4.3 v 8.6 m 0,4.3 v 8.6 m 0,4.3 v 8.6 m 0,4.3 v 8.6 m 0,4.3 v 8.6 m 0,4.2 v 8.6 m 0,4.3 v 8.6 m 0,4.3 v 4.3 h -4.5511 m -4.5511,0 h -9.1022 m -4.551,0 h -9.1022 m -4.5556,0 h -9.1022 m -4.5556,0 h -9.1021 m -4.5511,0 h -9.1022 m -4.5556,0 h -9.1021 m -4.5511,0 h -9.1112 m -4.5511,0 h -9.1022 m -4.5511,0 h -9.1021 m -4.5511,0 h -9.1022 m -4.551,0 h -9.1022 m -4.5556,0 H 461.13 m -4.5511,0 h -9.1021 m -4.5511,0 h -9.1022 m -4.551,0 h -9.1022 m -4.5511,0 h -9.1021 m -4.5556,0 h -9.1022 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.0931 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.11121 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.102156 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.5510802,0 h -4.55108 v -4.3 m 0,-4.3 v -8.6 m 0,-4.3 v -8.6 m 0,-4.3 v -8.5 m 0,-4.3 V 62 m 0,-4.3 v -8.6 m 0,-4.3 v -8.6 m 0,-4.3 v -8.6 m 0,-4.3 v -8.6 m 0,-4.2 V 1.9 h 4.55108" />
+  <path
+     style="fill:#ffffff;stroke:#000000;stroke-width:0.90568757"
+     inkscape:connector-curvature="0"
+     id="path6"
+     d="m 82.511884,44.4 c 0,-2.7 2.22799,-5 4.98128,-5 H 543.5068 c 2.7533,0 4.9813,2.3 4.9813,5 v 30.4 c 0,2.7 -2.228,5 -4.9813,5 H 87.493164 c -2.75329,0 -4.98128,-2.3 -4.98128,-5 z" />
+  <path
+     style="fill:#ffffff;fill-opacity:0;stroke:#000000;stroke-width:0.90568757;stroke-opacity:0"
+     inkscape:connector-curvature="0"
+     id="path10"
+     d="M 5.5284438,1.9 H 52.624194 V 32.2 H 5.5284438 Z" />
+  <path
+     style="fill:none;stroke-width:0.45284379"
+     inkscape:connector-curvature="0"
+     id="path14"
+     d="M 169.87329,39.4 V 79.8" />
+  <path
+     style="fill:#ffffff;fill-opacity:0;stroke:#000000;stroke-width:0.90568757;stroke-opacity:0"
+     inkscape:connector-curvature="0"
+     id="path18"
+     d="m 172.62779,87.4 h 14.94384 v 30.4 h -14.94384 z" />
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="9.9593773"
+     y="19.10981"
+     id="text281"><tspan
+       sodipodi:role="line"
+       id="tspan279"
+       x="9.9593773"
+       y="19.10981"
+       style="stroke-width:0.45284379">/acct</tspan></text>
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="299.41342"
+     y="64.584816"
+     id="text281-9"><tspan
+       sodipodi:role="line"
+       id="tspan279-1"
+       x="299.41342"
+       y="64.584816"
+       style="stroke-width:0.45284379">cont</tspan></text>
+  <path
+     style="fill:#800000;fill-opacity:1;stroke:#aa0000;stroke-width:0.90600002;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:7.24800014, 3.62400007;stroke-dashoffset:0;stroke-opacity:1"
+     d="m 169.27742,39.4 c 0,39.730565 0.0442,39.597985 0.0442,39.597985 v 0 0 0"
+     id="path4476-4"
+     inkscape:connector-curvature="0" />
+  <path
+     style="fill:#aa0000;fill-opacity:1;stroke:#aa0000;stroke-width:0.90600002;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:7.24800014, 3.62400007;stroke-dashoffset:0;stroke-opacity:1"
+     d="m 252.83355,39.4281 c 0,39.730565 0.0442,39.597985 0.0442,39.597985 v 0 0 0"
+     id="path4476-6"
+     inkscape:connector-curvature="0" />
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="157.79762"
+     y="110.58204"
+     id="text281-0"><tspan
+       sodipodi:role="line"
+       id="tspan279-4"
+       x="157.79762"
+       y="110.58204"
+       style="fill:#aa0000;stroke-width:0.45284379">cat</tspan></text>
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#aa0000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="229.34985"
+     y="109.59498"
+     id="text281-0-2"><tspan
+       sodipodi:role="line"
+       id="tspan279-4-7"
+       x="229.34985"
+       y="109.59498"
+       style="fill:#aa0000;stroke-width:0.45284379">giraffe</tspan></text>
+</svg>
--- a/doc/source/images/sharding_scan_load.svg
+++ b/doc/source/images/sharding_scan_load.svg
--- a/doc/source/images/sharding_sharded_load.svg
+++ b/doc/source/images/sharding_sharded_load.svg
--- a/doc/source/images/sharding_unsharded.svg
+++ b/doc/source/images/sharding_unsharded.svg
@ -0,0 +1,199 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="630"
+   height="120"
+   version="1.1"
+   id="svg3952"
+   sodipodi:docname="sharding_snip1.svg"
+   inkscape:version="0.92.2 (5c3e80d, 2017-08-06)">
+  <metadata
+     id="metadata3958">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs3956">
+    <defs
+       id="defs3948">
+      <path
+         inkscape:connector-curvature="0"
+         id="c"
+         d="m 779,-765 c -19,-119 -93,-196 -233,-196 -95,0 -164,32 -207,95 -43,63 -64,170 -64,320 0,144 23,251 68,320 45,69 114,104 205,104 139,0 224,-79 240,-212 l 182,12 C 946,-119 781,22 553,20 230,17 87,-201 87,-542 c 0,-338 145,-557 464,-560 225,-1 380,129 413,323" />
+      <path
+         inkscape:connector-curvature="0"
+         id="d"
+         d="m 571,-1102 c 350,0 480,196 482,560 2,357 -151,562 -488,562 -332,0 -479,-218 -479,-562 0,-373 162,-560 485,-560 z m -8,989 c 244,0 301,-164 301,-429 0,-266 -49,-427 -290,-427 -239,0 -299,165 -299,427 0,252 61,429 288,429" />
+      <path
+         inkscape:connector-curvature="0"
+         id="e"
+         d="m 663,-1102 c 251,0 343,119 343,381 V 0 H 825 v -686 c 0,-183 -40,-279 -223,-277 -184,2 -280,141 -280,336 V 0 H 142 c -3,-345 6,-754 -6,-1082 h 170 c 5,68 6,94 8,185 h 3 c 76,-134 157,-205 346,-205" />
+      <path
+         inkscape:connector-curvature="0"
+         id="f"
+         d="m 336,-268 c 2,98 22,139 114,141 24,0 59,-5 104,-14 V -8 C 495,8 434,16 372,16 228,16 156,-66 156,-229 V -951 H 31 v -131 h 132 l 53,-242 h 120 v 242 h 200 v 131 H 336 v 683" />
+      <g
+         id="a">
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3926"
+           xlink:href="#c"
+           transform="scale(0.01302083)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3928"
+           xlink:href="#d"
+           transform="matrix(0.01302083,0,0,0.01302083,13.333333,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3930"
+           xlink:href="#e"
+           transform="matrix(0.01302083,0,0,0.01302083,28.164062,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3932"
+           xlink:href="#f"
+           transform="matrix(0.01302083,0,0,0.01302083,42.994792,0)" />
+      </g>
+      <path
+         inkscape:connector-curvature="0"
+         id="g"
+         d="M 0,20 411,-1484 H 569 L 162,20 H 0" />
+      <path
+         inkscape:connector-curvature="0"
+         id="h"
+         d="M 797,-207 C 713,-60 620,16 414,20 203,24 88,-98 87,-302 c 0,-112 37,-198 111,-258 74,-60 192,-93 356,-96 l 243,-4 c 10,-201 -43,-307 -232,-305 -154,2 -223,43 -242,172 l -188,-17 c 31,-195 175,-292 434,-292 259,0 410,116 410,364 v 466 c 3,95 12,159 101,161 17,0 37,-2 59,-7 V -6 C 1094,5 1047,10 1000,10 857,8 812,-66 803,-207 Z m -525,-92 c -1,116 66,187 183,184 230,-7 361,-164 342,-419 -124,5 -305,-2 -389,30 -83,32 -136,92 -136,205" />
+      <g
+         id="b">
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3937"
+           xlink:href="#g"
+           transform="scale(0.01736111)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3939"
+           xlink:href="#h"
+           transform="matrix(0.01736111,0,0,0.01736111,9.8784722,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3941"
+           xlink:href="#c"
+           transform="matrix(0.01736111,0,0,0.01736111,29.652778,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3943"
+           xlink:href="#c"
+           transform="matrix(0.01736111,0,0,0.01736111,47.430556,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use3945"
+           xlink:href="#f"
+           transform="matrix(0.01736111,0,0,0.01736111,65.208333,0)" />
+      </g>
+    </defs>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="3840"
+     inkscape:window-height="2031"
+     id="namedview3954"
+     showgrid="false"
+     inkscape:zoom="4"
+     inkscape:cx="259.51356"
+     inkscape:cy="162.22523"
+     inkscape:window-x="0"
+     inkscape:window-y="55"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="svg3952" />
+  <path
+     style="fill:#ffffff;fill-opacity:0;stroke-width:0.45284376"
+     inkscape:connector-curvature="0"
+     id="path3910"
+     d="M 0.95284148,2.5359665 H 629.04715 V 118.46397 H 0.95284148 Z" />
+  <path
+     style="fill:none;stroke:#000000;stroke-width:0.90568751"
+     inkscape:connector-curvature="0"
+     id="path3912"
+     d="m 10.055001,2.5359665 h 9.10216 m 4.55108,0 h 9.10216 m 4.555608,0 h 9.10216 m 4.555608,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.555608,0 h 9.10216 m 4.55108,0 h 9.111215 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.5556,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54656,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.5556,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.11121 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55108,0 h 9.10216 m 4.54655,0 h 9.10216 m 4.55561,0 h 9.10216 m 4.55108,0 h 4.54202 v 4.2929589 m 0,4.2929586 v 8.590447 m 0,4.292959 v 8.590446 m 0,4.292959 v 8.585918 m 0,4.302016 v 8.585917 m 0,4.292959 v 8.585918 m 0,4.302016 v 8.58139 m 0,4.292958 v 8.585918 m 0,4.288435 v 8.59044 m 0,4.29296 v 4.28843 h -4.55108 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55561,0 h -9.10215 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.11122 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55107,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.54655,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.0931 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.11122 m -4.55108,0 h -9.10215 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55561,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.55108,0 h -9.102161 m -4.555608,0 h -9.10216 m -4.55108,0 h -9.10216 m -4.546551,0 h -9.10216 m -4.55108,0 h -9.102159 m -4.546552,0 h -9.10216 m -4.555608,0 h -9.10216 m -4.5510796,0 H 0.95284148 v -4.29296 m 0,-4.29296 v -8.59044 m 0,-4.292962 v -8.590446 m 0,-4.292959 v -8.576861 m 0,-4.302016 v -8.585918 m 0,-4.292958 v -8.585918 m 0,-4.302016 v -8.58139 m 0,-4.292959 v -8.590446 m 0,-4.28843 v -8.590447 m 0,-4.2929586 V 2.5359665 H 5.5039214" />
+  <path
+     style="fill:#ffffff;stroke:#000000;stroke-width:0.90568751"
+     inkscape:connector-curvature="0"
+     id="path3914"
+     d="m 82.011876,45.103281 c 0,-2.75329 2.227992,-4.981282 4.981282,-4.981282 H 543.00683 c 2.75329,0 4.98129,2.227992 4.98129,4.981282 v 30.340532 c 0,2.753291 -2.228,4.981282 -4.98129,4.981282 H 86.993158 c -2.75329,0 -4.981282,-2.227991 -4.981282,-4.981282 z" />
+  <path
+     style="fill:#ffffff;fill-opacity:0;stroke:#000000;stroke-width:0.90568751;stroke-opacity:0"
+     inkscape:connector-curvature="0"
+     id="path3918"
+     d="M 5.0284354,2.5359665 H 52.124187 V 32.876499 H 5.0284354 Z" />
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="6.4593792"
+     y="19.109808"
+     id="text281"><tspan
+       sodipodi:role="line"
+       id="tspan279"
+       x="6.4593792"
+       y="19.109808"
+       style="stroke-width:0.45284379">/acct</tspan></text>
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="298.91342"
+     y="65.258369"
+     id="text281-8"><tspan
+       sodipodi:role="line"
+       id="tspan279-9"
+       x="298.91342"
+       y="65.258369"
+       style="stroke-width:0.45284379">cont</tspan></text>
+</svg>
--- a/doc/source/images/sharding_unsharded_load.svg
+++ b/doc/source/images/sharding_unsharded_load.svg
@ -0,0 +1,219 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:xlink="http://www.w3.org/1999/xlink"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="642"
+   height="139"
+   version="1.1"
+   id="svg2012"
+   sodipodi:docname="sharding_lock1.svg"
+   inkscape:version="0.92.2 (5c3e80d, 2017-08-06)">
+  <metadata
+     id="metadata2018">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs2016">
+    <defs
+       id="defs2008">
+      <path
+         inkscape:connector-curvature="0"
+         id="c"
+         d="m 779,-765 c -19,-119 -93,-196 -233,-196 -95,0 -164,32 -207,95 -43,63 -64,170 -64,320 0,144 23,251 68,320 45,69 114,104 205,104 139,0 224,-79 240,-212 l 182,12 C 946,-119 781,22 553,20 230,17 87,-201 87,-542 c 0,-338 145,-557 464,-560 225,-1 380,129 413,323" />
+      <path
+         inkscape:connector-curvature="0"
+         id="d"
+         d="m 571,-1102 c 350,0 480,196 482,560 2,357 -151,562 -488,562 -332,0 -479,-218 -479,-562 0,-373 162,-560 485,-560 z m -8,989 c 244,0 301,-164 301,-429 0,-266 -49,-427 -290,-427 -239,0 -299,165 -299,427 0,252 61,429 288,429" />
+      <path
+         inkscape:connector-curvature="0"
+         id="e"
+         d="m 663,-1102 c 251,0 343,119 343,381 V 0 H 825 v -686 c 0,-183 -40,-279 -223,-277 -184,2 -280,141 -280,336 V 0 H 142 c -3,-345 6,-754 -6,-1082 h 170 c 5,68 6,94 8,185 h 3 c 76,-134 157,-205 346,-205" />
+      <path
+         inkscape:connector-curvature="0"
+         id="f"
+         d="m 336,-268 c 2,98 22,139 114,141 24,0 59,-5 104,-14 V -8 C 495,8 434,16 372,16 228,16 156,-66 156,-229 V -951 H 31 v -131 h 132 l 53,-242 h 120 v 242 h 200 v 131 H 336 v 683" />
+      <g
+         id="a">
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use1986"
+           xlink:href="#c"
+           transform="scale(0.01302083)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use1988"
+           xlink:href="#d"
+           transform="matrix(0.01302083,0,0,0.01302083,13.333333,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use1990"
+           xlink:href="#e"
+           transform="matrix(0.01302083,0,0,0.01302083,28.164062,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use1992"
+           xlink:href="#f"
+           transform="matrix(0.01302083,0,0,0.01302083,42.994792,0)" />
+      </g>
+      <path
+         inkscape:connector-curvature="0"
+         id="g"
+         d="M 0,20 411,-1484 H 569 L 162,20 H 0" />
+      <path
+         inkscape:connector-curvature="0"
+         id="h"
+         d="M 797,-207 C 713,-60 620,16 414,20 203,24 88,-98 87,-302 c 0,-112 37,-198 111,-258 74,-60 192,-93 356,-96 l 243,-4 c 10,-201 -43,-307 -232,-305 -154,2 -223,43 -242,172 l -188,-17 c 31,-195 175,-292 434,-292 259,0 410,116 410,364 v 466 c 3,95 12,159 101,161 17,0 37,-2 59,-7 V -6 C 1094,5 1047,10 1000,10 857,8 812,-66 803,-207 Z m -525,-92 c -1,116 66,187 183,184 230,-7 361,-164 342,-419 -124,5 -305,-2 -389,30 -83,32 -136,92 -136,205" />
+      <g
+         id="b">
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use1997"
+           xlink:href="#g"
+           transform="scale(0.01736111)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use1999"
+           xlink:href="#h"
+           transform="matrix(0.01736111,0,0,0.01736111,9.8784722,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use2001"
+           xlink:href="#c"
+           transform="matrix(0.01736111,0,0,0.01736111,29.652778,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use2003"
+           xlink:href="#c"
+           transform="matrix(0.01736111,0,0,0.01736111,47.430556,0)" />
+        <use
+           height="100%"
+           width="100%"
+           y="0"
+           x="0"
+           id="use2005"
+           xlink:href="#f"
+           transform="matrix(0.01736111,0,0,0.01736111,65.208333,0)" />
+      </g>
+    </defs>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="3522"
+     inkscape:window-height="1971"
+     id="namedview2014"
+     showgrid="false"
+     inkscape:zoom="2.8284271"
+     inkscape:cx="450.01007"
+     inkscape:cy="76.915323"
+     inkscape:window-x="0"
+     inkscape:window-y="55"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="svg2012" />
+  <path
+     style="fill:#ffffff;fill-opacity:0;stroke-width:0.46028754"
+     inkscape:connector-curvature="0"
+     id="path1962"
+     d="M 2.540593,20.136033 H 640.95941 V 137.96964 H 2.540593 Z" />
+  <path
+     style="fill:none;stroke:#000000;stroke-width:0.92057508"
+     inkscape:connector-curvature="0"
+     id="path1964"
+     d="m 11.792373,20.136033 h 9.251779 m 4.62589,0 h 9.251779 m 4.630493,0 h 9.25178 m 4.630492,0 h 9.25178 m 4.62589,0 h 9.251779 m 4.630493,0 h 9.251779 m 4.62589,0 h 9.260983 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.63049,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.63049,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62129,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62128,0 h 9.25178 m 4.6305,0 h 9.25178 m 4.62589,0 h 9.25177 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.6305,0 h 9.25178 m 4.63049,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.63049,0 h 9.25178 m 4.62589,0 h 9.26098 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.63049,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.63049,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62129,0 h 9.25178 m 4.62589,0 h 9.25178 m 4.62129,0 h 9.25177 m 4.6305,0 h 9.25178 m 4.62589,0 h 4.61668 v 4.363526 m 0,4.363526 v 8.731654 m 0,4.363526 v 8.731655 m 0,4.363526 v 8.727051 m 0,4.372732 v 8.727052 m 0,4.363526 v 8.727051 m 0,4.372732 v 8.72245 m 0,4.36352 v 8.72706 m 0,4.35892 v 8.73165 m 0,4.36353 v 4.35892 h -4.62589 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.26099 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62128,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62129,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.63049,0 h -9.24258 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.26098 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.63049,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.25178 m -4.62589,0 h -9.251781 m -4.630493,0 h -9.25178 m -4.625889,0 h -9.25178 m -4.621287,0 h -9.251779 m -4.62589,0 h -9.25178 m -4.621287,0 h -9.251779 m -4.630493,0 h -9.251779 m -4.6258903,0 H 2.540593 v -4.36352 m 0,-4.36353 v -8.73165 m 0,-4.36353 v -8.73165 m 0,-4.36353 v -8.717846 m 0,-4.372731 v -8.727052 m 0,-4.363526 v -8.727052 m 0,-4.372731 v -8.722449 m 0,-4.363526 v -8.731655 m 0,-4.358923 v -8.731654 m 0,-4.363526 v -4.363526 h 4.6258897" />
+  <path
+     style="fill:#ffffff;stroke:#000000;stroke-width:0.92057508"
+     inkscape:connector-curvature="0"
+     id="path1966"
+     d="m 84.932063,63.403062 c 0,-2.798549 2.264614,-5.063163 5.063163,-5.063163 H 553.50478 c 2.79855,0 5.06316,2.264614 5.06316,5.063163 v 30.839265 c 0,2.798548 -2.26461,5.063163 -5.06316,5.063163 H 89.995226 c -2.798549,0 -5.063163,-2.264615 -5.063163,-5.063163 z" />
+  <path
+     style="fill:#ffffff;fill-opacity:0;stroke:#000000;stroke-width:0.92057508;stroke-opacity:0"
+     inkscape:connector-curvature="0"
+     id="path1970"
+     d="M 6.6831808,20.136033 H 54.553085 V 50.975298 H 6.6831808 Z" />
+  <path
+     style="fill:none;stroke:#000000;stroke-width:0.92057508"
+     inkscape:connector-curvature="0"
+     id="path1974"
+     d="M 322.70095,58.339899 275.53989,1.7245313" />
+  <path
+     inkscape:connector-curvature="0"
+     id="path1976"
+     d="m 323.29933,58.339899 h -0.59838 l 0.34982,-0.294584 z"
+     style="stroke-width:0.46028754" />
+  <path
+     style="fill:none;stroke:#000000;stroke-width:0.92057508"
+     inkscape:connector-curvature="0"
+     id="path1978"
+     d="M 320.18778,59.260474 362.69073,8.1777626" />
+  <path
+     style="stroke:#000000;stroke-width:0.92057508"
+     inkscape:connector-curvature="0"
+     id="path1980"
+     d="m 367.10489,2.8752502 -2.5592,6.4072025 -3.28185,-2.7295051 z" />
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:13.2834177px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="307.00397"
+     y="83.392113"
+     id="text101-6-4"><tspan
+       sodipodi:role="line"
+       id="tspan99-88-4"
+       x="307.00397"
+       y="83.392113"
+       style="stroke-width:0.45284379">cont</tspan></text>
+  <text
+     xml:space="preserve"
+     style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:14.49100113px;line-height:1.25;font-family:sans-serif;-inkscape-font-specification:'sans-serif, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.45284379"
+     x="10.03654"
+     y="38.250904"
+     id="text281"><tspan
+       sodipodi:role="line"
+       id="tspan279"
+       x="10.03654"
+       y="38.250904"
+       style="stroke-width:0.45284379">/acct</tspan></text>
+</svg>
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -62,6 +62,7 @@ Overview and Concepts
    overview_erasure_code
    overview_encryption
    overview_backing_store
+    overview_container_sharding
    ring_background
    ring_partpower
    associated_projects
--- a/doc/source/logs.rst
+++ b/doc/source/logs.rst
@ -105,6 +105,7 @@ RL                      :ref:`ratelimit`
 VW                      :ref:`versioned_writes`
 SSC                     :ref:`copy`
 SYM                     :ref:`symlink`
+SH                      :ref:`sharding_doc`
 ======================= =============================


--- a/doc/source/overview_architecture.rst
+++ b/doc/source/overview_architecture.rst
@ -172,6 +172,8 @@ replicator for Replication type policies.  See :doc:`overview_erasure_code`
 for complete information on both Erasure Code support as well as the
 reconstructor.

+.. _architecture_updaters:
+
 --------
 Updaters
 --------
--- a/doc/source/overview_container_sharding.rst
+++ b/doc/source/overview_container_sharding.rst
@ -0,0 +1,784 @@
+.. _sharding_doc:
+
+==================
+Container Sharding
+==================
+
+Container sharding is an operator controlled feature that may be used to shard
+very large container databases into a number of smaller shard containers
+
+.. note::
+
+    Container sharding is currently an experimental feature. It is strongly
+    recommended that operators gain experience of sharding containers in a
+    non-production cluster before using in production.
+
+    The sharding process involves moving all sharding container database
+    records via the container replication engine; the time taken to complete
+    sharding is dependent upon the existing cluster load and the performance of
+    the container database being sharded.
+
+    There is currently no documented process for reversing the sharding
+    process once sharding has been enabled.
+
+
+----------
+Background
+----------
+The metadata for each container in Swift is stored in an SQLite database. This
+metadata includes: information about the container such as its name,
+modification time and current object count; user metadata that may been written
+to the container by clients; a record of every object in the container. The
+container database object records are used to generate container listings in
+response to container GET requests; each object record stores the object's
+name, size, hash and content-type as well as associated timestamps.
+
+As the number of objects in a container increases then the number of object
+records in the container database increases. Eventually the container database
+performance starts to degrade and the time taken to update an object record
+increases. This can result in object updates timing out, with a corresponding
+increase in the backlog of pending :ref:`asynchronous updates
+<architecture_updaters>` on object servers. Container databases are typically
+replicated on several nodes and any database performance degradation can also
+result in longer :doc:`container replication <overview_replication>` times.
+
+The point at which container database performance starts to degrade depends
+upon the choice of hardware in the container ring. Anecdotal evidence suggests
+that containers with tens of millions of object records have noticeably
+degraded performance.
+
+This performance degradation can be avoided by ensuring that clients use an
+object naming scheme that disperses objects across a number of containers
+thereby distributing load across a number of container databases. However, that
+is not always desirable nor is it under the control of the cluster operator.
+
+Swift's container sharding feature provides the operator with a mechanism to
+distribute the load on a single client-visible container across multiple,
+hidden, shard containers, each of which stores a subset of the container's
+object records. Clients are unaware of container sharding; clients continue to
+use the same API to access a container that, if sharded, maps to a number of
+shard containers within the Swift cluster.
+
+------------------------
+Deployment and operation
+------------------------
+
+Upgrade Considerations
+----------------------
+
+It is essential that all servers in a Swift cluster have been upgraded to
+support the container sharding feature before attempting to shard a container.
+
+Identifying containers in need of sharding
+------------------------------------------
+
+Container sharding is currently initiated by the ``swift-manage-shard-ranges``
+CLI tool :ref:`described below <swift-manage-shard-ranges>`. Operators must
+first identify containers that are candidates for sharding. To assist with
+this, the :ref:`sharder_daemon` inspects the size of containers that it visits
+and writes a list of sharding candidates to recon cache. For example::
+
+    "sharding_candidates": {
+        "found": 1,
+        "top": [
+            {
+                "account": "AUTH_test",
+                "container": "c1",
+                "file_size": 497763328,
+                "meta_timestamp": "1525346445.31161",
+                "node_index": 2,
+                "object_count": 3349028,
+                "path": <path_to_db>,
+                "root": "AUTH_test/c1"
+            }
+        ]
+    }
+
+A container is considered to be a sharding candidate if its object count is
+greater than or equal to the ``shard_container_threshold`` option.
+The number of candidates reported is limited to a number configured by the
+``recon_candidates_limit`` option such that only the largest candidate
+containers are included in the ``sharding_candidate`` data.
+
+
+.. _swift-manage-shard-ranges:
+
+``swift-manage-shard-ranges`` CLI tool
+--------------------------------------
+
+The ``swift-manage-shard-ranges`` tool provides commands for initiating
+sharding of a container. ``swift-manage-shard-ranges`` operates directly on a
+container database file.
+
+.. note::
+
+    ``swift-manage-shard-ranges`` must only be used on one replica of a
+    container database to avoid inconsistent results. The modifications made by
+    ``swift-manage-shard-ranges`` will be automatically copied to other
+    replicas of the container database via normal replication processes.
+
+There are three steps in the process of initiating sharding, each of which may
+be performed in isolation or, as shown below, using a single command.
+
+#. The ``find`` sub-command scans the container database to identify how many
+   shard containers will be required and which objects they will manage. Each
+   shard container manages a range of the object namespace defined by a
+   ``lower`` and ``upper`` bound. The maximum number of objects to be allocated
+   to each shard container is specified on the command line. For example::
+
+    $ swift-manage-shard-ranges <path_to_db> find 500000
+    Loaded db broker for AUTH_test/c1.
+    [
+      {
+        "index": 0,
+        "lower": "",
+        "object_count": 500000,
+        "upper": "o_01086834"
+      },
+      {
+        "index": 1,
+        "lower": "o_01086834",
+        "object_count": 500000,
+        "upper": "o_01586834"
+      },
+      {
+        "index": 2,
+        "lower": "o_01586834",
+        "object_count": 500000,
+        "upper": "o_02087570"
+      },
+      {
+        "index": 3,
+        "lower": "o_02087570",
+        "object_count": 500000,
+        "upper": "o_02587572"
+      },
+      {
+        "index": 4,
+        "lower": "o_02587572",
+        "object_count": 500000,
+        "upper": "o_03087572"
+      },
+      {
+        "index": 5,
+        "lower": "o_03087572",
+        "object_count": 500000,
+        "upper": "o_03587572"
+      },
+      {
+        "index": 6,
+        "lower": "o_03587572",
+        "object_count": 349194,
+        "upper": ""
+      }
+    ]
+    Found 7 ranges in 4.37222s (total object count 3349194)
+
+   This command returns a list of shard ranges each of which describes the
+   namespace to be managed by a shard container. No other action is taken by
+   this command and the container database is unchanged. The output may be
+   redirected to a file for subsequent retrieval by the ``replace`` command.
+   For example::
+
+    $ swift-manage-shard-ranges <path_to_db> find 500000 > my_shard_ranges
+    Loaded db broker for AUTH_test/c1.
+    Found 7 ranges in 2.448s (total object count 3349194)
+
+#. The ``replace`` sub-command deletes any shard ranges that might already be
+   in the container database and inserts shard ranges from a given file. The
+   file contents should be in the format generated by the ``find`` sub-command.
+   For example::
+
+    $ swift-manage-shard-ranges <path_to_db> replace my_shard_ranges
+    Loaded db broker for AUTH_test/c1.
+    No shard ranges found to delete.
+    Injected 7 shard ranges.
+    Run container-replicator to replicate them to other nodes.
+    Use the enable sub-command to enable sharding.
+
+   The container database is modified to store the shard ranges, but the
+   container will not start sharding until sharding is enabled. The ``info``
+   sub-command may be used to inspect the state of the container database at
+   any point, and the ``show`` sub-command may be used to display the inserted
+   shard ranges.
+
+   Shard ranges stored in the container database may be replaced using the
+   ``replace`` sub-command. This will first delete all existing shard ranges
+   before storing new shard ranges. Shard ranges may also be deleted from the
+   container database using the ``delete`` sub-command.
+
+   Shard ranges should not be replaced or deleted using
+   ``swift-manage-shard-ranges`` once the next step of enabling sharding has
+   been taken.
+
+#. The ``enable`` sub-command enables the container for sharding. The sharder
+   daemon and/or container replicator daemon will replicate shard ranges to
+   other replicas of the container db and the sharder daemon will proceed to
+   shard the container. This process may take some time depending on the size
+   of the container, the number of shard ranges and the underlying hardware.
+
+.. note::
+
+    Once the ``enable`` sub-command has been used there is no supported
+    mechanism to revert sharding. Do not use ``swift-manage-shard-ranges`` to
+    make any further changes to the shard ranges in the container db.
+
+   For example::
+
+    $ swift-manage-shard-ranges <path_to_db> enable
+    Loaded db broker for AUTH_test/c1.
+    Container moved to state 'sharding' with epoch 1525345093.22908.
+    Run container-sharder on all nodes to shard the container.
+
+   This does not shard the container - sharding is performed by the
+   :ref:`sharder_daemon` - but sets the necessary state in the database for the
+   daemon to subsequently start the sharding process.
+
+   The ``epoch`` value displayed in the output is the time at which sharding
+   was enabled. When the :ref:`sharder_daemon` starts sharding this container
+   it creates a new container database file using the epoch in the filename to
+   distinguish it from the retiring DB that is being sharded.
+
+All three steps may be performed with one sub-command::
+
+    $ swift-manage-shard-ranges <path_to_db> find_and_replace 500000 --enable --force
+    Loaded db broker for AUTH_test/c1.
+    No shard ranges found to delete.
+    Injected 7 shard ranges.
+    Run container-replicator to replicate them to other nodes.
+    Container moved to state 'sharding' with epoch 1525345669.46153.
+    Run container-sharder on all nodes to shard the container.
+
+.. _sharder_daemon:
+
+``container-sharder`` daemon
+----------------------------
+
+Once sharding has been enabled for a container, the act of sharding is
+performed by the :ref:`container-sharder`.  The :ref:`container-sharder` daemon
+must be running on all container servers. The ``container-sharder`` daemon
+periodically visits each container database to perform any container sharding
+tasks that are required.
+
+The ``container-sharder`` daemon requires a ``[container-sharder]`` config
+section to exist in the container server configuration file; a sample config
+section is shown in the `container-server.conf-sample` file.
+
+.. note::
+
+    Several of the ``[container-sharder]`` config options are only significant
+    when the ``auto_shard`` option is enabled. This option enables the
+    ``container-sharder`` daemon to automatically identify containers that are
+    candidates for sharding and initiate the sharding process, instead of using
+    the ``swift-manage-shard-ranges`` tool. The ``auto_shard`` option is
+    currently NOT recommended for production systems and shoud be set to
+    ``false`` (the default value).
+
+The container sharder uses an internal client and therefore requires an
+internal client configuration file to exist. By default the internal-client
+configuration file is expected to be found at
+`/etc/swift/internal-client.conf`. An alternative location for the
+configuration file may be specified using the ``internal_client_conf_path``
+option in the ``[container-sharder]`` config section.
+
+The content of the internal-client configuration file should be the same as the
+`internal-client.conf-sample` file. In particular, the internal-client
+configuration should have::
+
+    account_autocreate = True
+
+in the ``[proxy-server]`` section.
+
+A container database may require several visits by the ``container-sharder``
+daemon before it is fully sharded. On each visit the ``container-sharder``
+daemon will move a subset of object records to new shard containers by cleaving
+new shard container databases from the original. By default, two shards are
+processed per visit; this number may be configured by the ``cleave_batch_size``
+option.
+
+The ``container-sharder`` daemon periodically writes progress data for
+containers that are being sharded to recon cache. For example::
+
+    "sharding_in_progress": {
+        "all": [
+            {
+                "account": "AUTH_test",
+                "active": 0,
+                "cleaved": 2,
+                "container": "c1",
+                "created": 5,
+                "db_state": "sharding",
+                "error": null,
+                "file_size": 26624,
+                "found": 0,
+                "meta_timestamp": "1525349617.46235",
+                "node_index": 1,
+                "object_count": 3349030,
+                "path": <path_to_db>,
+                "root": "AUTH_test/c1",
+                "state": "sharding"
+            }
+        ]
+    }
+
+This example indicates that from a total of 7 shard ranges, 2 have been cleaved
+whereas 5 remain in created state waiting to be cleaved.
+
+Shard containers are created in an internal account and not visible to clients.
+By default, shard containers for an account ``AUTH_test`` are created in the
+internal account ``.shards_AUTH_test``.
+
+Once a container has started sharding, object updates to that container may be
+redirected to the shard container. The ``container-sharder`` daemon is also
+responsible for sending updates of a shard's object count and bytes_used to the
+original container so that aggegrate object count and bytes used values can be
+returned in responses to client requests.
+
+.. note::
+
+    The ``container-sharder`` daemon must continue to run on all container
+    servers in order for shards object stats updates to be generated.
+
+
+--------------
+Under the hood
+--------------
+
+Terminology
+-----------
+
+================== ==================================================
+Name               Description
+================== ==================================================
+Root container     The original container that lives in the
+                   user's account. It holds references to its
+                   shard containers.
+Retiring DB        The original database file that is to be sharded.
+Fresh DB           A database file that will replace the retiring
+                   database.
+Shard range        A range of the object namespace defined by a lower
+                   bound and and upper bound.
+Shard container    A container that holds object records for a shard
+                   range. Shard containers exist a hidden account
+                   mirroring the user's account.
+Misplaced objects  Items that don't belong in a container's shard
+                   range. These will be moved to their correct
+                   location by the container-sharder.
+Cleaving           The act of moving object records within a shard
+                   range to a shard container database.
+Shrinking          The act of merging a small shard container into
+                   another shard container in order to delete the
+                   small shard container.
+Donor              The shard range that is shrinking away.
+Acceptor           The shard range into which a donor is merged.
+================== ==================================================
+
+
+Finding shard ranges
+--------------------
+
+The end goal of sharding a container is to replace the original container
+database which has grown very large with a number of shard container databases,
+each of which is responsible for storing a range of the entire object
+namespace. The first step towards achieving this is to identify an appropriate
+set of contiguous object namespaces, known as shard ranges, each of which
+contains a similar sized portion of the container's current object content.
+
+Shard ranges cannot simply be selected by sharding the namespace uniformly,
+because object names are not guaranteed to be distributed uniformly. If the
+container were naively sharded into two shard ranges, one containing all
+object names up to `m` and the other containing all object names beyond `m`,
+then if all object names actually start with `o` the outcome would be an
+extremely unbalanced pair of shard containers.
+
+It is also too simplistic to assume that every container that requires sharding
+can be sharded into two. This might be the goal in the ideal world, but in
+practice there will be containers that have grown very large and should be
+sharded into many shards. Furthermore, the time required to find the exact
+mid-point of the existing object names in a large SQLite database would
+increase with container size.
+
+For these reasons, shard ranges of size `N` are found by searching for the
+`Nth` object in the database table, sorted by object name, and then searching
+for the `(2 * N)th` object, and so on until all objects have been searched. For
+a container that has exactly `2N` objects, the end result is the same as
+sharding the container at the midpoint of its object names. In practice
+sharding would typically be enabled for containers with great than `2N` objects
+and more than two shard ranges will be found, the last one probably containing
+less than `N` objects. With containers having large multiples of `N` objects,
+shard ranges can be identified in batches which enables more scalable solution.
+
+To illustrate this process, consider a very large container in a user account
+``acct`` that is a candidate for sharding:
+
+.. image:: images/sharding_unsharded.svg
+
+The :ref:`swift-manage-shard-ranges` tool ``find`` sub-command searches the
+object table for the `Nth` object whose name will become the upper bound of the
+first shard range, and the lower bound of the second shard range. The lower
+bound of the first shard range is the empty string.
+
+For the purposes of this example the first upper bound is `cat`:
+
+.. image:: images/sharding_scan_basic.svg
+
+:ref:`swift-manage-shard-ranges` continues to search the container to find
+further shard ranges, with the final upper bound also being the empty string.
+
+Enabling sharding
+-----------------
+
+Once shard ranges have been found the :ref:`swift-manage-shard-ranges`
+``replace`` sub-command is used to insert them into the `shard_ranges` table
+of the container database. In addition to its lower and upper bounds, each
+shard range is given a name. The name takes the form ``a/c`` where ``a`` is an
+account name formed by prefixing the user account with the string
+``.shards_``, and ``c`` is a container name that is derived from the original
+container and includes the index of the shard range. The final container name
+for the shard range uses the pattern of ``{original contianer name}-{hash of
+parent container}-{timestamp}-{shard index}``.
+
+The ``enable`` sub-command then creates some final state required to initiate
+sharding the container, including a special shard range record referred to as
+the container's `own_shard_range` whose name is equal to the container's path.
+This is used to keep a record of the object namespace that the container
+covers, which for user containers is always the entire namespace.
+
+The :class:`~swift.common.utils.ShardRange` class
+-------------------------------------------------
+
+The :class:`~swift.common.utils.ShardRange` class provides methods for
+interactng with the attributes and state of a shard range. The class
+encapsulates the following properties:
+
+* The name of the shard range which is also the name of the shard container
+  used to hold object records in its namespace.
+* Lower and upper bounds which define the object namespace of the shard range.
+* A deleted flag.
+* A timestamp at which the bounds and deleted flag were last modified.
+* The object stats for the shard range i.e. object count and bytes used.
+* A timestamp at which the object stats were last modified.
+* The state of the shard range, and an epoch, which is the timestamp used in
+  the shard container's database file name.
+* A timestamp at which the state and epoch were last modified.
+
+A shard range progresses through the following states:
+
+* FOUND: the shard range has been identified in the container that is to be
+  sharded but no resources have been created for it.
+* CREATED: A shard container has been created to store the contents of the
+  shard range.
+* CLEAVED: the sharding container's contents for the shard range have been
+  copied to the shard container from *at least one replica* of the sharding
+  container.
+* ACTIVE: shard ranges move to this state when all shard ranges in a sharding
+  container have been cleaved.
+* SHRINKING: the shard range has been enabled for shrinking; or
+* SHARDING: the shard range has been enabled for sharding.
+* SHARDED: the shard range has completed sharding or shrinking.
+
+..note::
+
+   Shard range state represents the most advanced state of the shard range on
+   any replica of the container. For example, a shard range in CLEAVED state
+   may not have completed cleaving on all replicas but has cleaved on at least
+   one replica.
+
+Fresh and retiring database files
+---------------------------------
+
+As alluded to earlier, writing to a large container causes increased latency
+for the container servers. Once sharding has been initiated on a container it
+is desirable to stop writing to the large database; ultimately it will be
+unlinked. This is primarily achieved by redirecting object updates to new shard
+containers as they are created (see :ref:`redirecting_updates` below), but some
+object updates may still need to be accepted by the root container and other
+container metadata must still be modifiable.
+
+To render the large `retiring` database effectively read-only, when the
+:ref:`sharder_daemon` finds a container with a set of shard range records,
+including an `own_shard_range`, it first creates a fresh database file which
+will ultimately replace the existing `retiring` database. For a retiring db
+whose filename is::
+
+    <hash>.db
+
+the fresh database file name is of the form::
+
+    <hash>_<epoch>.db
+
+where epoch is a timestamp stored in the container's `own_shard_range`.
+
+The fresh DB has a copy of the shard ranges table from the retiring DB and all
+other container metadata apart from the object records. Once a fresh DB file
+has been created it is used to store any new object updates and no more object
+records are written to the retiring DB file.
+
+Once the sharding process has completed, the retiring DB file will be unlinked
+leaving only the fresh DB file in the container's directory. There are
+therefore three states that the container DB directory may be in during the
+sharding process: UNSHARDED, SHARDING and SHARDED.
+
+.. image:: images/sharding_db_states.svg
+
+If the container ever shrink to the point that is has no shards then the fresh
+DB starts to store object records, behaving the same as an unsharded container.
+This is known as the COLLAPSED state.
+
+In summary, the DB states that any container replica may be in are:
+
+- UNSHARDED - In this state there is just one standard container database. All
+  containers are originally in this state.
+- SHARDING - There are now two databases, the retiring database and a fresh
+  database. The fresh database stores any metadata, container level stats,
+  an object holding table, and a table that stores shard ranges.
+- SHARDED - There is only one database, the fresh database, which has one or
+  more shard ranges in addition to its own shard range. The retiring database
+  has been unlinked.
+- COLLAPSED - There is only one database, the fresh database, which has only
+  its its own shard range and store object records.
+
+.. note::
+
+   DB state is unique to each replica of a container and is not necessarily
+   synchronised with shard range state.
+
+
+Creating shard containers
+-------------------------
+
+The :ref:`sharder_daemon` next creates a shard container for each shard range
+using the shard range name as the name of the shard container:
+
+.. image:: /images/sharding_cleave_basic.svg
+
+Shard containers now exist with a unique name and placed in a hidden account
+that maps to the user account (`.shards_acct`). This avoids namespace
+collisions and also keeps all the shard containers out of view from users of
+the account. Each shard container has an `own_shard_range` record which has the
+lower and upper bounds of the object namespace for which it is responsible, and
+a reference to the sharding user container, which is referred to as the
+`root_container`. Unlike the `root_container`, the shard container's
+`own_shard_range` does not cover the entire namepsace.
+
+Cleaving shard containers
+-------------------------
+
+Having created empty shard containers the sharder daemon will proceed to cleave
+objects from the retiring database to each shard range. Cleaving occurs in
+batches of two (by default) shard ranges, so if a container has more than two
+shard ranges then the daemon must visit it multiple times to complete cleaving.
+
+To cleave a shard range the daemon creates a shard database for the shard
+container on a local device. This device may be one of the shard container's
+primary nodes but often it will not. Object records from the corresponding
+shard range namespace are then copied from the retiring DB to this shard DB.
+
+Swift's container replication mechanism is then used to replicate the shard DB
+to its primary nodes. Checks are made to ensure that the new shard container DB
+has been replicated to a sufficient number of its primary nodes before it is
+considered to have been successfully cleaved. By default the daemon requires
+successful replication of a new shard broker to at least a quorum of the
+container rings replica count, but this requirement can be tuned using the
+``shard_replication_quorum`` option.
+
+Once a shard range has been succesfully cleaved from a retiring database the
+daemon transitions its state to ``CLEAVED``. It should be noted that this state
+transition occurs as soon as any one of the retiring DB replicas has cleaved
+the shard range, and therefore does not imply that all retiring DB replicas
+have cleaved that range. The significance of the state transition is that the
+shard container is now considered suitable for contributing to object listings,
+since its contents are present on a quorum of its primary nodes and are the
+same as at least one of the retiring DBs for that namespace.
+
+Once a shard range is in the ``CLEAVED`` state, the requirement for
+'successful' cleaving of other instances of the retirng DB may optionally be
+relaxed since it is not so imperative that their contents are replicated
+*immediately* to their primary nodes. The ``existing_shard_replication_quorum``
+option can be used to reduce the quorum required for a cleaved shard range to
+be considered successfully replicated by the sharder daemon.
+
+.. note::
+
+   Once cleaved, shard container DBs will continue to be replicated by the
+   normal `container-replicator` daemon so that they will eventually be fully
+   replicated to all primary nodes regardless of any replication quorum options
+   used by the sharder daemon.
+
+The cleaving progress of each replica of a retiring DB must be
+tracked independently of the shard range state. This is done using a per-DB
+CleavingContext object that maintains a cleaving cursor for the retiring DB
+that it is associated with. The cleaving cursor is simply the upper bound of
+the last shard range to have been cleaved *from that particular retiring DB*.
+
+Each CleavingContext is stored in the sharding container's sysmeta under a key
+that is the ``id`` of the retiring DB. Since all container DB files have unique
+``id``s, this guarantees that each retiring DB will have a unique
+CleavingContext. Furthermore, if the retiring DB file is changed, for example
+by an rsync_then_merge replication operation which might change the contents of
+the DB's object table, then it will get a new unique CleavingContext.
+
+A CleavingContext maintains other state that is used to ensure that a retiring
+DB is only considered to be fully cleaved, and ready to be deleted, if *all* of
+its object rows have been cleaved to a shard range.
+
+Once all shard ranges have been cleaved from the retiring DB it is deleted. The
+container is now represented by the fresh DB which has a table of shard range
+records that point to the shard containers that store the container's object
+records.
+
+.. _redirecting_updates:
+
+Redirecting object updates
+--------------------------
+
+Once a shard container exists, object updates arising from new client requests
+and async pending files are directed to the shard container instead of the root
+container. This takes load off of the root container.
+
+For a sharded (or partially sharded) container, when the proxy receives a new
+object request it issues a GET request to the container for data describing a
+shard container to which the object update should be sent. The proxy then
+annotates the object request with the shard container location so that the
+object server will forward object updates to the shard container. If those
+updates fail then the async pending file that is written on the object server
+contains the shard container location.
+
+When the object updater processes async pending files for previously failed
+object updates, it may not find a shard container location. In this case the
+updater sends the update to the `root container`, which returns a redirection
+response with the shard container location.
+
+.. note::
+
+    Object updates are directed to shard containers as soon as they exist, even
+    if the retiring DB object records have not yet been cleaved to the shard
+    container. This prevents further writes to the retiring DB and also avoids
+    the fresh DB being polluted by new object updates. The goal is to
+    ultimately have all object records in the shard containers and none in the
+    root container.
+
+Building container listings
+---------------------------
+
+Listing requests for a sharded container are handled by querying the shard
+containers for components of the listing. The proxy forwards the client listing
+request to the root container, as it would for an unsharded container, but the
+container server responds with a list of shard ranges rather than objects. The
+proxy then queries each shard container in namespace order for their listing,
+until either the listing length limit is reached or all shard ranges have been
+listed.
+
+While a container is still in the process of sharding, only *cleaved* shard
+ranges are used when building a container listing. Shard ranges that have not
+yet cleaved will not have any object records from the root container. The root
+container continues to provide listings for the uncleaved part of its
+namespace.
+
+..note::
+
+   New object updates are redirected to shard containers that have not yet been
+   cleaved. These updates will not threfore be included in container listings
+   until their shard range has been cleaved.
+
+Example request redirection
+---------------------------
+
+As an example, consider a sharding container in which 3 shard ranges have been
+found ending in cat, giraffe and igloo. Their respective shard containers have
+been created so update requests for objects up to "igloo" are redirected to the
+appropriate shard container. The root DB continues to handle listing requests
+and update requests for any object name beyond "igloo".
+
+.. image:: images/sharding_scan_load.svg
+
+The sharder daemon cleaves objects from the retiring DB to the shard range DBs;
+it also moves any misplaced objects from the root container's fresh DB to the
+shard DB. Cleaving progress is represented by the blue line. Once the first
+shard range has been cleaved listing requests for that namespace are directed
+to the shard container. The root container still provides listings for the
+remainder of the namespace.
+
+.. image:: images/sharding_cleave1_load.svg
+
+The process continues: the sharder cleaves the next range and a new range is
+found with upper bound of "linux". Now the root container only needs to handle
+listing requests up to "giraffe" and update requests for objects whose name is
+greater than "linux". Load will continue to diminish on the root DB and be
+dispersed across the shard DBs.
+
+.. image:: images/sharding_cleave2_load.svg
+
+
+Container replication
+---------------------
+
+Shard range records are replicated between container DB replicas in much the
+same way as object records are for unsharded containers. However, the usual
+replication of object records between replicas of a container is halted as soon
+as a container is capable of being sharded. Instead, object records are moved
+to their new locations in shard containers. This avoids unnecessary replication
+traffic between container replicas.
+
+To facilitate this, shard ranges are both 'pushed' and 'pulled' during
+replication, prior to any attempt to replicate objects. This means that the
+node initiating replication learns about shard ranges from the destination node
+early during the replication process and is able to skip object replication if
+it discovers that it has shard ranges and is able to shard.
+
+.. note::
+
+   When the destination DB for container replication is missing then the
+   'complete_rsync' replication mechanism is still used and in this case only
+   both object records and shard range records are copied to the destination
+   node.
+
+Container deletion
+------------------
+
+Sharded containers may be deleted by a ``DELETE`` request just like an
+unsharded container. A sharded container must be empty before it can be deleted
+which implies that all of its shard containers must have reported that they are
+empty.
+
+Shard containers are *not* immediately deleted when their root container is
+deleted; the shard containers remain undeleted so that they are able to
+continue to receive object updates that might arrive after the root container
+has been deleted. Shard containers continue to update their deleted root
+container with their object stats. If a shard container does receive object
+updates that cause it to no longer be empty then the root container will no
+longer be considered deleted once that shard container sends an object stats
+update.
+
+
+Sharding a shard container
+--------------------------
+
+A shard container may grow to a size that requires it to be sharded.
+``swift-manage-shard-ranges`` may be used to identify shard ranges within a
+shard container and enable sharding in the same way as for a root container.
+When a shard is sharding it notifies the root of its shard ranges so that the
+root can start to redirect object updates to the new 'sub-shards'. When the
+shard has completed sharding the root is aware of all the new sub-shards and
+the sharding shard deletes its shard range record in the root container shard
+ranges table. At this point the root is aware of all the new sub-shards which
+collectively cover the namespace of the now-deleted shard.
+
+There is no hierarchy of shards beyond the root and its immediate shards. When
+a shard shards, its sub-shards are effectively re-parented with the root
+container.
+
+
+Shrinking a shard container
+---------------------------
+
+A shard's contents may reduce to a point where the shard is no longer required.
+If this happens then the shard may be shrunk into another shard range.
+Shrinking is achieved in a similar way to sharding: an 'acceptor' shard range
+is written to the shrinking shard container's shard ranges table; unlike
+sharding, where shard ranges each cover a subset of the sharding container's
+namespace, the acceptor shard range is a superset of the shrinking shard range.
+
+Once given an acceptor shard range the shrinking shard will cleave itself to
+its acceptor, and then delete itself from the root container shard ranges
+table.
--- a/etc/container-server.conf-sample
+++ b/etc/container-server.conf-sample
@ -69,6 +69,10 @@ bind_port = 6201
 # Work only with ionice_class.
 # ionice_class =
 # ionice_priority =
+#
+# The prefix used for hidden auto-created accounts, for example accounts in
+# which shard containers are created. Defaults to '.'.
+# auto_create_account_prefix = .

 [pipeline:main]
 pipeline = healthcheck recon container-server
@ -323,3 +327,117 @@ use = egg:swift#xprofile
 #
 # unwind the iterator of applications
 # unwind = false
+
+[container-sharder]
+# You can override the default log routing for this app here (don't use set!):
+# log_name = container-sharder
+# log_facility = LOG_LOCAL0
+# log_level = INFO
+# log_address = /dev/log
+#
+# Container sharder specific settings
+#
+# If the auto_shard option is true then the sharder will automatically select
+# containers to shard, scan for shard ranges, and select shards to shrink.
+# The default is false.
+# Warning: auto-sharding is still under development and should not be used in
+# production; do not set this option to true in a production cluster.
+# auto_shard = false
+#
+# When auto-sharding is enabled shard_container_threshold defines the object
+# count at which a container with container-sharding enabled will start to
+# shard. shard_container_threshold also indirectly determines the initial
+# nominal size of shard containers, which is shard_container_threshold // 2, as
+# well as determining the thresholds for shrinking and merging shard
+# containers.
+# shard_container_threshold = 1000000
+#
+# When auto-sharding is enabled shard_shrink_point defines the object count
+# below which a 'donor' shard container will be considered for shrinking into
+# another 'acceptor' shard container. shard_shrink_point is a percentage of
+# shard_container_threshold e.g. the default value of 5 means 5% of the
+# shard_container_threshold.
+# shard_shrink_point = 5
+#
+# When auto-sharding is enabled shard_shrink_merge_point defines the maximum
+# allowed size of an acceptor shard container after having a donor merged into
+# it. Shard_shrink_merge_point is a percentage of shard_container_threshold.
+# e.g. the default value of 75 means that the projected sum of a donor object
+# count and acceptor count must be less than 75% of shard_container_threshold
+# for the donor to be allowed to merge into the acceptor.
+#
+# For example, if the shard_container_threshold is 1 million,
+# shard_shrink_point is 5, and shard_shrink_merge_point is 75 then a shard will
+# be considered for shrinking if it has less than or equal to 50 thousand
+# objects but will only merge into an acceptor if the combined object count
+# would be less than or equal to 750 thousand objects.
+# shard_shrink_merge_point = 75
+#
+# When auto-sharding is enabled shard_scanner_batch_size defines the maximum
+# number of shard ranges that will be found each time the sharder daemon visits
+# a sharding container. If necessary the sharder daemon will continue to search
+# for more shard ranges each time it visits the container.
+# shard_scanner_batch_size = 10
+#
+# cleave_batch_size defines the number of shard ranges that will be cleaved
+# each time the sharder daemon visits a sharding container.
+# cleave_batch_size = 2
+#
+# cleave_row_batch_size defines the size of batches of object rows read from a
+# sharding container and merged to a shard container during cleaving.
+# cleave_row_batch_size = 10000
+#
+# Defines the number of successfully replicated shard dbs required when
+# cleaving a previously uncleaved shard range before the sharder will progress
+# to the next shard range. The value should be less than or equal to the
+# container ring replica count. The default of 'auto' causes the container ring
+# quorum value to be used. This option only applies to the container-sharder
+# replication and does not affect the number of shard container replicas that
+# will eventually be replicated by the container-replicator.
+# shard_replication_quorum = auto
+#
+# Defines the number of successfully replicated shard dbs required when
+# cleaving a shard range that has been previously cleaved on another node
+# before the sharder will progress to the next shard range. The value should be
+# less than or equal to the container ring replica count. The default of 'auto'
+# causes the shard_replication_quorum value to be used. This option only
+# applies to the container-sharder replication and does not affect the number
+# of shard container replicas that will eventually be replicated by the
+# container-replicator.
+# existing_shard_replication_quorum = auto
+#
+# The sharder uses an internal client to create and make requests to
+# containers. The absolute path to the client config file can be configured.
+# internal_client_conf_path = /etc/swift/internal-client.conf
+#
+# The number of time the internal client will retry requests.
+# request_tries = 3
+#
+# Each time the sharder dumps stats to the recon cache file it includes a list
+# of containers that appear to need sharding but are not yet sharding. By
+# default this list is limited to the top 5 containers, ordered by object
+# count. The limit may be changed by setting recon_candidates_limit to an
+# integer value. A negative value implies no limit.
+# recon_candidates_limit = 5
+#
+# Large databases tend to take a while to work with, but we want to make sure
+# we write down our progress. Use a larger-than-normal broker timeout to make
+# us less likely to bomb out on a LockTimeout.
+# broker_timeout = 60
+#
+# Time in seconds to wait between sharder cycles
+# interval = 30
+#
+# The container-sharder accepts the following configuration options as defined
+# in the container-replicator section:
+#
+# per_diff = 1000
+# max_diffs = 100
+# concurrency = 8
+# node_timeout = 10
+# conn_timeout = 0.5
+# reclaim_age = 604800
+# rsync_compress = no
+# rsync_module = {replication_ip}::container
+# recon_cache_path = /var/cache/swift
+#
--- a/setup.cfg
+++ b/setup.cfg
@ -36,6 +36,7 @@ scripts =
    bin/swift-container-info
    bin/swift-container-replicator
    bin/swift-container-server
+    bin/swift-container-sharder
    bin/swift-container-sync
    bin/swift-container-updater
    bin/swift-container-reconciler
@ -71,6 +72,9 @@ keystone =
    keystonemiddleware>=4.17.0

 [entry_points]
+console_scripts =
+    swift-manage-shard-ranges = swift.cli.manage_shard_ranges:main
+
 paste.app_factory =
    proxy = swift.proxy.server:app_factory
    object = swift.obj.server:app_factory
--- a/swift/account/backend.py
+++ b/swift/account/backend.py
@ -22,7 +22,7 @@ import six.moves.cPickle as pickle
 import sqlite3

 from swift.common.utils import Timestamp
-from swift.common.db import DatabaseBroker, utf8encode
+from swift.common.db import DatabaseBroker, utf8encode, zero_like

 DATADIR = 'accounts'

@ -233,7 +233,7 @@ class AccountBroker(DatabaseBroker):
        with self.get() as conn:
            row = conn.execute(
                'SELECT container_count from account_stat').fetchone()
-            return (row[0] == 0)
+            return zero_like(row[0])

    def make_tuple_for_pickle(self, record):
        return (record['name'], record['put_timestamp'],
@ -254,7 +254,7 @@ class AccountBroker(DatabaseBroker):
        :param storage_policy_index:  the storage policy for this container
        """
        if Timestamp(delete_timestamp) > Timestamp(put_timestamp) and \
-                object_count in (None, '', 0, '0'):
+                zero_like(object_count):
            deleted = 1
        else:
            deleted = 0
@ -273,8 +273,7 @@ class AccountBroker(DatabaseBroker):

        :returns: True if the DB is considered to be deleted, False otherwise
        """
-        return status == 'DELETED' or (
-            container_count in (None, '', 0, '0') and
+        return status == 'DELETED' or zero_like(container_count) and (
            Timestamp(delete_timestamp) > Timestamp(put_timestamp))

    def _is_deleted(self, conn):
@ -509,7 +508,7 @@ class AccountBroker(DatabaseBroker):
                        record[2] = row[2]
                    # If deleted, mark as such
                    if Timestamp(record[2]) > Timestamp(record[1]) and \
-                            record[3] in (None, '', 0, '0'):
+                            zero_like(record[3]):
                        record[5] = 1
                    else:
                        record[5] = 0
--- a/swift/cli/info.py
+++ b/swift/cli/info.py
@ -298,6 +298,27 @@ def print_db_info_metadata(db_type, info, metadata, drop_prefixes=False):
    else:
        print('No user metadata found in db file')

+    if db_type == 'container':
+        print('Sharding Metadata:')
+        shard_type = 'root' if info['is_root'] else 'shard'
+        print('  Type: %s' % shard_type)
+        print('  State: %s' % info['db_state'])
+    if info.get('shard_ranges'):
+        print('Shard Ranges (%d):' % len(info['shard_ranges']))
+        for srange in info['shard_ranges']:
+            srange = dict(srange, state_text=srange.state_text)
+            print('  Name: %(name)s' % srange)
+            print('    lower: %(lower)r, upper: %(upper)r' % srange)
+            print('    Object Count: %(object_count)d, Bytes Used: '
+                  '%(bytes_used)d, State: %(state_text)s (%(state)d)'
+                  % srange)
+            print('    Created at: %s (%s)'
+                  % (Timestamp(srange['timestamp']).isoformat,
+                     srange['timestamp']))
+            print('    Meta Timestamp: %s (%s)'
+                  % (Timestamp(srange['meta_timestamp']).isoformat,
+                     srange['meta_timestamp']))
+

 def print_obj_metadata(metadata, drop_prefixes=False):
    """
@ -406,7 +427,13 @@ def print_info(db_type, db_file, swift_dir='/etc/swift', stale_reads_ok=False,
            raise InfoSystemExit()
        raise
    account = info['account']
-    container = info['container'] if db_type == 'container' else None
+    container = None
+    if db_type == 'container':
+        container = info['container']
+        info['is_root'] = broker.is_root_container()
+        sranges = broker.get_shard_ranges()
+        if sranges:
+            info['shard_ranges'] = sranges
    print_db_info_metadata(db_type, info, broker.metadata, drop_prefixes)
    try:
        ring = Ring(swift_dir, ring_name=db_type)
--- a/swift/cli/manage_shard_ranges.py
+++ b/swift/cli/manage_shard_ranges.py
@ -0,0 +1,370 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy
+# of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import argparse
+import json
+import sys
+import time
+
+from six.moves import input
+
+from swift.common.utils import Timestamp, get_logger, ShardRange
+from swift.container.backend import ContainerBroker, UNSHARDED
+from swift.container.sharder import make_shard_ranges, sharding_enabled, \
+    CleavingContext
+
+
+def _load_and_validate_shard_data(args):
+    try:
+        with open(args.input, 'rb') as fd:
+            try:
+                data = json.load(fd)
+                if not isinstance(data, list):
+                    raise ValueError('Shard data must be a list of dicts')
+                for k in ('lower', 'upper', 'index', 'object_count'):
+                    for shard in data:
+                        shard[k]
+                return data
+            except (TypeError, ValueError, KeyError) as err:
+                print('Failed to load valid shard range data: %r' % err,
+                      file=sys.stderr)
+                exit(2)
+    except IOError as err:
+        print('Failed to open file %s: %s' % (args.input, err),
+              file=sys.stderr)
+        exit(2)
+
+
+def _check_shard_ranges(own_shard_range, shard_ranges):
+    reasons = []
+
+    def reason(x, y):
+        if x != y:
+            reasons.append('%s != %s' % (x, y))
+
+    if not shard_ranges:
+        reasons.append('No shard ranges.')
+    else:
+        reason(own_shard_range.lower, shard_ranges[0].lower)
+        reason(own_shard_range.upper, shard_ranges[-1].upper)
+        for x, y in zip(shard_ranges, shard_ranges[1:]):
+            reason(x.upper, y.lower)
+
+    if reasons:
+        print('WARNING: invalid shard ranges: %s.' % reasons)
+        print('Aborting.')
+        exit(2)
+
+
+def _check_own_shard_range(broker, args):
+    # TODO: this check is weak - if the shards prefix changes then we may not
+    # identify a shard container. The goal is to not inadvertently create an
+    # entire namespace default shard range for a shard container.
+    is_shard = broker.account.startswith(args.shards_account_prefix)
+    own_shard_range = broker.get_own_shard_range(no_default=is_shard)
+    if not own_shard_range:
+        print('WARNING: shard container missing own shard range.')
+        print('Aborting.')
+        exit(2)
+    return own_shard_range
+
+
+def _find_ranges(broker, args, status_file=None):
+    start = last_report = time.time()
+    limit = 5 if status_file else -1
+    shard_data, last_found = broker.find_shard_ranges(
+        args.rows_per_shard, limit=limit)
+    if shard_data:
+        while not last_found:
+            if last_report + 10 < time.time():
+                print('Found %d ranges in %gs; looking for more...' % (
+                    len(shard_data), time.time() - start), file=status_file)
+                last_report = time.time()
+            # prefix doesn't matter since we aren't persisting it
+            found_ranges = make_shard_ranges(broker, shard_data, '.shards_')
+            more_shard_data, last_found = broker.find_shard_ranges(
+                args.rows_per_shard, existing_ranges=found_ranges, limit=5)
+            shard_data.extend(more_shard_data)
+    return shard_data, time.time() - start
+
+
+def find_ranges(broker, args):
+    shard_data, delta_t = _find_ranges(broker, args, sys.stderr)
+    print(json.dumps(shard_data, sort_keys=True, indent=2))
+    print('Found %d ranges in %gs (total object count %s)' %
+          (len(shard_data), delta_t,
+           sum(r['object_count'] for r in shard_data)),
+          file=sys.stderr)
+    return 0
+
+
+def show_shard_ranges(broker, args):
+    shard_ranges = broker.get_shard_ranges(
+        include_deleted=getattr(args, 'include_deleted', False))
+    shard_data = [dict(sr, state=sr.state_text)
+                  for sr in shard_ranges]
+
+    if not shard_data:
+        print("No shard data found.", file=sys.stderr)
+    elif getattr(args, 'brief', False):
+        print("Existing shard ranges:", file=sys.stderr)
+        print(json.dumps([(sd['lower'], sd['upper']) for sd in shard_data],
+                         sort_keys=True, indent=2))
+    else:
+        print("Existing shard ranges:", file=sys.stderr)
+        print(json.dumps(shard_data, sort_keys=True, indent=2))
+    return 0
+
+
+def db_info(broker, args):
+    print('Sharding enabled = %s' % sharding_enabled(broker))
+    own_sr = broker.get_own_shard_range(no_default=True)
+    print('Own shard range: %s' %
+          (json.dumps(dict(own_sr, state=own_sr.state_text),
+                      sort_keys=True, indent=2)
+           if own_sr else None))
+    db_state = broker.get_db_state()
+    print('db_state = %s' % db_state)
+    if db_state == 'sharding':
+        print('Retiring db id: %s' % broker.get_brokers()[0].get_info()['id'])
+        print('Cleaving context: %s' %
+              json.dumps(dict(CleavingContext.load(broker)),
+                         sort_keys=True, indent=2))
+    print('Metadata:')
+    for k, (v, t) in broker.metadata.items():
+        print('  %s = %s' % (k, v))
+
+
+def delete_shard_ranges(broker, args):
+    shard_ranges = broker.get_shard_ranges()
+    if not shard_ranges:
+        print("No shard ranges found to delete.")
+        return 0
+
+    while not args.force:
+        print('This will delete existing %d shard ranges.' % len(shard_ranges))
+        if broker.get_db_state() != UNSHARDED:
+            print('WARNING: Be very cautious about deleting existing shard '
+                  'ranges. Deleting all ranges in this db does not guarantee '
+                  'deletion of all ranges on all replicas of the db.')
+            print('  - this db is in state %s' % broker.get_db_state())
+            print('  - %d existing shard ranges have started sharding' %
+                  [sr.state != ShardRange.FOUND
+                   for sr in shard_ranges].count(True))
+        choice = input('Do you want to show the existing ranges [s], '
+                       'delete the existing ranges [yes] '
+                       'or quit without deleting [q]? ')
+        if choice == 's':
+            show_shard_ranges(broker, args)
+            continue
+        elif choice == 'q':
+            return 1
+        elif choice == 'yes':
+            break
+        else:
+            print('Please make a valid choice.')
+            print()
+
+    now = Timestamp.now()
+    for sr in shard_ranges:
+        sr.deleted = 1
+        sr.timestamp = now
+    broker.merge_shard_ranges(shard_ranges)
+    print('Deleted %s existing shard ranges.' % len(shard_ranges))
+    return 0
+
+
+def _replace_shard_ranges(broker, args, shard_data, timeout=None):
+    own_shard_range = _check_own_shard_range(broker, args)
+    shard_ranges = make_shard_ranges(
+        broker, shard_data, args.shards_account_prefix)
+    _check_shard_ranges(own_shard_range, shard_ranges)
+
+    if args.verbose > 0:
+        print('New shard ranges to be injected:')
+        print(json.dumps([dict(sr) for sr in shard_ranges],
+                         sort_keys=True, indent=2))
+
+    # Crank up the timeout in an effort to *make sure* this succeeds
+    with broker.updated_timeout(max(timeout, args.replace_timeout)):
+        delete_shard_ranges(broker, args)
+        broker.merge_shard_ranges(shard_ranges)
+
+    print('Injected %d shard ranges.' % len(shard_ranges))
+    print('Run container-replicator to replicate them to other nodes.')
+    if args.enable:
+        return enable_sharding(broker, args)
+    else:
+        print('Use the enable sub-command to enable sharding.')
+        return 0
+
+
+def replace_shard_ranges(broker, args):
+    shard_data = _load_and_validate_shard_data(args)
+    return _replace_shard_ranges(broker, args, shard_data)
+
+
+def find_replace_shard_ranges(broker, args):
+    shard_data, delta_t = _find_ranges(broker, args, sys.stdout)
+    # Since we're trying to one-shot this, and the previous step probably
+    # took a while, make the timeout for writing *at least* that long
+    return _replace_shard_ranges(broker, args, shard_data, timeout=delta_t)
+
+
+def _enable_sharding(broker, own_shard_range, args):
+    if own_shard_range.update_state(ShardRange.SHARDING):
+        own_shard_range.epoch = Timestamp.now()
+        own_shard_range.state_timestamp = own_shard_range.epoch
+
+    with broker.updated_timeout(args.enable_timeout):
+        broker.merge_shard_ranges([own_shard_range])
+        broker.update_metadata({'X-Container-Sysmeta-Sharding':
+                                ('True', Timestamp.now().normal)})
+    return own_shard_range
+
+
+def enable_sharding(broker, args):
+    own_shard_range = _check_own_shard_range(broker, args)
+    _check_shard_ranges(own_shard_range, broker.get_shard_ranges())
+
+    if own_shard_range.state == ShardRange.ACTIVE:
+        own_shard_range = _enable_sharding(broker, own_shard_range, args)
+        print('Container moved to state %r with epoch %s.' %
+              (own_shard_range.state_text, own_shard_range.epoch.internal))
+    elif own_shard_range.state == ShardRange.SHARDING:
+        if own_shard_range.epoch:
+            print('Container already in state %r with epoch %s.' %
+                  (own_shard_range.state_text, own_shard_range.epoch.internal))
+            print('No action required.')
+        else:
+            print('Container already in state %r but missing epoch.' %
+                  own_shard_range.state_text)
+            own_shard_range = _enable_sharding(broker, own_shard_range, args)
+            print('Container in state %r given epoch %s.' %
+                  (own_shard_range.state_text, own_shard_range.epoch.internal))
+    else:
+        print('WARNING: container in state %s (should be active or sharding).'
+              % own_shard_range.state_text)
+        print('Aborting.')
+        return 2
+
+    print('Run container-sharder on all nodes to shard the container.')
+    return 0
+
+
+def _add_find_args(parser):
+    parser.add_argument('rows_per_shard', nargs='?', type=int, default=500000)
+
+
+def _add_replace_args(parser):
+    parser.add_argument(
+        '--shards_account_prefix', metavar='shards_account_prefix', type=str,
+        required=False, help='Prefix for shards account', default='.shards_')
+    parser.add_argument(
+        '--replace-timeout', type=int, default=600,
+        help='Minimum DB timeout to use when replacing shard ranges.')
+    parser.add_argument(
+        '--force', '-f', action='store_true', default=False,
+        help='Delete existing shard ranges; no questions asked.')
+    parser.add_argument(
+        '--enable', action='store_true', default=False,
+        help='Enable sharding after adding shard ranges.')
+
+
+def _add_enable_args(parser):
+    parser.add_argument(
+        '--enable-timeout', type=int, default=300,
+        help='DB timeout to use when enabling sharding.')
+
+
+def _make_parser():
+    parser = argparse.ArgumentParser(description='Manage shard ranges')
+    parser.add_argument('container_db')
+    parser.add_argument('--verbose', '-v', action='count',
+                        help='Increase output verbosity')
+    subparsers = parser.add_subparsers(
+        help='Sub-command help', title='Sub-commands')
+
+    # find
+    find_parser = subparsers.add_parser(
+        'find', help='Find and display shard ranges')
+    _add_find_args(find_parser)
+    find_parser.set_defaults(func=find_ranges)
+
+    # delete
+    delete_parser = subparsers.add_parser(
+        'delete', help='Delete all existing shard ranges from db')
+    delete_parser.add_argument(
+        '--force', '-f', action='store_true', default=False,
+        help='Delete existing shard ranges; no questions asked.')
+    delete_parser.set_defaults(func=delete_shard_ranges)
+
+    # show
+    show_parser = subparsers.add_parser(
+        'show', help='Print shard range data')
+    show_parser.add_argument(
+        '--include_deleted', '-d', action='store_true', default=False,
+        help='Include deleted shard ranges in output.')
+    show_parser.add_argument(
+        '--brief', '-b', action='store_true', default=False,
+        help='Show only shard range bounds in output.')
+    show_parser.set_defaults(func=show_shard_ranges)
+
+    # info
+    info_parser = subparsers.add_parser(
+        'info', help='Print container db info')
+    info_parser.set_defaults(func=db_info)
+
+    # replace
+    replace_parser = subparsers.add_parser(
+        'replace',
+        help='Replace existing shard ranges. User will be prompted before '
+             'deleting any existing shard ranges.')
+    replace_parser.add_argument('input', metavar='input_file',
+                                type=str, help='Name of file')
+    _add_replace_args(replace_parser)
+    replace_parser.set_defaults(func=replace_shard_ranges)
+
+    # find_and_replace
+    find_replace_parser = subparsers.add_parser(
+        'find_and_replace',
+        help='Find new shard ranges and replace existing shard ranges. '
+             'User will be prompted before deleting any existing shard ranges.'
+    )
+    _add_find_args(find_replace_parser)
+    _add_replace_args(find_replace_parser)
+    _add_enable_args(find_replace_parser)
+    find_replace_parser.set_defaults(func=find_replace_shard_ranges)
+
+    # enable
+    enable_parser = subparsers.add_parser(
+        'enable', help='Enable sharding and move db to sharding state.')
+    _add_enable_args(enable_parser)
+    enable_parser.set_defaults(func=enable_sharding)
+    _add_replace_args(enable_parser)
+    return parser
+
+
+def main(args=None):
+    parser = _make_parser()
+    args = parser.parse_args(args)
+    logger = get_logger({}, name='ContainerBroker', log_to_console=True)
+    broker = ContainerBroker(args.container_db, logger=logger,
+                             skip_commits=True)
+    broker.get_info()
+    print('Loaded db broker for %s.' % broker.path, file=sys.stderr)
+    return args.func(broker, args)
+
+
+if __name__ == '__main__':
+    exit(main())
--- a/swift/cli/shard-info.py
+++ b/swift/cli/shard-info.py
@ -0,0 +1,195 @@
+# Copyright (c) 2017 OpenStack Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import defaultdict
+
+from swift.common import utils
+from swift.common.db_replicator import roundrobin_datadirs
+from swift.common.ring import ring
+from swift.common.utils import Timestamp
+from swift.container.backend import ContainerBroker, DATADIR
+
+TAB = '    '
+
+
+def broker_key(broker):
+    broker.get_info()
+    return broker.path
+
+
+def container_type(broker):
+    return 'ROOT' if broker.is_root_container() else 'SHARD'
+
+
+def collect_brokers(conf_path, names2nodes):
+    conf = utils.readconf(conf_path, 'container-replicator')
+    root = conf.get('devices', '/srv/node')
+    swift_dir = conf.get('swift_dir', '/etc/swift')
+    c_ring = ring.Ring(swift_dir, ring_name='container')
+    dirs = []
+    brokers = defaultdict(dict)
+    for node in c_ring.devs:
+        if node is None:
+            continue
+        datadir = os.path.join(root, node['device'], DATADIR)
+        if os.path.isdir(datadir):
+            dirs.append((datadir, node['id'], lambda *args: True))
+    for part, object_file, node_id in roundrobin_datadirs(dirs):
+        broker = ContainerBroker(object_file)
+        for node in c_ring.get_part_nodes(int(part)):
+            if node['id'] == node_id:
+                node_index = str(node['index'])
+                break
+        else:
+            node_index = 'handoff'
+        names2nodes[broker_key(broker)][(node_id, node_index)] = broker
+    return brokers
+
+
+def print_broker_info(node, broker, indent_level=0):
+    indent = indent_level * TAB
+    info = broker.get_info()
+    raw_info = broker._get_info()
+    deleted_at = float(info['delete_timestamp'])
+    if deleted_at:
+        deleted_at = Timestamp(info['delete_timestamp']).isoformat
+    else:
+        deleted_at = ' - '
+    print('%s(%s) %s, objs: %s, bytes: %s, actual_objs: %s, put: %s, '
+          'deleted: %s' %
+          (indent, node[1][0], broker.get_db_state(),
+           info['object_count'], info['bytes_used'], raw_info['object_count'],
+           Timestamp(info['put_timestamp']).isoformat, deleted_at))
+
+
+def print_db(node, broker, expect_type='ROOT', indent_level=0):
+    indent = indent_level * TAB
+    print('%s(%s) %s node id: %s, node index: %s' %
+          (indent, node[1][0], broker.db_file, node[0], node[1]))
+    actual_type = container_type(broker)
+    if actual_type != expect_type:
+        print('%s        ERROR expected %s but found %s' %
+              (indent, expect_type, actual_type))
+
+
+def print_own_shard_range(node, sr, indent_level):
+    indent = indent_level * TAB
+    range = '%r - %r' % (sr.lower, sr.upper)
+    print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), '
+          'modified: %s (%s), %7s: %s (%s), deleted: %s epoch: %s' %
+          (indent, node[1][0], range, sr.object_count, sr.bytes_used,
+           sr.timestamp.isoformat, sr.timestamp.internal,
+           sr.meta_timestamp.isoformat, sr.meta_timestamp.internal,
+           sr.state_text, sr.state_timestamp.isoformat,
+           sr.state_timestamp.internal, sr.deleted,
+           sr.epoch.internal if sr.epoch else None))
+
+
+def print_own_shard_range_info(node, shard_ranges, indent_level=0):
+    shard_ranges.sort(key=lambda x: x.deleted)
+    for sr in shard_ranges:
+        print_own_shard_range(node, sr, indent_level)
+
+
+def print_shard_range(node, sr, indent_level):
+    indent = indent_level * TAB
+    range = '%r - %r' % (sr.lower, sr.upper)
+    print('%s(%s) %23s, objs: %3s, bytes: %3s, timestamp: %s (%s), '
+          'modified: %s (%s), %7s: %s (%s), deleted: %s %s' %
+          (indent, node[1][0], range, sr.object_count, sr.bytes_used,
+           sr.timestamp.isoformat, sr.timestamp.internal,
+           sr.meta_timestamp.isoformat, sr.meta_timestamp.internal,
+           sr.state_text, sr.state_timestamp.isoformat,
+           sr.state_timestamp.internal, sr.deleted, sr.name))
+
+
+def print_shard_range_info(node, shard_ranges, indent_level=0):
+    shard_ranges.sort(key=lambda x: x.deleted)
+    for sr in shard_ranges:
+        print_shard_range(node, sr, indent_level)
+
+
+def print_sharding_info(node, broker, indent_level=0):
+    indent = indent_level * TAB
+    print('%s(%s) %s' % (indent, node[1][0], broker.get_sharding_sysmeta()))
+
+
+def print_container(name, name2nodes2brokers, expect_type='ROOT',
+                    indent_level=0, used_names=None):
+    used_names = used_names or set()
+    indent = indent_level * TAB
+    node2broker = name2nodes2brokers[name]
+    ordered_by_index = sorted(node2broker.keys(), key=lambda x: x[1])
+    brokers = [(node, node2broker[node]) for node in ordered_by_index]
+
+    print('%sName: %s' % (indent, name))
+    if name in used_names:
+        print('%s  (Details already listed)\n' % indent)
+        return
+
+    used_names.add(name)
+    print(indent + 'DB files:')
+    for node, broker in brokers:
+        print_db(node, broker, expect_type, indent_level=indent_level + 1)
+
+    print(indent + 'Info:')
+    for node, broker in brokers:
+        print_broker_info(node, broker, indent_level=indent_level + 1)
+
+    print(indent + 'Sharding info:')
+    for node, broker in brokers:
+        print_sharding_info(node, broker, indent_level=indent_level + 1)
+    print(indent + 'Own shard range:')
+    for node, broker in brokers:
+        shard_ranges = broker.get_shard_ranges(
+            include_deleted=True, include_own=True, exclude_others=True)
+        print_own_shard_range_info(node, shard_ranges,
+                                   indent_level=indent_level + 1)
+    print(indent + 'Shard ranges:')
+    shard_names = set()
+    for node, broker in brokers:
+        shard_ranges = broker.get_shard_ranges(include_deleted=True)
+        for sr_name in shard_ranges:
+            shard_names.add(sr_name.name)
+        print_shard_range_info(node, shard_ranges,
+                               indent_level=indent_level + 1)
+    print(indent + 'Shards:')
+    for sr_name in shard_names:
+        print_container(sr_name, name2nodes2brokers, expect_type='SHARD',
+                        indent_level=indent_level + 1, used_names=used_names)
+    print('\n')
+
+
+def run(conf_paths):
+    # container_name -> (node id, node index) -> broker
+    name2nodes2brokers = defaultdict(dict)
+    for conf_path in conf_paths:
+        collect_brokers(conf_path, name2nodes2brokers)
+
+    print('First column on each line is (node index)\n')
+    for name, node2broker in name2nodes2brokers.items():
+        expect_root = False
+        for node, broker in node2broker.items():
+            expect_root = broker.is_root_container() or expect_root
+        if expect_root:
+            print_container(name, name2nodes2brokers)
+
+
+if __name__ == '__main__':
+    conf_dir = '/etc/swift/container-server'
+    conf_paths = [os.path.join(conf_dir, p) for p in os.listdir(conf_dir)
+                  if p.endswith(('conf', 'conf.d'))]
+    run(conf_paths)
--- a/swift/common/db.py
+++ b/swift/common/db.py
@ -71,6 +71,18 @@ def native_str_keys(metadata):
            metadata[k.decode('utf-8')] = sv


+ZERO_LIKE_VALUES = {None, '', 0, '0'}
+
+
+def zero_like(count):
+    """
+    We've cargo culted our consumers to be tolerant of various expressions of
+    zero in our databases for backwards compatibility with less disciplined
+    producers.
+    """
+    return count in ZERO_LIKE_VALUES
+
+
 def _db_timeout(timeout, db_file, call):
    with LockTimeout(timeout, db_file):
        retry_wait = 0.001
@ -208,11 +220,27 @@ class DatabaseBroker(object):

    def __init__(self, db_file, timeout=BROKER_TIMEOUT, logger=None,
                 account=None, container=None, pending_timeout=None,
-                 stale_reads_ok=False):
-        """Encapsulates working with a database."""
+                 stale_reads_ok=False, skip_commits=False):
+        """Encapsulates working with a database.
+
+        :param db_file: path to a database file.
+        :param timeout: timeout used for database operations.
+        :param logger: a logger instance.
+        :param account: name of account.
+        :param container: name of container.
+        :param pending_timeout: timeout used when attempting to take a lock to
+            write to pending file.
+        :param stale_reads_ok: if True then no error is raised if pending
+            commits cannot be committed before the database is read, otherwise
+            an error is raised.
+        :param skip_commits: if True then this broker instance will never
+            commit records from the pending file to the database;
+            :meth:`~swift.common.db.DatabaseBroker.put_record` should not
+            called on brokers with skip_commits True.
+        """
        self.conn = None
-        self.db_file = db_file
-        self.pending_file = self.db_file + '.pending'
+        self._db_file = db_file
+        self.pending_file = self._db_file + '.pending'
        self.pending_timeout = pending_timeout or 10
        self.stale_reads_ok = stale_reads_ok
        self.db_dir = os.path.dirname(db_file)
@ -221,6 +249,7 @@ class DatabaseBroker(object):
        self.account = account
        self.container = container
        self._db_version = -1
+        self.skip_commits = skip_commits

    def __str__(self):
        """
@ -240,9 +269,9 @@ class DatabaseBroker(object):
        :param put_timestamp: internalized timestamp of initial PUT request
        :param storage_policy_index: only required for containers
        """
-        if self.db_file == ':memory:':
+        if self._db_file == ':memory:':
            tmp_db_file = None
-            conn = get_db_connection(self.db_file, self.timeout)
+            conn = get_db_connection(self._db_file, self.timeout)
        else:
            mkdirs(self.db_dir)
            fd, tmp_db_file = mkstemp(suffix='.tmp', dir=self.db_dir)
@ -329,15 +358,22 @@ class DatabaseBroker(object):
            self._delete_db(conn, timestamp)
            conn.commit()

+    @property
+    def db_file(self):
+        return self._db_file
+
+    def get_device_path(self):
+        suffix_path = os.path.dirname(self.db_dir)
+        partition_path = os.path.dirname(suffix_path)
+        dbs_path = os.path.dirname(partition_path)
+        return os.path.dirname(dbs_path)
+
    def quarantine(self, reason):
        """
        The database will be quarantined and a
        sqlite3.DatabaseError will be raised indicating the action taken.
        """
-        prefix_path = os.path.dirname(self.db_dir)
-        partition_path = os.path.dirname(prefix_path)
-        dbs_path = os.path.dirname(partition_path)
-        device_path = os.path.dirname(dbs_path)
+        device_path = self.get_device_path()
        quar_path = os.path.join(device_path, 'quarantined',
                                 self.db_type + 's',
                                 os.path.basename(self.db_dir))
@ -377,6 +413,20 @@ class DatabaseBroker(object):

        self.quarantine(exc_hint)

+    @contextmanager
+    def updated_timeout(self, new_timeout):
+        """Use with "with" statement; updates ``timeout`` within the block."""
+        old_timeout = self.timeout
+        try:
+            self.timeout = new_timeout
+            if self.conn:
+                self.conn.timeout = new_timeout
+            yield old_timeout
+        finally:
+            self.timeout = old_timeout
+            if self.conn:
+                self.conn.timeout = old_timeout
+
    @contextmanager
    def get(self):
        """Use with the "with" statement; returns a database connection."""
@ -477,6 +527,23 @@ class DatabaseBroker(object):
        with self.get() as conn:
            return self._is_deleted(conn)

+    def empty(self):
+        """
+        Check if the broker abstraction contains any undeleted records.
+        """
+        raise NotImplementedError()
+
+    def is_reclaimable(self, now, reclaim_age):
+        """
+        Check if the broker abstraction is empty, and has been marked deleted
+        for at least a reclaim age.
+        """
+        info = self.get_replication_info()
+        return (zero_like(info['count']) and
+                (Timestamp(now - reclaim_age) >
+                 Timestamp(info['delete_timestamp']) >
+                 Timestamp(info['put_timestamp'])))
+
    def merge_timestamps(self, created_at, put_timestamp, delete_timestamp):
        """
        Used in replication to handle updating timestamps.
@ -548,13 +615,15 @@ class DatabaseBroker(object):
                result.append({'remote_id': row[0], 'sync_point': row[1]})
            return result

-    def get_max_row(self):
+    def get_max_row(self, table=None):
+        if not table:
+            table = self.db_contains_type
        query = '''
            SELECT SQLITE_SEQUENCE.seq
            FROM SQLITE_SEQUENCE
            WHERE SQLITE_SEQUENCE.name == '%s'
            LIMIT 1
-        ''' % (self.db_contains_type)
+        ''' % (table, )
        with self.get() as conn:
            row = conn.execute(query).fetchone()
        return row[0] if row else -1
@ -582,11 +651,26 @@ class DatabaseBroker(object):
            return curs.fetchone()

    def put_record(self, record):
-        if self.db_file == ':memory:':
+        """
+        Put a record into the DB. If the DB has an associated pending file with
+        space then the record is appended to that file and a commit to the DB
+        is deferred. If the DB is in-memory or its pending file is full then
+        the record will be committed immediately.
+
+        :param record: a record to be added to the DB.
+        :raises DatabaseConnectionError: if the DB file does not exist or if
+            ``skip_commits`` is True.
+        :raises LockTimeout: if a timeout occurs while waiting to take a lock
+            to write to the pending file.
+        """
+        if self._db_file == ':memory:':
            self.merge_items([record])
            return
        if not os.path.exists(self.db_file):
            raise DatabaseConnectionError(self.db_file, "DB doesn't exist")
+        if self.skip_commits:
+            raise DatabaseConnectionError(self.db_file,
+                                          'commits not accepted')
        with lock_parent_directory(self.pending_file, self.pending_timeout):
            pending_size = 0
            try:
@ -606,6 +690,10 @@ class DatabaseBroker(object):
                        protocol=PICKLE_PROTOCOL).encode('base64'))
                    fp.flush()

+    def _skip_commit_puts(self):
+        return (self._db_file == ':memory:' or self.skip_commits or not
+                os.path.exists(self.pending_file))
+
    def _commit_puts(self, item_list=None):
        """
        Scan for .pending files and commit the found records by feeding them
@ -614,7 +702,13 @@ class DatabaseBroker(object):

        :param item_list: A list of items to commit in addition to .pending
        """
-        if self.db_file == ':memory:' or not os.path.exists(self.pending_file):
+        if self._skip_commit_puts():
+            if item_list:
+                # this broker instance should not be used to commit records,
+                # but if it is then raise an error rather than quietly
+                # discarding the records in item_list.
+                raise DatabaseConnectionError(self.db_file,
+                                              'commits not accepted')
            return
        if item_list is None:
            item_list = []
@ -645,7 +739,7 @@ class DatabaseBroker(object):
        Catch failures of _commit_puts() if broker is intended for
        reading of stats, and thus does not care for pending updates.
        """
-        if self.db_file == ':memory:' or not os.path.exists(self.pending_file):
+        if self._skip_commit_puts():
            return
        try:
            with lock_parent_directory(self.pending_file,
@ -663,6 +757,12 @@ class DatabaseBroker(object):
        """
        raise NotImplementedError

+    def merge_items(self, item_list, source=None):
+        """
+        Save :param:item_list to the database.
+        """
+        raise NotImplementedError
+
    def make_tuple_for_pickle(self, record):
        """
        Turn this db record dict into the format this service uses for
@ -701,7 +801,7 @@ class DatabaseBroker(object):
        within 512k of a boundary, it allocates to the next boundary.
        Boundaries are 2m, 5m, 10m, 25m, 50m, then every 50m after.
        """
-        if not DB_PREALLOCATION or self.db_file == ':memory:':
+        if not DB_PREALLOCATION or self._db_file == ':memory:':
            return
        MB = (1024 * 1024)

@ -830,40 +930,46 @@ class DatabaseBroker(object):

    def reclaim(self, age_timestamp, sync_timestamp):
        """
-        Delete rows from the db_contains_type table that are marked deleted
-        and whose created_at timestamp is < age_timestamp.  Also deletes rows
-        from incoming_sync and outgoing_sync where the updated_at timestamp is
-        < sync_timestamp.
+        Delete reclaimable rows and metadata from the db.

-        In addition, this calls the DatabaseBroker's :func:`_reclaim` method.
+        By default this method will delete rows from the db_contains_type table
+        that are marked deleted and whose created_at timestamp is <
+        age_timestamp, and deletes rows from incoming_sync and outgoing_sync
+        where the updated_at timestamp is < sync_timestamp. In addition, this
+        calls the :meth:`_reclaim_metadata` method.
+
+        Subclasses may reclaim other items by overriding :meth:`_reclaim`.

        :param age_timestamp: max created_at timestamp of object rows to delete
        :param sync_timestamp: max update_at timestamp of sync rows to delete
        """
-        if self.db_file != ':memory:' and os.path.exists(self.pending_file):
+        if not self._skip_commit_puts():
            with lock_parent_directory(self.pending_file,
                                       self.pending_timeout):
                self._commit_puts()
        with self.get() as conn:
-            conn.execute('''
-                DELETE FROM %s WHERE deleted = 1 AND %s < ?
-            ''' % (self.db_contains_type, self.db_reclaim_timestamp),
-                (age_timestamp,))
-            try:
-                conn.execute('''
-                    DELETE FROM outgoing_sync WHERE updated_at < ?
-                ''', (sync_timestamp,))
-                conn.execute('''
-                    DELETE FROM incoming_sync WHERE updated_at < ?
-                ''', (sync_timestamp,))
-            except sqlite3.OperationalError as err:
-                # Old dbs didn't have updated_at in the _sync tables.
-                if 'no such column: updated_at' not in str(err):
-                    raise
-            DatabaseBroker._reclaim(self, conn, age_timestamp)
+            self._reclaim(conn, age_timestamp, sync_timestamp)
+            self._reclaim_metadata(conn, age_timestamp)
            conn.commit()

-    def _reclaim(self, conn, timestamp):
+    def _reclaim(self, conn, age_timestamp, sync_timestamp):
+        conn.execute('''
+            DELETE FROM %s WHERE deleted = 1 AND %s < ?
+        ''' % (self.db_contains_type, self.db_reclaim_timestamp),
+            (age_timestamp,))
+        try:
+            conn.execute('''
+                DELETE FROM outgoing_sync WHERE updated_at < ?
+            ''', (sync_timestamp,))
+            conn.execute('''
+                DELETE FROM incoming_sync WHERE updated_at < ?
+            ''', (sync_timestamp,))
+        except sqlite3.OperationalError as err:
+            # Old dbs didn't have updated_at in the _sync tables.
+            if 'no such column: updated_at' not in str(err):
+                raise
+
+    def _reclaim_metadata(self, conn, timestamp):
        """
        Removes any empty metadata values older than the timestamp using the
        given database connection. This function will not call commit on the
--- a/swift/common/db_replicator.py
+++ b/swift/common/db_replicator.py
@ -33,10 +33,12 @@ from swift.common.direct_client import quote
 from swift.common.utils import get_logger, whataremyips, storage_directory, \
    renamer, mkdirs, lock_parent_directory, config_true_value, \
    unlink_older_than, dump_recon_cache, rsync_module_interpolation, \
-    json, Timestamp, parse_override_options, round_robin_iter, Everything
+    json, parse_override_options, round_robin_iter, Everything, get_db_files, \
+    parse_db_filename
 from swift.common import ring
 from swift.common.ring.utils import is_local_device
-from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE
+from swift.common.http import HTTP_NOT_FOUND, HTTP_INSUFFICIENT_STORAGE, \
+    is_success
 from swift.common.bufferedhttp import BufferedHTTPConnection
 from swift.common.exceptions import DriveNotMounted
 from swift.common.daemon import Daemon
@ -87,11 +89,14 @@ def roundrobin_datadirs(datadirs):
    found (in their proper places). The partitions within each data
    dir are walked randomly, however.

-    :param datadirs: a list of (path, node_id, partition_filter) to walk
-    :returns: A generator of (partition, path_to_db_file, node_id)
+    :param datadirs: a list of tuples of (path, context, partition_filter) to
+                     walk. The context may be any object; the context is not
+                     used by this function but is included with each yielded
+                     tuple.
+    :returns: A generator of (partition, path_to_db_file, context)
    """

-    def walk_datadir(datadir, node_id, part_filter):
+    def walk_datadir(datadir, context, part_filter):
        partitions = [pd for pd in os.listdir(datadir)
                      if looks_like_partition(pd) and part_filter(pd)]
        random.shuffle(partitions)
@ -116,17 +121,23 @@ def roundrobin_datadirs(datadirs):
                    if not os.path.isdir(hash_dir):
                        continue
                    object_file = os.path.join(hash_dir, hsh + '.db')
+                    # common case
                    if os.path.exists(object_file):
-                        yield (partition, object_file, node_id)
-                    else:
-                        try:
-                            os.rmdir(hash_dir)
-                        except OSError as e:
-                            if e.errno != errno.ENOTEMPTY:
-                                raise
+                        yield (partition, object_file, context)
+                        continue
+                    # look for any alternate db filenames
+                    db_files = get_db_files(object_file)
+                    if db_files:
+                        yield (partition, db_files[-1], context)
+                        continue
+                    try:
+                        os.rmdir(hash_dir)
+                    except OSError as e:
+                        if e.errno != errno.ENOTEMPTY:
+                            raise

-    its = [walk_datadir(datadir, node_id, filt)
-           for datadir, node_id, filt in datadirs]
+    its = [walk_datadir(datadir, context, filt)
+           for datadir, context, filt in datadirs]

    rr_its = round_robin_iter(its)
    for datadir in rr_its:
@ -212,7 +223,7 @@ class Replicator(Daemon):
        self.stats = {'attempted': 0, 'success': 0, 'failure': 0, 'ts_repl': 0,
                      'no_change': 0, 'hashmatch': 0, 'rsync': 0, 'diff': 0,
                      'remove': 0, 'empty': 0, 'remote_merge': 0,
-                      'start': time.time(), 'diff_capped': 0,
+                      'start': time.time(), 'diff_capped': 0, 'deferred': 0,
                      'failure_nodes': {}}

    def _report_stats(self):
@ -309,9 +320,20 @@ class Replicator(Daemon):
                                        different_region=different_region):
                    return False
        with Timeout(replicate_timeout or self.node_timeout):
-            response = http.replicate(replicate_method, local_id)
+            response = http.replicate(replicate_method, local_id,
+                                      os.path.basename(broker.db_file))
        return response and 200 <= response.status < 300

+    def _send_replicate_request(self, http, *repl_args):
+        with Timeout(self.node_timeout):
+            response = http.replicate(*repl_args)
+        if not response or not is_success(response.status):
+            if response:
+                self.logger.error('ERROR Bad response %s from %s',
+                                  response.status, http.host)
+            return False
+        return True
+
    def _usync_db(self, point, broker, http, remote_id, local_id):
        """
        Sync a db by sending all records since the last sync.
@ -326,26 +348,29 @@ class Replicator(Daemon):
        """
        self.stats['diff'] += 1
        self.logger.increment('diffs')
-        self.logger.debug('Syncing chunks with %s, starting at %s',
-                          http.host, point)
+        self.logger.debug('%s usyncing chunks to %s, starting at row %s',
+                          broker.db_file,
+                          '%(ip)s:%(port)s/%(device)s' % http.node,
+                          point)
+        start = time.time()
        sync_table = broker.get_syncs()
        objects = broker.get_items_since(point, self.per_diff)
        diffs = 0
        while len(objects) and diffs < self.max_diffs:
            diffs += 1
-            with Timeout(self.node_timeout):
-                response = http.replicate('merge_items', objects, local_id)
-            if not response or response.status >= 300 or response.status < 200:
-                if response:
-                    self.logger.error(_('ERROR Bad response %(status)s from '
-                                        '%(host)s'),
-                                      {'status': response.status,
-                                       'host': http.host})
+            if not self._send_replicate_request(
+                    http, 'merge_items', objects, local_id):
                return False
            # replication relies on db order to send the next merge batch in
            # order with no gaps
            point = objects[-1]['ROWID']
            objects = broker.get_items_since(point, self.per_diff)
+
+        self.logger.debug('%s usyncing chunks to %s, finished at row %s (%gs)',
+                          broker.db_file,
+                          '%(ip)s:%(port)s/%(device)s' % http.node,
+                          point, time.time() - start)
+
        if objects:
            self.logger.debug(
                'Synchronization for %s has fallen more than '
@ -397,9 +422,8 @@ class Replicator(Daemon):

        :returns: ReplConnection object
        """
-        return ReplConnection(node, partition,
-                              os.path.basename(db_file).split('.', 1)[0],
-                              self.logger)
+        hsh, other, ext = parse_db_filename(db_file)
+        return ReplConnection(node, partition, hsh, self.logger)

    def _gather_sync_args(self, info):
        """
@ -449,32 +473,79 @@ class Replicator(Daemon):
            if rinfo.get('metadata', ''):
                broker.update_metadata(json.loads(rinfo['metadata']))
            if self._in_sync(rinfo, info, broker, local_sync):
+                self.logger.debug('%s in sync with %s, nothing to do',
+                                  broker.db_file,
+                                  '%(ip)s:%(port)s/%(device)s' % node)
                return True
-            # if the difference in rowids between the two differs by
-            # more than 50% and the difference is greater than per_diff,
-            # rsync then do a remote merge.
-            # NOTE: difference > per_diff stops us from dropping to rsync
-            # on smaller containers, who have only a few rows to sync.
-            if rinfo['max_row'] / float(info['max_row']) < 0.5 and \
-                    info['max_row'] - rinfo['max_row'] > self.per_diff:
-                self.stats['remote_merge'] += 1
-                self.logger.increment('remote_merges')
-                return self._rsync_db(broker, node, http, info['id'],
-                                      replicate_method='rsync_then_merge',
-                                      replicate_timeout=(info['count'] / 2000),
-                                      different_region=different_region)
-            # else send diffs over to the remote server
-            return self._usync_db(max(rinfo['point'], local_sync),
-                                  broker, http, rinfo['id'], info['id'])
+            return self._choose_replication_mode(
+                node, rinfo, info, local_sync, broker, http,
+                different_region)
+        return False
+
+    def _choose_replication_mode(self, node, rinfo, info, local_sync, broker,
+                                 http, different_region):
+        # if the difference in rowids between the two differs by
+        # more than 50% and the difference is greater than per_diff,
+        # rsync then do a remote merge.
+        # NOTE: difference > per_diff stops us from dropping to rsync
+        # on smaller containers, who have only a few rows to sync.
+        if (rinfo['max_row'] / float(info['max_row']) < 0.5 and
+                info['max_row'] - rinfo['max_row'] > self.per_diff):
+            self.stats['remote_merge'] += 1
+            self.logger.increment('remote_merges')
+            return self._rsync_db(broker, node, http, info['id'],
+                                  replicate_method='rsync_then_merge',
+                                  replicate_timeout=(info['count'] / 2000),
+                                  different_region=different_region)
+        # else send diffs over to the remote server
+        return self._usync_db(max(rinfo['point'], local_sync),
+                              broker, http, rinfo['id'], info['id'])

    def _post_replicate_hook(self, broker, info, responses):
        """
-        :param broker: the container that just replicated
+        :param broker: broker instance for the database that just replicated
        :param info: pre-replication full info dict
        :param responses: a list of bools indicating success from nodes
        """
        pass

+    def cleanup_post_replicate(self, broker, orig_info, responses):
+        """
+        Cleanup non primary database from disk if needed.
+
+        :param broker: the broker for the database we're replicating
+        :param orig_info: snapshot of the broker replication info dict taken
+            before replication
+        :param responses: a list of boolean success values for each replication
+                          request to other nodes
+
+        :return success: returns False if deletion of the database was
+            attempted but unsuccessful, otherwise returns True.
+        """
+        log_template = 'Not deleting db %s (%%s)' % broker.db_file
+        max_row_delta = broker.get_max_row() - orig_info['max_row']
+        if max_row_delta < 0:
+            reason = 'negative max_row_delta: %s' % max_row_delta
+            self.logger.error(log_template, reason)
+            return True
+        if max_row_delta:
+            reason = '%s new rows' % max_row_delta
+            self.logger.debug(log_template, reason)
+            return True
+        if not (responses and all(responses)):
+            reason = '%s/%s success' % (responses.count(True), len(responses))
+            self.logger.debug(log_template, reason)
+            return True
+        # If the db has been successfully synced to all of its peers, it can be
+        # removed. Callers should have already checked that the db is not on a
+        # primary node.
+        if not self.delete_db(broker):
+            self.logger.debug(
+                'Failed to delete db %s', broker.db_file)
+            return False
+        self.logger.debug('Successfully deleted db %s', broker.db_file)
+        return True
+
    def _replicate_object(self, partition, object_file, node_id):
        """
        Replicate the db, choosing method based on whether or not it
@ -483,12 +554,20 @@ class Replicator(Daemon):
        :param partition: partition to be replicated to
        :param object_file: DB file name to be replicated
        :param node_id: node id of the node to be replicated to
+        :returns: a tuple (success, responses). ``success`` is a boolean that
+            is True if the method completed successfully, False otherwise.
+            ``responses`` is a list of booleans each of which indicates the
+            success or not of replicating to a peer node if replication has
+            been attempted. ``success`` is False if any of ``responses`` is
+            False; when ``responses`` is empty, ``success`` may be either True
+            or False.
        """
        start_time = now = time.time()
        self.logger.debug('Replicating db %s', object_file)
        self.stats['attempted'] += 1
        self.logger.increment('attempts')
        shouldbehere = True
+        responses = []
        try:
            broker = self.brokerclass(object_file, pending_timeout=30)
            broker.reclaim(now - self.reclaim_age,
@ -518,18 +597,12 @@ class Replicator(Daemon):
                                      failure_dev['device'])
                                     for failure_dev in nodes])
            self.logger.increment('failures')
-            return
-        # The db is considered deleted if the delete_timestamp value is greater
-        # than the put_timestamp, and there are no objects.
-        delete_timestamp = Timestamp(info.get('delete_timestamp') or 0)
-        put_timestamp = Timestamp(info.get('put_timestamp') or 0)
-        if (now - self.reclaim_age) > delete_timestamp > put_timestamp and \
-                info['count'] in (None, '', 0, '0'):
+            return False, responses
+        if broker.is_reclaimable(now, self.reclaim_age):
            if self.report_up_to_date(info):
                self.delete_db(broker)
            self.logger.timing_since('timing', start_time)
-            return
-        responses = []
+            return True, responses
        failure_devs_info = set()
        nodes = self.ring.get_part_nodes(int(partition))
        local_dev = None
@ -587,14 +660,11 @@ class Replicator(Daemon):
        except (Exception, Timeout):
            self.logger.exception('UNHANDLED EXCEPTION: in post replicate '
                                  'hook for %s', broker.db_file)
-        if not shouldbehere and responses and all(responses):
-            # If the db shouldn't be on this node and has been successfully
-            # synced to all of its peers, it can be removed.
-            if not self.delete_db(broker):
+        if not shouldbehere:
+            if not self.cleanup_post_replicate(broker, info, responses):
                failure_devs_info.update(
                    [(failure_dev['replication_ip'], failure_dev['device'])
                     for failure_dev in repl_nodes])
-
        target_devs_info = set([(target_dev['replication_ip'],
                                 target_dev['device'])
                                for target_dev in repl_nodes])
@ -602,6 +672,9 @@ class Replicator(Daemon):
        self._add_failure_stats(failure_devs_info)

        self.logger.timing_since('timing', start_time)
+        if shouldbehere:
+            responses.append(True)
+        return all(responses), responses

    def delete_db(self, broker):
        object_file = broker.db_file
@ -746,6 +819,9 @@ class ReplicatorRpc(object):
        self.mount_check = mount_check
        self.logger = logger or get_logger({}, log_route='replicator-rpc')

+    def _db_file_exists(self, db_path):
+        return os.path.exists(db_path)
+
    def dispatch(self, replicate_args, args):
        if not hasattr(args, 'pop'):
            return HTTPBadRequest(body='Invalid object type')
@ -764,7 +840,7 @@ class ReplicatorRpc(object):
            # someone might be about to rsync a db to us,
            # make sure there's a tmp dir to receive it.
            mkdirs(os.path.join(self.root, drive, 'tmp'))
-            if not os.path.exists(db_file):
+            if not self._db_file_exists(db_file):
                return HTTPNotFound()
            return getattr(self, op)(self.broker_class(db_file), args)

@ -863,6 +939,8 @@ class ReplicatorRpc(object):

    def complete_rsync(self, drive, db_file, args):
        old_filename = os.path.join(self.root, drive, 'tmp', args[0])
+        if args[1:]:
+            db_file = os.path.join(os.path.dirname(db_file), args[1])
        if os.path.exists(db_file):
            return HTTPNotFound()
        if not os.path.exists(old_filename):
@ -872,12 +950,21 @@ class ReplicatorRpc(object):
        renamer(old_filename, db_file)
        return HTTPNoContent()

+    def _abort_rsync_then_merge(self, db_file, tmp_filename):
+        return not (self._db_file_exists(db_file) and
+                    os.path.exists(tmp_filename))
+
+    def _post_rsync_then_merge_hook(self, existing_broker, new_broker):
+        # subclasses may override to make custom changes to the new broker
+        pass
+
    def rsync_then_merge(self, drive, db_file, args):
-        old_filename = os.path.join(self.root, drive, 'tmp', args[0])
-        if not os.path.exists(db_file) or not os.path.exists(old_filename):
+        tmp_filename = os.path.join(self.root, drive, 'tmp', args[0])
+        if self._abort_rsync_then_merge(db_file, tmp_filename):
            return HTTPNotFound()
-        new_broker = self.broker_class(old_filename)
+        new_broker = self.broker_class(tmp_filename)
        existing_broker = self.broker_class(db_file)
+        db_file = existing_broker.db_file
        point = -1
        objects = existing_broker.get_items_since(point, 1000)
        while len(objects):
@ -885,9 +972,13 @@ class ReplicatorRpc(object):
            point = objects[-1]['ROWID']
            objects = existing_broker.get_items_since(point, 1000)
            sleep()
+        new_broker.merge_syncs(existing_broker.get_syncs())
+        self._post_rsync_then_merge_hook(existing_broker, new_broker)
        new_broker.newid(args[0])
        new_broker.update_metadata(existing_broker.metadata)
-        renamer(old_filename, db_file)
+        if self._abort_rsync_then_merge(db_file, tmp_filename):
+            return HTTPNotFound()
+        renamer(tmp_filename, db_file)
        return HTTPNoContent()

 # Footnote [1]:
--- a/swift/common/direct_client.py
+++ b/swift/common/direct_client.py
@ -54,22 +54,72 @@ class DirectClientException(ClientException):
            http_reason=resp.reason, http_headers=headers)


-def _make_req(node, part, method, path, _headers, stype,
-              conn_timeout=5, response_timeout=15):
+def _make_req(node, part, method, path, headers, stype,
+              conn_timeout=5, response_timeout=15, send_timeout=15,
+              contents=None, content_length=None, chunk_size=65535):
    """
    Make request to backend storage node.
    (i.e. 'Account', 'Container', 'Object')
    :param node: a node dict from a ring
-    :param part: an integer, the partion number
+    :param part: an integer, the partition number
    :param method: a string, the HTTP method (e.g. 'PUT', 'DELETE', etc)
    :param path: a string, the request path
    :param headers: a dict, header name => value
    :param stype: a string, describing the type of service
+    :param conn_timeout: timeout while waiting for connection; default is 5
+        seconds
+    :param response_timeout: timeout while waiting for response; default is 15
+        seconds
+    :param send_timeout: timeout for sending request body; default is 15
+        seconds
+    :param contents: an iterable or string to read object data from
+    :param content_length: value to send as content-length header
+    :param chunk_size: if defined, chunk size of data to send
    :returns: an HTTPResponse object
+    :raises DirectClientException: if the response status is not 2xx
+    :raises eventlet.Timeout: if either conn_timeout or response_timeout is
+        exceeded
    """
+    if contents is not None:
+        if content_length is not None:
+            headers['Content-Length'] = str(content_length)
+        else:
+            for n, v in headers.items():
+                if n.lower() == 'content-length':
+                    content_length = int(v)
+        if not contents:
+            headers['Content-Length'] = '0'
+        if isinstance(contents, six.string_types):
+            contents = [contents]
+        if content_length is None:
+            headers['Transfer-Encoding'] = 'chunked'
+
    with Timeout(conn_timeout):
        conn = http_connect(node['ip'], node['port'], node['device'], part,
-                            method, path, headers=_headers)
+                            method, path, headers=headers)
+
+    if contents is not None:
+        contents_f = FileLikeIter(contents)
+
+        with Timeout(send_timeout):
+            if content_length is None:
+                chunk = contents_f.read(chunk_size)
+                while chunk:
+                    conn.send('%x\r\n%s\r\n' % (len(chunk), chunk))
+                    chunk = contents_f.read(chunk_size)
+                conn.send('0\r\n\r\n')
+            else:
+                left = content_length
+                while left > 0:
+                    size = chunk_size
+                    if size > left:
+                        size = left
+                    chunk = contents_f.read(size)
+                    if not chunk:
+                        break
+                    conn.send(chunk)
+                    left -= len(chunk)
+
    with Timeout(response_timeout):
        resp = conn.getresponse()
        resp.read()
@ -82,7 +132,7 @@ def _get_direct_account_container(path, stype, node, part,
                                  marker=None, limit=None,
                                  prefix=None, delimiter=None,
                                  conn_timeout=5, response_timeout=15,
-                                  end_marker=None, reverse=None):
+                                  end_marker=None, reverse=None, headers=None):
    """Base class for get direct account and container.

    Do not use directly use the get_direct_account or
@ -105,7 +155,7 @@ def _get_direct_account_container(path, stype, node, part,
    with Timeout(conn_timeout):
        conn = http_connect(node['ip'], node['port'], node['device'], part,
                            'GET', path, query_string=qs,
-                            headers=gen_headers())
+                            headers=gen_headers(hdrs_in=headers))
    with Timeout(response_timeout):
        resp = conn.getresponse()
    if not is_success(resp.status):
@ -121,11 +171,12 @@ def _get_direct_account_container(path, stype, node, part,
    return resp_headers, json.loads(resp.read())


-def gen_headers(hdrs_in=None, add_ts=False):
+def gen_headers(hdrs_in=None, add_ts=False, add_user_agent=True):
    hdrs_out = HeaderKeyDict(hdrs_in) if hdrs_in else HeaderKeyDict()
    if add_ts:
        hdrs_out['X-Timestamp'] = Timestamp.now().internal
-    hdrs_out['User-Agent'] = 'direct-client %s' % os.getpid()
+    if add_user_agent:
+        hdrs_out['User-Agent'] = 'direct-client %s' % os.getpid()
    return hdrs_out


@ -197,7 +248,7 @@ def direct_head_container(node, part, account, container, conn_timeout=5,
 def direct_get_container(node, part, account, container, marker=None,
                         limit=None, prefix=None, delimiter=None,
                         conn_timeout=5, response_timeout=15, end_marker=None,
-                         reverse=None):
+                         reverse=None, headers=None):
    """
    Get container listings directly from the container server.

@ -213,6 +264,7 @@ def direct_get_container(node, part, account, container, marker=None,
    :param response_timeout: timeout in seconds for getting the response
    :param end_marker: end_marker query
    :param reverse: reverse the returned listing
+    :param headers: headers to be included in the request
    :returns: a tuple of (response headers, a list of objects) The response
              headers will be a HeaderKeyDict.
    """
@ -224,7 +276,8 @@ def direct_get_container(node, part, account, container, marker=None,
                                         end_marker=end_marker,
                                         reverse=reverse,
                                         conn_timeout=conn_timeout,
-                                         response_timeout=response_timeout)
+                                         response_timeout=response_timeout,
+                                         headers=headers)


 def direct_delete_container(node, part, account, container, conn_timeout=5,
@ -250,6 +303,37 @@ def direct_delete_container(node, part, account, container, conn_timeout=5,
              'Container', conn_timeout, response_timeout)


+def direct_put_container(node, part, account, container, conn_timeout=5,
+                         response_timeout=15, headers=None, contents=None,
+                         content_length=None, chunk_size=65535):
+    """
+    Make a PUT request to a container server.
+
+    :param node: node dictionary from the ring
+    :param part: partition the container is on
+    :param account: account name
+    :param container: container name
+    :param conn_timeout: timeout in seconds for establishing the connection
+    :param response_timeout: timeout in seconds for getting the response
+    :param headers: additional headers to include in the request
+    :param contents: an iterable or string to send in request body (optional)
+    :param content_length: value to send as content-length header (optional)
+    :param chunk_size: chunk size of data to send (optional)
+    :raises ClientException: HTTP PUT request failed
+    """
+    if headers is None:
+        headers = {}
+
+    lower_headers = set(k.lower() for k in headers)
+    headers_out = gen_headers(headers,
+                              add_ts='x-timestamp' not in lower_headers,
+                              add_user_agent='user-agent' not in lower_headers)
+    path = '/%s/%s' % (account, container)
+    _make_req(node, part, 'PUT', path, headers_out, 'Container', conn_timeout,
+              response_timeout, contents=contents,
+              content_length=content_length, chunk_size=chunk_size)
+
+
 def direct_put_container_object(node, part, account, container, obj,
                                conn_timeout=5, response_timeout=15,
                                headers=None):
@ -385,56 +469,18 @@ def direct_put_object(node, part, account, container, name, contents,
        headers = {}
    if etag:
        headers['ETag'] = etag.strip('"')
-    if content_length is not None:
-        headers['Content-Length'] = str(content_length)
-    else:
-        for n, v in headers.items():
-            if n.lower() == 'content-length':
-                content_length = int(v)
    if content_type is not None:
        headers['Content-Type'] = content_type
    else:
        headers['Content-Type'] = 'application/octet-stream'
-    if not contents:
-        headers['Content-Length'] = '0'
-    if isinstance(contents, six.string_types):
-        contents = [contents]
    # Incase the caller want to insert an object with specific age
    add_ts = 'X-Timestamp' not in headers

-    if content_length is None:
-        headers['Transfer-Encoding'] = 'chunked'
+    resp = _make_req(
+        node, part, 'PUT', path, gen_headers(headers, add_ts=add_ts),
+        'Object', conn_timeout, response_timeout, contents=contents,
+        content_length=content_length, chunk_size=chunk_size)

-    with Timeout(conn_timeout):
-        conn = http_connect(node['ip'], node['port'], node['device'], part,
-                            'PUT', path, headers=gen_headers(headers, add_ts))
-
-    contents_f = FileLikeIter(contents)
-
-    if content_length is None:
-        chunk = contents_f.read(chunk_size)
-        while chunk:
-            conn.send('%x\r\n%s\r\n' % (len(chunk), chunk))
-            chunk = contents_f.read(chunk_size)
-        conn.send('0\r\n\r\n')
-    else:
-        left = content_length
-        while left > 0:
-            size = chunk_size
-            if size > left:
-                size = left
-            chunk = contents_f.read(size)
-            if not chunk:
-                break
-            conn.send(chunk)
-            left -= len(chunk)
-
-    with Timeout(response_timeout):
-        resp = conn.getresponse()
-        resp.read()
-    if not is_success(resp.status):
-        raise DirectClientException('Object', 'PUT',
-                                    node, part, path, resp)
    return resp.getheader('etag').strip('"')


--- a/swift/common/manager.py
+++ b/swift/common/manager.py
@ -34,7 +34,7 @@ PROC_DIR = '/proc'

 ALL_SERVERS = ['account-auditor', 'account-server', 'container-auditor',
               'container-replicator', 'container-reconciler',
-               'container-server', 'container-sync',
+               'container-server', 'container-sharder', 'container-sync',
               'container-updater', 'object-auditor', 'object-server',
               'object-expirer', 'object-replicator',
               'object-reconstructor', 'object-updater',
@ -637,13 +637,16 @@ class Server(object):
                  {'server': self.server, 'pid': pid, 'conf': conf_file})
        return 0

-    def spawn(self, conf_file, once=False, wait=True, daemon=True, **kwargs):
+    def spawn(self, conf_file, once=False, wait=True, daemon=True,
+              additional_args=None, **kwargs):
        """Launch a subprocess for this server.

        :param conf_file: path to conf_file to use as first arg
        :param once: boolean, add once argument to command
        :param wait: boolean, if true capture stdout with a pipe
        :param daemon: boolean, if false ask server to log to console
+        :param additional_args: list of additional arguments to pass
+                                on the command line

        :returns: the pid of the spawned process
        """
@ -653,6 +656,10 @@ class Server(object):
        if not daemon:
            # ask the server to log to console
            args.append('verbose')
+        if additional_args:
+            if isinstance(additional_args, str):
+                additional_args = [additional_args]
+            args.extend(additional_args)

        # figure out what we're going to do with stdio
        if not daemon:
--- a/swift/common/utils.py
+++ b/swift/common/utils.py
@ -19,10 +19,12 @@ from __future__ import print_function

 import base64
 import binascii
+import bisect
 import collections
 import errno
 import fcntl
 import grp
+import hashlib
 import hmac
 import json
 import math
@ -76,6 +78,7 @@ from six.moves import range, http_client
 from six.moves.urllib.parse import ParseResult
 from six.moves.urllib.parse import quote as _quote
 from six.moves.urllib.parse import urlparse as stdlib_urlparse
+from six import string_types

 from swift import gettext_ as _
 import swift.common.exceptions
@ -409,6 +412,21 @@ def config_positive_int_value(value):
    return result


+def config_float_value(value, minimum=None, maximum=None):
+    try:
+        val = float(value)
+        if minimum is not None and val < minimum:
+            raise ValueError()
+        if maximum is not None and val > maximum:
+            raise ValueError()
+        return val
+    except (TypeError, ValueError):
+        min_ = ', greater than %s' % minimum if minimum is not None else ''
+        max_ = ', less than %s' % maximum if maximum is not None else ''
+        raise ValueError('Config option must be a number%s%s, not "%s".' %
+                         (min_, max_, value))
+
+
 def config_auto_int_value(value, default):
    """
    Returns default if value is None or 'auto'.
@ -4370,6 +4388,553 @@ def get_md5_socket():
    return md5_sockfd


+class ShardRange(object):
+    """
+    A ShardRange encapsulates sharding state related to a container including
+    lower and upper bounds that define the object namespace for which the
+    container is responsible.
+
+    Shard ranges may be persisted in a container database. Timestamps
+    associated with subsets of the shard range attributes are used to resolve
+    conflicts when a shard range needs to be merged with an existing shard
+    range record and the most recent version of an attribute should be
+    persisted.
+
+    :param name: the name of the shard range; this should take the form of a
+        path to a container i.e. <account_name>/<container_name>.
+    :param timestamp: a timestamp that represents the time at which the
+        shard range's ``lower``, ``upper`` or ``deleted`` attributes were
+        last modified.
+    :param lower: the lower bound of object names contained in the shard range;
+        the lower bound *is not* included in the shard range namespace.
+    :param upper: the upper bound of object names contained in the shard range;
+        the upper bound *is* included in the shard range namespace.
+    :param object_count: the number of objects in the shard range; defaults to
+        zero.
+    :param bytes_used: the number of bytes in the shard range; defaults to
+        zero.
+    :param meta_timestamp: a timestamp that represents the time at which the
+        shard range's ``object_count`` and ``bytes_used`` were last updated;
+        defaults to the value of ``timestamp``.
+    :param deleted: a boolean; if True the shard range is considered to be
+        deleted.
+    :param state: the state; must be one of ShardRange.STATES; defaults to
+        CREATED.
+    :param state_timestamp: a timestamp that represents the time at which
+        ``state`` was forced to its current value; defaults to the value of
+        ``timestamp``. This timestamp is typically not updated with every
+        change of ``state`` because in general conflicts in ``state``
+        attributes are resolved by choosing the larger ``state`` value.
+        However, when this rule does not apply, for example when changing state
+        from ``SHARDED`` to ``ACTIVE``, the ``state_timestamp`` may be advanced
+        so that the new ``state`` value is preferred over any older ``state``
+        value.
+    :param epoch: optional epoch timestamp which represents the time at which
+        sharding was enabled for a container.
+    """
+    FOUND = 10
+    CREATED = 20
+    CLEAVED = 30
+    ACTIVE = 40
+    SHRINKING = 50
+    SHARDING = 60
+    SHARDED = 70
+    STATES = {FOUND: 'found',
+              CREATED: 'created',
+              CLEAVED: 'cleaved',
+              ACTIVE: 'active',
+              SHRINKING: 'shrinking',
+              SHARDING: 'sharding',
+              SHARDED: 'sharded'}
+    STATES_BY_NAME = dict((v, k) for k, v in STATES.items())
+
+    class OuterBound(object):
+        def __eq__(self, other):
+            return isinstance(other, type(self))
+
+        def __ne__(self, other):
+            return not self.__eq__(other)
+
+        def __str__(self):
+            return ''
+
+        def __repr__(self):
+            return type(self).__name__
+
+        def __bool__(self):
+            return False
+
+        __nonzero__ = __bool__
+
+    @functools.total_ordering
+    class MaxBound(OuterBound):
+        def __ge__(self, other):
+            return True
+
+    @functools.total_ordering
+    class MinBound(OuterBound):
+        def __le__(self, other):
+            return True
+
+    MIN = MinBound()
+    MAX = MaxBound()
+
+    def __init__(self, name, timestamp, lower=MIN, upper=MAX,
+                 object_count=0, bytes_used=0, meta_timestamp=None,
+                 deleted=False, state=None, state_timestamp=None, epoch=None):
+        self.account = self.container = self._timestamp = \
+            self._meta_timestamp = self._state_timestamp = self._epoch = None
+        self._lower = ShardRange.MIN
+        self._upper = ShardRange.MAX
+        self._deleted = False
+        self._state = None
+
+        self.name = name
+        self.timestamp = timestamp
+        self.lower = lower
+        self.upper = upper
+        self.deleted = deleted
+        self.object_count = object_count
+        self.bytes_used = bytes_used
+        self.meta_timestamp = meta_timestamp
+        self.state = self.FOUND if state is None else state
+        self.state_timestamp = state_timestamp
+        self.epoch = epoch
+
+    @classmethod
+    def _encode(cls, value):
+        if six.PY2 and isinstance(value, six.text_type):
+            return value.encode('utf-8')
+        return value
+
+    def _encode_bound(self, bound):
+        if isinstance(bound, ShardRange.OuterBound):
+            return bound
+        if not isinstance(bound, string_types):
+            raise TypeError('must be a string type')
+        return self._encode(bound)
+
+    @classmethod
+    def _make_container_name(cls, root_container, parent_container, timestamp,
+                             index):
+        if not isinstance(parent_container, bytes):
+            parent_container = parent_container.encode('utf-8')
+        return "%s-%s-%s-%s" % (root_container,
+                                hashlib.md5(parent_container).hexdigest(),
+                                cls._to_timestamp(timestamp).internal,
+                                index)
+
+    @classmethod
+    def make_path(cls, shards_account, root_container, parent_container,
+                  timestamp, index):
+        """
+        Returns a path for a shard container that is valid to use as a name
+        when constructing a :class:`~swift.common.utils.ShardRange`.
+
+        :param shards_account: the hidden internal account to which the shard
+            container belongs.
+        :param root_container: the name of the root container for the shard.
+        :param parent_container: the name of the parent container for the
+            shard; for initial first generation shards this should be the same
+            as ``root_container``; for shards of shards this should be the name
+            of the sharding shard container.
+        :param timestamp: an instance of :class:`~swift.common.utils.Timestamp`
+        :param index: a unique index that will distinguish the path from any
+            other path generated using the same combination of
+            ``shards_account``, ``root_container``, ``parent_container`` and
+            ``timestamp``.
+        :return: a string of the form <account_name>/<container_name>
+        """
+        shard_container = cls._make_container_name(
+            root_container, parent_container, timestamp, index)
+        return '%s/%s' % (shards_account, shard_container)
+
+    @classmethod
+    def _to_timestamp(cls, timestamp):
+        if timestamp is None or isinstance(timestamp, Timestamp):
+            return timestamp
+        return Timestamp(timestamp)
+
+    @property
+    def name(self):
+        return '%s/%s' % (self.account, self.container)
+
+    @name.setter
+    def name(self, path):
+        path = self._encode(path)
+        if not path or len(path.split('/')) != 2 or not all(path.split('/')):
+            raise ValueError(
+                "Name must be of the form '<account>/<container>', got %r" %
+                path)
+        self.account, self.container = path.split('/')
+
+    @property
+    def timestamp(self):
+        return self._timestamp
+
+    @timestamp.setter
+    def timestamp(self, ts):
+        if ts is None:
+            raise TypeError('timestamp cannot be None')
+        self._timestamp = self._to_timestamp(ts)
+
+    @property
+    def meta_timestamp(self):
+        if self._meta_timestamp is None:
+            return self.timestamp
+        return self._meta_timestamp
+
+    @meta_timestamp.setter
+    def meta_timestamp(self, ts):
+        self._meta_timestamp = self._to_timestamp(ts)
+
+    @property
+    def lower(self):
+        return self._lower
+
+    @property
+    def lower_str(self):
+        return str(self.lower)
+
+    @lower.setter
+    def lower(self, value):
+        if value in (None, ''):
+            value = ShardRange.MIN
+        try:
+            value = self._encode_bound(value)
+        except TypeError as err:
+            raise TypeError('lower %s' % err)
+        if value > self._upper:
+            raise ValueError(
+                'lower (%r) must be less than or equal to upper (%r)' %
+                (value, self.upper))
+        self._lower = value
+
+    @property
+    def end_marker(self):
+        return self.upper_str + '\x00' if self.upper else ''
+
+    @property
+    def upper(self):
+        return self._upper
+
+    @property
+    def upper_str(self):
+        return str(self.upper)
+
+    @upper.setter
+    def upper(self, value):
+        if value in (None, ''):
+            value = ShardRange.MAX
+        try:
+            value = self._encode_bound(value)
+        except TypeError as err:
+            raise TypeError('upper %s' % err)
+        if value < self._lower:
+            raise ValueError(
+                'upper (%r) must be greater than or equal to lower (%r)' %
+                (value, self.lower))
+        self._upper = value
+
+    @property
+    def object_count(self):
+        return self._count
+
+    @object_count.setter
+    def object_count(self, count):
+        count = int(count)
+        if count < 0:
+            raise ValueError('object_count cannot be < 0')
+        self._count = count
+
+    @property
+    def bytes_used(self):
+        return self._bytes
+
+    @bytes_used.setter
+    def bytes_used(self, bytes_used):
+        bytes_used = int(bytes_used)
+        if bytes_used < 0:
+            raise ValueError('bytes_used cannot be < 0')
+        self._bytes = bytes_used
+
+    def update_meta(self, object_count, bytes_used, meta_timestamp=None):
+        """
+        Set the object stats metadata to the given values and update the
+        meta_timestamp to the current time.
+
+        :param object_count: should be an integer
+        :param bytes_used: should be an integer
+        :param meta_timestamp: timestamp for metadata; if not given the
+            current time will be set.
+        :raises ValueError: if ``object_count`` or ``bytes_used`` cannot be
+            cast to an int, or if meta_timestamp is neither None nor can be
+            cast to a :class:`~swift.common.utils.Timestamp`.
+        """
+        self.object_count = int(object_count)
+        self.bytes_used = int(bytes_used)
+        if meta_timestamp is None:
+            self.meta_timestamp = Timestamp.now()
+        else:
+            self.meta_timestamp = meta_timestamp
+
+    def increment_meta(self, object_count, bytes_used):
+        """
+        Increment the object stats metadata by the given values and update the
+        meta_timestamp to the current time.
+
+        :param object_count: should be an integer
+        :param bytes_used: should be an integer
+        :raises ValueError: if ``object_count`` or ``bytes_used`` cannot be
+            cast to an int.
+        """
+        self.update_meta(self.object_count + int(object_count),
+                         self.bytes_used + int(bytes_used))
+
+    @classmethod
+    def resolve_state(cls, state):
+        """
+        Given a value that may be either the name or the number of a state
+        return a tuple of (state number, state name).
+
+        :param state: Either a string state name or an integer state number.
+        :return: A tuple (state number, state name)
+        :raises ValueError: if ``state`` is neither a valid state name nor a
+            valid state number.
+        """
+        try:
+            state = state.lower()
+            state_num = cls.STATES_BY_NAME[state]
+        except (KeyError, AttributeError):
+            try:
+                state_name = cls.STATES[state]
+            except KeyError:
+                raise ValueError('Invalid state %r' % state)
+            else:
+                state_num = state
+        else:
+            state_name = state
+        return state_num, state_name
+
+    @property
+    def state(self):
+        return self._state
+
+    @state.setter
+    def state(self, state):
+        try:
+            float_state = float(state)
+            int_state = int(float_state)
+        except (ValueError, TypeError):
+            raise ValueError('Invalid state %r' % state)
+        if int_state != float_state or int_state not in self.STATES:
+            raise ValueError('Invalid state %r' % state)
+        self._state = int_state
+
+    @property
+    def state_text(self):
+        return self.STATES[self.state]
+
+    @property
+    def state_timestamp(self):
+        if self._state_timestamp is None:
+            return self.timestamp
+        return self._state_timestamp
+
+    @state_timestamp.setter
+    def state_timestamp(self, ts):
+        self._state_timestamp = self._to_timestamp(ts)
+
+    @property
+    def epoch(self):
+        return self._epoch
+
+    @epoch.setter
+    def epoch(self, epoch):
+        self._epoch = self._to_timestamp(epoch)
+
+    def update_state(self, state, state_timestamp=None):
+        """
+        Set state to the given value and optionally update the state_timestamp
+        to the given time.
+
+        :param state: new state, should be an integer
+        :param state_timestamp: timestamp for state; if not given the
+            state_timestamp will not be changed.
+        :return: True if the state or state_timestamp was changed, False
+            otherwise
+        """
+        if state_timestamp is None and self.state == state:
+            return False
+        self.state = state
+        if state_timestamp is not None:
+            self.state_timestamp = state_timestamp
+        return True
+
+    @property
+    def deleted(self):
+        return self._deleted
+
+    @deleted.setter
+    def deleted(self, value):
+        self._deleted = bool(value)
+
+    def set_deleted(self, timestamp=None):
+        """
+        Mark the shard range deleted and set timestamp to the current time.
+
+        :param timestamp: optional timestamp to set; if not given the
+            current time will be set.
+        :return: True if the deleted attribute or timestamp was changed, False
+            otherwise
+        """
+        if timestamp is None and self.deleted:
+            return False
+        self.deleted = True
+        self.timestamp = timestamp or Timestamp.now()
+        return True
+
+    def __contains__(self, item):
+        # test if the given item is within the namespace
+        if item == '':
+            return False
+        item = self._encode_bound(item)
+        return self.lower < item <= self.upper
+
+    def __lt__(self, other):
+        # a ShardRange is less than other if its entire namespace is less than
+        # other; if other is another ShardRange that implies that this
+        # ShardRange's upper must be less than or equal to the other
+        # ShardRange's lower
+        if self.upper == ShardRange.MAX:
+            return False
+        if isinstance(other, ShardRange):
+            return self.upper <= other.lower
+        elif other is None:
+            return True
+        else:
+            return self.upper < other
+
+    def __gt__(self, other):
+        # a ShardRange is greater than other if its entire namespace is greater
+        # than other; if other is another ShardRange that implies that this
+        # ShardRange's lower must be less greater than or equal to the other
+        # ShardRange's upper
+        if self.lower == ShardRange.MIN:
+            return False
+        if isinstance(other, ShardRange):
+            return self.lower >= other.upper
+        elif other is None:
+            return False
+        else:
+            return self.lower >= other
+
+    def __eq__(self, other):
+        # test for equality of range bounds only
+        if not isinstance(other, ShardRange):
+            return False
+        return self.lower == other.lower and self.upper == other.upper
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __repr__(self):
+        return '%s<%r to %r as of %s, (%d, %d) as of %s, %s as of %s>' % (
+            self.__class__.__name__, self.lower, self.upper,
+            self.timestamp.internal, self.object_count, self.bytes_used,
+            self.meta_timestamp.internal, self.state_text,
+            self.state_timestamp.internal)
+
+    def entire_namespace(self):
+        """
+        Returns True if the ShardRange includes the entire namespace, False
+        otherwise.
+        """
+        return (self.lower == ShardRange.MIN and
+                self.upper == ShardRange.MAX)
+
+    def overlaps(self, other):
+        """
+        Returns True if the ShardRange namespace overlaps with the other
+        ShardRange's namespace.
+
+        :param other: an instance of :class:`~swift.common.utils.ShardRange`
+        """
+        if not isinstance(other, ShardRange):
+            return False
+        return max(self.lower, other.lower) < min(self.upper, other.upper)
+
+    def includes(self, other):
+        """
+        Returns True if this namespace includes the whole of the other
+        namespace, False otherwise.
+
+        :param other: an instance of :class:`~swift.common.utils.ShardRange`
+        """
+        return (self.lower <= other.lower) and (other.upper <= self.upper)
+
+    def __iter__(self):
+        yield 'name', self.name
+        yield 'timestamp', self.timestamp.internal
+        yield 'lower', str(self.lower)
+        yield 'upper', str(self.upper)
+        yield 'object_count', self.object_count
+        yield 'bytes_used', self.bytes_used
+        yield 'meta_timestamp', self.meta_timestamp.internal
+        yield 'deleted', 1 if self.deleted else 0
+        yield 'state', self.state
+        yield 'state_timestamp', self.state_timestamp.internal
+        yield 'epoch', self.epoch.internal if self.epoch is not None else None
+
+    def copy(self, timestamp=None, **kwargs):
+        """
+        Creates a copy of the ShardRange.
+
+        :param timestamp: (optional) If given, the returned ShardRange will
+            have all of its timestamps set to this value. Otherwise the
+            returned ShardRange will have the original timestamps.
+        :return: an instance of :class:`~swift.common.utils.ShardRange`
+        """
+        new = ShardRange.from_dict(dict(self, **kwargs))
+        if timestamp:
+            new.timestamp = timestamp
+            new.meta_timestamp = new.state_timestamp = None
+        return new
+
+    @classmethod
+    def from_dict(cls, params):
+        """
+        Return an instance constructed using the given dict of params. This
+        method is deliberately less flexible than the class `__init__()` method
+        and requires all of the `__init__()` args to be given in the dict of
+        params.
+
+        :param params: a dict of parameters
+        :return: an instance of this class
+        """
+        return cls(
+            params['name'], params['timestamp'], params['lower'],
+            params['upper'], params['object_count'], params['bytes_used'],
+            params['meta_timestamp'], params['deleted'], params['state'],
+            params['state_timestamp'], params['epoch'])
+
+
+def find_shard_range(item, ranges):
+    """
+    Find a ShardRange in given list of ``shard_ranges`` whose namespace
+    contains ``item``.
+
+    :param item: The item for a which a ShardRange is to be found.
+    :param ranges: a sorted list of ShardRanges.
+    :return: the ShardRange whose namespace contains ``item``, or None if
+        no suitable range is found.
+    """
+    index = bisect.bisect_left(ranges, item)
+    if index != len(ranges) and item in ranges[index]:
+        return ranges[index]
+    return None
+
+
 def modify_priority(conf, logger):
    """
    Modify priority by nice and ionice.
@ -4750,3 +5315,110 @@ def distribute_evenly(items, num_buckets):
    for index, item in enumerate(items):
        out[index % num_buckets].append(item)
    return out
+
+
+def get_redirect_data(response):
+    """
+    Extract a redirect location from a response's headers.
+
+    :param response: a response
+    :return: a tuple of (path, Timestamp) if a Location header is found,
+        otherwise None
+    :raises ValueError: if the Location header is found but a
+        X-Backend-Redirect-Timestamp is not found, or if there is a problem
+        with the format of etiher header
+    """
+    headers = HeaderKeyDict(response.getheaders())
+    if 'Location' not in headers:
+        return None
+    location = urlparse(headers['Location']).path
+    account, container, _junk = split_path(location, 2, 3, True)
+    timestamp_val = headers.get('X-Backend-Redirect-Timestamp')
+    try:
+        timestamp = Timestamp(timestamp_val)
+    except (TypeError, ValueError):
+        raise ValueError('Invalid timestamp value: %s' % timestamp_val)
+    return '%s/%s' % (account, container), timestamp
+
+
+def parse_db_filename(filename):
+    """
+    Splits a db filename into three parts: the hash, the epoch, and the
+    extension.
+
+    >>> parse_db_filename("ab2134.db")
+    ('ab2134', None, '.db')
+    >>> parse_db_filename("ab2134_1234567890.12345.db")
+    ('ab2134', '1234567890.12345', '.db')
+
+    :param filename: A db file basename or path to a db file.
+    :return: A tuple of (hash , epoch, extension). ``epoch`` may be None.
+    :raises ValueError: if ``filename`` is not a path to a file.
+    """
+    filename = os.path.basename(filename)
+    if not filename:
+        raise ValueError('Path to a file required.')
+    name, ext = os.path.splitext(filename)
+    parts = name.split('_')
+    hash_ = parts.pop(0)
+    epoch = parts[0] if parts else None
+    return hash_, epoch, ext
+
+
+def make_db_file_path(db_path, epoch):
+    """
+    Given a path to a db file, return a modified path whose filename part has
+    the given epoch.
+
+    A db filename takes the form <hash>[_<epoch>].db; this method replaces the
+    <epoch> part of the given ``db_path`` with the given ``epoch`` value.
+
+    :param db_path: Path to a db file that does not necessarily exist.
+    :param epoch: A string that will be used as the epoch in the new path's
+        filename; the value will be normalized to the normal string
+        representation of a :class:`~swift.common.utils.Timestamp`.
+    :return: A modified path to a db file.
+    :raises ValueError: if the ``epoch`` is not valid for constructing a
+        :class:`~swift.common.utils.Timestamp`.
+    """
+    if epoch is None:
+        raise ValueError('epoch must not be None')
+    epoch = Timestamp(epoch).normal
+    hash_, _, ext = parse_db_filename(db_path)
+    db_dir = os.path.dirname(db_path)
+    return os.path.join(db_dir, '%s_%s%s' % (hash_, epoch, ext))
+
+
+def get_db_files(db_path):
+    """
+    Given the path to a db file, return a sorted list of all valid db files
+    that actually exist in that path's dir. A valid db filename has the form:
+
+        <hash>[_<epoch>].db
+
+    where <hash> matches the <hash> part of the given db_path as would be
+    parsed by :meth:`~swift.utils.common.parse_db_filename`.
+
+    :param db_path: Path to a db file that does not necessarily exist.
+    :return: List of valid db files that do exist in the dir of the
+        ``db_path``. This list may be empty.
+    """
+    db_dir, db_file = os.path.split(db_path)
+    try:
+        files = os.listdir(db_dir)
+    except OSError as err:
+        if err.errno == errno.ENOENT:
+            return []
+        raise
+    if not files:
+        return []
+    match_hash, epoch, ext = parse_db_filename(db_file)
+    results = []
+    for f in files:
+        hash_, epoch, ext = parse_db_filename(f)
+        if ext != '.db':
+            continue
+        if hash_ != match_hash:
+            continue
+        results.append(os.path.join(db_dir, f))
+    return sorted(results)
--- a/swift/common/wsgi.py
+++ b/swift/common/wsgi.py
@ -45,6 +45,9 @@ from swift.common.utils import capture_stdio, disable_fallocate, \
    validate_configuration, get_hub, config_auto_int_value, \
    reiterate

+SIGNUM_TO_NAME = {getattr(signal, n): n for n in dir(signal)
+                  if n.startswith('SIG') and '_' not in n}
+
 # Set maximum line size of message headers to be accepted.
 wsgi.MAX_HEADER_LINE = constraints.MAX_HEADER_SIZE

@ -559,7 +562,8 @@ class WorkersStrategy(object):
        :param int pid: The new worker process' PID
        """

-        self.logger.notice('Started child %s' % pid)
+        self.logger.notice('Started child %s from parent %s',
+                           pid, os.getpid())
        self.children.append(pid)

    def register_worker_exit(self, pid):
@ -569,7 +573,8 @@ class WorkersStrategy(object):
        :param int pid: The PID of the worker that exited.
        """

-        self.logger.error('Removing dead child %s' % pid)
+        self.logger.error('Removing dead child %s from parent %s',
+                          pid, os.getpid())
        self.children.remove(pid)

    def shutdown_sockets(self):
@ -935,24 +940,17 @@ def run_wsgi(conf_path, app_section, *args, **kwargs):
        run_server(conf, logger, no_fork_sock, global_conf=global_conf)
        return 0

-    def kill_children(*args):
-        """Kills the entire process group."""
-        logger.error('SIGTERM received')
-        signal.signal(signal.SIGTERM, signal.SIG_IGN)
-        running[0] = False
-        os.killpg(0, signal.SIGTERM)
+    def stop_with_signal(signum, *args):
+        """Set running flag to False and capture the signum"""
+        running_context[0] = False
+        running_context[1] = signum

-    def hup(*args):
-        """Shuts down the server, but allows running requests to complete"""
-        logger.error('SIGHUP received')
-        signal.signal(signal.SIGHUP, signal.SIG_IGN)
-        running[0] = False
+    # context to hold boolean running state and stop signum
+    running_context = [True, None]
+    signal.signal(signal.SIGTERM, stop_with_signal)
+    signal.signal(signal.SIGHUP, stop_with_signal)

-    running = [True]
-    signal.signal(signal.SIGTERM, kill_children)
-    signal.signal(signal.SIGHUP, hup)
-
-    while running[0]:
+    while running_context[0]:
        for sock, sock_info in strategy.new_worker_socks():
            pid = os.fork()
            if pid == 0:
@ -992,11 +990,23 @@ def run_wsgi(conf_path, app_section, *args, **kwargs):
                        sleep(0.01)
            except KeyboardInterrupt:
                logger.notice('User quit')
-                running[0] = False
+                running_context[0] = False
                break

+    if running_context[1] is not None:
+        try:
+            signame = SIGNUM_TO_NAME[running_context[1]]
+        except KeyError:
+            logger.error('Stopping with unexpected signal %r' %
+                         running_context[1])
+        else:
+            logger.error('%s received', signame)
+    if running_context[1] == signal.SIGTERM:
+        os.killpg(0, signal.SIGTERM)
+
    strategy.shutdown_sockets()
-    logger.notice('Exited')
+    signal.signal(signal.SIGTERM, signal.SIG_IGN)
+    logger.notice('Exited (%s)', os.getpid())
    return 0


--- a/swift/container/backend.py
+++ b/swift/container/backend.py
--- a/swift/container/replicator.py
+++ b/swift/container/replicator.py
@ -26,11 +26,10 @@ from swift.container.reconciler import (
    get_reconciler_container_name, get_row_to_q_entry_translator)
 from swift.common import db_replicator
 from swift.common.storage_policy import POLICIES
+from swift.common.swob import HTTPOk, HTTPAccepted
 from swift.common.exceptions import DeviceUnavailable
 from swift.common.http import is_success
-from swift.common.db import DatabaseAlreadyExists
-from swift.common.utils import (Timestamp, hash_path,
-                                storage_directory, majority_size)
+from swift.common.utils import Timestamp, majority_size, get_db_files


 class ContainerReplicator(db_replicator.Replicator):
@ -39,6 +38,10 @@ class ContainerReplicator(db_replicator.Replicator):
    datadir = DATADIR
    default_port = 6201

+    def __init__(self, conf, logger=None):
+        super(ContainerReplicator, self).__init__(conf, logger=logger)
+        self.reconciler_cleanups = self.sync_store = None
+
    def report_up_to_date(self, full_info):
        reported_key_map = {
            'reported_put_timestamp': 'put_timestamp',
@ -61,8 +64,7 @@ class ContainerReplicator(db_replicator.Replicator):
        return sync_args

    def _handle_sync_response(self, node, response, info, broker, http,
-                              different_region):
-        parent = super(ContainerReplicator, self)
+                              different_region=False):
        if is_success(response.status):
            remote_info = json.loads(response.data)
            if incorrect_policy_index(info, remote_info):
@ -75,9 +77,50 @@ class ContainerReplicator(db_replicator.Replicator):
            if any(info[key] != remote_info[key] for key in sync_timestamps):
                broker.merge_timestamps(*(remote_info[key] for key in
                                          sync_timestamps))
-        rv = parent._handle_sync_response(
+
+            # Grab remote's shard ranges, too
+            self._fetch_and_merge_shard_ranges(http, broker)
+
+        return super(ContainerReplicator, self)._handle_sync_response(
            node, response, info, broker, http, different_region)
-        return rv
+
+    def _sync_shard_ranges(self, broker, http, local_id):
+        # TODO: currently the number of shard ranges is expected to be _much_
+        # less than normal objects so all are sync'd on each cycle. However, in
+        # future there should be sync points maintained much like for object
+        # syncing so that only new shard range rows are sync'd.
+        shard_range_data = broker.get_all_shard_range_data()
+        if shard_range_data:
+            if not self._send_replicate_request(
+                    http, 'merge_shard_ranges', shard_range_data, local_id):
+                return False
+            self.logger.debug('%s synced %s shard ranges to %s',
+                              broker.db_file, len(shard_range_data),
+                              '%(ip)s:%(port)s/%(device)s' % http.node)
+        return True
+
+    def _choose_replication_mode(self, node, rinfo, info, local_sync, broker,
+                                 http, different_region):
+        # Always replicate shard ranges
+        shard_range_success = self._sync_shard_ranges(broker, http, info['id'])
+        if broker.sharding_initiated():
+            self.logger.warning(
+                '%s is able to shard -- refusing to replicate objects to peer '
+                '%s; have shard ranges and will wait for cleaving',
+                broker.db_file,
+                '%(ip)s:%(port)s/%(device)s' % node)
+            self.stats['deferred'] += 1
+            return shard_range_success
+
+        success = super(ContainerReplicator, self)._choose_replication_mode(
+            node, rinfo, info, local_sync, broker, http,
+            different_region)
+        return shard_range_success and success
+
+    def _fetch_and_merge_shard_ranges(self, http, broker):
+        response = http.replicate('get_shard_ranges')
+        if is_success(response.status):
+            broker.merge_shard_ranges(json.loads(response.data))

    def find_local_handoff_for_part(self, part):
        """
@ -114,15 +157,10 @@ class ContainerReplicator(db_replicator.Replicator):
            raise DeviceUnavailable(
                'No mounted devices found suitable to Handoff reconciler '
                'container %s in partition %s' % (container, part))
-        hsh = hash_path(account, container)
-        db_dir = storage_directory(DATADIR, part, hsh)
-        db_path = os.path.join(self.root, node['device'], db_dir, hsh + '.db')
-        broker = ContainerBroker(db_path, account=account, container=container)
-        if not os.path.exists(broker.db_file):
-            try:
-                broker.initialize(timestamp, 0)
-            except DatabaseAlreadyExists:
-                pass
+        broker = ContainerBroker.create_broker(
+            os.path.join(self.root, node['device']), part, account, container,
+            logger=self.logger, put_timestamp=timestamp,
+            storage_policy_index=0)
        if self.reconciler_containers is not None:
            self.reconciler_containers[container] = part, broker, node['id']
        return broker
@ -207,6 +245,18 @@ class ContainerReplicator(db_replicator.Replicator):
            # replication
            broker.update_reconciler_sync(max_sync)

+    def cleanup_post_replicate(self, broker, orig_info, responses):
+        debug_template = 'Not deleting db %s (%%s)' % broker.db_file
+        if broker.sharding_required():
+            # despite being a handoff, since we're sharding we're not going to
+            # do any cleanup so we can continue cleaving - this is still
+            # considered "success"
+            reason = 'requires sharding, state %s' % broker.get_db_state()
+            self.logger.debug(debug_template, reason)
+            return True
+        return super(ContainerReplicator, self).cleanup_post_replicate(
+            broker, orig_info, responses)
+
    def delete_db(self, broker):
        """
        Ensure that reconciler databases are only cleaned up at the end of the
@ -217,12 +267,13 @@ class ContainerReplicator(db_replicator.Replicator):
            # this container shouldn't be here, make sure it's cleaned up
            self.reconciler_cleanups[broker.container] = broker
            return
-        try:
-            # DB is going to get deleted. Be preemptive about it
-            self.sync_store.remove_synced_container(broker)
-        except Exception:
-            self.logger.exception('Failed to remove sync_store entry %s' %
-                                  broker.db_file)
+        if self.sync_store:
+            try:
+                # DB is going to get deleted. Be preemptive about it
+                self.sync_store.remove_synced_container(broker)
+            except Exception:
+                self.logger.exception('Failed to remove sync_store entry %s' %
+                                      broker.db_file)

        return super(ContainerReplicator, self).delete_db(broker)

@ -259,9 +310,20 @@ class ContainerReplicator(db_replicator.Replicator):
            self.replicate_reconcilers()
        return rv

+    def _in_sync(self, rinfo, info, broker, local_sync):
+        # TODO: don't always sync shard ranges!
+        if broker.get_shard_ranges(include_own=True, include_deleted=True):
+            return False
+
+        return super(ContainerReplicator, self)._in_sync(
+            rinfo, info, broker, local_sync)
+

 class ContainerReplicatorRpc(db_replicator.ReplicatorRpc):

+    def _db_file_exists(self, db_path):
+        return bool(get_db_files(db_path))
+
    def _parse_sync_args(self, args):
        parent = super(ContainerReplicatorRpc, self)
        remote_info = parent._parse_sync_args(args)
@ -289,3 +351,27 @@ class ContainerReplicatorRpc(db_replicator.ReplicatorRpc):
                timestamp=status_changed_at)
            info = broker.get_replication_info()
        return info
+
+    def _abort_rsync_then_merge(self, db_file, old_filename):
+        if super(ContainerReplicatorRpc, self)._abort_rsync_then_merge(
+                db_file, old_filename):
+            return True
+        # if the local db has started sharding since the original 'sync'
+        # request then abort object replication now; instantiate a fresh broker
+        # each time this check if performed so to get latest state
+        broker = ContainerBroker(db_file)
+        return broker.sharding_initiated()
+
+    def _post_rsync_then_merge_hook(self, existing_broker, new_broker):
+        # Note the following hook will need to change to using a pointer and
+        # limit in the future.
+        new_broker.merge_shard_ranges(
+            existing_broker.get_all_shard_range_data())
+
+    def merge_shard_ranges(self, broker, args):
+        broker.merge_shard_ranges(args[0])
+        return HTTPAccepted()
+
+    def get_shard_ranges(self, broker, args):
+        return HTTPOk(headers={'Content-Type': 'application/json'},
+                      body=json.dumps(broker.get_all_shard_range_data()))
--- a/swift/container/server.py
+++ b/swift/container/server.py
@ -24,7 +24,8 @@ from eventlet import Timeout

 import swift.common.db
 from swift.container.sync_store import ContainerSyncStore
-from swift.container.backend import ContainerBroker, DATADIR
+from swift.container.backend import ContainerBroker, DATADIR, \
+    RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED, SHARD_UPDATE_STATES
 from swift.container.replicator import ContainerReplicatorRpc
 from swift.common.db import DatabaseAlreadyExists
 from swift.common.container_sync_realms import ContainerSyncRealms
@ -33,7 +34,8 @@ from swift.common.request_helpers import get_param, \
 from swift.common.utils import get_logger, hash_path, public, \
    Timestamp, storage_directory, validate_sync_to, \
    config_true_value, timing_stats, replication, \
-    override_bytes_from_content_type, get_log_line
+    override_bytes_from_content_type, get_log_line, ShardRange, list_from_csv
+
 from swift.common.constraints import valid_timestamp, check_utf8, check_drive
 from swift.common import constraints
 from swift.common.bufferedhttp import http_connect
@ -46,7 +48,7 @@ from swift.common.header_key_dict import HeaderKeyDict
 from swift.common.swob import HTTPAccepted, HTTPBadRequest, HTTPConflict, \
    HTTPCreated, HTTPInternalServerError, HTTPNoContent, HTTPNotFound, \
    HTTPPreconditionFailed, HTTPMethodNotAllowed, Request, Response, \
-    HTTPInsufficientStorage, HTTPException
+    HTTPInsufficientStorage, HTTPException, HTTPMovedPermanently


 def gen_resp_headers(info, is_deleted=False):
@ -72,6 +74,7 @@ def gen_resp_headers(info, is_deleted=False):
            'X-Timestamp': Timestamp(info.get('created_at', 0)).normal,
            'X-PUT-Timestamp': Timestamp(
                info.get('put_timestamp', 0)).normal,
+            'X-Backend-Sharding-State': info.get('db_state', UNSHARDED),
        })
    return headers

@ -261,6 +264,40 @@ class ContainerController(BaseStorageServer):
            self.logger.exception('Failed to update sync_store %s during %s' %
                                  (broker.db_file, method))

+    def _redirect_to_shard(self, req, broker, obj_name):
+        """
+        If the request indicates that it can accept a redirection, look for a
+        shard range that contains ``obj_name`` and if one exists return a
+        HTTPMovedPermanently response.
+
+        :param req: an instance of :class:`~swift.common.swob.Request`
+        :param broker: a container broker
+        :param obj_name: an object name
+        :return: an instance of :class:`swift.common.swob.HTTPMovedPermanently`
+            if a shard range exists for the given ``obj_name``, otherwise None.
+        """
+        if not config_true_value(
+                req.headers.get('x-backend-accept-redirect', False)):
+            return None
+
+        shard_ranges = broker.get_shard_ranges(
+            includes=obj_name, states=SHARD_UPDATE_STATES)
+        if not shard_ranges:
+            return None
+
+        # note: obj_name may be included in both a created sub-shard and its
+        # sharding parent. get_shard_ranges will return the created sub-shard
+        # in preference to the parent, which is the desired result.
+        containing_range = shard_ranges[0]
+        location = "/%s/%s" % (containing_range.name, obj_name)
+        headers = {'Location': location,
+                   'X-Backend-Redirect-Timestamp':
+                       containing_range.timestamp.internal}
+
+        # we do not want the host added to the location
+        req.environ['swift.leave_relative_location'] = True
+        return HTTPMovedPermanently(headers=headers, request=req)
+
    @public
    @timing_stats()
    def DELETE(self, req):
@ -283,6 +320,11 @@ class ContainerController(BaseStorageServer):
        if not os.path.exists(broker.db_file):
            return HTTPNotFound()
        if obj:     # delete object
+            # redirect if a shard range exists for the object name
+            redirect = self._redirect_to_shard(req, broker, obj)
+            if redirect:
+                return redirect
+
            broker.delete_object(obj, req.headers.get('x-timestamp'),
                                 obj_policy_index)
            return HTTPNoContent(request=req)
@ -343,6 +385,40 @@ class ContainerController(BaseStorageServer):
            broker.update_status_changed_at(timestamp)
        return recreated

+    def _maybe_autocreate(self, broker, req_timestamp, account,
+                          policy_index):
+        created = False
+        if account.startswith(self.auto_create_account_prefix) and \
+                not os.path.exists(broker.db_file):
+            if policy_index is None:
+                raise HTTPBadRequest(
+                    'X-Backend-Storage-Policy-Index header is required')
+            try:
+                broker.initialize(req_timestamp.internal, policy_index)
+            except DatabaseAlreadyExists:
+                pass
+            else:
+                created = True
+        if not os.path.exists(broker.db_file):
+            raise HTTPNotFound()
+        return created
+
+    def _update_metadata(self, req, broker, req_timestamp, method):
+        metadata = {}
+        metadata.update(
+            (key, (value, req_timestamp.internal))
+            for key, value in req.headers.items()
+            if key.lower() in self.save_headers or
+            is_sys_or_user_meta('container', key))
+        if metadata:
+            if 'X-Container-Sync-To' in metadata:
+                if 'X-Container-Sync-To' not in broker.metadata or \
+                        metadata['X-Container-Sync-To'][0] != \
+                        broker.metadata['X-Container-Sync-To'][0]:
+                    broker.set_x_container_sync_points(-1, -1)
+            broker.update_metadata(metadata, validate_metadata=True)
+            self._update_sync_store(broker, method)
+
    @public
    @timing_stats()
    def PUT(self, req):
@ -364,14 +440,13 @@ class ContainerController(BaseStorageServer):
            # obj put expects the policy_index header, default is for
            # legacy support during upgrade.
            obj_policy_index = requested_policy_index or 0
-            if account.startswith(self.auto_create_account_prefix) and \
-                    not os.path.exists(broker.db_file):
-                try:
-                    broker.initialize(req_timestamp.internal, obj_policy_index)
-                except DatabaseAlreadyExists:
-                    pass
-            if not os.path.exists(broker.db_file):
-                return HTTPNotFound()
+            self._maybe_autocreate(broker, req_timestamp, account,
+                                   obj_policy_index)
+            # redirect if a shard exists for this object name
+            response = self._redirect_to_shard(req, broker, obj)
+            if response:
+                return response
+
            broker.put_object(obj, req_timestamp.internal,
                              int(req.headers['x-size']),
                              req.headers['x-content-type'],
@ -380,6 +455,22 @@ class ContainerController(BaseStorageServer):
                              req.headers.get('x-content-type-timestamp'),
                              req.headers.get('x-meta-timestamp'))
            return HTTPCreated(request=req)
+
+        record_type = req.headers.get('x-backend-record-type', '').lower()
+        if record_type == RECORD_TYPE_SHARD:
+            try:
+                # validate incoming data...
+                shard_ranges = [ShardRange.from_dict(sr)
+                                for sr in json.loads(req.body)]
+            except (ValueError, KeyError, TypeError) as err:
+                return HTTPBadRequest('Invalid body: %r' % err)
+            created = self._maybe_autocreate(broker, req_timestamp, account,
+                                             requested_policy_index)
+            self._update_metadata(req, broker, req_timestamp, 'PUT')
+            if shard_ranges:
+                # TODO: consider writing the shard ranges into the pending
+                # file, but if so ensure an all-or-none semantic for the write
+                broker.merge_shard_ranges(shard_ranges)
        else:   # put container
            if requested_policy_index is None:
                # use the default index sent by the proxy if available
@ -391,31 +482,18 @@ class ContainerController(BaseStorageServer):
                                             req_timestamp.internal,
                                             new_container_policy,
                                             requested_policy_index)
-            metadata = {}
-            metadata.update(
-                (key, (value, req_timestamp.internal))
-                for key, value in req.headers.items()
-                if key.lower() in self.save_headers or
-                is_sys_or_user_meta('container', key))
-            if 'X-Container-Sync-To' in metadata:
-                if 'X-Container-Sync-To' not in broker.metadata or \
-                        metadata['X-Container-Sync-To'][0] != \
-                        broker.metadata['X-Container-Sync-To'][0]:
-                    broker.set_x_container_sync_points(-1, -1)
-            broker.update_metadata(metadata, validate_metadata=True)
-            if metadata:
-                self._update_sync_store(broker, 'PUT')
+            self._update_metadata(req, broker, req_timestamp, 'PUT')
            resp = self.account_update(req, account, container, broker)
            if resp:
                return resp
-            if created:
-                return HTTPCreated(request=req,
-                                   headers={'x-backend-storage-policy-index':
-                                            broker.storage_policy_index})
-            else:
-                return HTTPAccepted(request=req,
-                                    headers={'x-backend-storage-policy-index':
-                                             broker.storage_policy_index})
+        if created:
+            return HTTPCreated(request=req,
+                               headers={'x-backend-storage-policy-index':
+                                        broker.storage_policy_index})
+        else:
+            return HTTPAccepted(request=req,
+                                headers={'x-backend-storage-policy-index':
+                                         broker.storage_policy_index})

    @public
    @timing_stats(sample_rate=0.1)
@ -454,13 +532,18 @@ class ContainerController(BaseStorageServer):
        :params record: object entry record
        :returns: modified record
        """
-        (name, created, size, content_type, etag) = record[:5]
-        if content_type is None:
-            return {'subdir': name.decode('utf8')}
-        response = {'bytes': size, 'hash': etag, 'name': name.decode('utf8'),
-                    'content_type': content_type}
+        if isinstance(record, ShardRange):
+            created = record.timestamp
+            response = dict(record)
+        else:
+            (name, created, size, content_type, etag) = record[:5]
+            if content_type is None:
+                return {'subdir': name.decode('utf8')}
+            response = {
+                'bytes': size, 'hash': etag, 'name': name.decode('utf8'),
+                'content_type': content_type}
+            override_bytes_from_content_type(response, logger=self.logger)
        response['last_modified'] = Timestamp(created).isoformat
-        override_bytes_from_content_type(response, logger=self.logger)
        return response

    @public
@ -494,12 +577,45 @@ class ContainerController(BaseStorageServer):
                                            pending_timeout=0.1,
                                            stale_reads_ok=True)
        info, is_deleted = broker.get_info_is_deleted()
-        resp_headers = gen_resp_headers(info, is_deleted=is_deleted)
-        if is_deleted:
-            return HTTPNotFound(request=req, headers=resp_headers)
-        container_list = broker.list_objects_iter(
-            limit, marker, end_marker, prefix, delimiter, path,
-            storage_policy_index=info['storage_policy_index'], reverse=reverse)
+        record_type = req.headers.get('x-backend-record-type', '').lower()
+        if record_type == 'auto' and info.get('db_state') in (SHARDING,
+                                                              SHARDED):
+            record_type = 'shard'
+        if record_type == 'shard':
+            override_deleted = info and config_true_value(
+                req.headers.get('x-backend-override-deleted', False))
+            resp_headers = gen_resp_headers(
+                info, is_deleted=is_deleted and not override_deleted)
+            if is_deleted and not override_deleted:
+                return HTTPNotFound(request=req, headers=resp_headers)
+            resp_headers['X-Backend-Record-Type'] = 'shard'
+            includes = get_param(req, 'includes')
+            states = get_param(req, 'states')
+            fill_gaps = False
+            if states:
+                states = list_from_csv(states)
+                fill_gaps = any(('listing' in states, 'updating' in states))
+                try:
+                    states = broker.resolve_shard_range_states(states)
+                except ValueError:
+                    return HTTPBadRequest(request=req, body='Bad state')
+            include_deleted = config_true_value(
+                req.headers.get('x-backend-include-deleted', False))
+            container_list = broker.get_shard_ranges(
+                marker, end_marker, includes, reverse, states=states,
+                include_deleted=include_deleted, fill_gaps=fill_gaps)
+        else:
+            resp_headers = gen_resp_headers(info, is_deleted=is_deleted)
+            if is_deleted:
+                return HTTPNotFound(request=req, headers=resp_headers)
+            resp_headers['X-Backend-Record-Type'] = 'object'
+            # Use the retired db while container is in process of sharding,
+            # otherwise use current db
+            src_broker = broker.get_brokers()[0]
+            container_list = src_broker.list_objects_iter(
+                limit, marker, end_marker, prefix, delimiter, path,
+                storage_policy_index=info['storage_policy_index'],
+                reverse=reverse)
        return self.create_listing(req, out_content_type, info, resp_headers,
                                   broker.metadata, container_list, container)

@ -562,20 +678,7 @@ class ContainerController(BaseStorageServer):
        if broker.is_deleted():
            return HTTPNotFound(request=req)
        broker.update_put_timestamp(req_timestamp.internal)
-        metadata = {}
-        metadata.update(
-            (key, (value, req_timestamp.internal))
-            for key, value in req.headers.items()
-            if key.lower() in self.save_headers or
-            is_sys_or_user_meta('container', key))
-        if metadata:
-            if 'X-Container-Sync-To' in metadata:
-                if 'X-Container-Sync-To' not in broker.metadata or \
-                        metadata['X-Container-Sync-To'][0] != \
-                        broker.metadata['X-Container-Sync-To'][0]:
-                    broker.set_x_container_sync_points(-1, -1)
-            broker.update_metadata(metadata, validate_metadata=True)
-            self._update_sync_store(broker, 'POST')
+        self._update_metadata(req, broker, req_timestamp, 'POST')
        return HTTPNoContent(request=req)

    def __call__(self, env, start_response):
--- a/swift/container/sharder.py
+++ b/swift/container/sharder.py
--- a/swift/obj/server.py
+++ b/swift/obj/server.py
@ -35,7 +35,7 @@ from swift.common.utils import public, get_logger, \
    normalize_delete_at_timestamp, get_log_line, Timestamp, \
    get_expirer_container, parse_mime_headers, \
    iter_multipart_mime_documents, extract_swift_bytes, safe_json_loads, \
-    config_auto_int_value
+    config_auto_int_value, split_path, get_redirect_data
 from swift.common.bufferedhttp import http_connect
 from swift.common.constraints import check_object_creation, \
    valid_timestamp, check_utf8
@ -44,7 +44,7 @@ from swift.common.exceptions import ConnectionTimeout, DiskFileQuarantined, \
    DiskFileDeviceUnavailable, DiskFileExpired, ChunkReadTimeout, \
    ChunkReadError, DiskFileXattrNotSupported
 from swift.obj import ssync_receiver
-from swift.common.http import is_success
+from swift.common.http import is_success, HTTP_MOVED_PERMANENTLY
 from swift.common.base_storage_server import BaseStorageServer
 from swift.common.header_key_dict import HeaderKeyDict
 from swift.common.request_helpers import get_name_and_placement, \
@ -245,7 +245,7 @@ class ObjectController(BaseStorageServer):

    def async_update(self, op, account, container, obj, host, partition,
                     contdevice, headers_out, objdevice, policy,
-                     logger_thread_locals=None):
+                     logger_thread_locals=None, container_path=None):
        """
        Sends or saves an async update.

@ -263,11 +263,21 @@ class ObjectController(BaseStorageServer):
        :param logger_thread_locals: The thread local values to be set on the
                                     self.logger to retain transaction
                                     logging information.
+        :param container_path: optional path in the form `<account/container>`
+            to which the update should be sent. If given this path will be used
+            instead of constructing a path from the ``account`` and
+            ``container`` params.
        """
        if logger_thread_locals:
            self.logger.thread_locals = logger_thread_locals
        headers_out['user-agent'] = 'object-server %s' % os.getpid()
-        full_path = '/%s/%s/%s' % (account, container, obj)
+        if container_path:
+            # use explicitly specified container path
+            full_path = '/%s/%s' % (container_path, obj)
+        else:
+            full_path = '/%s/%s/%s' % (account, container, obj)
+
+        redirect_data = None
        if all([host, partition, contdevice]):
            try:
                with ConnectionTimeout(self.conn_timeout):
@ -277,15 +287,23 @@ class ObjectController(BaseStorageServer):
                with Timeout(self.node_timeout):
                    response = conn.getresponse()
                    response.read()
-                    if is_success(response.status):
-                        return
-                    else:
-                        self.logger.error(_(
-                            'ERROR Container update failed '
-                            '(saving for async update later): %(status)d '
-                            'response from %(ip)s:%(port)s/%(dev)s'),
-                            {'status': response.status, 'ip': ip, 'port': port,
-                             'dev': contdevice})
+                if is_success(response.status):
+                    return
+
+                if response.status == HTTP_MOVED_PERMANENTLY:
+                    try:
+                        redirect_data = get_redirect_data(response)
+                    except ValueError as err:
+                        self.logger.error(
+                            'Container update failed for %r; problem with '
+                            'redirect location: %s' % (obj, err))
+                else:
+                    self.logger.error(_(
+                        'ERROR Container update failed '
+                        '(saving for async update later): %(status)d '
+                        'response from %(ip)s:%(port)s/%(dev)s'),
+                        {'status': response.status, 'ip': ip, 'port': port,
+                         'dev': contdevice})
            except (Exception, Timeout):
                self.logger.exception(_(
                    'ERROR container update failed with '
@ -293,6 +311,13 @@ class ObjectController(BaseStorageServer):
                    {'ip': ip, 'port': port, 'dev': contdevice})
        data = {'op': op, 'account': account, 'container': container,
                'obj': obj, 'headers': headers_out}
+        if redirect_data:
+            self.logger.debug(
+                'Update to %(path)s redirected to %(redirect)s',
+                {'path': full_path, 'redirect': redirect_data[0]})
+            container_path = redirect_data[0]
+        if container_path:
+            data['container_path'] = container_path
        timestamp = headers_out.get('x-meta-timestamp',
                                    headers_out.get('x-timestamp'))
        self._diskfile_router[policy].pickle_async_update(
@ -319,6 +344,7 @@ class ObjectController(BaseStorageServer):
        contdevices = [d.strip() for d in
                       headers_in.get('X-Container-Device', '').split(',')]
        contpartition = headers_in.get('X-Container-Partition', '')
+        contpath = headers_in.get('X-Backend-Container-Path')

        if len(conthosts) != len(contdevices):
            # This shouldn't happen unless there's a bug in the proxy,
@ -331,6 +357,21 @@ class ObjectController(BaseStorageServer):
                    'devices': headers_in.get('X-Container-Device', '')})
            return

+        if contpath:
+            try:
+                # TODO: this is very late in request handling to be validating
+                # a header - if we did *not* check and the header was bad
+                # presumably the update would fail and we would fall back to an
+                # async update to the root container, which might be best
+                # course of action rather than aborting update altogether?
+                split_path('/' + contpath, minsegs=2, maxsegs=2)
+            except ValueError:
+                self.logger.error(
+                    "Invalid X-Backend-Container-Path, should be of the form "
+                    "'account/container' but got %r." % contpath)
+                # fall back to updating root container
+                contpath = None
+
        if contpartition:
            updates = zip(conthosts, contdevices)
        else:
@ -344,7 +385,8 @@ class ObjectController(BaseStorageServer):
            gt = spawn(self.async_update, op, account, container, obj,
                       conthost, contpartition, contdevice, headers_out,
                       objdevice, policy,
-                       logger_thread_locals=self.logger.thread_locals)
+                       logger_thread_locals=self.logger.thread_locals,
+                       container_path=contpath)
            update_greenthreads.append(gt)
        # Wait a little bit to see if the container updates are successful.
        # If we immediately return after firing off the greenthread above, then
--- a/swift/obj/updater.py
+++ b/swift/obj/updater.py
@ -28,12 +28,14 @@ from swift.common.constraints import check_drive
 from swift.common.exceptions import ConnectionTimeout
 from swift.common.ring import Ring
 from swift.common.utils import get_logger, renamer, write_pickle, \
-    dump_recon_cache, config_true_value, ratelimit_sleep, eventlet_monkey_patch
+    dump_recon_cache, config_true_value, ratelimit_sleep, split_path, \
+    eventlet_monkey_patch, get_redirect_data
 from swift.common.daemon import Daemon
 from swift.common.header_key_dict import HeaderKeyDict
 from swift.common.storage_policy import split_policy_string, PolicyError
 from swift.obj.diskfile import get_tmp_dir, ASYNCDIR_BASE
-from swift.common.http import is_success, HTTP_INTERNAL_SERVER_ERROR
+from swift.common.http import is_success, HTTP_INTERNAL_SERVER_ERROR, \
+    HTTP_MOVED_PERMANENTLY


 class SweepStats(object):
@ -41,12 +43,13 @@ class SweepStats(object):
    Stats bucket for an update sweep
    """
    def __init__(self, errors=0, failures=0, quarantines=0, successes=0,
-                 unlinks=0):
+                 unlinks=0, redirects=0):
        self.errors = errors
        self.failures = failures
        self.quarantines = quarantines
        self.successes = successes
        self.unlinks = unlinks
+        self.redirects = redirects

    def copy(self):
        return type(self)(self.errors, self.failures, self.quarantines,
@ -57,7 +60,8 @@ class SweepStats(object):
                          self.failures - other.failures,
                          self.quarantines - other.quarantines,
                          self.successes - other.successes,
-                          self.unlinks - other.unlinks)
+                          self.unlinks - other.unlinks,
+                          self.redirects - other.redirects)

    def reset(self):
        self.errors = 0
@ -65,6 +69,7 @@ class SweepStats(object):
        self.quarantines = 0
        self.successes = 0
        self.unlinks = 0
+        self.redirects = 0

    def __str__(self):
        keys = (
@ -73,6 +78,7 @@ class SweepStats(object):
            (self.quarantines, 'quarantines'),
            (self.unlinks, 'unlinks'),
            (self.errors, 'errors'),
+            (self.redirects, 'redirects'),
        )
        return ', '.join('%d %s' % pair for pair in keys)

@ -279,7 +285,8 @@ class ObjectUpdater(Daemon):
                 'in %(elapsed).02fs seconds:, '
                 '%(successes)d successes, %(failures)d failures, '
                 '%(quarantines)d quarantines, '
-                 '%(unlinks)d unlinks, %(errors)d errors '
+                 '%(unlinks)d unlinks, %(errors)d errors, '
+                 '%(redirects)d redirects '
                 '(pid: %(pid)d)'),
                {'device': device,
                 'elapsed': time.time() - start_time,
@ -288,7 +295,8 @@ class ObjectUpdater(Daemon):
                 'failures': sweep_totals.failures,
                 'quarantines': sweep_totals.quarantines,
                 'unlinks': sweep_totals.unlinks,
-                 'errors': sweep_totals.errors})
+                 'errors': sweep_totals.errors,
+                 'redirects': sweep_totals.redirects})

    def process_object_update(self, update_path, device, policy):
        """
@ -309,44 +317,83 @@ class ObjectUpdater(Daemon):
                                       os.path.basename(update_path))
            renamer(update_path, target_path, fsync=False)
            return
-        successes = update.get('successes', [])
-        part, nodes = self.get_container_ring().get_nodes(
-            update['account'], update['container'])
-        obj = '/%s/%s/%s' % \
-              (update['account'], update['container'], update['obj'])
-        headers_out = HeaderKeyDict(update['headers'])
-        headers_out['user-agent'] = 'object-updater %s' % os.getpid()
-        headers_out.setdefault('X-Backend-Storage-Policy-Index',
-                               str(int(policy)))
-        events = [spawn(self.object_update,
-                        node, part, update['op'], obj, headers_out)
-                  for node in nodes if node['id'] not in successes]
-        success = True
-        new_successes = False
-        for event in events:
-            event_success, node_id = event.wait()
-            if event_success is True:
-                successes.append(node_id)
-                new_successes = True
+
+        def do_update():
+            successes = update.get('successes', [])
+            headers_out = HeaderKeyDict(update['headers'].copy())
+            headers_out['user-agent'] = 'object-updater %s' % os.getpid()
+            headers_out.setdefault('X-Backend-Storage-Policy-Index',
+                                   str(int(policy)))
+            headers_out.setdefault('X-Backend-Accept-Redirect', 'true')
+            container_path = update.get('container_path')
+            if container_path:
+                acct, cont = split_path('/' + container_path, minsegs=2)
            else:
-                success = False
-        if success:
-            self.stats.successes += 1
-            self.logger.increment('successes')
-            self.logger.debug('Update sent for %(obj)s %(path)s',
-                              {'obj': obj, 'path': update_path})
-            self.stats.unlinks += 1
-            self.logger.increment('unlinks')
-            os.unlink(update_path)
-        else:
-            self.stats.failures += 1
-            self.logger.increment('failures')
-            self.logger.debug('Update failed for %(obj)s %(path)s',
-                              {'obj': obj, 'path': update_path})
-            if new_successes:
-                update['successes'] = successes
-                write_pickle(update, update_path, os.path.join(
-                    device, get_tmp_dir(policy)))
+                acct, cont = update['account'], update['container']
+            part, nodes = self.get_container_ring().get_nodes(acct, cont)
+            obj = '/%s/%s/%s' % (acct, cont, update['obj'])
+            events = [spawn(self.object_update,
+                            node, part, update['op'], obj, headers_out)
+                      for node in nodes if node['id'] not in successes]
+            success = True
+            new_successes = rewrite_pickle = False
+            redirect = None
+            redirects = set()
+            for event in events:
+                event_success, node_id, redirect = event.wait()
+                if event_success is True:
+                    successes.append(node_id)
+                    new_successes = True
+                else:
+                    success = False
+                if redirect:
+                    redirects.add(redirect)
+
+            if success:
+                self.stats.successes += 1
+                self.logger.increment('successes')
+                self.logger.debug('Update sent for %(obj)s %(path)s',
+                                  {'obj': obj, 'path': update_path})
+                self.stats.unlinks += 1
+                self.logger.increment('unlinks')
+                os.unlink(update_path)
+            elif redirects:
+                # erase any previous successes
+                update.pop('successes', None)
+                redirect = max(redirects, key=lambda x: x[-1])[0]
+                redirect_history = update.setdefault('redirect_history', [])
+                if redirect in redirect_history:
+                    # force next update to be sent to root, reset history
+                    update['container_path'] = None
+                    update['redirect_history'] = []
+                else:
+                    update['container_path'] = redirect
+                    redirect_history.append(redirect)
+                self.stats.redirects += 1
+                self.logger.increment("redirects")
+                self.logger.debug(
+                    'Update redirected for %(obj)s %(path)s to %(shard)s',
+                    {'obj': obj, 'path': update_path,
+                     'shard': update['container_path']})
+                rewrite_pickle = True
+            else:
+                self.stats.failures += 1
+                self.logger.increment('failures')
+                self.logger.debug('Update failed for %(obj)s %(path)s',
+                                  {'obj': obj, 'path': update_path})
+                if new_successes:
+                    update['successes'] = successes
+                    rewrite_pickle = True
+
+            return rewrite_pickle, redirect
+
+        rewrite_pickle, redirect = do_update()
+        if redirect:
+            # make one immediate retry to the redirect location
+            rewrite_pickle, redirect = do_update()
+        if rewrite_pickle:
+            write_pickle(update, update_path, os.path.join(
+                device, get_tmp_dir(policy)))

    def object_update(self, node, part, op, obj, headers_out):
        """
@ -357,7 +404,12 @@ class ObjectUpdater(Daemon):
        :param op: operation performed (ex: 'PUT' or 'DELETE')
        :param obj: object name being updated
        :param headers_out: headers to send with the update
+        :return: a tuple of (``success``, ``node_id``, ``redirect``)
+            where ``success`` is True if the update succeeded, ``node_id`` is
+            the_id of the node updated and ``redirect`` is either None or a
+            tuple of (a path, a timestamp string).
        """
+        redirect = None
        try:
            with ConnectionTimeout(self.conn_timeout):
                conn = http_connect(node['ip'], node['port'], node['device'],
@ -365,15 +417,24 @@ class ObjectUpdater(Daemon):
            with Timeout(self.node_timeout):
                resp = conn.getresponse()
                resp.read()
-                success = is_success(resp.status)
-                if not success:
-                    self.logger.debug(
-                        _('Error code %(status)d is returned from remote '
-                          'server %(ip)s: %(port)s / %(device)s'),
-                        {'status': resp.status, 'ip': node['ip'],
-                         'port': node['port'], 'device': node['device']})
-                return (success, node['id'])
+
+            if resp.status == HTTP_MOVED_PERMANENTLY:
+                try:
+                    redirect = get_redirect_data(resp)
+                except ValueError as err:
+                    self.logger.error(
+                        'Container update failed for %r; problem with '
+                        'redirect location: %s' % (obj, err))
+
+            success = is_success(resp.status)
+            if not success:
+                self.logger.debug(
+                    _('Error code %(status)d is returned from remote '
+                      'server %(ip)s: %(port)s / %(device)s'),
+                    {'status': resp.status, 'ip': node['ip'],
+                     'port': node['port'], 'device': node['device']})
+            return success, node['id'], redirect
        except (Exception, Timeout):
            self.logger.exception(_('ERROR with remote server '
                                    '%(ip)s:%(port)s/%(device)s'), node)
-        return HTTP_INTERNAL_SERVER_ERROR, node['id']
+        return HTTP_INTERNAL_SERVER_ERROR, node['id'], redirect
--- a/swift/proxy/controllers/base.py
+++ b/swift/proxy/controllers/base.py
@ -28,6 +28,7 @@ from six.moves.urllib.parse import quote

 import os
 import time
+import json
 import functools
 import inspect
 import itertools
@ -40,11 +41,11 @@ from eventlet import sleep
 from eventlet.timeout import Timeout
 import six

-from swift.common.wsgi import make_pre_authed_env
+from swift.common.wsgi import make_pre_authed_env, make_pre_authed_request
 from swift.common.utils import Timestamp, config_true_value, \
    public, split_path, list_from_csv, GreenthreadSafeIterator, \
    GreenAsyncPile, quorum_size, parse_content_type, \
-    document_iters_to_http_response_body
+    document_iters_to_http_response_body, ShardRange
 from swift.common.bufferedhttp import http_connect
 from swift.common import constraints
 from swift.common.exceptions import ChunkReadTimeout, ChunkWriteTimeout, \
@ -188,6 +189,7 @@ def headers_to_container_info(headers, status_int=HTTP_OK):
        },
        'meta': meta,
        'sysmeta': sysmeta,
+        'sharding_state': headers.get('x-backend-sharding-state', 'unsharded'),
    }


@ -375,6 +377,9 @@ def get_container_info(env, app, swift_source=None):
        else:
            info[field] = int(info[field])

+    if info.get('sharding_state') is None:
+        info['sharding_state'] = 'unsharded'
+
    return info


@ -1994,3 +1999,91 @@ class Controller(object):
        else:
            raise ValueError(
                "server_type can only be 'account' or 'container'")
+
+    def _get_container_listing(self, req, account, container, headers=None,
+                               params=None):
+        """
+        Fetch container listing from given `account/container`.
+
+        :param req: original Request instance.
+        :param account: account in which `container` is stored.
+        :param container: container from listing should be fetched.
+        :param headers: headers to be included with the request
+        :param params: query string parameters to be used.
+        :return: a tuple of (deserialized json data structure, swob Response)
+        """
+        params = params or {}
+        version, _a, _c, _other = req.split_path(3, 4, True)
+        path = '/'.join(['', version, account, container])
+
+        subreq = make_pre_authed_request(
+            req.environ, method='GET', path=quote(path), headers=req.headers,
+            swift_source='SH')
+        if headers:
+            subreq.headers.update(headers)
+        subreq.params = params
+        self.app.logger.debug(
+            'Get listing from %s %s' % (subreq.path_qs, headers))
+        response = self.app.handle_request(subreq)
+
+        if not is_success(response.status_int):
+            self.app.logger.warning(
+                'Failed to get container listing from %s: %s',
+                subreq.path_qs, response.status_int)
+            return None, response
+
+        try:
+            data = json.loads(response.body)
+            if not isinstance(data, list):
+                raise ValueError('not a list')
+            return data, response
+        except ValueError as err:
+            self.app.logger.error(
+                'Problem with listing response from %s: %r',
+                subreq.path_qs, err)
+            return None, response
+
+    def _get_shard_ranges(self, req, account, container, includes=None,
+                          states=None):
+        """
+        Fetch shard ranges from given `account/container`. If `includes` is
+        given then the shard range for that object name is requested, otherwise
+        all shard ranges are requested.
+
+        :param req: original Request instance.
+        :param account: account from which shard ranges should be fetched.
+        :param container: container from which shard ranges should be fetched.
+        :param includes: (optional) restricts the list of fetched shard ranges
+            to those which include the given name.
+        :param states: (optional) the states of shard ranges to be fetched.
+        :return: a list of instances of :class:`swift.common.utils.ShardRange`,
+            or None if there was a problem fetching the shard ranges
+        """
+        params = req.params.copy()
+        params.pop('limit', None)
+        params['format'] = 'json'
+        if includes:
+            params['includes'] = includes
+        if states:
+            params['states'] = states
+        headers = {'X-Backend-Record-Type': 'shard'}
+        listing, response = self._get_container_listing(
+            req, account, container, headers=headers, params=params)
+        if listing is None:
+            return None
+
+        record_type = response.headers.get('x-backend-record-type')
+        if record_type != 'shard':
+            err = 'unexpected record type %r' % record_type
+            self.app.logger.error("Failed to get shard ranges from %s: %s",
+                                  req.path_qs, err)
+            return None
+
+        try:
+            return [ShardRange.from_dict(shard_range)
+                    for shard_range in listing]
+        except (ValueError, TypeError, KeyError) as err:
+            self.app.logger.error(
+                "Failed to get shard ranges from %s: invalid data: %r",
+                req.path_qs, err)
+            return None
--- a/swift/proxy/controllers/container.py
+++ b/swift/proxy/controllers/container.py
@ -14,11 +14,14 @@
 # limitations under the License.

 from swift import gettext_ as _
+import json

 from six.moves.urllib.parse import unquote
-from swift.common.utils import public, csv_append, Timestamp
-from swift.common.constraints import check_metadata
+from swift.common.utils import public, csv_append, Timestamp, \
+    config_true_value, ShardRange
+from swift.common.constraints import check_metadata, CONTAINER_LISTING_LIMIT
 from swift.common.http import HTTP_ACCEPTED, is_success
+from swift.common.request_helpers import get_sys_meta_prefix
 from swift.proxy.controllers.base import Controller, delay_denial, \
    cors_validation, set_info_cache, clear_info_cache
 from swift.common.storage_policy import POLICIES
@ -84,7 +87,9 @@ class ContainerController(Controller):
    def GETorHEAD(self, req):
        """Handler for HTTP GET/HEAD requests."""
        ai = self.account_info(self.account_name, req)
-        if not ai[1]:
+        auto_account = self.account_name.startswith(
+            self.app.auto_create_account_prefix)
+        if not (auto_account or ai[1]):
            if 'swift.authorize' in req.environ:
                aresp = req.environ['swift.authorize'](req)
                if aresp:
@ -101,10 +106,20 @@ class ContainerController(Controller):
        node_iter = self.app.iter_nodes(self.app.container_ring, part)
        params = req.params
        params['format'] = 'json'
+        record_type = req.headers.get('X-Backend-Record-Type', '').lower()
+        if not record_type:
+            record_type = 'auto'
+            req.headers['X-Backend-Record-Type'] = 'auto'
+            params['states'] = 'listing'
        req.params = params
        resp = self.GETorHEAD_base(
            req, _('Container'), node_iter, part,
            req.swift_entity_path, concurrency)
+        resp_record_type = resp.headers.get('X-Backend-Record-Type', '')
+        if all((req.method == "GET", record_type == 'auto',
+               resp_record_type.lower() == 'shard')):
+            resp = self._get_from_shards(req, resp)
+
        # Cache this. We just made a request to a storage node and got
        # up-to-date information for the container.
        resp.headers['X-Backend-Recheck-Container-Existence'] = str(
@ -122,6 +137,104 @@ class ContainerController(Controller):
            for key in self.app.swift_owner_headers:
                if key in resp.headers:
                    del resp.headers[key]
+        # Expose sharding state in reseller requests
+        if req.environ.get('reseller_request', False):
+            resp.headers['X-Container-Sharding'] = config_true_value(
+                resp.headers.get(get_sys_meta_prefix('container') + 'Sharding',
+                                 'False'))
+        return resp
+
+    def _get_from_shards(self, req, resp):
+        # construct listing using shards described by the response body
+        shard_ranges = [ShardRange.from_dict(data)
+                        for data in json.loads(resp.body)]
+        self.app.logger.debug('GET listing from %s shards for: %s',
+                              len(shard_ranges), req.path_qs)
+        if not shard_ranges:
+            # can't find ranges or there was a problem getting the ranges. So
+            # return what we have.
+            return resp
+
+        objects = []
+        req_limit = int(req.params.get('limit', CONTAINER_LISTING_LIMIT))
+        params = req.params.copy()
+        params.pop('states', None)
+        req.headers.pop('X-Backend-Record-Type', None)
+        reverse = config_true_value(params.get('reverse'))
+        marker = params.get('marker')
+        end_marker = params.get('end_marker')
+
+        limit = req_limit
+        for shard_range in shard_ranges:
+            params['limit'] = limit
+            # Always set marker to ensure that object names less than or equal
+            # to those already in the listing are not fetched
+            if objects:
+                last_name = objects[-1].get('name',
+                                            objects[-1].get('subdir', u''))
+                params['marker'] = last_name.encode('utf-8')
+            elif reverse and marker and marker > shard_range.lower:
+                params['marker'] = marker
+            elif marker and marker <= shard_range.upper:
+                params['marker'] = marker
+            else:
+                params['marker'] = shard_range.upper_str if reverse \
+                    else shard_range.lower_str
+                if params['marker'] and reverse:
+                    params['marker'] += '\x00'
+
+            # Always set end_marker to ensure that misplaced objects beyond
+            # the expected shard range are not fetched
+            if end_marker and end_marker in shard_range:
+                params['end_marker'] = end_marker
+            else:
+                params['end_marker'] = shard_range.lower_str if reverse \
+                    else shard_range.upper_str
+                if params['end_marker'] and not reverse:
+                    params['end_marker'] += '\x00'
+
+            if (shard_range.account == self.account_name and
+                    shard_range.container == self.container_name):
+                # directed back to same container - force GET of objects
+                headers = {'X-Backend-Record-Type': 'object'}
+            else:
+                headers = None
+            self.app.logger.debug('Getting from %s %s with %s',
+                                  shard_range, shard_range.name, headers)
+            objs, shard_resp = self._get_container_listing(
+                req, shard_range.account, shard_range.container,
+                headers=headers, params=params)
+
+            if not objs:
+                # tolerate errors or empty shard containers
+                continue
+
+            objects.extend(objs)
+            limit -= len(objs)
+
+            if limit <= 0:
+                break
+            elif (end_marker and reverse and
+                  end_marker >= objects[-1]['name'].encode('utf-8')):
+                break
+            elif (end_marker and not reverse and
+                  end_marker <= objects[-1]['name'].encode('utf-8')):
+                break
+
+        resp.body = json.dumps(objects)
+        constrained = any(req.params.get(constraint) for constraint in (
+            'marker', 'end_marker', 'path', 'prefix', 'delimiter'))
+        if not constrained and len(objects) < req_limit:
+            self.app.logger.debug('Setting object count to %s' % len(objects))
+            # prefer the actual listing stats over the potentially outdated
+            # root stats. This condition is only likely when a sharded
+            # container is shrinking or in tests; typically a sharded container
+            # will have more than CONTAINER_LISTING_LIMIT objects so any
+            # unconstrained listing will be capped by the limit and total
+            # object stats cannot therefore be inferred from the listing.
+            resp.headers['X-Container-Object-Count'] = len(objects)
+            resp.headers['X-Container-Bytes-Used'] = sum(
+                [o['bytes'] for o in objects])
        return resp

    @public
@ -150,6 +263,10 @@ class ContainerController(Controller):
        if not req.environ.get('swift_owner'):
            for key in self.app.swift_owner_headers:
                req.headers.pop(key, None)
+        if req.environ.get('reseller_request', False) and \
+                'X-Container-Sharding' in req.headers:
+            req.headers[get_sys_meta_prefix('container') + 'Sharding'] = \
+                str(config_true_value(req.headers['X-Container-Sharding']))
        length_limit = self.get_name_length_limit()
        if len(self.container_name) > length_limit:
            resp = HTTPBadRequest(request=req)
@ -198,6 +315,10 @@ class ContainerController(Controller):
        if not req.environ.get('swift_owner'):
            for key in self.app.swift_owner_headers:
                req.headers.pop(key, None)
+        if req.environ.get('reseller_request', False) and \
+                'X-Container-Sharding' in req.headers:
+            req.headers[get_sys_meta_prefix('container') + 'Sharding'] = \
+                str(config_true_value(req.headers['X-Container-Sharding']))
        account_partition, accounts, container_count = \
            self.account_info(self.account_name, req)
        if not accounts:
--- a/swift/proxy/controllers/obj.py
+++ b/swift/proxy/controllers/obj.py
@ -266,6 +266,20 @@ class BaseObjectController(Controller):
        """Handler for HTTP HEAD requests."""
        return self.GETorHEAD(req)

+    def _get_update_target(self, req, container_info):
+        # find the sharded container to which we'll send the update
+        db_state = container_info.get('sharding_state', 'unsharded')
+        if db_state in ('sharded', 'sharding'):
+            shard_ranges = self._get_shard_ranges(
+                req, self.account_name, self.container_name,
+                includes=self.object_name, states='updating')
+            if shard_ranges:
+                partition, nodes = self.app.container_ring.get_nodes(
+                    shard_ranges[0].account, shard_ranges[0].container)
+                return partition, nodes, shard_ranges[0].name
+
+        return container_info['partition'], container_info['nodes'], None
+
    @public
    @cors_validation
    @delay_denial
@ -273,8 +287,8 @@ class BaseObjectController(Controller):
        """HTTP POST request handler."""
        container_info = self.container_info(
            self.account_name, self.container_name, req)
-        container_partition = container_info['partition']
-        container_nodes = container_info['nodes']
+        container_partition, container_nodes, container_path = \
+            self._get_update_target(req, container_info)
        req.acl = container_info['write_acl']
        if 'swift.authorize' in req.environ:
            aresp = req.environ['swift.authorize'](req)
@ -304,13 +318,14 @@ class BaseObjectController(Controller):

        headers = self._backend_requests(
            req, len(nodes), container_partition, container_nodes,
-            delete_at_container, delete_at_part, delete_at_nodes)
+            delete_at_container, delete_at_part, delete_at_nodes,
+            container_path=container_path)
        return self._post_object(req, obj_ring, partition, headers)

    def _backend_requests(self, req, n_outgoing,
                          container_partition, containers,
                          delete_at_container=None, delete_at_partition=None,
-                          delete_at_nodes=None):
+                          delete_at_nodes=None, container_path=None):
        policy_index = req.headers['X-Backend-Storage-Policy-Index']
        policy = POLICIES.get_by_index(policy_index)
        headers = [self.generate_request_headers(req, additional=req.headers)
@ -324,6 +339,8 @@ class BaseObjectController(Controller):
            headers[index]['X-Container-Device'] = csv_append(
                headers[index].get('X-Container-Device'),
                container['device'])
+            if container_path:
+                headers[index]['X-Backend-Container-Path'] = container_path

        def set_delete_at_headers(index, delete_at_node):
            headers[index]['X-Delete-At-Container'] = delete_at_container
@ -752,8 +769,8 @@ class BaseObjectController(Controller):
        policy_index = req.headers.get('X-Backend-Storage-Policy-Index',
                                       container_info['storage_policy'])
        obj_ring = self.app.get_object_ring(policy_index)
-        container_nodes = container_info['nodes']
-        container_partition = container_info['partition']
+        container_partition, container_nodes, container_path = \
+            self._get_update_target(req, container_info)
        partition, nodes = obj_ring.get_nodes(
            self.account_name, self.container_name, self.object_name)

@ -800,7 +817,8 @@ class BaseObjectController(Controller):
        # add special headers to be handled by storage nodes
        outgoing_headers = self._backend_requests(
            req, len(nodes), container_partition, container_nodes,
-            delete_at_container, delete_at_part, delete_at_nodes)
+            delete_at_container, delete_at_part, delete_at_nodes,
+            container_path=container_path)

        # send object to storage nodes
        resp = self._store_object(
@ -823,8 +841,8 @@ class BaseObjectController(Controller):
        next_part_power = getattr(obj_ring, 'next_part_power', None)
        if next_part_power:
            req.headers['X-Backend-Next-Part-Power'] = next_part_power
-        container_partition = container_info['partition']
-        container_nodes = container_info['nodes']
+        container_partition, container_nodes, container_path = \
+            self._get_update_target(req, container_info)
        req.acl = container_info['write_acl']
        req.environ['swift_sync_key'] = container_info['sync_key']
        if 'swift.authorize' in req.environ:
@ -851,7 +869,8 @@ class BaseObjectController(Controller):
            node_count += local_handoffs

        headers = self._backend_requests(
-            req, node_count, container_partition, container_nodes)
+            req, node_count, container_partition, container_nodes,
+            container_path=container_path)
        return self._delete_object(req, obj_ring, partition, headers)


--- a/test/init.py
+++ b/test/init.py
@ -17,7 +17,11 @@
 # The code below enables nosetests to work with i18n _() blocks
 from __future__ import print_function
 import sys
+from contextlib import contextmanager
+
 import os
+from six import reraise
+
 try:
    from unittest.util import safe_repr
 except ImportError:
@ -86,3 +90,26 @@ def listen_zero():
    sock.bind(("127.0.0.1", 0))
    sock.listen(50)
    return sock
+
+
+@contextmanager
+def annotate_failure(msg):
+    """
+    Catch AssertionError and annotate it with a message. Useful when making
+    assertions in a loop where the message can indicate the loop index or
+    richer context about the failure.
+
+    :param msg: A message to be prefixed to the AssertionError message.
+    """
+    try:
+        yield
+    except AssertionError as err:
+        err_typ, err_val, err_tb = sys.exc_info()
+        if err_val.args:
+            msg = '%s Failed with %s' % (msg, err_val.args[0])
+            err_val.args = (msg, ) + err_val.args[1:]
+        else:
+            # workaround for some IDE's raising custom AssertionErrors
+            err_val = '%s Failed with %s' % (msg, err)
+            err_typ = AssertionError
+        reraise(err_typ, err_val, err_tb)
--- a/test/probe/brain.py
+++ b/test/probe/brain.py
@ -99,9 +99,11 @@ class BrainSplitter(object):
            raise ValueError('Unknown server_type: %r' % server_type)
        self.server_type = server_type

-        part, nodes = self.ring.get_nodes(self.account, c, o)
+        self.part, self.nodes = self.ring.get_nodes(self.account, c, o)
+
+        node_ids = [n['id'] for n in self.nodes]
+        self.node_numbers = [n + 1 for n in node_ids]

-        node_ids = [n['id'] for n in nodes]
        if all(n_id in node_ids for n_id in (0, 1)):
            self.primary_numbers = (1, 2)
            self.handoff_numbers = (3, 4)
--- a/test/probe/common.py
+++ b/test/probe/common.py
@ -14,6 +14,8 @@
 # limitations under the License.

 from __future__ import print_function
+
+import errno
 import os
 from subprocess import Popen, PIPE
 import sys
@ -125,13 +127,17 @@ def kill_server(ipport, ipport2server):
    if err:
        raise Exception('unable to kill %s' % (server if not number else
                                               '%s%s' % (server, number)))
+    return wait_for_server_to_hangup(ipport)
+
+
+def wait_for_server_to_hangup(ipport):
    try_until = time() + 30
    while True:
        try:
            conn = HTTPConnection(*ipport)
            conn.request('GET', '/')
            conn.getresponse()
-        except Exception as err:
+        except Exception:
            break
        if time() > try_until:
            raise Exception(
@ -334,33 +340,35 @@ class ProbeTest(unittest.TestCase):
    Don't instantiate this directly, use a child class instead.
    """

+    def _load_rings_and_configs(self):
+        self.ipport2server = {}
+        self.configs = defaultdict(dict)
+        self.account_ring = get_ring(
+            'account',
+            self.acct_cont_required_replicas,
+            self.acct_cont_required_devices,
+            ipport2server=self.ipport2server,
+            config_paths=self.configs)
+        self.container_ring = get_ring(
+            'container',
+            self.acct_cont_required_replicas,
+            self.acct_cont_required_devices,
+            ipport2server=self.ipport2server,
+            config_paths=self.configs)
+        self.policy = get_policy(**self.policy_requirements)
+        self.object_ring = get_ring(
+            self.policy.ring_name,
+            self.obj_required_replicas,
+            self.obj_required_devices,
+            server='object',
+            ipport2server=self.ipport2server,
+            config_paths=self.configs)
+
    def setUp(self):
        resetswift()
        kill_orphans()
+        self._load_rings_and_configs()
        try:
-            self.ipport2server = {}
-            self.configs = defaultdict(dict)
-            self.account_ring = get_ring(
-                'account',
-                self.acct_cont_required_replicas,
-                self.acct_cont_required_devices,
-                ipport2server=self.ipport2server,
-                config_paths=self.configs)
-            self.container_ring = get_ring(
-                'container',
-                self.acct_cont_required_replicas,
-                self.acct_cont_required_devices,
-                ipport2server=self.ipport2server,
-                config_paths=self.configs)
-            self.policy = get_policy(**self.policy_requirements)
-            self.object_ring = get_ring(
-                self.policy.ring_name,
-                self.obj_required_replicas,
-                self.obj_required_devices,
-                server='object',
-                ipport2server=self.ipport2server,
-                config_paths=self.configs)
-
            self.servers_per_port = any(
                int(readconf(c, section_name='object-replicator').get(
                    'servers_per_port', '0'))
@ -489,6 +497,49 @@ class ProbeTest(unittest.TestCase):
        finally:
            shutil.rmtree(tempdir)

+    def get_all_object_nodes(self):
+        """
+        Returns a list of all nodes in all object storage policies.
+
+        :return: a list of node dicts.
+        """
+        all_obj_nodes = {}
+        for policy in ENABLED_POLICIES:
+            for dev in policy.object_ring.devs:
+                all_obj_nodes[dev['device']] = dev
+        return all_obj_nodes.values()
+
+    def gather_async_pendings(self, onodes):
+        """
+        Returns a list of paths to async pending files found on given nodes.
+
+        :param onodes: a list of nodes.
+        :return: a list of file paths.
+        """
+        async_pendings = []
+        for onode in onodes:
+            device_dir = self.device_dir('', onode)
+            for ap_pol_dir in os.listdir(device_dir):
+                if not ap_pol_dir.startswith('async_pending'):
+                    # skip 'objects', 'containers', etc.
+                    continue
+                async_pending_dir = os.path.join(device_dir, ap_pol_dir)
+                try:
+                    ap_dirs = os.listdir(async_pending_dir)
+                except OSError as err:
+                    if err.errno == errno.ENOENT:
+                        pass
+                    else:
+                        raise
+                else:
+                    for ap_dir in ap_dirs:
+                        ap_dir_fullpath = os.path.join(
+                            async_pending_dir, ap_dir)
+                        async_pendings.extend([
+                            os.path.join(ap_dir_fullpath, ent)
+                            for ent in os.listdir(ap_dir_fullpath)])
+        return async_pendings
+

 class ReplProbeTest(ProbeTest):

--- a/test/probe/test_object_expirer.py
+++ b/test/probe/test_object_expirer.py
@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import errno
-import os
 import random
 import time
 import uuid
@ -143,31 +141,6 @@ class TestObjectExpirer(ReplProbeTest):
        # tha the object server does not write out any async pendings; this
        # test asserts that this is the case.

-        def gather_async_pendings(onodes):
-            async_pendings = []
-            for onode in onodes:
-                device_dir = self.device_dir('', onode)
-                for ap_pol_dir in os.listdir(device_dir):
-                    if not ap_pol_dir.startswith('async_pending'):
-                        # skip 'objects', 'containers', etc.
-                        continue
-                    async_pending_dir = os.path.join(device_dir, ap_pol_dir)
-                    try:
-                        ap_dirs = os.listdir(async_pending_dir)
-                    except OSError as err:
-                        if err.errno == errno.ENOENT:
-                            pass
-                        else:
-                            raise
-                    else:
-                        for ap_dir in ap_dirs:
-                            ap_dir_fullpath = os.path.join(
-                                async_pending_dir, ap_dir)
-                            async_pendings.extend([
-                                os.path.join(ap_dir_fullpath, ent)
-                                for ent in os.listdir(ap_dir_fullpath)])
-            return async_pendings
-
        # Make an expiring object in each policy
        for policy in ENABLED_POLICIES:
            container_name = "expirer-test-%d" % policy.idx
@ -191,15 +164,12 @@ class TestObjectExpirer(ReplProbeTest):
        # Make sure there's no async_pendings anywhere. Probe tests only run
        # on single-node installs anyway, so this set should be small enough
        # that an exhaustive check doesn't take too long.
-        all_obj_nodes = {}
-        for policy in ENABLED_POLICIES:
-            for dev in policy.object_ring.devs:
-                all_obj_nodes[dev['device']] = dev
-        pendings_before = gather_async_pendings(all_obj_nodes.values())
+        all_obj_nodes = self.get_all_object_nodes()
+        pendings_before = self.gather_async_pendings(all_obj_nodes)

        # expire the objects
        Manager(['object-expirer']).once()
-        pendings_after = gather_async_pendings(all_obj_nodes.values())
+        pendings_after = self.gather_async_pendings(all_obj_nodes)
        self.assertEqual(pendings_after, pendings_before)

    def test_expirer_object_should_not_be_expired(self):
--- a/test/probe/test_sharder.py
+++ b/test/probe/test_sharder.py
--- a/test/unit/init.py
+++ b/test/unit/init.py
@ -751,6 +751,8 @@ class FakeStatus(object):
        :param response_sleep: float, time to eventlet sleep during response
        """
        # connect exception
+        if inspect.isclass(status) and issubclass(status, Exception):
+            raise status('FakeStatus Error')
        if isinstance(status, (Exception, eventlet.Timeout)):
            raise status
        if isinstance(status, tuple):
@ -1063,6 +1065,15 @@ def make_timestamp_iter(offset=0):
                for t in itertools.count(int(time.time()) + offset))


+@contextmanager
+def mock_timestamp_now(now=None):
+    if now is None:
+        now = Timestamp.now()
+    with mocklib.patch('swift.common.utils.Timestamp.now',
+                       classmethod(lambda c: now)):
+        yield now
+
+
 class Timeout(object):
    def __init__(self, seconds):
        self.seconds = seconds
@ -1323,3 +1334,55 @@ def skip_if_no_xattrs():
    if not xattr_supported_check():
        raise SkipTest('Large xattrs not supported in `%s`. Skipping test' %
                       gettempdir())
+
+
+def unlink_files(paths):
+    for path in paths:
+        try:
+            os.unlink(path)
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise
+
+
+class FakeHTTPResponse(object):
+
+    def __init__(self, resp):
+        self.resp = resp
+
+    @property
+    def status(self):
+        return self.resp.status_int
+
+    @property
+    def data(self):
+        return self.resp.body
+
+
+def attach_fake_replication_rpc(rpc, replicate_hook=None, errors=None):
+    class FakeReplConnection(object):
+
+        def __init__(self, node, partition, hash_, logger):
+            self.logger = logger
+            self.node = node
+            self.partition = partition
+            self.path = '/%s/%s/%s' % (node['device'], partition, hash_)
+            self.host = node['replication_ip']
+
+        def replicate(self, op, *sync_args):
+            print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args))
+            resp = None
+            if errors and op in errors and errors[op]:
+                resp = errors[op].pop(0)
+            if not resp:
+                replicate_args = self.path.lstrip('/').split('/')
+                args = [op] + copy.deepcopy(list(sync_args))
+                with mock_check_drive(isdir=not rpc.mount_check,
+                                      ismount=rpc.mount_check):
+                    swob_response = rpc.dispatch(replicate_args, args)
+                resp = FakeHTTPResponse(swob_response)
+            if replicate_hook:
+                replicate_hook(op, *sync_args)
+            return resp
+
+    return FakeReplConnection
--- a/test/unit/account/test_server.py
+++ b/test/unit/account/test_server.py
@ -404,7 +404,7 @@ class TestAccountController(unittest.TestCase):
                elif state[0] == 'race':
                    # Save the original db_file attribute value
                    self._saved_db_file = self.db_file
-                    self.db_file += '.doesnotexist'
+                    self._db_file += '.doesnotexist'

            def initialize(self, *args, **kwargs):
                if state[0] == 'initial':
@ -413,7 +413,7 @@ class TestAccountController(unittest.TestCase):
                elif state[0] == 'race':
                    # Restore the original db_file attribute to get the race
                    # behavior
-                    self.db_file = self._saved_db_file
+                    self._db_file = self._saved_db_file
                return super(InterceptedAcBr, self).initialize(*args, **kwargs)

        with mock.patch("swift.account.server.AccountBroker", InterceptedAcBr):
--- a/test/unit/cli/test_info.py
+++ b/test/unit/cli/test_info.py
@ -31,6 +31,7 @@ from swift.cli.info import (print_db_info_metadata, print_ring_locations,
                            parse_get_node_args)
 from swift.account.server import AccountController
 from swift.container.server import ContainerController
+from swift.container.backend import UNSHARDED, SHARDED
 from swift.obj.diskfile import write_metadata


@ -103,17 +104,18 @@ class TestCliInfo(TestCliInfoBase):
        self.assertRaisesMessage(ValueError, 'Info is incomplete',
                                 print_db_info_metadata, 'container', {}, {})

-        info = dict(
-            account='acct',
-            created_at=100.1,
-            put_timestamp=106.3,
-            delete_timestamp=107.9,
-            status_changed_at=108.3,
-            container_count='3',
-            object_count='20',
-            bytes_used='42')
-        info['hash'] = 'abaddeadbeefcafe'
-        info['id'] = 'abadf100d0ddba11'
+        info = {
+            'account': 'acct',
+            'created_at': 100.1,
+            'put_timestamp': 106.3,
+            'delete_timestamp': 107.9,
+            'status_changed_at': 108.3,
+            'container_count': '3',
+            'object_count': '20',
+            'bytes_used': '42',
+            'hash': 'abaddeadbeefcafe',
+            'id': 'abadf100d0ddba11',
+        }
        md = {'x-account-meta-mydata': ('swift', '0000000000.00000'),
              'x-other-something': ('boo', '0000000000.00000')}
        out = StringIO()
@ -154,7 +156,9 @@ No system metadata found in db file
            reported_object_count='20',
            reported_bytes_used='42',
            x_container_foo='bar',
-            x_container_bar='goo')
+            x_container_bar='goo',
+            db_state=UNSHARDED,
+            is_root=True)
        info['hash'] = 'abaddeadbeefcafe'
        info['id'] = 'abadf100d0ddba11'
        md = {'x-container-sysmeta-mydata': ('swift', '0000000000.00000')}
@ -182,10 +186,88 @@ Metadata:
  X-Container-Bar: goo
  X-Container-Foo: bar
  System Metadata: {'mydata': 'swift'}
-No user metadata found in db file''' % POLICIES[0].name
+No user metadata found in db file
+Sharding Metadata:
+  Type: root
+  State: unsharded''' % POLICIES[0].name
        self.assertEqual(sorted(out.getvalue().strip().split('\n')),
                         sorted(exp_out.split('\n')))

+    def test_print_db_info_metadata_with_shard_ranges(self):
+
+        shard_ranges = [utils.ShardRange(
+            name='.sharded_a/shard_range_%s' % i,
+            timestamp=utils.Timestamp(i), lower='%da' % i,
+            upper='%dz' % i, object_count=i, bytes_used=i,
+            meta_timestamp=utils.Timestamp(i)) for i in range(1, 4)]
+        shard_ranges[0].state = utils.ShardRange.CLEAVED
+        shard_ranges[1].state = utils.ShardRange.CREATED
+
+        info = dict(
+            account='acct',
+            container='cont',
+            storage_policy_index=0,
+            created_at='0000000100.10000',
+            put_timestamp='0000000106.30000',
+            delete_timestamp='0000000107.90000',
+            status_changed_at='0000000108.30000',
+            object_count='20',
+            bytes_used='42',
+            reported_put_timestamp='0000010106.30000',
+            reported_delete_timestamp='0000010107.90000',
+            reported_object_count='20',
+            reported_bytes_used='42',
+            db_state=SHARDED,
+            is_root=True,
+            shard_ranges=shard_ranges)
+        info['hash'] = 'abaddeadbeefcafe'
+        info['id'] = 'abadf100d0ddba11'
+        out = StringIO()
+        with mock.patch('sys.stdout', out):
+            print_db_info_metadata('container', info, {})
+        exp_out = '''Path: /acct/cont
+  Account: acct
+  Container: cont
+  Container Hash: d49d0ecbb53be1fcc49624f2f7c7ccae
+Metadata:
+  Created at: 1970-01-01T00:01:40.100000 (0000000100.10000)
+  Put Timestamp: 1970-01-01T00:01:46.300000 (0000000106.30000)
+  Delete Timestamp: 1970-01-01T00:01:47.900000 (0000000107.90000)
+  Status Timestamp: 1970-01-01T00:01:48.300000 (0000000108.30000)
+  Object Count: 20
+  Bytes Used: 42
+  Storage Policy: %s (0)
+  Reported Put Timestamp: 1970-01-01T02:48:26.300000 (0000010106.30000)
+  Reported Delete Timestamp: 1970-01-01T02:48:27.900000 (0000010107.90000)
+  Reported Object Count: 20
+  Reported Bytes Used: 42
+  Chexor: abaddeadbeefcafe
+  UUID: abadf100d0ddba11
+No system metadata found in db file
+No user metadata found in db file
+Sharding Metadata:
+  Type: root
+  State: sharded
+Shard Ranges (3):
+  Name: .sharded_a/shard_range_1
+    lower: '1a', upper: '1z'
+    Object Count: 1, Bytes Used: 1, State: cleaved (30)
+    Created at: 1970-01-01T00:00:01.000000 (0000000001.00000)
+    Meta Timestamp: 1970-01-01T00:00:01.000000 (0000000001.00000)
+  Name: .sharded_a/shard_range_2
+    lower: '2a', upper: '2z'
+    Object Count: 2, Bytes Used: 2, State: created (20)
+    Created at: 1970-01-01T00:00:02.000000 (0000000002.00000)
+    Meta Timestamp: 1970-01-01T00:00:02.000000 (0000000002.00000)
+  Name: .sharded_a/shard_range_3
+    lower: '3a', upper: '3z'
+    Object Count: 3, Bytes Used: 3, State: found (10)
+    Created at: 1970-01-01T00:00:03.000000 (0000000003.00000)
+    Meta Timestamp: 1970-01-01T00:00:03.000000 (0000000003.00000)''' %\
+                  POLICIES[0].name
+        self.assertEqual(sorted(out.getvalue().strip().split('\n')),
+                         sorted(exp_out.strip().split('\n')))
+
    def test_print_ring_locations_invalid_args(self):
        self.assertRaises(ValueError, print_ring_locations,
                          None, 'dir', 'acct')
@ -423,14 +505,8 @@ No user metadata found in db file''' % POLICIES[0].name
                                   '1', 'b47',
                                   'dc5be2aa4347a22a0fee6bc7de505b47',
                                   'dc5be2aa4347a22a0fee6bc7de505b47.db')
-            try:
-                print_info('account', db_file, swift_dir=self.testdir)
-            except Exception:
-                exp_raised = True
-        if exp_raised:
-            self.fail("Unexpected exception raised")
-        else:
-            self.assertGreater(len(out.getvalue().strip()), 800)
+            print_info('account', db_file, swift_dir=self.testdir)
+        self.assertGreater(len(out.getvalue().strip()), 800)

        controller = ContainerController(
            {'devices': self.testdir, 'mount_check': 'false'})
--- a/test/unit/cli/test_manage_shard_ranges.py
+++ b/test/unit/cli/test_manage_shard_ranges.py
@ -0,0 +1,362 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy
+# of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from __future__ import unicode_literals
+
+import json
+import os
+import unittest
+import mock
+from shutil import rmtree
+from tempfile import mkdtemp
+
+from six.moves import cStringIO as StringIO
+
+from swift.cli.manage_shard_ranges import main
+from swift.common import utils
+from swift.common.utils import Timestamp, ShardRange
+from swift.container.backend import ContainerBroker
+from test.unit import mock_timestamp_now
+
+
+class TestManageShardRanges(unittest.TestCase):
+    def setUp(self):
+        self.testdir = os.path.join(mkdtemp(), 'tmp_test_cli_find_shards')
+        utils.mkdirs(self.testdir)
+        rmtree(self.testdir)
+        self.shard_data = [
+            {'index': 0, 'lower': '', 'upper': 'obj09', 'object_count': 10},
+            {'index': 1, 'lower': 'obj09', 'upper': 'obj19',
+             'object_count': 10},
+            {'index': 2, 'lower': 'obj19', 'upper': 'obj29',
+             'object_count': 10},
+            {'index': 3, 'lower': 'obj29', 'upper': 'obj39',
+             'object_count': 10},
+            {'index': 4, 'lower': 'obj39', 'upper': 'obj49',
+             'object_count': 10},
+            {'index': 5, 'lower': 'obj49', 'upper': 'obj59',
+             'object_count': 10},
+            {'index': 6, 'lower': 'obj59', 'upper': 'obj69',
+             'object_count': 10},
+            {'index': 7, 'lower': 'obj69', 'upper': 'obj79',
+             'object_count': 10},
+            {'index': 8, 'lower': 'obj79', 'upper': 'obj89',
+             'object_count': 10},
+            {'index': 9, 'lower': 'obj89', 'upper': '', 'object_count': 10},
+        ]
+
+    def tearDown(self):
+        rmtree(os.path.dirname(self.testdir))
+
+    def assert_starts_with(self, value, prefix):
+        self.assertTrue(value.startswith(prefix),
+                        "%r does not start with %r" % (value, prefix))
+
+    def assert_formatted_json(self, output, expected):
+        try:
+            loaded = json.loads(output)
+        except ValueError as err:
+            self.fail('Invalid JSON: %s\n%r' % (err, output))
+        # Check this one first, for a prettier diff
+        self.assertEqual(loaded, expected)
+        formatted = json.dumps(expected, sort_keys=True, indent=2) + '\n'
+        self.assertEqual(output, formatted)
+
+    def _make_broker(self, account='a', container='c',
+                     device='sda', part=0):
+        datadir = os.path.join(
+            self.testdir, device, 'containers', str(part), 'ash', 'hash')
+        db_file = os.path.join(datadir, 'hash.db')
+        broker = ContainerBroker(
+            db_file, account=account, container=container)
+        broker.initialize()
+        return broker
+
+    def test_find_shard_ranges(self):
+        db_file = os.path.join(self.testdir, 'hash.db')
+        broker = ContainerBroker(db_file)
+        broker.account = 'a'
+        broker.container = 'c'
+        broker.initialize()
+        ts = utils.Timestamp.now()
+        broker.merge_items([
+            {'name': 'obj%02d' % i, 'created_at': ts.internal, 'size': 0,
+             'content_type': 'application/octet-stream', 'etag': 'not-really',
+             'deleted': 0, 'storage_policy_index': 0,
+             'ctype_timestamp': ts.internal, 'meta_timestamp': ts.internal}
+            for i in range(100)])
+
+        # Default uses a large enough value that sharding isn't required
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            main([db_file, 'find'])
+        self.assert_formatted_json(out.getvalue(), [])
+        err_lines = err.getvalue().split('\n')
+        self.assert_starts_with(err_lines[0], 'Loaded db broker for ')
+        self.assert_starts_with(err_lines[1], 'Found 0 ranges in ')
+
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            main([db_file, 'find', '100'])
+        self.assert_formatted_json(out.getvalue(), [])
+        err_lines = err.getvalue().split('\n')
+        self.assert_starts_with(err_lines[0], 'Loaded db broker for ')
+        self.assert_starts_with(err_lines[1], 'Found 0 ranges in ')
+
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            main([db_file, 'find', '99'])
+        self.assert_formatted_json(out.getvalue(), [
+            {'index': 0, 'lower': '', 'upper': 'obj98', 'object_count': 99},
+            {'index': 1, 'lower': 'obj98', 'upper': '', 'object_count': 1},
+        ])
+        err_lines = err.getvalue().split('\n')
+        self.assert_starts_with(err_lines[0], 'Loaded db broker for ')
+        self.assert_starts_with(err_lines[1], 'Found 2 ranges in ')
+
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            main([db_file, 'find', '10'])
+        self.assert_formatted_json(out.getvalue(), [
+            {'index': 0, 'lower': '', 'upper': 'obj09', 'object_count': 10},
+            {'index': 1, 'lower': 'obj09', 'upper': 'obj19',
+             'object_count': 10},
+            {'index': 2, 'lower': 'obj19', 'upper': 'obj29',
+             'object_count': 10},
+            {'index': 3, 'lower': 'obj29', 'upper': 'obj39',
+             'object_count': 10},
+            {'index': 4, 'lower': 'obj39', 'upper': 'obj49',
+             'object_count': 10},
+            {'index': 5, 'lower': 'obj49', 'upper': 'obj59',
+             'object_count': 10},
+            {'index': 6, 'lower': 'obj59', 'upper': 'obj69',
+             'object_count': 10},
+            {'index': 7, 'lower': 'obj69', 'upper': 'obj79',
+             'object_count': 10},
+            {'index': 8, 'lower': 'obj79', 'upper': 'obj89',
+             'object_count': 10},
+            {'index': 9, 'lower': 'obj89', 'upper': '', 'object_count': 10},
+        ])
+        err_lines = err.getvalue().split('\n')
+        self.assert_starts_with(err_lines[0], 'Loaded db broker for ')
+        self.assert_starts_with(err_lines[1], 'Found 10 ranges in ')
+
+    def test_info(self):
+        broker = self._make_broker()
+        broker.update_metadata({'X-Container-Sysmeta-Sharding':
+                                (True, Timestamp.now().internal)})
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            main([broker.db_file, 'info'])
+        expected = ['Sharding enabled = True',
+                    'Own shard range: None',
+                    'db_state = unsharded',
+                    'Metadata:',
+                    '  X-Container-Sysmeta-Sharding = True']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+
+        retiring_db_id = broker.get_info()['id']
+        broker.merge_shard_ranges(ShardRange('.shards/cc', Timestamp.now()))
+        epoch = Timestamp.now()
+        with mock_timestamp_now(epoch) as now:
+            broker.enable_sharding(epoch)
+        self.assertTrue(broker.set_sharding_state())
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            with mock_timestamp_now(now):
+                main([broker.db_file, 'info'])
+        expected = ['Sharding enabled = True',
+                    'Own shard range: {',
+                    '  "bytes_used": 0, ',
+                    '  "deleted": 0, ',
+                    '  "epoch": "%s", ' % epoch.internal,
+                    '  "lower": "", ',
+                    '  "meta_timestamp": "%s", ' % now.internal,
+                    '  "name": "a/c", ',
+                    '  "object_count": 0, ',
+                    '  "state": "sharding", ',
+                    '  "state_timestamp": "%s", ' % now.internal,
+                    '  "timestamp": "%s", ' % now.internal,
+                    '  "upper": ""',
+                    '}',
+                    'db_state = sharding',
+                    'Retiring db id: %s' % retiring_db_id,
+                    'Cleaving context: {',
+                    '  "cleave_to_row": null, ',
+                    '  "cleaving_done": false, ',
+                    '  "cursor": "", ',
+                    '  "last_cleave_to_row": null, ',
+                    '  "max_row": -1, ',
+                    '  "misplaced_done": false, ',
+                    '  "ranges_done": 0, ',
+                    '  "ranges_todo": 0, ',
+                    '  "ref": "%s"' % retiring_db_id,
+                    '}',
+                    'Metadata:',
+                    '  X-Container-Sysmeta-Sharding = True']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+
+        self.assertTrue(broker.set_sharded_state())
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            with mock_timestamp_now(now):
+                main([broker.db_file, 'info'])
+        expected = ['Sharding enabled = True',
+                    'Own shard range: {',
+                    '  "bytes_used": 0, ',
+                    '  "deleted": 0, ',
+                    '  "epoch": "%s", ' % epoch.internal,
+                    '  "lower": "", ',
+                    '  "meta_timestamp": "%s", ' % now.internal,
+                    '  "name": "a/c", ',
+                    '  "object_count": 0, ',
+                    '  "state": "sharding", ',
+                    '  "state_timestamp": "%s", ' % now.internal,
+                    '  "timestamp": "%s", ' % now.internal,
+                    '  "upper": ""',
+                    '}',
+                    'db_state = sharded',
+                    'Metadata:',
+                    '  X-Container-Sysmeta-Sharding = True']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+
+    def test_replace(self):
+        broker = self._make_broker()
+        broker.update_metadata({'X-Container-Sysmeta-Sharding':
+                                (True, Timestamp.now().internal)})
+        input_file = os.path.join(self.testdir, 'shards')
+        with open(input_file, 'wb') as fd:
+            json.dump(self.shard_data, fd)
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            main([broker.db_file, 'replace', input_file])
+        expected = [
+            'No shard ranges found to delete.',
+            'Injected 10 shard ranges.',
+            'Run container-replicator to replicate them to other nodes.',
+            'Use the enable sub-command to enable sharding.']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+        self.assertEqual(
+            [(data['lower'], data['upper']) for data in self.shard_data],
+            [(sr.lower_str, sr.upper_str) for sr in broker.get_shard_ranges()])
+
+    def _assert_enabled(self, broker, epoch):
+        own_sr = broker.get_own_shard_range()
+        self.assertEqual(ShardRange.SHARDING, own_sr.state)
+        self.assertEqual(epoch, own_sr.epoch)
+        self.assertEqual(ShardRange.MIN, own_sr.lower)
+        self.assertEqual(ShardRange.MAX, own_sr.upper)
+        self.assertEqual(
+            'True', broker.metadata['X-Container-Sysmeta-Sharding'][0])
+
+    def test_enable(self):
+        broker = self._make_broker()
+        broker.update_metadata({'X-Container-Sysmeta-Sharding':
+                                (True, Timestamp.now().internal)})
+        # no shard ranges
+        out = StringIO()
+        err = StringIO()
+        with self.assertRaises(SystemExit):
+            with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+                main([broker.db_file, 'enable'])
+        expected = ["WARNING: invalid shard ranges: ['No shard ranges.'].",
+                    'Aborting.']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+
+        # success
+        shard_ranges = []
+        for data in self.shard_data:
+            path = ShardRange.make_path(
+                '.shards_a', 'c', 'c', Timestamp.now(), data['index'])
+            shard_ranges.append(
+                ShardRange(path, Timestamp.now(), data['lower'],
+                           data['upper'], data['object_count']))
+        broker.merge_shard_ranges(shard_ranges)
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            with mock_timestamp_now() as now:
+                main([broker.db_file, 'enable'])
+        expected = [
+            "Container moved to state 'sharding' with epoch %s." %
+            now.internal,
+            'Run container-sharder on all nodes to shard the container.']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+        self._assert_enabled(broker, now)
+
+        # already enabled
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            main([broker.db_file, 'enable'])
+        expected = [
+            "Container already in state 'sharding' with epoch %s." %
+            now.internal,
+            'No action required.',
+            'Run container-sharder on all nodes to shard the container.']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+        self._assert_enabled(broker, now)
+
+    def test_find_replace_enable(self):
+        db_file = os.path.join(self.testdir, 'hash.db')
+        broker = ContainerBroker(db_file)
+        broker.account = 'a'
+        broker.container = 'c'
+        broker.initialize()
+        ts = utils.Timestamp.now()
+        broker.merge_items([
+            {'name': 'obj%02d' % i, 'created_at': ts.internal, 'size': 0,
+             'content_type': 'application/octet-stream', 'etag': 'not-really',
+             'deleted': 0, 'storage_policy_index': 0,
+             'ctype_timestamp': ts.internal, 'meta_timestamp': ts.internal}
+            for i in range(100)])
+        out = StringIO()
+        err = StringIO()
+        with mock.patch('sys.stdout', out), mock.patch('sys.stderr', err):
+            with mock_timestamp_now() as now:
+                main([broker.db_file, 'find_and_replace', '10', '--enable'])
+        expected = [
+            'No shard ranges found to delete.',
+            'Injected 10 shard ranges.',
+            'Run container-replicator to replicate them to other nodes.',
+            "Container moved to state 'sharding' with epoch %s." %
+            now.internal,
+            'Run container-sharder on all nodes to shard the container.']
+        self.assertEqual(expected, out.getvalue().splitlines())
+        self.assertEqual(['Loaded db broker for a/c.'],
+                         err.getvalue().splitlines())
+        self._assert_enabled(broker, now)
+        self.assertEqual(
+            [(data['lower'], data['upper']) for data in self.shard_data],
+            [(sr.lower_str, sr.upper_str) for sr in broker.get_shard_ranges()])
--- a/test/unit/common/test_db.py
+++ b/test/unit/common/test_db.py
@ -38,7 +38,7 @@ from swift.common.constraints import \
    MAX_META_VALUE_LENGTH, MAX_META_COUNT, MAX_META_OVERALL_SIZE
 from swift.common.db import chexor, dict_factory, get_db_connection, \
    DatabaseBroker, DatabaseConnectionError, DatabaseAlreadyExists, \
-    GreenDBConnection, PICKLE_PROTOCOL
+    GreenDBConnection, PICKLE_PROTOCOL, zero_like
 from swift.common.utils import normalize_timestamp, mkdirs, Timestamp
 from swift.common.exceptions import LockTimeout
 from swift.common.swob import HTTPException
@ -46,6 +46,30 @@ from swift.common.swob import HTTPException
 from test.unit import with_tempdir


+class TestHelperFunctions(unittest.TestCase):
+
+    def test_zero_like(self):
+        expectations = {
+            # value => expected
+            None: True,
+            True: False,
+            '': True,
+            'asdf': False,
+            0: True,
+            1: False,
+            '0': True,
+            '1': False,
+        }
+        errors = []
+        for value, expected in expectations.items():
+            rv = zero_like(value)
+            if rv != expected:
+                errors.append('zero_like(%r) => %r expected %r' % (
+                    value, rv, expected))
+        if errors:
+            self.fail('Some unexpected return values:\n' + '\n'.join(errors))
+
+
 class TestDatabaseConnectionError(unittest.TestCase):

    def test_str(self):
@ -989,6 +1013,19 @@ class TestDatabaseBroker(unittest.TestCase):
        self.assertEqual(broker.get_sync(uuid3), 2)
        broker.merge_syncs([{'sync_point': 5, 'remote_id': uuid2}])
        self.assertEqual(broker.get_sync(uuid2), 5)
+        # max sync point sticks
+        broker.merge_syncs([{'sync_point': 5, 'remote_id': uuid2}])
+        self.assertEqual(broker.get_sync(uuid2), 5)
+        self.assertEqual(broker.get_sync(uuid3), 2)
+        broker.merge_syncs([{'sync_point': 4, 'remote_id': uuid2}])
+        self.assertEqual(broker.get_sync(uuid2), 5)
+        self.assertEqual(broker.get_sync(uuid3), 2)
+        broker.merge_syncs([{'sync_point': -1, 'remote_id': uuid2},
+                            {'sync_point': 3, 'remote_id': uuid3}])
+        self.assertEqual(broker.get_sync(uuid2), 5)
+        self.assertEqual(broker.get_sync(uuid3), 3)
+        self.assertEqual(broker.get_sync(uuid2, incoming=False), 3)
+        self.assertEqual(broker.get_sync(uuid3, incoming=False), 4)

    def test_get_replication_info(self):
        self.get_replication_info_tester(metadata=False)
@ -1089,11 +1126,9 @@ class TestDatabaseBroker(unittest.TestCase):
            'max_row': 1, 'id': broker_uuid, 'metadata': broker_metadata})
        return broker

-    def test_metadata(self):
-        def reclaim(broker, timestamp):
-            with broker.get() as conn:
-                broker._reclaim(conn, timestamp)
-                conn.commit()
+    # only testing _reclaim_metadata here
+    @patch.object(DatabaseBroker, '_reclaim')
+    def test_metadata(self, mock_reclaim):
        # Initializes a good broker for us
        broker = self.get_replication_info_tester(metadata=True)
        # Add our first item
@ -1134,7 +1169,7 @@ class TestDatabaseBroker(unittest.TestCase):
        self.assertEqual(broker.metadata['Second'],
                         [second_value, second_timestamp])
        # Reclaim at point before second item was deleted
-        reclaim(broker, normalize_timestamp(3))
+        broker.reclaim(normalize_timestamp(3), normalize_timestamp(3))
        self.assertIn('First', broker.metadata)
        self.assertEqual(broker.metadata['First'],
                         [first_value, first_timestamp])
@ -1142,7 +1177,7 @@ class TestDatabaseBroker(unittest.TestCase):
        self.assertEqual(broker.metadata['Second'],
                         [second_value, second_timestamp])
        # Reclaim at point second item was deleted
-        reclaim(broker, normalize_timestamp(4))
+        broker.reclaim(normalize_timestamp(4), normalize_timestamp(4))
        self.assertIn('First', broker.metadata)
        self.assertEqual(broker.metadata['First'],
                         [first_value, first_timestamp])
@ -1150,11 +1185,18 @@ class TestDatabaseBroker(unittest.TestCase):
        self.assertEqual(broker.metadata['Second'],
                         [second_value, second_timestamp])
        # Reclaim after point second item was deleted
-        reclaim(broker, normalize_timestamp(5))
+        broker.reclaim(normalize_timestamp(5), normalize_timestamp(5))
        self.assertIn('First', broker.metadata)
        self.assertEqual(broker.metadata['First'],
                         [first_value, first_timestamp])
        self.assertNotIn('Second', broker.metadata)
+        # Delete first item (by setting to empty string)
+        first_timestamp = normalize_timestamp(6)
+        broker.update_metadata({'First': ['', first_timestamp]})
+        self.assertIn('First', broker.metadata)
+        # Check that sync_timestamp doesn't cause item to be reclaimed
+        broker.reclaim(normalize_timestamp(5), normalize_timestamp(99))
+        self.assertIn('First', broker.metadata)

    def test_update_metadata_missing_container_info(self):
        # Test missing container_info/container_stat row
@ -1197,7 +1239,7 @@ class TestDatabaseBroker(unittest.TestCase):
        exc = None
        try:
            with broker.get() as conn:
-                broker._reclaim(conn, 0)
+                broker._reclaim_metadata(conn, 0)
        except Exception as err:
            exc = err
        self.assertEqual(
@ -1333,5 +1375,141 @@ class TestDatabaseBroker(unittest.TestCase):
                else:
                    self.fail('Expected an exception to be raised')

+    def test_skip_commits(self):
+        broker = DatabaseBroker(':memory:')
+        self.assertTrue(broker._skip_commit_puts())
+        broker._initialize = MagicMock()
+        broker.initialize(Timestamp.now())
+        self.assertTrue(broker._skip_commit_puts())
+
+        # not initialized
+        db_file = os.path.join(self.testdir, '1.db')
+        broker = DatabaseBroker(db_file)
+        self.assertFalse(os.path.exists(broker.db_file))  # sanity check
+        self.assertTrue(broker._skip_commit_puts())
+
+        # no pending file
+        broker._initialize = MagicMock()
+        broker.initialize(Timestamp.now())
+        self.assertTrue(os.path.exists(broker.db_file))  # sanity check
+        self.assertFalse(os.path.exists(broker.pending_file))  # sanity check
+        self.assertTrue(broker._skip_commit_puts())
+
+        # pending file exists
+        with open(broker.pending_file, 'wb'):
+            pass
+        self.assertTrue(os.path.exists(broker.pending_file))  # sanity check
+        self.assertFalse(broker._skip_commit_puts())
+
+        # skip_commits is True
+        broker.skip_commits = True
+        self.assertTrue(broker._skip_commit_puts())
+
+        # re-init
+        broker = DatabaseBroker(db_file)
+        self.assertFalse(broker._skip_commit_puts())
+
+        # constructor can override
+        broker = DatabaseBroker(db_file, skip_commits=True)
+        self.assertTrue(broker._skip_commit_puts())
+
+    def test_commit_puts(self):
+        db_file = os.path.join(self.testdir, '1.db')
+        broker = DatabaseBroker(db_file)
+        broker._initialize = MagicMock()
+        broker.initialize(Timestamp.now())
+        with open(broker.pending_file, 'wb'):
+            pass
+
+        # merge given list
+        with patch.object(broker, 'merge_items') as mock_merge_items:
+            broker._commit_puts(['test'])
+        mock_merge_items.assert_called_once_with(['test'])
+
+        # load file and merge
+        with open(broker.pending_file, 'wb') as fd:
+            fd.write(':1:2:99')
+        with patch.object(broker, 'merge_items') as mock_merge_items:
+            broker._commit_puts_load = lambda l, e: l.append(e)
+            broker._commit_puts()
+        mock_merge_items.assert_called_once_with(['1', '2', '99'])
+        self.assertEqual(0, os.path.getsize(broker.pending_file))
+
+        # load file and merge with given list
+        with open(broker.pending_file, 'wb') as fd:
+            fd.write(':bad')
+        with patch.object(broker, 'merge_items') as mock_merge_items:
+            broker._commit_puts_load = lambda l, e: l.append(e)
+            broker._commit_puts(['not'])
+        mock_merge_items.assert_called_once_with(['not', 'bad'])
+        self.assertEqual(0, os.path.getsize(broker.pending_file))
+
+        # skip_commits True - no merge
+        db_file = os.path.join(self.testdir, '2.db')
+        broker = DatabaseBroker(db_file, skip_commits=True)
+        broker._initialize = MagicMock()
+        broker.initialize(Timestamp.now())
+        with open(broker.pending_file, 'wb') as fd:
+            fd.write(':ignored')
+        with patch.object(broker, 'merge_items') as mock_merge_items:
+            with self.assertRaises(DatabaseConnectionError) as cm:
+                broker._commit_puts(['hmmm'])
+        mock_merge_items.assert_not_called()
+        self.assertIn('commits not accepted', str(cm.exception))
+        with open(broker.pending_file, 'rb') as fd:
+            self.assertEqual(':ignored', fd.read())
+
+    def test_put_record(self):
+        db_file = os.path.join(self.testdir, '1.db')
+        broker = DatabaseBroker(db_file)
+        broker._initialize = MagicMock()
+        broker.initialize(Timestamp.now())
+
+        # pending file created and record written
+        broker.make_tuple_for_pickle = lambda x: x.upper()
+        with patch.object(broker, '_commit_puts') as mock_commit_puts:
+            broker.put_record('pinky')
+        mock_commit_puts.assert_not_called()
+        with open(broker.pending_file, 'rb') as fd:
+            pending = fd.read()
+        items = pending.split(':')
+        self.assertEqual(['PINKY'],
+                         [pickle.loads(i.decode('base64')) for i in items[1:]])
+
+        # record appended
+        with patch.object(broker, '_commit_puts') as mock_commit_puts:
+            broker.put_record('perky')
+        mock_commit_puts.assert_not_called()
+        with open(broker.pending_file, 'rb') as fd:
+            pending = fd.read()
+        items = pending.split(':')
+        self.assertEqual(['PINKY', 'PERKY'],
+                         [pickle.loads(i.decode('base64')) for i in items[1:]])
+
+        # pending file above cap
+        cap = swift.common.db.PENDING_CAP
+        while os.path.getsize(broker.pending_file) < cap:
+            with open(broker.pending_file, 'ab') as fd:
+                fd.write('x' * 100000)
+        with patch.object(broker, '_commit_puts') as mock_commit_puts:
+            broker.put_record('direct')
+        mock_commit_puts.called_once_with(['direct'])
+
+        # records shouldn't be put to brokers with skip_commits True because
+        # they cannot be accepted if the pending file is full
+        broker.skip_commits = True
+        with open(broker.pending_file, 'wb'):
+            # empty the pending file
+            pass
+        with patch.object(broker, '_commit_puts') as mock_commit_puts:
+            with self.assertRaises(DatabaseConnectionError) as cm:
+                broker.put_record('unwelcome')
+        self.assertIn('commits not accepted', str(cm.exception))
+        mock_commit_puts.assert_not_called()
+        with open(broker.pending_file, 'rb') as fd:
+            pending = fd.read()
+        self.assertFalse(pending)
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/test/unit/common/test_db_replicator.py
+++ b/test/unit/common/test_db_replicator.py
@ -16,6 +16,8 @@
 from __future__ import print_function
 import unittest
 from contextlib import contextmanager
+
+import eventlet
 import os
 import logging
 import errno
@ -37,6 +39,7 @@ from swift.common.exceptions import DriveNotMounted
 from swift.common.swob import HTTPException

 from test import unit
+from test.unit import FakeLogger, attach_fake_replication_rpc
 from test.unit.common.test_db import ExampleBroker


@ -160,6 +163,11 @@ class ReplHttp(object):
        self.set_status = set_status
    replicated = False
    host = 'localhost'
+    node = {
+        'ip': '127.0.0.1',
+        'port': '6000',
+        'device': 'sdb',
+    }

    def replicate(self, *args):
        self.replicated = True
@ -230,11 +238,27 @@ class FakeBroker(object):
            'put_timestamp': 1,
            'created_at': 1,
            'count': 0,
+            'max_row': 99,
+            'id': 'ID',
+            'metadata': {}
        })
        if self.stub_replication_info:
            info.update(self.stub_replication_info)
        return info

+    def get_max_row(self, table=None):
+        return self.get_replication_info()['max_row']
+
+    def is_reclaimable(self, now, reclaim_age):
+        info = self.get_replication_info()
+        return info['count'] == 0 and (
+            (now - reclaim_age) >
+            info['delete_timestamp'] >
+            info['put_timestamp'])
+
+    def get_other_replication_items(self):
+        return None
+
    def reclaim(self, item_timestamp, sync_timestamp):
        pass

@ -249,6 +273,9 @@ class FakeBroker(object):
        self.put_timestamp = put_timestamp
        self.delete_timestamp = delete_timestamp

+    def get_brokers(self):
+        return [self]
+

 class FakeAccountBroker(FakeBroker):
    db_type = 'account'
@ -273,6 +300,7 @@ class TestDBReplicator(unittest.TestCase):
        self.recon_cache = mkdtemp()
        rmtree(self.recon_cache, ignore_errors=1)
        os.mkdir(self.recon_cache)
+        self.logger = unit.debug_logger('test-replicator')

    def tearDown(self):
        for patcher in self._patchers:
@ -287,6 +315,7 @@ class TestDBReplicator(unittest.TestCase):

    def stub_delete_db(self, broker):
        self.delete_db_calls.append('/path/to/file')
+        return True

    def test_creation(self):
        # later config should be extended to assert more config options
@ -647,11 +676,107 @@ class TestDBReplicator(unittest.TestCase):
        })

    def test_replicate_object(self):
+        # verify return values from replicate_object
        db_replicator.ring = FakeRingWithNodes()
-        replicator = TestReplicator({})
-        replicator.delete_db = self.stub_delete_db
-        replicator._replicate_object('0', '/path/to/file', 'node_id')
-        self.assertEqual([], self.delete_db_calls)
+        db_path = '/path/to/file'
+        replicator = TestReplicator({}, logger=FakeLogger())
+        info = FakeBroker().get_replication_info()
+        # make remote appear to be in sync
+        rinfo = {'point': info['max_row'], 'id': 'remote_id'}
+
+        class FakeResponse(object):
+            def __init__(self, status, rinfo):
+                self._status = status
+                self.data = json.dumps(rinfo)
+
+            @property
+            def status(self):
+                if isinstance(self._status, (Exception, eventlet.Timeout)):
+                    raise self._status
+                return self._status
+
+        # all requests fail
+        replicate = 'swift.common.db_replicator.ReplConnection.replicate'
+        with mock.patch(replicate) as fake_replicate:
+            fake_replicate.side_effect = [
+                FakeResponse(500, None),
+                FakeResponse(500, None),
+                FakeResponse(500, None)]
+            with mock.patch.object(replicator, 'delete_db') as mock_delete:
+                res = replicator._replicate_object('0', db_path, 'node_id')
+        self.assertRaises(StopIteration, next, fake_replicate.side_effect)
+        self.assertEqual((False, [False, False, False]), res)
+        self.assertEqual(0, mock_delete.call_count)
+        self.assertFalse(replicator.logger.get_lines_for_level('error'))
+        self.assertFalse(replicator.logger.get_lines_for_level('warning'))
+        replicator.logger.clear()
+
+        with mock.patch(replicate) as fake_replicate:
+            fake_replicate.side_effect = [
+                FakeResponse(Exception('ugh'), None),
+                FakeResponse(eventlet.Timeout(), None),
+                FakeResponse(200, rinfo)]
+            with mock.patch.object(replicator, 'delete_db') as mock_delete:
+                res = replicator._replicate_object('0', db_path, 'node_id')
+        self.assertRaises(StopIteration, next, fake_replicate.side_effect)
+        self.assertEqual((False, [False, False, True]), res)
+        self.assertEqual(0, mock_delete.call_count)
+        lines = replicator.logger.get_lines_for_level('error')
+        self.assertIn('ERROR syncing', lines[0])
+        self.assertIn('ERROR syncing', lines[1])
+        self.assertFalse(lines[2:])
+        self.assertFalse(replicator.logger.get_lines_for_level('warning'))
+        replicator.logger.clear()
+
+        # partial success
+        with mock.patch(replicate) as fake_replicate:
+            fake_replicate.side_effect = [
+                FakeResponse(200, rinfo),
+                FakeResponse(200, rinfo),
+                FakeResponse(500, None)]
+            with mock.patch.object(replicator, 'delete_db') as mock_delete:
+                res = replicator._replicate_object('0', db_path, 'node_id')
+        self.assertRaises(StopIteration, next, fake_replicate.side_effect)
+        self.assertEqual((False, [True, True, False]), res)
+        self.assertEqual(0, mock_delete.call_count)
+        self.assertFalse(replicator.logger.get_lines_for_level('error'))
+        self.assertFalse(replicator.logger.get_lines_for_level('warning'))
+        replicator.logger.clear()
+
+        # 507 triggers additional requests
+        with mock.patch(replicate) as fake_replicate:
+            fake_replicate.side_effect = [
+                FakeResponse(200, rinfo),
+                FakeResponse(200, rinfo),
+                FakeResponse(507, None),
+                FakeResponse(507, None),
+                FakeResponse(200, rinfo)]
+            with mock.patch.object(replicator, 'delete_db') as mock_delete:
+                res = replicator._replicate_object('0', db_path, 'node_id')
+        self.assertRaises(StopIteration, next, fake_replicate.side_effect)
+        self.assertEqual((False, [True, True, False, False, True]), res)
+        self.assertEqual(0, mock_delete.call_count)
+        lines = replicator.logger.get_lines_for_level('error')
+        self.assertIn('Remote drive not mounted', lines[0])
+        self.assertIn('Remote drive not mounted', lines[1])
+        self.assertFalse(lines[2:])
+        self.assertFalse(replicator.logger.get_lines_for_level('warning'))
+        replicator.logger.clear()
+
+        # all requests succeed; node id == 'node_id' causes node to be
+        # considered a handoff so expect the db to be deleted
+        with mock.patch(replicate) as fake_replicate:
+            fake_replicate.side_effect = [
+                FakeResponse(200, rinfo),
+                FakeResponse(200, rinfo),
+                FakeResponse(200, rinfo)]
+            with mock.patch.object(replicator, 'delete_db') as mock_delete:
+                res = replicator._replicate_object('0', db_path, 'node_id')
+        self.assertRaises(StopIteration, next, fake_replicate.side_effect)
+        self.assertEqual((True, [True, True, True]), res)
+        self.assertEqual(1, mock_delete.call_count)
+        self.assertFalse(replicator.logger.get_lines_for_level('error'))
+        self.assertFalse(replicator.logger.get_lines_for_level('warning'))

    def test_replicate_object_quarantine(self):
        replicator = TestReplicator({})
@ -695,8 +820,122 @@ class TestDBReplicator(unittest.TestCase):
        replicator.brokerclass = FakeAccountBroker
        replicator._repl_to_node = lambda *args: True
        replicator.delete_db = self.stub_delete_db
-        replicator._replicate_object('0', '/path/to/file', 'node_id')
+        orig_cleanup = replicator.cleanup_post_replicate
+        with mock.patch.object(replicator, 'cleanup_post_replicate',
+                               side_effect=orig_cleanup) as mock_cleanup:
+            replicator._replicate_object('0', '/path/to/file', 'node_id')
+        mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY, [True] * 3)
+        self.assertIsInstance(mock_cleanup.call_args[0][0],
+                              replicator.brokerclass)
        self.assertEqual(['/path/to/file'], self.delete_db_calls)
+        self.assertEqual(0, replicator.stats['failure'])
+
+    def test_replicate_object_delete_delegated_to_cleanup_post_replicate(self):
+        replicator = TestReplicator({})
+        replicator.ring = FakeRingWithNodes().Ring('path')
+        replicator.brokerclass = FakeAccountBroker
+        replicator._repl_to_node = lambda *args: True
+        replicator.delete_db = self.stub_delete_db
+
+        # cleanup succeeds
+        with mock.patch.object(replicator, 'cleanup_post_replicate',
+                               return_value=True) as mock_cleanup:
+            replicator._replicate_object('0', '/path/to/file', 'node_id')
+        mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY, [True] * 3)
+        self.assertIsInstance(mock_cleanup.call_args[0][0],
+                              replicator.brokerclass)
+        self.assertFalse(self.delete_db_calls)
+        self.assertEqual(0, replicator.stats['failure'])
+        self.assertEqual(3, replicator.stats['success'])
+
+        # cleanup fails
+        replicator._zero_stats()
+        with mock.patch.object(replicator, 'cleanup_post_replicate',
+                               return_value=False) as mock_cleanup:
+            replicator._replicate_object('0', '/path/to/file', 'node_id')
+        mock_cleanup.assert_called_once_with(mock.ANY, mock.ANY, [True] * 3)
+        self.assertIsInstance(mock_cleanup.call_args[0][0],
+                              replicator.brokerclass)
+        self.assertFalse(self.delete_db_calls)
+        self.assertEqual(3, replicator.stats['failure'])
+        self.assertEqual(0, replicator.stats['success'])
+
+        # shouldbehere True - cleanup not required
+        replicator._zero_stats()
+        primary_node_id = replicator.ring.get_part_nodes('0')[0]['id']
+        with mock.patch.object(replicator, 'cleanup_post_replicate',
+                               return_value=True) as mock_cleanup:
+            replicator._replicate_object('0', '/path/to/file', primary_node_id)
+        mock_cleanup.assert_not_called()
+        self.assertFalse(self.delete_db_calls)
+        self.assertEqual(0, replicator.stats['failure'])
+        self.assertEqual(2, replicator.stats['success'])
+
+    def test_cleanup_post_replicate(self):
+        replicator = TestReplicator({}, logger=self.logger)
+        replicator.ring = FakeRingWithNodes().Ring('path')
+        broker = FakeBroker()
+        replicator._repl_to_node = lambda *args: True
+        info = broker.get_replication_info()
+
+        with mock.patch.object(replicator, 'delete_db') as mock_delete_db:
+            res = replicator.cleanup_post_replicate(
+                broker, info, [False] * 3)
+        mock_delete_db.assert_not_called()
+        self.assertTrue(res)
+        self.assertEqual(['Not deleting db %s (0/3 success)' % broker.db_file],
+                         replicator.logger.get_lines_for_level('debug'))
+        replicator.logger.clear()
+
+        with mock.patch.object(replicator, 'delete_db') as mock_delete_db:
+            res = replicator.cleanup_post_replicate(
+                broker, info, [True, False, True])
+        mock_delete_db.assert_not_called()
+        self.assertTrue(res)
+        self.assertEqual(['Not deleting db %s (2/3 success)' % broker.db_file],
+                         replicator.logger.get_lines_for_level('debug'))
+        replicator.logger.clear()
+
+        broker.stub_replication_info = {'max_row': 101}
+        with mock.patch.object(replicator, 'delete_db') as mock_delete_db:
+            res = replicator.cleanup_post_replicate(
+                broker, info, [True] * 3)
+        mock_delete_db.assert_not_called()
+        self.assertTrue(res)
+        self.assertEqual(['Not deleting db %s (2 new rows)' % broker.db_file],
+                         replicator.logger.get_lines_for_level('debug'))
+        replicator.logger.clear()
+
+        broker.stub_replication_info = {'max_row': 98}
+        with mock.patch.object(replicator, 'delete_db') as mock_delete_db:
+            res = replicator.cleanup_post_replicate(
+                broker, info, [True] * 3)
+        mock_delete_db.assert_not_called()
+        self.assertTrue(res)
+        broker.stub_replication_info = None
+        self.assertEqual(['Not deleting db %s (negative max_row_delta: -1)' %
+                          broker.db_file],
+                         replicator.logger.get_lines_for_level('error'))
+        replicator.logger.clear()
+
+        with mock.patch.object(replicator, 'delete_db') as mock_delete_db:
+            res = replicator.cleanup_post_replicate(
+                broker, info, [True] * 3)
+        mock_delete_db.assert_called_once_with(broker)
+        self.assertTrue(res)
+        self.assertEqual(['Successfully deleted db %s' % broker.db_file],
+                         replicator.logger.get_lines_for_level('debug'))
+        replicator.logger.clear()
+
+        with mock.patch.object(replicator, 'delete_db',
+                               return_value=False) as mock_delete_db:
+            res = replicator.cleanup_post_replicate(
+                broker, info, [True] * 3)
+        mock_delete_db.assert_called_once_with(broker)
+        self.assertFalse(res)
+        self.assertEqual(['Failed to delete db %s' % broker.db_file],
+                         replicator.logger.get_lines_for_level('debug'))
+        replicator.logger.clear()

    def test_replicate_object_with_exception(self):
        replicator = TestReplicator({})
@ -949,6 +1188,8 @@ class TestDBReplicator(unittest.TestCase):
            response = rpc.dispatch(('drive', 'part', 'hash'),
                                    ['rsync_then_merge', 'arg1', 'arg2'])
            expected_calls = [call('/part/ash/hash/hash.db'),
+                              call('/drive/tmp/arg1'),
+                              call(FakeBroker.db_file),
                              call('/drive/tmp/arg1')]
            self.assertEqual(mock_os.path.exists.call_args_list,
                             expected_calls)
@ -966,7 +1207,7 @@ class TestDBReplicator(unittest.TestCase):
                unit.mock_check_drive(isdir=True):
            mock_os.path.exists.side_effect = [False, True]
            response = rpc.dispatch(('drive', 'part', 'hash'),
-                                    ['complete_rsync', 'arg1', 'arg2'])
+                                    ['complete_rsync', 'arg1'])
            expected_calls = [call('/part/ash/hash/hash.db'),
                              call('/drive/tmp/arg1')]
            self.assertEqual(mock_os.path.exists.call_args_list,
@ -974,6 +1215,19 @@ class TestDBReplicator(unittest.TestCase):
            self.assertEqual('204 No Content', response.status)
            self.assertEqual(204, response.status_int)

+        with patch('swift.common.db_replicator.os',
+                   new=mock.MagicMock(wraps=os)) as mock_os, \
+                unit.mock_check_drive(isdir=True):
+            mock_os.path.exists.side_effect = [False, True]
+            response = rpc.dispatch(('drive', 'part', 'hash'),
+                                    ['complete_rsync', 'arg1', 'arg2'])
+            expected_calls = [call('/part/ash/hash/arg2'),
+                              call('/drive/tmp/arg1')]
+            self.assertEqual(mock_os.path.exists.call_args_list,
+                             expected_calls)
+            self.assertEqual('204 No Content', response.status)
+            self.assertEqual(204, response.status_int)
+
    def test_rsync_then_merge_db_does_not_exist(self):
        rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker,
                                          mount_check=False)
@ -1010,7 +1264,8 @@ class TestDBReplicator(unittest.TestCase):

        def mock_renamer(old, new):
            self.assertEqual('/drive/tmp/arg1', old)
-            self.assertEqual('/data/db.db', new)
+            # FakeBroker uses module filename as db_file!
+            self.assertEqual(__file__, new)

        self._patch(patch.object, db_replicator, 'renamer', mock_renamer)

@ -1023,17 +1278,26 @@ class TestDBReplicator(unittest.TestCase):
            self.assertEqual('204 No Content', response.status)
            self.assertEqual(204, response.status_int)

-    def test_complete_rsync_db_does_not_exist(self):
+    def test_complete_rsync_db_exists(self):
        rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker,
                                          mount_check=False)

+        with patch('swift.common.db_replicator.os',
+                   new=mock.MagicMock(wraps=os)) as mock_os, \
+                unit.mock_check_drive(isdir=True):
+            mock_os.path.exists.return_value = True
+            response = rpc.complete_rsync('drive', '/data/db.db', ['arg1'])
+            mock_os.path.exists.assert_called_with('/data/db.db')
+            self.assertEqual('404 Not Found', response.status)
+            self.assertEqual(404, response.status_int)
+
        with patch('swift.common.db_replicator.os',
                   new=mock.MagicMock(wraps=os)) as mock_os, \
                unit.mock_check_drive(isdir=True):
            mock_os.path.exists.return_value = True
            response = rpc.complete_rsync('drive', '/data/db.db',
                                          ['arg1', 'arg2'])
-            mock_os.path.exists.assert_called_with('/data/db.db')
+            mock_os.path.exists.assert_called_with('/data/arg2')
            self.assertEqual('404 Not Found', response.status)
            self.assertEqual(404, response.status_int)

@ -1046,37 +1310,57 @@ class TestDBReplicator(unittest.TestCase):
                unit.mock_check_drive(isdir=True):
            mock_os.path.exists.return_value = False
            response = rpc.complete_rsync('drive', '/data/db.db',
-                                          ['arg1', 'arg2'])
+                                          ['arg1'])
            expected_calls = [call('/data/db.db'), call('/drive/tmp/arg1')]
            self.assertEqual(expected_calls,
                             mock_os.path.exists.call_args_list)
            self.assertEqual('404 Not Found', response.status)
            self.assertEqual(404, response.status_int)

+        with patch('swift.common.db_replicator.os',
+                   new=mock.MagicMock(wraps=os)) as mock_os, \
+                unit.mock_check_drive(isdir=True):
+            mock_os.path.exists.return_value = False
+            response = rpc.complete_rsync('drive', '/data/db.db',
+                                          ['arg1', 'arg2'])
+            expected_calls = [call('/data/arg2'), call('/drive/tmp/arg1')]
+            self.assertEqual(expected_calls,
+                             mock_os.path.exists.call_args_list)
+            self.assertEqual('404 Not Found', response.status)
+            self.assertEqual(404, response.status_int)
+
    def test_complete_rsync_rename(self):
        rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker,
                                          mount_check=False)

-        def mock_exists(path):
-            if path == '/data/db.db':
-                return False
-            self.assertEqual('/drive/tmp/arg1', path)
-            return True
-
        def mock_renamer(old, new):
-            self.assertEqual('/drive/tmp/arg1', old)
-            self.assertEqual('/data/db.db', new)
+            renamer_calls.append((old, new))

        self._patch(patch.object, db_replicator, 'renamer', mock_renamer)

+        renamer_calls = []
+        with patch('swift.common.db_replicator.os',
+                   new=mock.MagicMock(wraps=os)) as mock_os, \
+                unit.mock_check_drive(isdir=True):
+            mock_os.path.exists.side_effect = [False, True]
+            response = rpc.complete_rsync('drive', '/data/db.db',
+                                          ['arg1'])
+        self.assertEqual('204 No Content', response.status)
+        self.assertEqual(204, response.status_int)
+        self.assertEqual(('/drive/tmp/arg1', '/data/db.db'), renamer_calls[0])
+        self.assertFalse(renamer_calls[1:])
+
+        renamer_calls = []
        with patch('swift.common.db_replicator.os',
                   new=mock.MagicMock(wraps=os)) as mock_os, \
                unit.mock_check_drive(isdir=True):
            mock_os.path.exists.side_effect = [False, True]
            response = rpc.complete_rsync('drive', '/data/db.db',
                                          ['arg1', 'arg2'])
-            self.assertEqual('204 No Content', response.status)
-            self.assertEqual(204, response.status_int)
+        self.assertEqual('204 No Content', response.status)
+        self.assertEqual(204, response.status_int)
+        self.assertEqual(('/drive/tmp/arg1', '/data/arg2'), renamer_calls[0])
+        self.assertFalse(renamer_calls[1:])

    def test_replicator_sync_with_broker_replication_missing_table(self):
        rpc = db_replicator.ReplicatorRpc('/', '/', FakeBroker,
@ -1435,10 +1719,10 @@ class TestDBReplicator(unittest.TestCase):
        db_file = __file__
        replicator = TestReplicator({})
        replicator._http_connect(node, partition, db_file)
+        expected_hsh = os.path.basename(db_file).split('.', 1)[0]
+        expected_hsh = expected_hsh.split('_', 1)[0]
        db_replicator.ReplConnection.assert_has_calls([
-            mock.call(node, partition,
-                      os.path.basename(db_file).split('.', 1)[0],
-                      replicator.logger)])
+            mock.call(node, partition, expected_hsh, replicator.logger)])


 class TestHandoffsOnly(unittest.TestCase):
@ -1740,7 +2024,7 @@ class TestReplToNode(unittest.TestCase):
    def test_repl_to_node_300_status(self):
        self.http = ReplHttp('{"id": 3, "point": -1}', set_status=300)

-        self.assertIsNone(self.replicator._repl_to_node(
+        self.assertFalse(self.replicator._repl_to_node(
            self.fake_node, FakeBroker(), '0', self.fake_info))

    def test_repl_to_node_not_response(self):
@ -1769,45 +2053,6 @@ class TestReplToNode(unittest.TestCase):
            ])


-class FakeHTTPResponse(object):
-
-    def __init__(self, resp):
-        self.resp = resp
-
-    @property
-    def status(self):
-        return self.resp.status_int
-
-    @property
-    def data(self):
-        return self.resp.body
-
-
-def attach_fake_replication_rpc(rpc, replicate_hook=None):
-    class FakeReplConnection(object):
-
-        def __init__(self, node, partition, hash_, logger):
-            self.logger = logger
-            self.node = node
-            self.partition = partition
-            self.path = '/%s/%s/%s' % (node['device'], partition, hash_)
-            self.host = node['replication_ip']
-
-        def replicate(self, op, *sync_args):
-            print('REPLICATE: %s, %s, %r' % (self.path, op, sync_args))
-            replicate_args = self.path.lstrip('/').split('/')
-            args = [op] + list(sync_args)
-            with unit.mock_check_drive(isdir=not rpc.mount_check,
-                                       ismount=rpc.mount_check):
-                swob_response = rpc.dispatch(replicate_args, args)
-            resp = FakeHTTPResponse(swob_response)
-            if replicate_hook:
-                replicate_hook(op, *sync_args)
-            return resp
-
-    return FakeReplConnection
-
-
 class ExampleReplicator(db_replicator.Replicator):
    server_type = 'fake'
    brokerclass = ExampleBroker
@ -1872,15 +2117,19 @@ class TestReplicatorSync(unittest.TestCase):
            conf.update(conf_updates)
        return self.replicator_daemon(conf, logger=self.logger)

-    def _run_once(self, node, conf_updates=None, daemon=None):
-        daemon = daemon or self._get_daemon(node, conf_updates)
-
+    def _install_fake_rsync_file(self, daemon, captured_calls=None):
        def _rsync_file(db_file, remote_file, **kwargs):
+            if captured_calls is not None:
+                captured_calls.append((db_file, remote_file, kwargs))
            remote_server, remote_path = remote_file.split('/', 1)
            dest_path = os.path.join(self.root, remote_path)
            copy(db_file, dest_path)
            return True
        daemon._rsync_file = _rsync_file
+
+    def _run_once(self, node, conf_updates=None, daemon=None):
+        daemon = daemon or self._get_daemon(node, conf_updates)
+        self._install_fake_rsync_file(daemon)
        with mock.patch('swift.common.db_replicator.whataremyips',
                        new=lambda *a, **kw: [node['replication_ip']]), \
                unit.mock_check_drive(isdir=not daemon.mount_check,
--- a/test/unit/common/test_direct_client.py
+++ b/test/unit/common/test_direct_client.py
@ -95,6 +95,11 @@ def mocked_http_conn(*args, **kwargs):
        yield fake_conn


+@contextmanager
+def noop_timeout(duration):
+    yield
+
+
@patch_policies
 class TestDirectClient(unittest.TestCase):

@ -117,6 +122,10 @@ class TestDirectClient(unittest.TestCase):
                self.account, self.container, self.obj))
        self.user_agent = 'direct-client %s' % os.getpid()

+        patcher = mock.patch.object(direct_client, 'Timeout', noop_timeout)
+        patcher.start()
+        self.addCleanup(patcher.stop)
+
    def test_gen_headers(self):
        stub_user_agent = 'direct-client %s' % os.getpid()

@ -450,6 +459,67 @@ class TestDirectClient(unittest.TestCase):
        self.assertEqual(err.http_status, 500)
        self.assertTrue('DELETE' in str(err))

+    def test_direct_put_container(self):
+        body = 'Let us begin with a quick introduction'
+        headers = {'x-foo': 'bar', 'Content-Length': str(len(body)),
+                   'Content-Type': 'application/json',
+                   'User-Agent': 'my UA'}
+
+        with mocked_http_conn(204) as conn:
+            rv = direct_client.direct_put_container(
+                self.node, self.part, self.account, self.container,
+                contents=body, headers=headers)
+            self.assertEqual(conn.host, self.node['ip'])
+            self.assertEqual(conn.port, self.node['port'])
+            self.assertEqual(conn.method, 'PUT')
+            self.assertEqual(conn.path, self.container_path)
+            self.assertEqual(conn.req_headers['Content-Length'],
+                             str(len(body)))
+            self.assertEqual(conn.req_headers['Content-Type'],
+                             'application/json')
+            self.assertEqual(conn.req_headers['User-Agent'], 'my UA')
+            self.assertTrue('x-timestamp' in conn.req_headers)
+            self.assertEqual('bar', conn.req_headers.get('x-foo'))
+            self.assertEqual(md5(body).hexdigest(), conn.etag.hexdigest())
+        self.assertIsNone(rv)
+
+    def test_direct_put_container_chunked(self):
+        body = 'Let us begin with a quick introduction'
+        headers = {'x-foo': 'bar', 'Content-Type': 'application/json'}
+
+        with mocked_http_conn(204) as conn:
+            rv = direct_client.direct_put_container(
+                self.node, self.part, self.account, self.container,
+                contents=body, headers=headers)
+            self.assertEqual(conn.host, self.node['ip'])
+            self.assertEqual(conn.port, self.node['port'])
+            self.assertEqual(conn.method, 'PUT')
+            self.assertEqual(conn.path, self.container_path)
+            self.assertEqual(conn.req_headers['Transfer-Encoding'], 'chunked')
+            self.assertEqual(conn.req_headers['Content-Type'],
+                             'application/json')
+            self.assertTrue('x-timestamp' in conn.req_headers)
+            self.assertEqual('bar', conn.req_headers.get('x-foo'))
+            self.assertNotIn('Content-Length', conn.req_headers)
+            expected_sent = '%0x\r\n%s\r\n0\r\n\r\n' % (len(body), body)
+            self.assertEqual(md5(expected_sent).hexdigest(),
+                             conn.etag.hexdigest())
+        self.assertIsNone(rv)
+
+    def test_direct_put_container_fail(self):
+        with mock.patch('swift.common.bufferedhttp.http_connect_raw',
+                        side_effect=Exception('conn failed')):
+            with self.assertRaises(Exception) as cm:
+                direct_client.direct_put_container(
+                    self.node, self.part, self.account, self.container)
+        self.assertEqual('conn failed', str(cm.exception))
+
+        with mocked_http_conn(Exception('resp failed')):
+            with self.assertRaises(Exception) as cm:
+                direct_client.direct_put_container(
+                    self.node, self.part, self.account, self.container)
+        self.assertEqual('resp failed', str(cm.exception))
+
    def test_direct_put_container_object(self):
        headers = {'x-foo': 'bar'}

--- a/test/unit/common/test_utils.py
+++ b/test/unit/common/test_utils.py
--- a/test/unit/common/test_wsgi.py
+++ b/test/unit/common/test_wsgi.py
@ -1270,9 +1270,10 @@ class TestWorkersStrategy(unittest.TestCase):
            pid += 1
            sock_count += 1

+        mypid = os.getpid()
        self.assertEqual([
-            'Started child %s' % 88,
-            'Started child %s' % 89,
+            'Started child %s from parent %s' % (88, mypid),
+            'Started child %s from parent %s' % (89, mypid),
        ], self.logger.get_lines_for_level('notice'))

        self.assertEqual(2, sock_count)
@ -1282,7 +1283,7 @@ class TestWorkersStrategy(unittest.TestCase):
        self.strategy.register_worker_exit(88)

        self.assertEqual([
-            'Removing dead child %s' % 88,
+            'Removing dead child %s from parent %s' % (88, mypid)
        ], self.logger.get_lines_for_level('error'))

        for s, i in self.strategy.new_worker_socks():
@ -1294,9 +1295,9 @@ class TestWorkersStrategy(unittest.TestCase):

        self.assertEqual(1, sock_count)
        self.assertEqual([
-            'Started child %s' % 88,
-            'Started child %s' % 89,
-            'Started child %s' % 90,
+            'Started child %s from parent %s' % (88, mypid),
+            'Started child %s from parent %s' % (89, mypid),
+            'Started child %s from parent %s' % (90, mypid),
        ], self.logger.get_lines_for_level('notice'))

    def test_post_fork_hook(self):
--- a/test/unit/container/test_backend.py
+++ b/test/unit/container/test_backend.py
--- a/test/unit/container/test_replicator.py
+++ b/test/unit/container/test_replicator.py
--- a/test/unit/container/test_server.py
+++ b/test/unit/container/test_server.py
--- a/test/unit/container/test_sharder.py
+++ b/test/unit/container/test_sharder.py
--- a/test/unit/obj/test_server.py
+++ b/test/unit/obj/test_server.py
@ -1053,7 +1053,7 @@ class TestObjectController(unittest.TestCase):
            mock_ring = mock.MagicMock()
            mock_ring.get_nodes.return_value = (99, [node])
            object_updater.container_ring = mock_ring
-            mock_update.return_value = ((True, 1))
+            mock_update.return_value = ((True, 1, None))
            object_updater.run_once()
        self.assertEqual(1, mock_update.call_count)
        self.assertEqual((node, 99, 'PUT', '/a/c/o'),
@ -1061,6 +1061,7 @@ class TestObjectController(unittest.TestCase):
        actual_headers = mock_update.call_args_list[0][0][4]
        # User-Agent is updated.
        expected_post_headers['User-Agent'] = 'object-updater %s' % os.getpid()
+        expected_post_headers['X-Backend-Accept-Redirect'] = 'true'
        self.assertDictEqual(expected_post_headers, actual_headers)
        self.assertFalse(
            os.listdir(os.path.join(
@ -1073,6 +1074,104 @@ class TestObjectController(unittest.TestCase):
        self._test_PUT_then_POST_async_pendings(
            POLICIES[1], update_etag='override_etag')

+    def _check_PUT_redirected_async_pending(self, container_path=None):
+        # When container update is redirected verify that the redirect location
+        # is persisted in the async pending file.
+        policy = POLICIES[0]
+        device_dir = os.path.join(self.testdir, 'sda1')
+        t_put = next(self.ts)
+        update_etag = '098f6bcd4621d373cade4e832627b4f6'
+
+        put_headers = {
+            'X-Trans-Id': 'put_trans_id',
+            'X-Timestamp': t_put.internal,
+            'Content-Type': 'application/octet-stream;swift_bytes=123456789',
+            'Content-Length': '4',
+            'X-Backend-Storage-Policy-Index': int(policy),
+            'X-Container-Host': 'chost:3200',
+            'X-Container-Partition': '99',
+            'X-Container-Device': 'cdevice'}
+
+        if container_path:
+            # the proxy may include this header
+            put_headers['X-Backend-Container-Path'] = container_path
+            expected_update_path = '/cdevice/99/%s/o' % container_path
+        else:
+            expected_update_path = '/cdevice/99/a/c/o'
+
+        if policy.policy_type == EC_POLICY:
+            put_headers.update({
+                'X-Object-Sysmeta-Ec-Frag-Index': '2',
+                'X-Backend-Container-Update-Override-Etag': update_etag,
+                'X-Object-Sysmeta-Ec-Etag': update_etag})
+
+        req = Request.blank('/sda1/p/a/c/o',
+                            environ={'REQUEST_METHOD': 'PUT'},
+                            headers=put_headers, body='test')
+        resp_headers = {'Location': '/.sharded_a/c_shard_1/o',
+                        'X-Backend-Redirect-Timestamp': next(self.ts).internal}
+
+        with mocked_http_conn(301, headers=[resp_headers]) as conn, \
+                mock.patch('swift.common.utils.HASH_PATH_PREFIX', ''),\
+                fake_spawn():
+            resp = req.get_response(self.object_controller)
+
+        self.assertEqual(resp.status_int, 201)
+        self.assertEqual(1, len(conn.requests))
+
+        self.assertEqual(expected_update_path, conn.requests[0]['path'])
+
+        # whether or not an X-Backend-Container-Path was received from the
+        # proxy, the async pending file should now have the container_path
+        # equal to the Location header received in the update response.
+        async_pending_file_put = os.path.join(
+            device_dir, diskfile.get_async_dir(policy), 'a83',
+            '06fbf0b514e5199dfc4e00f42eb5ea83-%s' % t_put.internal)
+        self.assertTrue(os.path.isfile(async_pending_file_put),
+                        'Expected %s to be a file but it is not.'
+                        % async_pending_file_put)
+        expected_put_headers = {
+            'Referer': 'PUT http://localhost/sda1/p/a/c/o',
+            'X-Trans-Id': 'put_trans_id',
+            'X-Timestamp': t_put.internal,
+            'X-Content-Type': 'application/octet-stream;swift_bytes=123456789',
+            'X-Size': '4',
+            'X-Etag': '098f6bcd4621d373cade4e832627b4f6',
+            'User-Agent': 'object-server %s' % os.getpid(),
+            'X-Backend-Storage-Policy-Index': '%d' % int(policy)}
+        if policy.policy_type == EC_POLICY:
+            expected_put_headers['X-Etag'] = update_etag
+        self.assertEqual(
+            {'headers': expected_put_headers,
+             'account': 'a', 'container': 'c', 'obj': 'o', 'op': 'PUT',
+             'container_path': '.sharded_a/c_shard_1'},
+            pickle.load(open(async_pending_file_put)))
+
+        # when updater is run its first request will be to the redirect
+        # location that is persisted in the async pending file
+        with mocked_http_conn(201) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache',
+                            lambda *args: None):
+                object_updater = updater.ObjectUpdater(
+                    {'devices': self.testdir,
+                     'mount_check': 'false'}, logger=debug_logger())
+                node = {'id': 1, 'ip': 'chost', 'port': 3200,
+                        'device': 'cdevice'}
+                mock_ring = mock.MagicMock()
+                mock_ring.get_nodes.return_value = (99, [node])
+                object_updater.container_ring = mock_ring
+                object_updater.run_once()
+
+        self.assertEqual(1, len(conn.requests))
+        self.assertEqual('/cdevice/99/.sharded_a/c_shard_1/o',
+                         conn.requests[0]['path'])
+
+    def test_PUT_redirected_async_pending(self):
+        self._check_PUT_redirected_async_pending()
+
+    def test_PUT_redirected_async_pending_with_container_path(self):
+        self._check_PUT_redirected_async_pending(container_path='.another/c')
+
    def test_POST_quarantine_zbyte(self):
        timestamp = normalize_timestamp(time())
        req = Request.blank('/sda1/p/a/c/o', environ={'REQUEST_METHOD': 'PUT'},
@ -5263,6 +5362,95 @@ class TestObjectController(unittest.TestCase):
            'X-Backend-Container-Update-Override-Content-Type': 'ignored',
            'X-Backend-Container-Update-Override-Foo': 'ignored'})

+    def test_PUT_container_update_to_shard(self):
+        # verify that alternate container update path is respected when
+        # included in request headers
+        def do_test(container_path, expected_path, expected_container_path):
+            policy = random.choice(list(POLICIES))
+            container_updates = []
+
+            def capture_updates(
+                    ip, port, method, path, headers, *args, **kwargs):
+                container_updates.append((ip, port, method, path, headers))
+
+            pickle_async_update_args = []
+
+            def fake_pickle_async_update(*args):
+                pickle_async_update_args.append(args)
+
+            diskfile_mgr = self.object_controller._diskfile_router[policy]
+            diskfile_mgr.pickle_async_update = fake_pickle_async_update
+
+            ts_put = next(self.ts)
+            headers = {
+                'X-Timestamp': ts_put.internal,
+                'X-Trans-Id': '123',
+                'X-Container-Host': 'chost:cport',
+                'X-Container-Partition': 'cpartition',
+                'X-Container-Device': 'cdevice',
+                'Content-Type': 'text/plain',
+                'X-Object-Sysmeta-Ec-Frag-Index': 0,
+                'X-Backend-Storage-Policy-Index': int(policy),
+            }
+            if container_path is not None:
+                headers['X-Backend-Container-Path'] = container_path
+
+            req = Request.blank('/sda1/0/a/c/o', method='PUT',
+                                headers=headers, body='')
+            with mocked_http_conn(
+                    500, give_connect=capture_updates) as fake_conn:
+                with fake_spawn():
+                    resp = req.get_response(self.object_controller)
+            self.assertRaises(StopIteration, fake_conn.code_iter.next)
+            self.assertEqual(resp.status_int, 201)
+            self.assertEqual(len(container_updates), 1)
+            # verify expected path used in update request
+            ip, port, method, path, headers = container_updates[0]
+            self.assertEqual(ip, 'chost')
+            self.assertEqual(port, 'cport')
+            self.assertEqual(method, 'PUT')
+            self.assertEqual(path, '/cdevice/cpartition/%s/o' % expected_path)
+
+            # verify that the picked update *always* has root container
+            self.assertEqual(1, len(pickle_async_update_args))
+            (objdevice, account, container, obj, data, timestamp,
+             policy) = pickle_async_update_args[0]
+            self.assertEqual(objdevice, 'sda1')
+            self.assertEqual(account, 'a')  # NB user account
+            self.assertEqual(container, 'c')  # NB root container
+            self.assertEqual(obj, 'o')
+            self.assertEqual(timestamp, ts_put.internal)
+            self.assertEqual(policy, policy)
+            expected_data = {
+                'headers': HeaderKeyDict({
+                    'X-Size': '0',
+                    'User-Agent': 'object-server %s' % os.getpid(),
+                    'X-Content-Type': 'text/plain',
+                    'X-Timestamp': ts_put.internal,
+                    'X-Trans-Id': '123',
+                    'Referer': 'PUT http://localhost/sda1/0/a/c/o',
+                    'X-Backend-Storage-Policy-Index': int(policy),
+                    'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e'}),
+                'obj': 'o',
+                'account': 'a',
+                'container': 'c',
+                'op': 'PUT'}
+            if expected_container_path:
+                expected_data['container_path'] = expected_container_path
+            self.assertEqual(expected_data, data)
+
+        do_test('a_shard/c_shard', 'a_shard/c_shard', 'a_shard/c_shard')
+        do_test('', 'a/c', None)
+        do_test(None, 'a/c', None)
+        # TODO: should these cases trigger a 400 response rather than
+        # defaulting to root path?
+        do_test('garbage', 'a/c', None)
+        do_test('/', 'a/c', None)
+        do_test('/no-acct', 'a/c', None)
+        do_test('no-cont/', 'a/c', None)
+        do_test('too/many/parts', 'a/c', None)
+        do_test('/leading/slash', 'a/c', None)
+
    def test_container_update_async(self):
        policy = random.choice(list(POLICIES))
        req = Request.blank(
@ -5335,23 +5523,21 @@ class TestObjectController(unittest.TestCase):
                     'X-Container-Partition': '20',
                     'X-Container-Host': '1.2.3.4:5',
                     'X-Container-Device': 'sdb1'})
-        with mock.patch.object(object_server, 'spawn',
-                               local_fake_spawn):
-            with mock.patch.object(self.object_controller,
-                                   'async_update',
-                                   local_fake_async_update):
-                resp = req.get_response(self.object_controller)
-        # check the response is completed and successful
-        self.assertEqual(resp.status_int, 201)
-        # check that async_update hasn't been called
-        self.assertFalse(len(called_async_update_args))
-        # now do the work in greenthreads
-        for func, a, kw in saved_spawn_calls:
-            gt = spawn(func, *a, **kw)
-            greenthreads.append(gt)
-        # wait for the greenthreads to finish
-        for gt in greenthreads:
-            gt.wait()
+        with mock.patch.object(object_server, 'spawn', local_fake_spawn), \
+                mock.patch.object(self.object_controller, 'async_update',
+                                  local_fake_async_update):
+            resp = req.get_response(self.object_controller)
+            # check the response is completed and successful
+            self.assertEqual(resp.status_int, 201)
+            # check that async_update hasn't been called
+            self.assertFalse(len(called_async_update_args))
+            # now do the work in greenthreads
+            for func, a, kw in saved_spawn_calls:
+                gt = spawn(func, *a, **kw)
+                greenthreads.append(gt)
+            # wait for the greenthreads to finish
+            for gt in greenthreads:
+                gt.wait()
        # check that the calls to async_update have happened
        headers_out = {'X-Size': '0',
                       'X-Content-Type': 'application/burrito',
@ -5362,7 +5548,8 @@ class TestObjectController(unittest.TestCase):
                       'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e'}
        expected = [('PUT', 'a', 'c', 'o', '1.2.3.4:5', '20', 'sdb1',
                     headers_out, 'sda1', POLICIES[0]),
-                    {'logger_thread_locals': (None, None)}]
+                    {'logger_thread_locals': (None, None),
+                     'container_path': None}]
        self.assertEqual(called_async_update_args, [expected])

    def test_container_update_as_greenthread_with_timeout(self):
--- a/test/unit/obj/test_updater.py
+++ b/test/unit/obj/test_updater.py
@ -65,7 +65,9 @@ class TestObjectUpdater(unittest.TestCase):
                          {'id': 1, 'ip': '127.0.0.1', 'port': 1,
                           'device': 'sda1', 'zone': 2},
                          {'id': 2, 'ip': '127.0.0.1', 'port': 1,
-                           'device': 'sda1', 'zone': 4}], 30),
+                           'device': 'sda1', 'zone': 4},
+                          {'id': 3, 'ip': '127.0.0.1', 'port': 1,
+                           'device': 'sda1', 'zone': 6}], 30),
                f)
        self.devices_dir = os.path.join(self.testdir, 'devices')
        os.mkdir(self.devices_dir)
@ -74,6 +76,7 @@ class TestObjectUpdater(unittest.TestCase):
        for policy in POLICIES:
            os.mkdir(os.path.join(self.sda1, get_tmp_dir(policy)))
        self.logger = debug_logger()
+        self.ts_iter = make_timestamp_iter()

    def tearDown(self):
        rmtree(self.testdir, ignore_errors=1)
@ -299,19 +302,22 @@ class TestObjectUpdater(unittest.TestCase):
        self.assertIn("sweep progress", info_lines[1])
        # the space ensures it's a positive number
        self.assertIn(
-            "2 successes, 0 failures, 0 quarantines, 2 unlinks, 0 error",
+            "2 successes, 0 failures, 0 quarantines, 2 unlinks, 0 errors, "
+            "0 redirects",
            info_lines[1])
        self.assertIn(self.sda1, info_lines[1])

        self.assertIn("sweep progress", info_lines[2])
        self.assertIn(
-            "4 successes, 0 failures, 0 quarantines, 4 unlinks, 0 error",
+            "4 successes, 0 failures, 0 quarantines, 4 unlinks, 0 errors, "
+            "0 redirects",
            info_lines[2])
        self.assertIn(self.sda1, info_lines[2])

        self.assertIn("sweep complete", info_lines[3])
        self.assertIn(
-            "5 successes, 0 failures, 0 quarantines, 5 unlinks, 0 error",
+            "5 successes, 0 failures, 0 quarantines, 5 unlinks, 0 errors, "
+            "0 redirects",
            info_lines[3])
        self.assertIn(self.sda1, info_lines[3])

@ -547,6 +553,26 @@ class TestObjectUpdater(unittest.TestCase):
                             {'successes': 1, 'unlinks': 1,
                              'async_pendings': 1})

+    def _write_async_update(self, dfmanager, timestamp, policy,
+                            headers=None, container_path=None):
+        # write an async
+        account, container, obj = 'a', 'c', 'o'
+        op = 'PUT'
+        headers_out = headers or {
+            'x-size': 0,
+            'x-content-type': 'text/plain',
+            'x-etag': 'd41d8cd98f00b204e9800998ecf8427e',
+            'x-timestamp': timestamp.internal,
+            'X-Backend-Storage-Policy-Index': int(policy),
+            'User-Agent': 'object-server %s' % os.getpid()
+        }
+        data = {'op': op, 'account': account, 'container': container,
+                'obj': obj, 'headers': headers_out}
+        if container_path:
+            data['container_path'] = container_path
+        dfmanager.pickle_async_update(self.sda1, account, container, obj,
+                                      data, timestamp, policy)
+
    def test_obj_put_async_updates(self):
        ts_iter = make_timestamp_iter()
        policies = list(POLICIES)
@ -562,16 +588,12 @@ class TestObjectUpdater(unittest.TestCase):
        async_dir = os.path.join(self.sda1, get_async_dir(policies[0]))
        os.mkdir(async_dir)

-        def do_test(headers_out, expected):
+        def do_test(headers_out, expected, container_path=None):
            # write an async
            dfmanager = DiskFileManager(conf, daemon.logger)
-            account, container, obj = 'a', 'c', 'o'
-            op = 'PUT'
-            data = {'op': op, 'account': account, 'container': container,
-                    'obj': obj, 'headers': headers_out}
-            dfmanager.pickle_async_update(self.sda1, account, container, obj,
-                                          data, next(ts_iter), policies[0])
-
+            self._write_async_update(dfmanager, next(ts_iter), policies[0],
+                                     headers=headers_out,
+                                     container_path=container_path)
            request_log = []

            def capture(*args, **kwargs):
@ -613,11 +635,21 @@ class TestObjectUpdater(unittest.TestCase):
            'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e',
            'X-Timestamp': ts.normal,
            'X-Backend-Storage-Policy-Index': str(int(policies[0])),
-            'User-Agent': 'object-updater %s' % os.getpid()
+            'User-Agent': 'object-updater %s' % os.getpid(),
+            'X-Backend-Accept-Redirect': 'true',
        }
+        # always expect X-Backend-Accept-Redirect to be true
+        do_test(headers_out, expected, container_path='.shards_a/shard_c')
        do_test(headers_out, expected)

+        # ...unless X-Backend-Accept-Redirect is already set
+        expected['X-Backend-Accept-Redirect'] = 'false'
+        headers_out_2 = dict(headers_out)
+        headers_out_2['X-Backend-Accept-Redirect'] = 'false'
+        do_test(headers_out_2, expected)
+
        # updater should add policy header if missing
+        expected['X-Backend-Accept-Redirect'] = 'true'
        headers_out['X-Backend-Storage-Policy-Index'] = None
        do_test(headers_out, expected)

@ -632,6 +664,414 @@ class TestObjectUpdater(unittest.TestCase):
            'X-Backend-Storage-Policy-Index')
        do_test(headers_out, expected)

+    def _check_update_requests(self, requests, timestamp, policy):
+        # do some sanity checks on update request
+        expected_headers = {
+            'X-Size': '0',
+            'X-Content-Type': 'text/plain',
+            'X-Etag': 'd41d8cd98f00b204e9800998ecf8427e',
+            'X-Timestamp': timestamp.internal,
+            'X-Backend-Storage-Policy-Index': str(int(policy)),
+            'User-Agent': 'object-updater %s' % os.getpid(),
+            'X-Backend-Accept-Redirect': 'true'}
+        for request in requests:
+            self.assertEqual('PUT', request['method'])
+            self.assertDictEqual(expected_headers, request['headers'])
+
+    def test_obj_put_async_root_update_redirected(self):
+        policies = list(POLICIES)
+        random.shuffle(policies)
+        # setup updater
+        conf = {
+            'devices': self.devices_dir,
+            'mount_check': 'false',
+            'swift_dir': self.testdir,
+        }
+        daemon = object_updater.ObjectUpdater(conf, logger=self.logger)
+        async_dir = os.path.join(self.sda1, get_async_dir(policies[0]))
+        os.mkdir(async_dir)
+        dfmanager = DiskFileManager(conf, daemon.logger)
+
+        ts_obj = next(self.ts_iter)
+        self._write_async_update(dfmanager, ts_obj, policies[0])
+
+        # run once
+        ts_redirect_1 = next(self.ts_iter)
+        ts_redirect_2 = next(self.ts_iter)
+        fake_responses = [
+            # first round of update attempts, newest redirect should be chosen
+            (200, {}),
+            (301, {'Location': '/.shards_a/c_shard_new/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_2.internal}),
+            (301, {'Location': '/.shards_a/c_shard_old/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_1.internal}),
+            # second round of update attempts
+            (200, {}),
+            (200, {}),
+            (200, {}),
+        ]
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+
+        self._check_update_requests(conn.requests[:3], ts_obj, policies[0])
+        self._check_update_requests(conn.requests[3:], ts_obj, policies[0])
+        self.assertEqual(['/sda1/0/a/c/o'] * 3 +
+                         ['/sda1/0/.shards_a/c_shard_new/o'] * 3,
+                         [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 1, 'successes': 1,
+             'unlinks': 1, 'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        self.assertFalse(os.listdir(async_dir))  # no async file
+
+    def test_obj_put_async_root_update_redirected_previous_success(self):
+        policies = list(POLICIES)
+        random.shuffle(policies)
+        # setup updater
+        conf = {
+            'devices': self.devices_dir,
+            'mount_check': 'false',
+            'swift_dir': self.testdir,
+        }
+        daemon = object_updater.ObjectUpdater(conf, logger=self.logger)
+        async_dir = os.path.join(self.sda1, get_async_dir(policies[0]))
+        os.mkdir(async_dir)
+        dfmanager = DiskFileManager(conf, daemon.logger)
+
+        ts_obj = next(self.ts_iter)
+        self._write_async_update(dfmanager, ts_obj, policies[0])
+        orig_async_path, orig_async_data = self._check_async_file(async_dir)
+
+        # run once
+        with mocked_http_conn(
+                507, 200, 507) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+
+        self._check_update_requests(conn.requests, ts_obj, policies[0])
+        self.assertEqual(['/sda1/0/a/c/o'] * 3,
+                         [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'failures': 1, 'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        async_path, async_data = self._check_async_file(async_dir)
+        self.assertEqual(dict(orig_async_data, successes=[1]), async_data)
+
+        # run again - expect 3 redirected updates despite previous success
+        ts_redirect = next(self.ts_iter)
+        resp_headers_1 = {'Location': '/.shards_a/c_shard_1/o',
+                          'X-Backend-Redirect-Timestamp': ts_redirect.internal}
+        fake_responses = (
+            # 1st round of redirects, 2nd round of redirects
+            [(301, resp_headers_1)] * 2 + [(200, {})] * 3)
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+
+        self._check_update_requests(conn.requests[:2], ts_obj, policies[0])
+        self._check_update_requests(conn.requests[2:], ts_obj, policies[0])
+        root_part = daemon.container_ring.get_part('a/c')
+        shard_1_part = daemon.container_ring.get_part('.shards_a/c_shard_1')
+        self.assertEqual(
+            ['/sda1/%s/a/c/o' % root_part] * 2 +
+            ['/sda1/%s/.shards_a/c_shard_1/o' % shard_1_part] * 3,
+            [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 1, 'successes': 1, 'failures': 1, 'unlinks': 1,
+             'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        self.assertFalse(os.listdir(async_dir))  # no async file
+
+    def _check_async_file(self, async_dir):
+        async_subdirs = os.listdir(async_dir)
+        self.assertEqual([mock.ANY], async_subdirs)
+        async_files = os.listdir(os.path.join(async_dir, async_subdirs[0]))
+        self.assertEqual([mock.ANY], async_files)
+        async_path = os.path.join(
+            async_dir, async_subdirs[0], async_files[0])
+        with open(async_path) as fd:
+            async_data = pickle.load(fd)
+        return async_path, async_data
+
+    def _check_obj_put_async_update_bad_redirect_headers(self, headers):
+        policies = list(POLICIES)
+        random.shuffle(policies)
+        # setup updater
+        conf = {
+            'devices': self.devices_dir,
+            'mount_check': 'false',
+            'swift_dir': self.testdir,
+        }
+        daemon = object_updater.ObjectUpdater(conf, logger=self.logger)
+        async_dir = os.path.join(self.sda1, get_async_dir(policies[0]))
+        os.mkdir(async_dir)
+        dfmanager = DiskFileManager(conf, daemon.logger)
+
+        ts_obj = next(self.ts_iter)
+        self._write_async_update(dfmanager, ts_obj, policies[0])
+        orig_async_path, orig_async_data = self._check_async_file(async_dir)
+
+        fake_responses = [
+            (301, headers),
+            (301, headers),
+            (301, headers),
+        ]
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+
+        self._check_update_requests(conn.requests, ts_obj, policies[0])
+        self.assertEqual(['/sda1/0/a/c/o'] * 3,
+                         [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'failures': 1, 'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        # async file still intact
+        async_path, async_data = self._check_async_file(async_dir)
+        self.assertEqual(orig_async_path, async_path)
+        self.assertEqual(orig_async_data, async_data)
+        return daemon
+
+    def test_obj_put_async_root_update_missing_location_header(self):
+        headers = {
+            'X-Backend-Redirect-Timestamp': next(self.ts_iter).internal}
+        self._check_obj_put_async_update_bad_redirect_headers(headers)
+
+    def test_obj_put_async_root_update_bad_location_header(self):
+        headers = {
+            'Location': 'bad bad bad',
+            'X-Backend-Redirect-Timestamp': next(self.ts_iter).internal}
+        daemon = self._check_obj_put_async_update_bad_redirect_headers(headers)
+        error_lines = daemon.logger.get_lines_for_level('error')
+        self.assertIn('Container update failed', error_lines[0])
+        self.assertIn('Invalid path: bad%20bad%20bad', error_lines[0])
+
+    def test_obj_put_async_shard_update_redirected_twice(self):
+        policies = list(POLICIES)
+        random.shuffle(policies)
+        # setup updater
+        conf = {
+            'devices': self.devices_dir,
+            'mount_check': 'false',
+            'swift_dir': self.testdir,
+        }
+        daemon = object_updater.ObjectUpdater(conf, logger=self.logger)
+        async_dir = os.path.join(self.sda1, get_async_dir(policies[0]))
+        os.mkdir(async_dir)
+        dfmanager = DiskFileManager(conf, daemon.logger)
+
+        ts_obj = next(self.ts_iter)
+        self._write_async_update(dfmanager, ts_obj, policies[0],
+                                 container_path='.shards_a/c_shard_older')
+        orig_async_path, orig_async_data = self._check_async_file(async_dir)
+
+        # run once
+        ts_redirect_1 = next(self.ts_iter)
+        ts_redirect_2 = next(self.ts_iter)
+        ts_redirect_3 = next(self.ts_iter)
+        fake_responses = [
+            # 1st round of redirects, newest redirect should be chosen
+            (301, {'Location': '/.shards_a/c_shard_old/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_1.internal}),
+            (301, {'Location': '/.shards_a/c_shard_new/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_2.internal}),
+            (301, {'Location': '/.shards_a/c_shard_old/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_1.internal}),
+            # 2nd round of redirects
+            (301, {'Location': '/.shards_a/c_shard_newer/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_3.internal}),
+            (301, {'Location': '/.shards_a/c_shard_newer/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_3.internal}),
+            (301, {'Location': '/.shards_a/c_shard_newer/o',
+                   'X-Backend-Redirect-Timestamp': ts_redirect_3.internal}),
+        ]
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+
+        self._check_update_requests(conn.requests, ts_obj, policies[0])
+        # only *one* set of redirected requests is attempted per cycle
+        older_part = daemon.container_ring.get_part('.shards_a/c_shard_older')
+        new_part = daemon.container_ring.get_part('.shards_a/c_shard_new')
+        newer_part = daemon.container_ring.get_part('.shards_a/c_shard_newer')
+        self.assertEqual(
+            ['/sda1/%s/.shards_a/c_shard_older/o' % older_part] * 3 +
+            ['/sda1/%s/.shards_a/c_shard_new/o' % new_part] * 3,
+            [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 2, 'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        # update failed, we still have pending file with most recent redirect
+        # response Location header value added to data
+        async_path, async_data = self._check_async_file(async_dir)
+        self.assertEqual(orig_async_path, async_path)
+        self.assertEqual(
+            dict(orig_async_data, container_path='.shards_a/c_shard_newer',
+                 redirect_history=['.shards_a/c_shard_new',
+                                   '.shards_a/c_shard_newer']),
+            async_data)
+
+        # next cycle, should get latest redirect from pickled async update
+        fake_responses = [(200, {})] * 3
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+
+        self._check_update_requests(conn.requests, ts_obj, policies[0])
+        self.assertEqual(
+            ['/sda1/%s/.shards_a/c_shard_newer/o' % newer_part] * 3,
+            [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 2, 'successes': 1, 'unlinks': 1,
+             'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        self.assertFalse(os.listdir(async_dir))  # no async file
+
+    def test_obj_put_async_update_redirection_loop(self):
+        policies = list(POLICIES)
+        random.shuffle(policies)
+        # setup updater
+        conf = {
+            'devices': self.devices_dir,
+            'mount_check': 'false',
+            'swift_dir': self.testdir,
+        }
+        daemon = object_updater.ObjectUpdater(conf, logger=self.logger)
+        async_dir = os.path.join(self.sda1, get_async_dir(policies[0]))
+        os.mkdir(async_dir)
+        dfmanager = DiskFileManager(conf, daemon.logger)
+
+        ts_obj = next(self.ts_iter)
+        self._write_async_update(dfmanager, ts_obj, policies[0])
+        orig_async_path, orig_async_data = self._check_async_file(async_dir)
+
+        # run once
+        ts_redirect = next(self.ts_iter)
+
+        resp_headers_1 = {'Location': '/.shards_a/c_shard_1/o',
+                          'X-Backend-Redirect-Timestamp': ts_redirect.internal}
+        resp_headers_2 = {'Location': '/.shards_a/c_shard_2/o',
+                          'X-Backend-Redirect-Timestamp': ts_redirect.internal}
+        fake_responses = (
+            # 1st round of redirects, 2nd round of redirects
+            [(301, resp_headers_1)] * 3 + [(301, resp_headers_2)] * 3)
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+        self._check_update_requests(conn.requests[:3], ts_obj, policies[0])
+        self._check_update_requests(conn.requests[3:], ts_obj, policies[0])
+        # only *one* set of redirected requests is attempted per cycle
+        root_part = daemon.container_ring.get_part('a/c')
+        shard_1_part = daemon.container_ring.get_part('.shards_a/c_shard_1')
+        shard_2_part = daemon.container_ring.get_part('.shards_a/c_shard_2')
+        shard_3_part = daemon.container_ring.get_part('.shards_a/c_shard_3')
+        self.assertEqual(['/sda1/%s/a/c/o' % root_part] * 3 +
+                         ['/sda1/%s/.shards_a/c_shard_1/o' % shard_1_part] * 3,
+                         [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 2, 'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        # update failed, we still have pending file with most recent redirect
+        # response Location header value added to data
+        async_path, async_data = self._check_async_file(async_dir)
+        self.assertEqual(orig_async_path, async_path)
+        self.assertEqual(
+            dict(orig_async_data, container_path='.shards_a/c_shard_2',
+                 redirect_history=['.shards_a/c_shard_1',
+                                   '.shards_a/c_shard_2']),
+            async_data)
+
+        # next cycle, more redirects! first is to previously visited location
+        resp_headers_3 = {'Location': '/.shards_a/c_shard_3/o',
+                          'X-Backend-Redirect-Timestamp': ts_redirect.internal}
+        fake_responses = (
+            # 1st round of redirects, 2nd round of redirects
+            [(301, resp_headers_1)] * 3 + [(301, resp_headers_3)] * 3)
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+        self._check_update_requests(conn.requests[:3], ts_obj, policies[0])
+        self._check_update_requests(conn.requests[3:], ts_obj, policies[0])
+        # first try the previously persisted container path, response to that
+        # creates a loop so ignore and send to root
+        self.assertEqual(
+            ['/sda1/%s/.shards_a/c_shard_2/o' % shard_2_part] * 3 +
+            ['/sda1/%s/a/c/o' % root_part] * 3,
+            [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 4, 'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        # update failed, we still have pending file with most recent redirect
+        # response Location header value from root added to persisted data
+        async_path, async_data = self._check_async_file(async_dir)
+        self.assertEqual(orig_async_path, async_path)
+        # note: redirect_history was reset when falling back to root
+        self.assertEqual(
+            dict(orig_async_data, container_path='.shards_a/c_shard_3',
+                 redirect_history=['.shards_a/c_shard_3']),
+            async_data)
+
+        # next cycle, more redirects! first is to a location visited previously
+        # but not since last fall back to root, so that location IS tried;
+        # second is to a location visited since last fall back to root so that
+        # location is NOT tried
+        fake_responses = (
+            # 1st round of redirects, 2nd round of redirects
+            [(301, resp_headers_1)] * 3 + [(301, resp_headers_3)] * 3)
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+        self._check_update_requests(conn.requests, ts_obj, policies[0])
+        self.assertEqual(
+            ['/sda1/%s/.shards_a/c_shard_3/o' % shard_3_part] * 3 +
+            ['/sda1/%s/.shards_a/c_shard_1/o' % shard_1_part] * 3,
+            [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 6, 'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        # update failed, we still have pending file, but container_path is None
+        # because most recent redirect location was a repeat
+        async_path, async_data = self._check_async_file(async_dir)
+        self.assertEqual(orig_async_path, async_path)
+        self.assertEqual(
+            dict(orig_async_data, container_path=None,
+                 redirect_history=[]),
+            async_data)
+
+        # next cycle, persisted container path is None so update should go to
+        # root, this time it succeeds
+        fake_responses = [(200, {})] * 3
+        fake_status_codes, fake_headers = zip(*fake_responses)
+        with mocked_http_conn(
+                *fake_status_codes, headers=fake_headers) as conn:
+            with mock.patch('swift.obj.updater.dump_recon_cache'):
+                daemon.run_once()
+        self._check_update_requests(conn.requests, ts_obj, policies[0])
+        self.assertEqual(['/sda1/%s/a/c/o' % root_part] * 3,
+                         [req['path'] for req in conn.requests])
+        self.assertEqual(
+            {'redirects': 6, 'successes': 1, 'unlinks': 1,
+             'async_pendings': 1},
+            daemon.logger.get_increment_counts())
+        self.assertFalse(os.listdir(async_dir))  # no async file
+

 if __name__ == '__main__':
    unittest.main()
--- a/test/unit/proxy/controllers/test_base.py
+++ b/test/unit/proxy/controllers/test_base.py
@ -14,6 +14,7 @@
 # limitations under the License.

 import itertools
+import json
 from collections import defaultdict
 import unittest
 import mock
@ -23,11 +24,14 @@ from swift.proxy.controllers.base import headers_to_container_info, \
    Controller, GetOrHeadHandler, bytes_to_skip
 from swift.common.swob import Request, HTTPException, RESPONSE_REASONS
 from swift.common import exceptions
-from swift.common.utils import split_path
+from swift.common.utils import split_path, ShardRange, Timestamp
 from swift.common.header_key_dict import HeaderKeyDict
 from swift.common.http import is_success
 from swift.common.storage_policy import StoragePolicy, StoragePolicyCollection
-from test.unit import fake_http_connect, FakeRing, FakeMemcache, PatchPolicies
+from test.unit import (
+    fake_http_connect, FakeRing, FakeMemcache, PatchPolicies, FakeLogger,
+    make_timestamp_iter,
+    mocked_http_conn)
 from swift.proxy import server as proxy_server
 from swift.common.request_helpers import (
    get_sys_meta_prefix, get_object_transient_sysmeta
@ -172,7 +176,8 @@ class TestFuncs(unittest.TestCase):
    def setUp(self):
        self.app = proxy_server.Application(None, FakeMemcache(),
                                            account_ring=FakeRing(),
-                                            container_ring=FakeRing())
+                                            container_ring=FakeRing(),
+                                            logger=FakeLogger())

    def test_get_info_zero_recheck(self):
        mock_cache = mock.Mock()
@ -1030,3 +1035,146 @@ class TestFuncs(unittest.TestCase):
        # prime numbers
        self.assertEqual(bytes_to_skip(11, 7), 4)
        self.assertEqual(bytes_to_skip(97, 7873823), 55)
+
+    def test_get_shard_ranges_for_container_get(self):
+        ts_iter = make_timestamp_iter()
+        shard_ranges = [dict(ShardRange(
+            '.sharded_a/sr%d' % i, next(ts_iter), '%d_lower' % i,
+            '%d_upper' % i, object_count=i, bytes_used=1024 * i,
+            meta_timestamp=next(ts_iter)))
+            for i in range(3)]
+        base = Controller(self.app)
+        req = Request.blank('/v1/a/c', method='GET')
+        resp_headers = {'X-Backend-Record-Type': 'shard'}
+        with mocked_http_conn(
+            200, 200, body_iter=iter(['', json.dumps(shard_ranges)]),
+            headers=resp_headers
+        ) as fake_conn:
+            actual = base._get_shard_ranges(req, 'a', 'c')
+
+        # account info
+        captured = fake_conn.requests
+        self.assertEqual('HEAD', captured[0]['method'])
+        self.assertEqual('a', captured[0]['path'][7:])
+        # container GET
+        self.assertEqual('GET', captured[1]['method'])
+        self.assertEqual('a/c', captured[1]['path'][7:])
+        self.assertEqual('format=json', captured[1]['qs'])
+        self.assertEqual(
+            'shard', captured[1]['headers'].get('X-Backend-Record-Type'))
+        self.assertEqual(shard_ranges, [dict(pr) for pr in actual])
+        self.assertFalse(self.app.logger.get_lines_for_level('error'))
+
+    def test_get_shard_ranges_for_object_put(self):
+        ts_iter = make_timestamp_iter()
+        shard_ranges = [dict(ShardRange(
+            '.sharded_a/sr%d' % i, next(ts_iter), '%d_lower' % i,
+            '%d_upper' % i, object_count=i, bytes_used=1024 * i,
+            meta_timestamp=next(ts_iter)))
+            for i in range(3)]
+        base = Controller(self.app)
+        req = Request.blank('/v1/a/c/o', method='PUT')
+        resp_headers = {'X-Backend-Record-Type': 'shard'}
+        with mocked_http_conn(
+            200, 200, body_iter=iter(['', json.dumps(shard_ranges[1:2])]),
+            headers=resp_headers
+        ) as fake_conn:
+            actual = base._get_shard_ranges(req, 'a', 'c', '1_test')
+
+        # account info
+        captured = fake_conn.requests
+        self.assertEqual('HEAD', captured[0]['method'])
+        self.assertEqual('a', captured[0]['path'][7:])
+        # container GET
+        self.assertEqual('GET', captured[1]['method'])
+        self.assertEqual('a/c', captured[1]['path'][7:])
+        params = sorted(captured[1]['qs'].split('&'))
+        self.assertEqual(
+            ['format=json', 'includes=1_test'], params)
+        self.assertEqual(
+            'shard', captured[1]['headers'].get('X-Backend-Record-Type'))
+        self.assertEqual(shard_ranges[1:2], [dict(pr) for pr in actual])
+        self.assertFalse(self.app.logger.get_lines_for_level('error'))
+
+    def _check_get_shard_ranges_bad_data(self, body):
+        base = Controller(self.app)
+        req = Request.blank('/v1/a/c/o', method='PUT')
+        # empty response
+        headers = {'X-Backend-Record-Type': 'shard'}
+        with mocked_http_conn(200, 200, body_iter=iter(['', body]),
+                              headers=headers):
+            actual = base._get_shard_ranges(req, 'a', 'c', '1_test')
+        self.assertIsNone(actual)
+        lines = self.app.logger.get_lines_for_level('error')
+        return lines
+
+    def test_get_shard_ranges_empty_body(self):
+        error_lines = self._check_get_shard_ranges_bad_data('')
+        self.assertIn('Problem with listing response', error_lines[0])
+        self.assertIn('No JSON', error_lines[0])
+        self.assertFalse(error_lines[1:])
+
+    def test_get_shard_ranges_not_a_list(self):
+        error_lines = self._check_get_shard_ranges_bad_data(json.dumps({}))
+        self.assertIn('Problem with listing response', error_lines[0])
+        self.assertIn('not a list', error_lines[0])
+        self.assertFalse(error_lines[1:])
+
+    def test_get_shard_ranges_key_missing(self):
+        error_lines = self._check_get_shard_ranges_bad_data(json.dumps([{}]))
+        self.assertIn('Failed to get shard ranges', error_lines[0])
+        self.assertIn('KeyError', error_lines[0])
+        self.assertFalse(error_lines[1:])
+
+    def test_get_shard_ranges_invalid_shard_range(self):
+        sr = ShardRange('a/c', Timestamp.now())
+        bad_sr_data = dict(sr, name='bad_name')
+        error_lines = self._check_get_shard_ranges_bad_data(
+            json.dumps([bad_sr_data]))
+        self.assertIn('Failed to get shard ranges', error_lines[0])
+        self.assertIn('ValueError', error_lines[0])
+        self.assertFalse(error_lines[1:])
+
+    def test_get_shard_ranges_missing_record_type(self):
+        base = Controller(self.app)
+        req = Request.blank('/v1/a/c/o', method='PUT')
+        sr = ShardRange('a/c', Timestamp.now())
+        body = json.dumps([dict(sr)])
+        with mocked_http_conn(
+                200, 200, body_iter=iter(['', body])):
+            actual = base._get_shard_ranges(req, 'a', 'c', '1_test')
+        self.assertIsNone(actual)
+        error_lines = self.app.logger.get_lines_for_level('error')
+        self.assertIn('Failed to get shard ranges', error_lines[0])
+        self.assertIn('unexpected record type', error_lines[0])
+        self.assertIn('/a/c', error_lines[0])
+        self.assertFalse(error_lines[1:])
+
+    def test_get_shard_ranges_wrong_record_type(self):
+        base = Controller(self.app)
+        req = Request.blank('/v1/a/c/o', method='PUT')
+        sr = ShardRange('a/c', Timestamp.now())
+        body = json.dumps([dict(sr)])
+        headers = {'X-Backend-Record-Type': 'object'}
+        with mocked_http_conn(
+                200, 200, body_iter=iter(['', body]),
+                headers=headers):
+            actual = base._get_shard_ranges(req, 'a', 'c', '1_test')
+        self.assertIsNone(actual)
+        error_lines = self.app.logger.get_lines_for_level('error')
+        self.assertIn('Failed to get shard ranges', error_lines[0])
+        self.assertIn('unexpected record type', error_lines[0])
+        self.assertIn('/a/c', error_lines[0])
+        self.assertFalse(error_lines[1:])
+
+    def test_get_shard_ranges_request_failed(self):
+        base = Controller(self.app)
+        req = Request.blank('/v1/a/c/o', method='PUT')
+        with mocked_http_conn(200, 404, 404, 404):
+            actual = base._get_shard_ranges(req, 'a', 'c', '1_test')
+        self.assertIsNone(actual)
+        self.assertFalse(self.app.logger.get_lines_for_level('error'))
+        warning_lines = self.app.logger.get_lines_for_level('warning')
+        self.assertIn('Failed to get container listing', warning_lines[0])
+        self.assertIn('/a/c', warning_lines[0])
+        self.assertFalse(warning_lines[1:])
--- a/test/unit/proxy/controllers/test_container.py
+++ b/test/unit/proxy/controllers/test_container.py
@ -12,17 +12,24 @@
 # implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json

 import mock
 import socket
 import unittest

 from eventlet import Timeout
+from six.moves import urllib

+from swift.common.constraints import CONTAINER_LISTING_LIMIT
 from swift.common.swob import Request
+from swift.common.utils import ShardRange, Timestamp
 from swift.proxy import server as proxy_server
-from swift.proxy.controllers.base import headers_to_container_info, Controller
-from test.unit import fake_http_connect, FakeRing, FakeMemcache
+from swift.proxy.controllers.base import headers_to_container_info, Controller, \
+    get_container_info
+from test import annotate_failure
+from test.unit import fake_http_connect, FakeRing, FakeMemcache, \
+    make_timestamp_iter
 from swift.common.storage_policy import StoragePolicy
 from swift.common.request_helpers import get_sys_meta_prefix

@ -72,6 +79,7 @@ class TestContainerController(TestRingBase):
                            new=FakeAccountInfoContainerController):
                return _orig_get_controller(*args, **kwargs)
        self.app.get_controller = wrapped_get_controller
+        self.ts_iter = make_timestamp_iter()

    def _make_callback_func(self, context):
        def callback(ipaddr, port, device, partition, method, path,
@ -151,6 +159,91 @@ class TestContainerController(TestRingBase):
        for key in owner_headers:
            self.assertIn(key, resp.headers)

+    def test_reseller_admin(self):
+        reseller_internal_headers = {
+            get_sys_meta_prefix('container') + 'sharding': 'True'}
+        reseller_external_headers = {'x-container-sharding': 'on'}
+        controller = proxy_server.ContainerController(self.app, 'a', 'c')
+
+        # Normal users, even swift owners, can't set it
+        req = Request.blank('/v1/a/c', method='PUT',
+                            headers=reseller_external_headers,
+                            environ={'swift_owner': True})
+        with mocked_http_conn(*[201] * self.CONTAINER_REPLICAS) as mock_conn:
+            resp = req.get_response(self.app)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_internal_headers:
+            for captured in mock_conn.requests:
+                self.assertNotIn(key.title(), captured['headers'])
+
+        req = Request.blank('/v1/a/c', method='POST',
+                            headers=reseller_external_headers,
+                            environ={'swift_owner': True})
+        with mocked_http_conn(*[204] * self.CONTAINER_REPLICAS) as mock_conn:
+            resp = req.get_response(self.app)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_internal_headers:
+            for captured in mock_conn.requests:
+                self.assertNotIn(key.title(), captured['headers'])
+
+        req = Request.blank('/v1/a/c', environ={'swift_owner': True})
+        # Heck, they don't even get to know
+        with mock.patch('swift.proxy.controllers.base.http_connect',
+                        fake_http_connect(200, 200,
+                                          headers=reseller_internal_headers)):
+            resp = controller.HEAD(req)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_external_headers:
+            self.assertNotIn(key, resp.headers)
+
+        with mock.patch('swift.proxy.controllers.base.http_connect',
+                        fake_http_connect(200, 200,
+                                          headers=reseller_internal_headers)):
+            resp = controller.GET(req)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_external_headers:
+            self.assertNotIn(key, resp.headers)
+
+        # But reseller admins can set it
+        req = Request.blank('/v1/a/c', method='PUT',
+                            headers=reseller_external_headers,
+                            environ={'reseller_request': True})
+        with mocked_http_conn(*[201] * self.CONTAINER_REPLICAS) as mock_conn:
+            resp = req.get_response(self.app)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_internal_headers:
+            for captured in mock_conn.requests:
+                self.assertIn(key.title(), captured['headers'])
+
+        req = Request.blank('/v1/a/c', method='POST',
+                            headers=reseller_external_headers,
+                            environ={'reseller_request': True})
+        with mocked_http_conn(*[204] * self.CONTAINER_REPLICAS) as mock_conn:
+            resp = req.get_response(self.app)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_internal_headers:
+            for captured in mock_conn.requests:
+                self.assertIn(key.title(), captured['headers'])
+
+        # And see that they have
+        req = Request.blank('/v1/a/c', environ={'reseller_request': True})
+        with mock.patch('swift.proxy.controllers.base.http_connect',
+                        fake_http_connect(200, 200,
+                                          headers=reseller_internal_headers)):
+            resp = controller.HEAD(req)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_external_headers:
+            self.assertIn(key, resp.headers)
+            self.assertEqual(resp.headers[key], 'True')
+
+        with mock.patch('swift.proxy.controllers.base.http_connect',
+                        fake_http_connect(200, 200,
+                                          headers=reseller_internal_headers)):
+            resp = controller.GET(req)
+        self.assertEqual(2, resp.status_int // 100)
+        for key in reseller_external_headers:
+            self.assertEqual(resp.headers[key], 'True')
+
    def test_sys_meta_headers_PUT(self):
        # check that headers in sys meta namespace make it through
        # the container controller
@ -329,6 +422,852 @@ class TestContainerController(TestRingBase):
        ]
        self._assert_responses('POST', POST_TEST_CASES)

+    def _make_shard_objects(self, shard_range):
+        lower = ord(shard_range.lower[0]) if shard_range.lower else ord('@')
+        upper = ord(shard_range.upper[0]) if shard_range.upper else ord('z')
+
+        objects = [{'name': chr(i), 'bytes': i, 'hash': 'hash%s' % chr(i),
+                    'content_type': 'text/plain', 'deleted': 0,
+                    'last_modified': next(self.ts_iter).isoformat}
+                   for i in range(lower + 1, upper + 1)]
+        return objects
+
+    def _check_GET_shard_listing(self, mock_responses, expected_objects,
+                                 expected_requests, query_string='',
+                                 reverse=False):
+        # mock_responses is a list of tuples (status, json body, headers)
+        # expected objects is a list of dicts
+        # expected_requests is a list of tuples (path, hdrs dict, params dict)
+
+        # sanity check that expected objects is name ordered with no repeats
+        def name(obj):
+            return obj.get('name', obj.get('subdir'))
+
+        for (prev, next_) in zip(expected_objects, expected_objects[1:]):
+            if reverse:
+                self.assertGreater(name(prev), name(next_))
+            else:
+                self.assertLess(name(prev), name(next_))
+        container_path = '/v1/a/c' + query_string
+        codes = (resp[0] for resp in mock_responses)
+        bodies = iter([json.dumps(resp[1]) for resp in mock_responses])
+        exp_headers = [resp[2] for resp in mock_responses]
+        request = Request.blank(container_path)
+        with mocked_http_conn(
+                *codes, body_iter=bodies, headers=exp_headers) as fake_conn:
+            resp = request.get_response(self.app)
+        for backend_req in fake_conn.requests:
+            self.assertEqual(request.headers['X-Trans-Id'],
+                             backend_req['headers']['X-Trans-Id'])
+            self.assertTrue(backend_req['headers']['User-Agent'].startswith(
+                'proxy-server'))
+        self.assertEqual(200, resp.status_int)
+        actual_objects = json.loads(resp.body)
+        self.assertEqual(len(expected_objects), len(actual_objects))
+        self.assertEqual(expected_objects, actual_objects)
+        self.assertEqual(len(expected_requests), len(fake_conn.requests))
+        for i, ((exp_path, exp_headers, exp_params), req) in enumerate(
+                zip(expected_requests, fake_conn.requests)):
+            with annotate_failure('Request check at index %d.' % i):
+                # strip off /sdx/0/ from path
+                self.assertEqual(exp_path, req['path'][7:])
+                self.assertEqual(
+                    dict(exp_params, format='json'),
+                    dict(urllib.parse.parse_qsl(req['qs'], True)))
+                for k, v in exp_headers.items():
+                    self.assertIn(k, req['headers'])
+                    self.assertEqual(v, req['headers'][k])
+                self.assertNotIn('X-Backend-Override-Delete', req['headers'])
+        return resp
+
+    def check_response(self, resp, root_resp_hdrs, expected_objects=None):
+        info_hdrs = dict(root_resp_hdrs)
+        if expected_objects is None:
+            # default is to expect whatever the root container sent
+            expected_obj_count = root_resp_hdrs['X-Container-Object-Count']
+            expected_bytes_used = root_resp_hdrs['X-Container-Bytes-Used']
+        else:
+            expected_bytes_used = sum([o['bytes'] for o in expected_objects])
+            expected_obj_count = len(expected_objects)
+            info_hdrs['X-Container-Bytes-Used'] = expected_bytes_used
+            info_hdrs['X-Container-Object-Count'] = expected_obj_count
+        self.assertEqual(expected_bytes_used,
+                         int(resp.headers['X-Container-Bytes-Used']))
+        self.assertEqual(expected_obj_count,
+                         int(resp.headers['X-Container-Object-Count']))
+        self.assertEqual('sharded', resp.headers['X-Backend-Sharding-State'])
+        for k, v in root_resp_hdrs.items():
+            if k.lower().startswith('x-container-meta'):
+                self.assertEqual(v, resp.headers[k])
+        # check that info cache is correct for root container
+        info = get_container_info(resp.request.environ, self.app)
+        self.assertEqual(headers_to_container_info(info_hdrs), info)
+
+    def test_GET_sharded_container(self):
+        shard_bounds = (('', 'ham'), ('ham', 'pie'), ('pie', ''))
+        shard_ranges = [
+            ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper)
+            for lower, upper in shard_bounds]
+        sr_dicts = [dict(sr) for sr in shard_ranges]
+        sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges]
+        shard_resp_hdrs = [
+            {'X-Backend-Sharding-State': 'unsharded',
+             'X-Container-Object-Count': len(sr_objs[i]),
+             'X-Container-Bytes-Used':
+                 sum([obj['bytes'] for obj in sr_objs[i]]),
+             'X-Container-Meta-Flavour': 'flavour%d' % i,
+             'X-Backend-Storage-Policy-Index': 0}
+            for i in range(3)]
+
+        all_objects = []
+        for objects in sr_objs:
+            all_objects.extend(objects)
+        size_all_objects = sum([obj['bytes'] for obj in all_objects])
+        num_all_objects = len(all_objects)
+        limit = CONTAINER_LISTING_LIMIT
+        expected_objects = all_objects
+        root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded',
+                          # pretend root object stats are not yet updated
+                          'X-Container-Object-Count': num_all_objects - 1,
+                          'X-Container-Bytes-Used': size_all_objects - 1,
+                          'X-Container-Meta-Flavour': 'peach',
+                          'X-Backend-Storage-Policy-Index': 0}
+        root_shard_resp_hdrs = dict(root_resp_hdrs)
+        root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        # GET all objects
+        # include some failed responses
+        mock_responses = [
+            # status, body, headers
+            (404, '', {}),
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (200, sr_objs[1], shard_resp_hdrs[1]),
+            (200, sr_objs[2], shard_resp_hdrs[2])
+        ]
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 404
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', limit=str(limit),
+                  states='listing')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='pie\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),  # 200
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='p', end_marker='', states='listing',
+                  limit=str(limit - len(sr_objs[0] + sr_objs[1]))))  # 200
+        ]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs,
+                            expected_objects=expected_objects)
+
+        # GET all objects - sharding, final shard range points back to root
+        root_range = ShardRange('a/c', Timestamp.now(), 'pie', '')
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts[:2] + [dict(root_range)], root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (200, sr_objs[1], shard_resp_hdrs[1]),
+            (200, sr_objs[2], root_resp_hdrs)
+        ]
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', limit=str(limit),
+                  states='listing')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='pie\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),  # 200
+            (root_range.name, {'X-Backend-Record-Type': 'object'},
+             dict(marker='p', end_marker='',
+                  limit=str(limit - len(sr_objs[0] + sr_objs[1]))))  # 200
+        ]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs,
+                            expected_objects=expected_objects)
+
+        # GET all objects in reverse
+        mock_responses = [
+            # status, body, headers
+            (200, list(reversed(sr_dicts)), root_shard_resp_hdrs),
+            (200, list(reversed(sr_objs[2])), shard_resp_hdrs[2]),
+            (200, list(reversed(sr_objs[1])), shard_resp_hdrs[1]),
+            (200, list(reversed(sr_objs[0])), shard_resp_hdrs[0]),
+        ]
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing', reverse='true')),
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='pie', reverse='true',
+                  limit=str(limit), states='listing')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='q', end_marker='ham', states='listing',
+                  reverse='true', limit=str(limit - len(sr_objs[2])))),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='i', end_marker='', states='listing', reverse='true',
+                  limit=str(limit - len(sr_objs[2] + sr_objs[1])))),  # 200
+        ]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, list(reversed(expected_objects)),
+            expected_requests, query_string='?reverse=true', reverse=True)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs,
+                            expected_objects=expected_objects)
+
+        # GET with limit param
+        limit = len(sr_objs[0]) + len(sr_objs[1]) + 1
+        expected_objects = all_objects[:limit]
+        mock_responses = [
+            (404, '', {}),
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (200, sr_objs[1], shard_resp_hdrs[1]),
+            (200, sr_objs[2][:1], shard_resp_hdrs[2])
+        ]
+        expected_requests = [
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(limit=str(limit), states='listing')),  # 404
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(limit=str(limit), states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker='', end_marker='ham\x00', states='listing',
+                  limit=str(limit))),
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker='h', end_marker='pie\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},   # 200
+             dict(marker='p', end_marker='', states='listing',
+                  limit=str(limit - len(sr_objs[0] + sr_objs[1]))))
+        ]
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests,
+            query_string='?limit=%s' % limit)
+        self.check_response(resp, root_resp_hdrs)
+
+        # GET with marker
+        marker = sr_objs[1][2]['name']
+        first_included = len(sr_objs[0]) + 2
+        limit = CONTAINER_LISTING_LIMIT
+        expected_objects = all_objects[first_included:]
+        mock_responses = [
+            (404, '', {}),
+            (200, sr_dicts[1:], root_shard_resp_hdrs),
+            (404, '', {}),
+            (200, sr_objs[1][2:], shard_resp_hdrs[1]),
+            (200, sr_objs[2], shard_resp_hdrs[2])
+        ]
+        expected_requests = [
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(marker=marker, states='listing')),  # 404
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(marker=marker, states='listing')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},  # 404
+             dict(marker=marker, end_marker='pie\x00', states='listing',
+                  limit=str(limit))),
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker=marker, end_marker='pie\x00', states='listing',
+                  limit=str(limit))),
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker='p', end_marker='', states='listing',
+                  limit=str(limit - len(sr_objs[1][2:])))),
+        ]
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests,
+            query_string='?marker=%s' % marker)
+        self.check_response(resp, root_resp_hdrs)
+
+        # GET with end marker
+        end_marker = sr_objs[1][6]['name']
+        first_excluded = len(sr_objs[0]) + 6
+        expected_objects = all_objects[:first_excluded]
+        mock_responses = [
+            (404, '', {}),
+            (200, sr_dicts[:2], root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (404, '', {}),
+            (200, sr_objs[1][:6], shard_resp_hdrs[1])
+        ]
+        expected_requests = [
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(end_marker=end_marker, states='listing')),  # 404
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(end_marker=end_marker, states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker='', end_marker='ham\x00', states='listing',
+                  limit=str(limit))),
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},  # 404
+             dict(marker='h', end_marker=end_marker, states='listing',
+                  limit=str(limit - len(sr_objs[0])))),
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker='h', end_marker=end_marker, states='listing',
+                  limit=str(limit - len(sr_objs[0])))),
+        ]
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests,
+            query_string='?end_marker=%s' % end_marker)
+        self.check_response(resp, root_resp_hdrs)
+
+        # marker and end_marker and limit
+        limit = 2
+        expected_objects = all_objects[first_included:first_excluded]
+        mock_responses = [
+            (200, sr_dicts[1:2], root_shard_resp_hdrs),
+            (200, sr_objs[1][2:6], shard_resp_hdrs[1])
+        ]
+        expected_requests = [
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing', limit=str(limit),
+                  marker=marker, end_marker=end_marker)),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker=marker, end_marker=end_marker, states='listing',
+                  limit=str(limit))),
+        ]
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests,
+            query_string='?marker=%s&end_marker=%s&limit=%s'
+            % (marker, end_marker, limit))
+        self.check_response(resp, root_resp_hdrs)
+
+        # reverse with marker, end_marker
+        expected_objects.reverse()
+        mock_responses = [
+            (200, sr_dicts[1:2], root_shard_resp_hdrs),
+            (200, list(reversed(sr_objs[1][2:6])), shard_resp_hdrs[1])
+        ]
+        expected_requests = [
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(marker=end_marker, reverse='true', end_marker=marker,
+                  limit=str(limit), states='listing',)),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},  # 200
+             dict(marker=end_marker, end_marker=marker, states='listing',
+                  limit=str(limit), reverse='true')),
+        ]
+        self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests,
+            query_string='?marker=%s&end_marker=%s&limit=%s&reverse=true'
+            % (end_marker, marker, limit), reverse=True)
+        self.check_response(resp, root_resp_hdrs)
+
+    def test_GET_sharded_container_with_delimiter(self):
+        shard_bounds = (('', 'ham'), ('ham', 'pie'), ('pie', ''))
+        shard_ranges = [
+            ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper)
+            for lower, upper in shard_bounds]
+        sr_dicts = [dict(sr) for sr in shard_ranges]
+        shard_resp_hdrs = {'X-Backend-Sharding-State': 'unsharded',
+                           'X-Container-Object-Count': 2,
+                           'X-Container-Bytes-Used': 4,
+                           'X-Backend-Storage-Policy-Index': 0}
+
+        limit = CONTAINER_LISTING_LIMIT
+        root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded',
+                          # pretend root object stats are not yet updated
+                          'X-Container-Object-Count': 6,
+                          'X-Container-Bytes-Used': 12,
+                          'X-Backend-Storage-Policy-Index': 0}
+        root_shard_resp_hdrs = dict(root_resp_hdrs)
+        root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        sr_0_obj = {'name': 'apple',
+                    'bytes': 1,
+                    'hash': 'hash',
+                    'content_type': 'text/plain',
+                    'deleted': 0,
+                    'last_modified': next(self.ts_iter).isoformat}
+        sr_2_obj = {'name': 'pumpkin',
+                    'bytes': 1,
+                    'hash': 'hash',
+                    'content_type': 'text/plain',
+                    'deleted': 0,
+                    'last_modified': next(self.ts_iter).isoformat}
+        subdir = {'subdir': 'ha/'}
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, [sr_0_obj, subdir], shard_resp_hdrs),
+            (200, [], shard_resp_hdrs),
+            (200, [sr_2_obj], shard_resp_hdrs)
+        ]
+        expected_requests = [
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing', delimiter='/')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', limit=str(limit),
+                  states='listing', delimiter='/')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='ha/', end_marker='pie\x00', states='listing',
+                  limit=str(limit - 2), delimiter='/')),  # 200
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='ha/', end_marker='', states='listing',
+                  limit=str(limit - 2), delimiter='/'))  # 200
+        ]
+
+        expected_objects = [sr_0_obj, subdir, sr_2_obj]
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests,
+            query_string='?delimiter=/')
+        self.check_response(resp, root_resp_hdrs)
+
+    def test_GET_sharded_container_overlapping_shards(self):
+        # verify ordered listing even if unexpected overlapping shard ranges
+        shard_bounds = (('', 'ham', ShardRange.CLEAVED),
+                        ('', 'pie', ShardRange.ACTIVE),
+                        ('lemon', '', ShardRange.ACTIVE))
+        shard_ranges = [
+            ShardRange('.shards_a/c_' + upper, Timestamp.now(), lower, upper,
+                       state=state)
+            for lower, upper, state in shard_bounds]
+        sr_dicts = [dict(sr) for sr in shard_ranges]
+        sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges]
+        shard_resp_hdrs = [
+            {'X-Backend-Sharding-State': 'unsharded',
+             'X-Container-Object-Count': len(sr_objs[i]),
+             'X-Container-Bytes-Used':
+                 sum([obj['bytes'] for obj in sr_objs[i]]),
+             'X-Container-Meta-Flavour': 'flavour%d' % i,
+             'X-Backend-Storage-Policy-Index': 0}
+            for i in range(3)]
+
+        all_objects = []
+        for objects in sr_objs:
+            all_objects.extend(objects)
+        size_all_objects = sum([obj['bytes'] for obj in all_objects])
+        num_all_objects = len(all_objects)
+        limit = CONTAINER_LISTING_LIMIT
+        root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded',
+                          # pretend root object stats are not yet updated
+                          'X-Container-Object-Count': num_all_objects - 1,
+                          'X-Container-Bytes-Used': size_all_objects - 1,
+                          'X-Container-Meta-Flavour': 'peach',
+                          'X-Backend-Storage-Policy-Index': 0}
+        root_shard_resp_hdrs = dict(root_resp_hdrs)
+        root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        # forwards listing
+
+        # expect subset of second shard range
+        objs_1 = [o for o in sr_objs[1] if o['name'] > sr_objs[0][-1]['name']]
+        # expect subset of third shard range
+        objs_2 = [o for o in sr_objs[2] if o['name'] > sr_objs[1][-1]['name']]
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (200, objs_1, shard_resp_hdrs[1]),
+            (200, objs_2, shard_resp_hdrs[2])
+        ]
+        # NB marker always advances to last object name
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', states='listing',
+                  limit=str(limit))),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='pie\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),  # 200
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='p', end_marker='', states='listing',
+                  limit=str(limit - len(sr_objs[0] + objs_1))))  # 200
+        ]
+
+        expected_objects = sr_objs[0] + objs_1 + objs_2
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs,
+                            expected_objects=expected_objects)
+
+        # reverse listing
+
+        # expect subset of third shard range
+        objs_0 = [o for o in sr_objs[0] if o['name'] < sr_objs[1][0]['name']]
+        # expect subset of second shard range
+        objs_1 = [o for o in sr_objs[1] if o['name'] < sr_objs[2][0]['name']]
+        mock_responses = [
+            # status, body, headers
+            (200, list(reversed(sr_dicts)), root_shard_resp_hdrs),
+            (200, list(reversed(sr_objs[2])), shard_resp_hdrs[2]),
+            (200, list(reversed(objs_1)), shard_resp_hdrs[1]),
+            (200, list(reversed(objs_0)), shard_resp_hdrs[0]),
+        ]
+        # NB marker always advances to last object name
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing', reverse='true')),  # 200
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='lemon', states='listing',
+                  limit=str(limit),
+                  reverse='true')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='m', end_marker='', reverse='true', states='listing',
+                  limit=str(limit - len(sr_objs[2])))),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='A', end_marker='', reverse='true', states='listing',
+                  limit=str(limit - len(sr_objs[2] + objs_1))))  # 200
+        ]
+
+        expected_objects = list(reversed(objs_0 + objs_1 + sr_objs[2]))
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests,
+            query_string='?reverse=true', reverse=True)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs,
+                            expected_objects=expected_objects)
+
+    def test_GET_sharded_container_gap_in_shards(self):
+        # verify ordered listing even if unexpected gap between shard ranges
+        shard_bounds = (('', 'ham'), ('onion', 'pie'), ('rhubarb', ''))
+        shard_ranges = [
+            ShardRange('.shards_a/c_' + upper, Timestamp.now(), lower, upper)
+            for lower, upper in shard_bounds]
+        sr_dicts = [dict(sr) for sr in shard_ranges]
+        sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges]
+        shard_resp_hdrs = [
+            {'X-Backend-Sharding-State': 'unsharded',
+             'X-Container-Object-Count': len(sr_objs[i]),
+             'X-Container-Bytes-Used':
+                 sum([obj['bytes'] for obj in sr_objs[i]]),
+             'X-Container-Meta-Flavour': 'flavour%d' % i,
+             'X-Backend-Storage-Policy-Index': 0}
+            for i in range(3)]
+
+        all_objects = []
+        for objects in sr_objs:
+            all_objects.extend(objects)
+        size_all_objects = sum([obj['bytes'] for obj in all_objects])
+        num_all_objects = len(all_objects)
+        limit = CONTAINER_LISTING_LIMIT
+        root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded',
+                          'X-Container-Object-Count': num_all_objects,
+                          'X-Container-Bytes-Used': size_all_objects,
+                          'X-Container-Meta-Flavour': 'peach',
+                          'X-Backend-Storage-Policy-Index': 0}
+        root_shard_resp_hdrs = dict(root_resp_hdrs)
+        root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (200, sr_objs[1], shard_resp_hdrs[1]),
+            (200, sr_objs[2], shard_resp_hdrs[2])
+        ]
+        # NB marker always advances to last object name
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', states='listing',
+                  limit=str(limit))),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='pie\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),  # 200
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='p', end_marker='', states='listing',
+                  limit=str(limit - len(sr_objs[0] + sr_objs[1]))))  # 200
+        ]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, all_objects, expected_requests)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs)
+
+    def test_GET_sharded_container_empty_shard(self):
+        # verify ordered listing when a shard is empty
+        shard_bounds = (('', 'ham'), ('ham', 'pie'), ('lemon', ''))
+        shard_ranges = [
+            ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper)
+            for lower, upper in shard_bounds]
+        sr_dicts = [dict(sr) for sr in shard_ranges]
+        sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges]
+        # empty second shard range
+        sr_objs[1] = []
+        shard_resp_hdrs = [
+            {'X-Backend-Sharding-State': 'unsharded',
+             'X-Container-Object-Count': len(sr_objs[i]),
+             'X-Container-Bytes-Used':
+                 sum([obj['bytes'] for obj in sr_objs[i]]),
+             'X-Container-Meta-Flavour': 'flavour%d' % i,
+             'X-Backend-Storage-Policy-Index': 0}
+            for i in range(3)]
+
+        all_objects = []
+        for objects in sr_objs:
+            all_objects.extend(objects)
+        size_all_objects = sum([obj['bytes'] for obj in all_objects])
+        num_all_objects = len(all_objects)
+        limit = CONTAINER_LISTING_LIMIT
+        root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded',
+                          'X-Container-Object-Count': num_all_objects,
+                          'X-Container-Bytes-Used': size_all_objects,
+                          'X-Container-Meta-Flavour': 'peach',
+                          'X-Backend-Storage-Policy-Index': 0}
+        root_shard_resp_hdrs = dict(root_resp_hdrs)
+        root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (200, sr_objs[1], shard_resp_hdrs[1]),
+            (200, sr_objs[2], shard_resp_hdrs[2])
+        ]
+        # NB marker always advances to last object name
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', states='listing',
+                  limit=str(limit))),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='pie\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),  # 200
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='', states='listing',
+                  limit=str(limit - len(sr_objs[0] + sr_objs[1]))))  # 200
+        ]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, all_objects, expected_requests)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs)
+
+        # marker in empty second range
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts[1:], root_shard_resp_hdrs),
+            (200, sr_objs[1], shard_resp_hdrs[1]),
+            (200, sr_objs[2], shard_resp_hdrs[2])
+        ]
+        # NB marker unchanged when getting from third range
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing', marker='koolaid')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='koolaid', end_marker='pie\x00', states='listing',
+                  limit=str(limit))),  # 200
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='koolaid', end_marker='', states='listing',
+             limit=str(limit)))  # 200
+        ]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, sr_objs[2], expected_requests,
+            query_string='?marker=koolaid')
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs)
+
+        # marker in empty second range, reverse
+        mock_responses = [
+            # status, body, headers
+            (200, list(reversed(sr_dicts[:2])), root_shard_resp_hdrs),
+            (200, list(reversed(sr_objs[1])), shard_resp_hdrs[1]),
+            (200, list(reversed(sr_objs[0])), shard_resp_hdrs[2])
+        ]
+        # NB marker unchanged when getting from first range
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing', marker='koolaid', reverse='true')),  # 200
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='koolaid', end_marker='ham', reverse='true',
+                  states='listing', limit=str(limit))),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='koolaid', end_marker='', reverse='true',
+                  states='listing', limit=str(limit)))  # 200
+        ]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, list(reversed(sr_objs[0])), expected_requests,
+            query_string='?marker=koolaid&reverse=true', reverse=True)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs)
+
+    def _check_GET_sharded_container_shard_error(self, error):
+        # verify ordered listing when a shard is empty
+        shard_bounds = (('', 'ham'), ('ham', 'pie'), ('lemon', ''))
+        shard_ranges = [
+            ShardRange('.shards_a/c_%s' % upper, Timestamp.now(), lower, upper)
+            for lower, upper in shard_bounds]
+        sr_dicts = [dict(sr) for sr in shard_ranges]
+        sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges]
+        # empty second shard range
+        sr_objs[1] = []
+        shard_resp_hdrs = [
+            {'X-Backend-Sharding-State': 'unsharded',
+             'X-Container-Object-Count': len(sr_objs[i]),
+             'X-Container-Bytes-Used':
+                 sum([obj['bytes'] for obj in sr_objs[i]]),
+             'X-Container-Meta-Flavour': 'flavour%d' % i,
+             'X-Backend-Storage-Policy-Index': 0}
+            for i in range(3)]
+
+        all_objects = []
+        for objects in sr_objs:
+            all_objects.extend(objects)
+        size_all_objects = sum([obj['bytes'] for obj in all_objects])
+        num_all_objects = len(all_objects)
+        limit = CONTAINER_LISTING_LIMIT
+        root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded',
+                          'X-Container-Object-Count': num_all_objects,
+                          'X-Container-Bytes-Used': size_all_objects,
+                          'X-Container-Meta-Flavour': 'peach',
+                          'X-Backend-Storage-Policy-Index': 0}
+        root_shard_resp_hdrs = dict(root_resp_hdrs)
+        root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0])] + \
+            [(error, [], {})] * 2 * self.CONTAINER_REPLICAS + \
+            [(200, sr_objs[2], shard_resp_hdrs[2])]
+
+        # NB marker always advances to last object name
+        expected_requests = [
+            # path, headers, params
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 200
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', states='listing',
+                  limit=str(limit)))] \
+            + [(shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+                dict(marker='h', end_marker='pie\x00', states='listing',
+                     limit=str(limit - len(sr_objs[0]))))
+               ] * 2 * self.CONTAINER_REPLICAS \
+            + [(shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+                dict(marker='h', end_marker='', states='listing',
+                     limit=str(limit - len(sr_objs[0] + sr_objs[1]))))]
+
+        resp = self._check_GET_shard_listing(
+            mock_responses, all_objects, expected_requests)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs)
+
+    def test_GET_sharded_container_shard_errors(self):
+        self._check_GET_sharded_container_shard_error(404)
+        self._check_GET_sharded_container_shard_error(500)
+
+    def test_GET_sharded_container_sharding_shard(self):
+        # one shard is in process of sharding
+        shard_bounds = (('', 'ham'), ('ham', 'pie'), ('pie', ''))
+        shard_ranges = [
+            ShardRange('.shards_a/c_' + upper, Timestamp.now(), lower, upper)
+            for lower, upper in shard_bounds]
+        sr_dicts = [dict(sr) for sr in shard_ranges]
+        sr_objs = [self._make_shard_objects(sr) for sr in shard_ranges]
+        shard_resp_hdrs = [
+            {'X-Backend-Sharding-State': 'unsharded',
+             'X-Container-Object-Count': len(sr_objs[i]),
+             'X-Container-Bytes-Used':
+                 sum([obj['bytes'] for obj in sr_objs[i]]),
+             'X-Container-Meta-Flavour': 'flavour%d' % i,
+             'X-Backend-Storage-Policy-Index': 0}
+            for i in range(3)]
+        shard_1_shard_resp_hdrs = dict(shard_resp_hdrs[1])
+        shard_1_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        # second shard is sharding and has cleaved two out of three sub shards
+        shard_resp_hdrs[1]['X-Backend-Sharding-State'] = 'sharding'
+        sub_shard_bounds = (('ham', 'juice'), ('juice', 'lemon'))
+        sub_shard_ranges = [
+            ShardRange('a/c_sub_' + upper, Timestamp.now(), lower, upper)
+            for lower, upper in sub_shard_bounds]
+        sub_sr_dicts = [dict(sr) for sr in sub_shard_ranges]
+        sub_sr_objs = [self._make_shard_objects(sr) for sr in sub_shard_ranges]
+        sub_shard_resp_hdrs = [
+            {'X-Backend-Sharding-State': 'unsharded',
+             'X-Container-Object-Count': len(sub_sr_objs[i]),
+             'X-Container-Bytes-Used':
+                 sum([obj['bytes'] for obj in sub_sr_objs[i]]),
+             'X-Container-Meta-Flavour': 'flavour%d' % i,
+             'X-Backend-Storage-Policy-Index': 0}
+            for i in range(2)]
+
+        all_objects = []
+        for objects in sr_objs:
+            all_objects.extend(objects)
+        size_all_objects = sum([obj['bytes'] for obj in all_objects])
+        num_all_objects = len(all_objects)
+        limit = CONTAINER_LISTING_LIMIT
+        root_resp_hdrs = {'X-Backend-Sharding-State': 'sharded',
+                          'X-Container-Object-Count': num_all_objects,
+                          'X-Container-Bytes-Used': size_all_objects,
+                          'X-Container-Meta-Flavour': 'peach',
+                          'X-Backend-Storage-Policy-Index': 0}
+        root_shard_resp_hdrs = dict(root_resp_hdrs)
+        root_shard_resp_hdrs['X-Backend-Record-Type'] = 'shard'
+
+        mock_responses = [
+            # status, body, headers
+            (200, sr_dicts, root_shard_resp_hdrs),
+            (200, sr_objs[0], shard_resp_hdrs[0]),
+            (200, sub_sr_dicts + [sr_dicts[1]], shard_1_shard_resp_hdrs),
+            (200, sub_sr_objs[0], sub_shard_resp_hdrs[0]),
+            (200, sub_sr_objs[1], sub_shard_resp_hdrs[1]),
+            (200, sr_objs[1][len(sub_sr_objs[0] + sub_sr_objs[1]):],
+             shard_resp_hdrs[1]),
+            (200, sr_objs[2], shard_resp_hdrs[2])
+        ]
+        # NB marker always advances to last object name
+        expected_requests = [
+            # get root shard ranges
+            ('a/c', {'X-Backend-Record-Type': 'auto'},
+             dict(states='listing')),  # 200
+            # get first shard objects
+            (shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='', end_marker='ham\x00', states='listing',
+                  limit=str(limit))),  # 200
+            # get second shard sub-shard ranges
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='pie\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),
+            # get first sub-shard objects
+            (sub_shard_ranges[0].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='h', end_marker='juice\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0])))),
+            # get second sub-shard objects
+            (sub_shard_ranges[1].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='j', end_marker='lemon\x00', states='listing',
+                  limit=str(limit - len(sr_objs[0] + sub_sr_objs[0])))),
+            # get remainder of first shard objects
+            (shard_ranges[1].name, {'X-Backend-Record-Type': 'object'},
+             dict(marker='l', end_marker='pie\x00',
+                  limit=str(limit - len(sr_objs[0] + sub_sr_objs[0] +
+                                        sub_sr_objs[1])))),  # 200
+            # get third shard objects
+            (shard_ranges[2].name, {'X-Backend-Record-Type': 'auto'},
+             dict(marker='p', end_marker='', states='listing',
+                  limit=str(limit - len(sr_objs[0] + sr_objs[1]))))  # 200
+        ]
+        expected_objects = (
+            sr_objs[0] + sub_sr_objs[0] + sub_sr_objs[1] +
+            sr_objs[1][len(sub_sr_objs[0] + sub_sr_objs[1]):] + sr_objs[2])
+        resp = self._check_GET_shard_listing(
+            mock_responses, expected_objects, expected_requests)
+        # root object count will overridden by actual length of listing
+        self.check_response(resp, root_resp_hdrs)
+

@patch_policies(
    [StoragePolicy(0, 'zero', True, object_ring=FakeRing(replicas=4))])
--- a/test/unit/proxy/test_server.py
+++ b/test/unit/proxy/test_server.py
@ -47,7 +47,7 @@ from eventlet.green import httplib
 from six import BytesIO
 from six import StringIO
 from six.moves import range
-from six.moves.urllib.parse import quote
+from six.moves.urllib.parse import quote, parse_qsl

 from test import listen_zero
 from test.unit import (
@ -3222,95 +3222,197 @@ class TestReplicatedObjectController(
        # reset the router post patch_policies
        self.app.obj_controller_router = proxy_server.ObjectControllerRouter()
        self.app.sort_nodes = lambda nodes, *args, **kwargs: nodes
-        backend_requests = []

-        def capture_requests(ip, port, method, path, headers, *args,
-                             **kwargs):
-            backend_requests.append((method, path, headers))
+        def do_test(resp_headers):
+            self.app.memcache.store = {}
+            backend_requests = []

-        req = Request.blank('/v1/a/c/o', {}, method='POST',
-                            headers={'X-Object-Meta-Color': 'Blue',
-                                     'Content-Type': 'text/plain'})
+            def capture_requests(ip, port, method, path, headers, *args,
+                                 **kwargs):
+                backend_requests.append((method, path, headers))

-        # we want the container_info response to says a policy index of 1
-        resp_headers = {'X-Backend-Storage-Policy-Index': 1}
-        with mocked_http_conn(
-                200, 200, 202, 202, 202,
-                headers=resp_headers, give_connect=capture_requests
-        ) as fake_conn:
-            resp = req.get_response(self.app)
-            self.assertRaises(StopIteration, fake_conn.code_iter.next)
+            req = Request.blank('/v1/a/c/o', {}, method='POST',
+                                headers={'X-Object-Meta-Color': 'Blue',
+                                         'Content-Type': 'text/plain'})

-        self.assertEqual(resp.status_int, 202)
-        self.assertEqual(len(backend_requests), 5)
+            # we want the container_info response to says a policy index of 1
+            with mocked_http_conn(
+                    200, 200, 202, 202, 202,
+                    headers=resp_headers, give_connect=capture_requests
+            ) as fake_conn:
+                resp = req.get_response(self.app)
+                self.assertRaises(StopIteration, fake_conn.code_iter.next)

-        def check_request(req, method, path, headers=None):
-            req_method, req_path, req_headers = req
-            self.assertEqual(method, req_method)
-            # caller can ignore leading path parts
-            self.assertTrue(req_path.endswith(path),
-                            'expected path to end with %s, it was %s' % (
-                                path, req_path))
-            headers = headers or {}
-            # caller can ignore some headers
-            for k, v in headers.items():
-                self.assertEqual(req_headers[k], v)
-        account_request = backend_requests.pop(0)
-        check_request(account_request, method='HEAD', path='/sda/0/a')
-        container_request = backend_requests.pop(0)
-        check_request(container_request, method='HEAD', path='/sda/0/a/c')
-        # make sure backend requests included expected container headers
-        container_headers = {}
-        for request in backend_requests:
-            req_headers = request[2]
-            device = req_headers['x-container-device']
-            host = req_headers['x-container-host']
-            container_headers[device] = host
-            expectations = {
-                'method': 'POST',
-                'path': '/0/a/c/o',
-                'headers': {
-                    'X-Container-Partition': '0',
-                    'Connection': 'close',
-                    'User-Agent': 'proxy-server %s' % os.getpid(),
-                    'Host': 'localhost:80',
-                    'Referer': 'POST http://localhost/v1/a/c/o',
-                    'X-Object-Meta-Color': 'Blue',
-                    'X-Backend-Storage-Policy-Index': '1'
-                },
-            }
-            check_request(request, **expectations)
+            self.assertEqual(resp.status_int, 202)
+            self.assertEqual(len(backend_requests), 5)

-        expected = {}
-        for i, device in enumerate(['sda', 'sdb', 'sdc']):
-            expected[device] = '10.0.0.%d:100%d' % (i, i)
-        self.assertEqual(container_headers, expected)
+            def check_request(req, method, path, headers=None):
+                req_method, req_path, req_headers = req
+                self.assertEqual(method, req_method)
+                # caller can ignore leading path parts
+                self.assertTrue(req_path.endswith(path),
+                                'expected path to end with %s, it was %s' % (
+                                    path, req_path))
+                headers = headers or {}
+                # caller can ignore some headers
+                for k, v in headers.items():
+                    self.assertEqual(req_headers[k], v)
+                self.assertNotIn('X-Backend-Container-Path', req_headers)

-        # and again with policy override
-        self.app.memcache.store = {}
-        backend_requests = []
-        req = Request.blank('/v1/a/c/o', {}, method='POST',
-                            headers={'X-Object-Meta-Color': 'Blue',
-                                     'Content-Type': 'text/plain',
-                                     'X-Backend-Storage-Policy-Index': 0})
-        with mocked_http_conn(
-                200, 200, 202, 202, 202,
-                headers=resp_headers, give_connect=capture_requests
-        ) as fake_conn:
-            resp = req.get_response(self.app)
-            self.assertRaises(StopIteration, fake_conn.code_iter.next)
-        self.assertEqual(resp.status_int, 202)
-        self.assertEqual(len(backend_requests), 5)
-        for request in backend_requests[2:]:
-            expectations = {
-                'method': 'POST',
-                'path': '/0/a/c/o',  # ignore device bit
-                'headers': {
-                    'X-Object-Meta-Color': 'Blue',
-                    'X-Backend-Storage-Policy-Index': '0',
+            account_request = backend_requests.pop(0)
+            check_request(account_request, method='HEAD', path='/sda/0/a')
+            container_request = backend_requests.pop(0)
+            check_request(container_request, method='HEAD', path='/sda/0/a/c')
+            # make sure backend requests included expected container headers
+            container_headers = {}
+            for request in backend_requests:
+                req_headers = request[2]
+                device = req_headers['x-container-device']
+                host = req_headers['x-container-host']
+                container_headers[device] = host
+                expectations = {
+                    'method': 'POST',
+                    'path': '/0/a/c/o',
+                    'headers': {
+                        'X-Container-Partition': '0',
+                        'Connection': 'close',
+                        'User-Agent': 'proxy-server %s' % os.getpid(),
+                        'Host': 'localhost:80',
+                        'Referer': 'POST http://localhost/v1/a/c/o',
+                        'X-Object-Meta-Color': 'Blue',
+                        'X-Backend-Storage-Policy-Index': '1'
+                    },
                }
-            }
-            check_request(request, **expectations)
+                check_request(request, **expectations)
+
+            expected = {}
+            for i, device in enumerate(['sda', 'sdb', 'sdc']):
+                expected[device] = '10.0.0.%d:100%d' % (i, i)
+            self.assertEqual(container_headers, expected)
+
+            # and again with policy override
+            self.app.memcache.store = {}
+            backend_requests = []
+            req = Request.blank('/v1/a/c/o', {}, method='POST',
+                                headers={'X-Object-Meta-Color': 'Blue',
+                                         'Content-Type': 'text/plain',
+                                         'X-Backend-Storage-Policy-Index': 0})
+            with mocked_http_conn(
+                    200, 200, 202, 202, 202,
+                    headers=resp_headers, give_connect=capture_requests
+            ) as fake_conn:
+                resp = req.get_response(self.app)
+                self.assertRaises(StopIteration, fake_conn.code_iter.next)
+            self.assertEqual(resp.status_int, 202)
+            self.assertEqual(len(backend_requests), 5)
+            for request in backend_requests[2:]:
+                expectations = {
+                    'method': 'POST',
+                    'path': '/0/a/c/o',  # ignore device bit
+                    'headers': {
+                        'X-Object-Meta-Color': 'Blue',
+                        'X-Backend-Storage-Policy-Index': '0',
+                    }
+                }
+                check_request(request, **expectations)
+
+        resp_headers = {'X-Backend-Storage-Policy-Index': 1}
+        do_test(resp_headers)
+        resp_headers['X-Backend-Sharding-State'] = 'unsharded'
+        do_test(resp_headers)
+
+    @patch_policies([
+        StoragePolicy(0, 'zero', is_default=True, object_ring=FakeRing()),
+        StoragePolicy(1, 'one', object_ring=FakeRing()),
+    ])
+    def test_backend_headers_update_shard_container(self):
+        # verify that when container is sharded the backend container update is
+        # directed to the shard container
+        # reset the router post patch_policies
+        self.app.obj_controller_router = proxy_server.ObjectControllerRouter()
+        self.app.sort_nodes = lambda nodes, *args, **kwargs: nodes
+
+        def do_test(method, sharding_state):
+            self.app.memcache.store = {}
+            req = Request.blank('/v1/a/c/o', {}, method=method, body='',
+                                headers={'Content-Type': 'text/plain'})
+
+            # we want the container_info response to say policy index of 1 and
+            # sharding state
+            # acc HEAD, cont HEAD, cont shard GET, obj POSTs
+            status_codes = (200, 200, 200, 202, 202, 202)
+            resp_headers = {'X-Backend-Storage-Policy-Index': 1,
+                            'x-backend-sharding-state': sharding_state,
+                            'X-Backend-Record-Type': 'shard'}
+            shard_range = utils.ShardRange(
+                '.shards_a/c_shard', utils.Timestamp.now(), 'l', 'u')
+            body = json.dumps([dict(shard_range)])
+            with mocked_http_conn(*status_codes, headers=resp_headers,
+                                  body=body) as fake_conn:
+                resp = req.get_response(self.app)
+
+            self.assertEqual(resp.status_int, 202)
+            backend_requests = fake_conn.requests
+
+            def check_request(req, method, path, headers=None, params=None):
+                self.assertEqual(method, req['method'])
+                # caller can ignore leading path parts
+                self.assertTrue(req['path'].endswith(path),
+                                'expected path to end with %s, it was %s' % (
+                                    path, req['path']))
+                headers = headers or {}
+                # caller can ignore some headers
+                for k, v in headers.items():
+                    self.assertEqual(req['headers'][k], v,
+                                     'Expected %s but got %s for key %s' %
+                                     (v, req['headers'][k], k))
+                params = params or {}
+                req_params = dict(parse_qsl(req['qs'])) if req['qs'] else {}
+                for k, v in params.items():
+                    self.assertEqual(req_params[k], v,
+                                     'Expected %s but got %s for key %s' %
+                                     (v, req_params[k], k))
+
+            account_request = backend_requests[0]
+            check_request(account_request, method='HEAD', path='/sda/0/a')
+            container_request = backend_requests[1]
+            check_request(container_request, method='HEAD', path='/sda/0/a/c')
+            container_request_shard = backend_requests[2]
+            check_request(
+                container_request_shard, method='GET', path='/sda/0/a/c',
+                params={'includes': 'o'})
+
+            # make sure backend requests included expected container headers
+            container_headers = {}
+
+            for request in backend_requests[3:]:
+                req_headers = request['headers']
+                device = req_headers['x-container-device']
+                container_headers[device] = req_headers['x-container-host']
+                expectations = {
+                    'method': method,
+                    'path': '/0/a/c/o',
+                    'headers': {
+                        'X-Container-Partition': '0',
+                        'Host': 'localhost:80',
+                        'Referer': '%s http://localhost/v1/a/c/o' % method,
+                        'X-Backend-Storage-Policy-Index': '1',
+                        'X-Backend-Container-Path': shard_range.name
+                    },
+                }
+                check_request(request, **expectations)
+
+            expected = {}
+            for i, device in enumerate(['sda', 'sdb', 'sdc']):
+                expected[device] = '10.0.0.%d:100%d' % (i, i)
+            self.assertEqual(container_headers, expected)
+
+        do_test('POST', 'sharding')
+        do_test('POST', 'sharded')
+        do_test('DELETE', 'sharding')
+        do_test('DELETE', 'sharded')
+        do_test('PUT', 'sharding')
+        do_test('PUT', 'sharded')

    def test_DELETE(self):
        with save_globals():
@ -8356,6 +8458,29 @@ class TestContainerController(unittest.TestCase):
            self.assertEqual(res.content_length, 0)
            self.assertNotIn('transfer-encoding', res.headers)

+    def test_GET_account_non_existent(self):
+        with save_globals():
+            set_http_connect(404, 404, 404)
+            controller = proxy_server.ContainerController(self.app, 'a', 'c')
+            req = Request.blank('/v1/a/c')
+            self.app.update_request(req)
+            res = controller.GET(req)
+            self.assertEqual(res.status_int, 404)
+            self.assertNotIn('container/a/c', res.environ['swift.infocache'])
+
+    def test_GET_auto_create_prefix_account_non_existent(self):
+        with save_globals():
+            set_http_connect(404, 404, 404, 204, 204, 204)
+            controller = proxy_server.ContainerController(self.app, '.a', 'c')
+            req = Request.blank('/v1/a/c')
+            self.app.update_request(req)
+            res = controller.GET(req)
+            self.assertEqual(res.status_int, 204)
+            ic = res.environ['swift.infocache']
+            self.assertEqual(ic['container/.a/c']['status'], 204)
+            self.assertEqual(res.content_length, 0)
+            self.assertNotIn('transfer-encoding', res.headers)
+
    def test_GET_calls_authorize(self):
        called = [False]