Erasure Code Documentation
This patch adds all the relevant EC documentation to the source tree. Notable additions are: - Updated SAIO documentation - Updates to existing swift documentation; and - Erasure Coding overview Co-Authored-By: Alistair Coles <alistair.coles@hp.com> Co-Authored-By: Thiago da Silva <thiago@redhat.com> Co-Authored-By: John Dickinson <me@not.mn> Co-Authored-By: Clay Gerrard <clay.gerrard@gmail.com> Co-Authored-By: Tushar Gohad <tushar.gohad@intel.com> Co-Authored-By: Samuel Merritt <sam@swiftstack.com> Co-Authored-By: Christian Schwede <christian.schwede@enovance.com> Co-Authored-By: Yuan Zhou <yuan.zhou@intel.com> Change-Id: I0403016a4bb7dad9535891632753b0e5e9d402eb Implements: blueprint swift-ec Signed-off-by: Thiago da Silva <thiago@redhat.com>
This commit is contained in:
parent
647b66a2ce
commit
8f5d4d2455
@ -16,6 +16,16 @@ swift-ring-builder object-1.builder add r1z2-127.0.0.1:6020/sdb2 1
|
||||
swift-ring-builder object-1.builder add r1z3-127.0.0.1:6030/sdb3 1
|
||||
swift-ring-builder object-1.builder add r1z4-127.0.0.1:6040/sdb4 1
|
||||
swift-ring-builder object-1.builder rebalance
|
||||
swift-ring-builder object-2.builder create 10 6 1
|
||||
swift-ring-builder object-2.builder add r1z1-127.0.0.1:6010/sdb1 1
|
||||
swift-ring-builder object-2.builder add r1z1-127.0.0.1:6010/sdb5 1
|
||||
swift-ring-builder object-2.builder add r1z2-127.0.0.1:6020/sdb2 1
|
||||
swift-ring-builder object-2.builder add r1z2-127.0.0.1:6020/sdb6 1
|
||||
swift-ring-builder object-2.builder add r1z3-127.0.0.1:6030/sdb3 1
|
||||
swift-ring-builder object-2.builder add r1z3-127.0.0.1:6030/sdb7 1
|
||||
swift-ring-builder object-2.builder add r1z4-127.0.0.1:6040/sdb4 1
|
||||
swift-ring-builder object-2.builder add r1z4-127.0.0.1:6040/sdb8 1
|
||||
swift-ring-builder object-2.builder rebalance
|
||||
swift-ring-builder container.builder create 10 3 1
|
||||
swift-ring-builder container.builder add r1z1-127.0.0.1:6011/sdb1 1
|
||||
swift-ring-builder container.builder add r1z2-127.0.0.1:6021/sdb2 1
|
||||
|
@ -9,7 +9,10 @@ sudo mkfs.xfs -f ${SAIO_BLOCK_DEVICE:-/dev/sdb1}
|
||||
sudo mount /mnt/sdb1
|
||||
sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4
|
||||
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
||||
mkdir -p /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 /srv/4/node/sdb4
|
||||
mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \
|
||||
/srv/2/node/sdb2 /srv/2/node/sdb6 \
|
||||
/srv/3/node/sdb3 /srv/3/node/sdb7 \
|
||||
/srv/4/node/sdb4 /srv/4/node/sdb8
|
||||
sudo rm -f /var/log/debug /var/log/messages /var/log/rsyncd.log /var/log/syslog
|
||||
find /var/cache/swift* -type f -name *.recon -exec rm -f {} \;
|
||||
# On Fedora use "systemctl restart <service>"
|
||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
||||
[object-replicator]
|
||||
vm_test_mode = yes
|
||||
|
||||
[object-reconstructor]
|
||||
|
||||
[object-updater]
|
||||
|
||||
[object-auditor]
|
||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
||||
[object-replicator]
|
||||
vm_test_mode = yes
|
||||
|
||||
[object-reconstructor]
|
||||
|
||||
[object-updater]
|
||||
|
||||
[object-auditor]
|
||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
||||
[object-replicator]
|
||||
vm_test_mode = yes
|
||||
|
||||
[object-reconstructor]
|
||||
|
||||
[object-updater]
|
||||
|
||||
[object-auditor]
|
||||
|
@ -22,6 +22,8 @@ use = egg:swift#recon
|
||||
[object-replicator]
|
||||
vm_test_mode = yes
|
||||
|
||||
[object-reconstructor]
|
||||
|
||||
[object-updater]
|
||||
|
||||
[object-auditor]
|
||||
|
@ -5,7 +5,16 @@ swift_hash_path_suffix = changeme
|
||||
|
||||
[storage-policy:0]
|
||||
name = gold
|
||||
policy_type = replication
|
||||
default = yes
|
||||
|
||||
[storage-policy:1]
|
||||
name = silver
|
||||
policy_type = replication
|
||||
|
||||
[storage-policy:2]
|
||||
name = ec42
|
||||
policy_type = erasure_coding
|
||||
ec_type = jerasure_rs_vand
|
||||
ec_num_data_fragments = 4
|
||||
ec_num_parity_fragments = 2
|
||||
|
@ -104,5 +104,7 @@ Other
|
||||
* `Swiftsync <https://github.com/stackforge/swiftsync>`_ - A massive syncer between two swift clusters.
|
||||
* `Django Swiftbrowser <https://github.com/cschwede/django-swiftbrowser>`_ - Simple Django web app to access Openstack Swift.
|
||||
* `Swift-account-stats <https://github.com/enovance/swift-account-stats>`_ - Swift-account-stats is a tool to report statistics on Swift usage at tenant and global levels.
|
||||
* `PyECLib <https://bitbucket.org/kmgreen2/pyeclib>`_ - High Level Erasure Code library used by Swift
|
||||
* `liberasurecode <http://www.bytebucket.org/tsg-/liberasurecode>`_ - Low Level Erasure Code library used by PyECLib
|
||||
* `Swift Browser <https://github.com/zerovm/swift-browser>`_ - JavaScript interface for Swift
|
||||
* `swift-ui <https://github.com/fanatic/swift-ui>`_ - OpenStack Swift web browser
|
||||
|
@ -87,8 +87,11 @@ another device when creating the VM, and follow these instructions:
|
||||
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
||||
sudo mkdir /srv
|
||||
for x in {1..4}; do sudo ln -s /mnt/sdb1/$x /srv/$x; done
|
||||
sudo mkdir -p /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 \
|
||||
/srv/4/node/sdb4 /var/run/swift
|
||||
sudo mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \
|
||||
/srv/2/node/sdb2 /srv/2/node/sdb6 \
|
||||
/srv/3/node/sdb3 /srv/3/node/sdb7 \
|
||||
/srv/4/node/sdb4 /srv/4/node/sdb8 \
|
||||
/var/run/swift
|
||||
sudo chown -R ${USER}:${USER} /var/run/swift
|
||||
# **Make sure to include the trailing slash after /srv/$x/**
|
||||
for x in {1..4}; do sudo chown -R ${USER}:${USER} /srv/$x/; done
|
||||
@ -124,7 +127,11 @@ these instructions:
|
||||
sudo mkdir /mnt/sdb1/1 /mnt/sdb1/2 /mnt/sdb1/3 /mnt/sdb1/4
|
||||
sudo chown ${USER}:${USER} /mnt/sdb1/*
|
||||
for x in {1..4}; do sudo ln -s /mnt/sdb1/$x /srv/$x; done
|
||||
sudo mkdir -p /srv/1/node/sdb1 /srv/2/node/sdb2 /srv/3/node/sdb3 /srv/4/node/sdb4 /var/run/swift
|
||||
sudo mkdir -p /srv/1/node/sdb1 /srv/1/node/sdb5 \
|
||||
/srv/2/node/sdb2 /srv/2/node/sdb6 \
|
||||
/srv/3/node/sdb3 /srv/3/node/sdb7 \
|
||||
/srv/4/node/sdb4 /srv/4/node/sdb8 \
|
||||
/var/run/swift
|
||||
sudo chown -R ${USER}:${USER} /var/run/swift
|
||||
# **Make sure to include the trailing slash after /srv/$x/**
|
||||
for x in {1..4}; do sudo chown -R ${USER}:${USER} /srv/$x/; done
|
||||
@ -402,7 +409,7 @@ Setting up scripts for running Swift
|
||||
|
||||
#. Copy the SAIO scripts for resetting the environment::
|
||||
|
||||
cd $HOME/swift/doc; cp -r saio/bin $HOME/bin; cd -
|
||||
cd $HOME/swift/doc; cp saio/bin/* $HOME/bin; cd -
|
||||
chmod +x $HOME/bin/*
|
||||
|
||||
#. Edit the ``$HOME/bin/resetswift`` script
|
||||
@ -455,30 +462,41 @@ Setting up scripts for running Swift
|
||||
|
||||
.. literalinclude:: /../saio/bin/remakerings
|
||||
|
||||
You can expect the output from this command to produce the following (note
|
||||
that 2 object rings are created in order to test storage policies in the
|
||||
SAIO environment however they map to the same nodes)::
|
||||
You can expect the output from this command to produce the following. Note
|
||||
that 3 object rings are created in order to test storage policies and EC in
|
||||
the SAIO environment. The EC ring is the only one with all 8 devices.
|
||||
There are also two replication rings, one for 3x replication and another
|
||||
for 2x replication, but those rings only use 4 devices::
|
||||
|
||||
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
||||
Device d1r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 1
|
||||
Device d2r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 2
|
||||
Device d3r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 3
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
||||
Device d1r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 1
|
||||
Device d2r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 2
|
||||
Device d3r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 3
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||
Device d0r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb1_"" with 1.0 weight got id 0
|
||||
Device d1r1z1-127.0.0.1:6010R127.0.0.1:6010/sdb5_"" with 1.0 weight got id 1
|
||||
Device d2r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb2_"" with 1.0 weight got id 2
|
||||
Device d3r1z2-127.0.0.1:6020R127.0.0.1:6020/sdb6_"" with 1.0 weight got id 3
|
||||
Device d4r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb3_"" with 1.0 weight got id 4
|
||||
Device d5r1z3-127.0.0.1:6030R127.0.0.1:6030/sdb7_"" with 1.0 weight got id 5
|
||||
Device d6r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb4_"" with 1.0 weight got id 6
|
||||
Device d7r1z4-127.0.0.1:6040R127.0.0.1:6040/sdb8_"" with 1.0 weight got id 7
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||
Device d0r1z1-127.0.0.1:6011R127.0.0.1:6011/sdb1_"" with 1.0 weight got id 0
|
||||
Device d1r1z2-127.0.0.1:6021R127.0.0.1:6021/sdb2_"" with 1.0 weight got id 1
|
||||
Device d2r1z3-127.0.0.1:6031R127.0.0.1:6031/sdb3_"" with 1.0 weight got id 2
|
||||
Device d3r1z4-127.0.0.1:6041R127.0.0.1:6041/sdb4_"" with 1.0 weight got id 3
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||
Device d0r1z1-127.0.0.1:6012R127.0.0.1:6012/sdb1_"" with 1.0 weight got id 0
|
||||
Device d1r1z2-127.0.0.1:6022R127.0.0.1:6022/sdb2_"" with 1.0 weight got id 1
|
||||
Device d2r1z3-127.0.0.1:6032R127.0.0.1:6032/sdb3_"" with 1.0 weight got id 2
|
||||
Device d3r1z4-127.0.0.1:6042R127.0.0.1:6042/sdb4_"" with 1.0 weight got id 3
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00.
|
||||
Reassigned 1024 (100.00%) partitions. Balance is now 0.00. Dispersion is now 0.00
|
||||
|
||||
#. Read more about Storage Policies and your SAIO :doc:`policies_saio`
|
||||
|
||||
|
BIN
doc/source/images/ec_overview.png
Executable file
BIN
doc/source/images/ec_overview.png
Executable file
Binary file not shown.
After Width: | Height: | Size: 145 KiB |
@ -56,6 +56,7 @@ Overview and Concepts
|
||||
overview_expiring_objects
|
||||
cors
|
||||
crossdomain
|
||||
overview_erasure_code
|
||||
overview_backing_store
|
||||
associated_projects
|
||||
|
||||
|
@ -11,7 +11,10 @@ Proxy Server
|
||||
The Proxy Server is responsible for tying together the rest of the Swift
|
||||
architecture. For each request, it will look up the location of the account,
|
||||
container, or object in the ring (see below) and route the request accordingly.
|
||||
The public API is also exposed through the Proxy Server.
|
||||
For Erasure Code type policies, the Proxy Server is also responsible for
|
||||
encoding and decoding object data. See :doc:`overview_erasure_code` for
|
||||
complete information on Erasure Code suport. The public API is also exposed
|
||||
through the Proxy Server.
|
||||
|
||||
A large number of failures are also handled in the Proxy Server. For
|
||||
example, if a server is unavailable for an object PUT, it will ask the
|
||||
@ -87,7 +90,8 @@ implementing a particular differentiation.
|
||||
For example, one might have the default policy with 3x replication, and create
|
||||
a second policy which, when applied to new containers only uses 2x replication.
|
||||
Another might add SSDs to a set of storage nodes and create a performance tier
|
||||
storage policy for certain containers to have their objects stored there.
|
||||
storage policy for certain containers to have their objects stored there. Yet
|
||||
another might be the use of Erasure Coding to define a cold-storage tier.
|
||||
|
||||
This mapping is then exposed on a per-container basis, where each container
|
||||
can be assigned a specific storage policy when it is created, which remains in
|
||||
@ -156,6 +160,15 @@ item (object, container, or account) is deleted, a tombstone is set as the
|
||||
latest version of the item. The replicator will see the tombstone and ensure
|
||||
that the item is removed from the entire system.
|
||||
|
||||
--------------
|
||||
Reconstruction
|
||||
--------------
|
||||
|
||||
The reconstructor is used by Erasure Code policies and is analogous to the
|
||||
replicator for Replication type policies. See :doc:`overview_erasure_code`
|
||||
for complete information on both Erasure Code support as well as the
|
||||
reconstructor.
|
||||
|
||||
--------
|
||||
Updaters
|
||||
--------
|
||||
|
672
doc/source/overview_erasure_code.rst
Executable file
672
doc/source/overview_erasure_code.rst
Executable file
@ -0,0 +1,672 @@
|
||||
====================
|
||||
Erasure Code Support
|
||||
====================
|
||||
|
||||
|
||||
--------------------------
|
||||
Beta: Not production ready
|
||||
--------------------------
|
||||
The erasure code support in Swift is considered "beta" at this point.
|
||||
Most major functionality is included, but it has not been tested or validated
|
||||
at large scale. This feature relies on ssync for durability. Deployers are
|
||||
urged to do extensive testing and not deploy production data using an
|
||||
erasure code storage policy.
|
||||
|
||||
If any bugs are found during testing, please report them to
|
||||
https://bugs.launchpad.net/swift
|
||||
|
||||
|
||||
-------------------------------
|
||||
History and Theory of Operation
|
||||
-------------------------------
|
||||
|
||||
There's a lot of good material out there on Erasure Code (EC) theory, this short
|
||||
introduction is just meant to provide some basic context to help the reader
|
||||
better understand the implementation in Swift.
|
||||
|
||||
Erasure Coding for storage applications grew out of Coding Theory as far back as
|
||||
the 1960s with the Reed-Solomon codes. These codes have been used for years in
|
||||
applications ranging from CDs to DVDs to general communications and, yes, even
|
||||
in the space program starting with Voyager! The basic idea is that some amount
|
||||
of data is broken up into smaller pieces called fragments and coded in such a
|
||||
way that it can be transmitted with the ability to tolerate the loss of some
|
||||
number of the coded fragments. That's where the word "erasure" comes in, if you
|
||||
transmit 14 fragments and only 13 are received then one of them is said to be
|
||||
"erased". The word "erasure" provides an important distinction with EC; it
|
||||
isn't about detecting errors, it's about dealing with failures. Another
|
||||
important element of EC is that the number of erasures that can be tolerated can
|
||||
be adjusted to meet the needs of the application.
|
||||
|
||||
At a high level EC works by using a specific scheme to break up a single data
|
||||
buffer into several smaller data buffers then, depending on the scheme,
|
||||
performing some encoding operation on that data in order to generate additional
|
||||
information. So you end up with more data than you started with and that extra
|
||||
data is often called "parity". Note that there are many, many different
|
||||
encoding techniques that vary both in how they organize and manipulate the data
|
||||
as well by what means they use to calculate parity. For example, one scheme
|
||||
might rely on `Galois Field Arithmetic <http://www.ssrc.ucsc.edu/Papers/plank-
|
||||
fast13.pdf>`_ while others may work with only XOR. The number of variations and
|
||||
details about their differences are well beyond the scope of this introduction,
|
||||
but we will talk more about a few of them when we get into the implementation of
|
||||
EC in Swift.
|
||||
|
||||
--------------------------------
|
||||
Overview of EC Support in Swift
|
||||
--------------------------------
|
||||
|
||||
First and foremost, from an application perspective EC support is totally
|
||||
transparent. There are no EC related external API; a container is simply created
|
||||
using a Storage Policy defined to use EC and then interaction with the cluster
|
||||
is the same as any other durability policy.
|
||||
|
||||
EC is implemented in Swift as a Storage Policy, see :doc:`overview_policies` for
|
||||
complete details on Storage Policies. Because support is implemented as a
|
||||
Storage Policy, all of the storage devices associated with your cluster's EC
|
||||
capability can be isolated. It is entirely possible to share devices between
|
||||
storage policies, but for EC it may make more sense to not only use separate
|
||||
devices but possibly even entire nodes dedicated for EC.
|
||||
|
||||
Which direction one chooses depends on why the EC policy is being deployed. If,
|
||||
for example, there is a production replication policy in place already and the
|
||||
goal is to add a cold storage tier such that the existing nodes performing
|
||||
replication are impacted as little as possible, adding a new set of nodes
|
||||
dedicated to EC might make the most sense but also incurs the most cost. On the
|
||||
other hand, if EC is being added as a capability to provide additional
|
||||
durability for a specific set of applications and the existing infrastructure is
|
||||
well suited for EC (sufficient number of nodes, zones for the EC scheme that is
|
||||
chosen) then leveraging the existing infrastructure such that the EC ring shares
|
||||
nodes with the replication ring makes the most sense. These are some of the
|
||||
main considerations:
|
||||
|
||||
* Layout of existing infrastructure.
|
||||
* Cost of adding dedicated EC nodes (or just dedicated EC devices).
|
||||
* Intended usage model(s).
|
||||
|
||||
The Swift code base does not include any of the algorithms necessary to perform
|
||||
the actual encoding and decoding of data; that is left to external libraries.
|
||||
The Storage Policies architecture is leveraged to enable EC on a per container
|
||||
basis -- the object rings are still used to determine the placement of EC data
|
||||
fragments. Although there are several code paths that are unique to an operation
|
||||
associated with an EC policy, an external dependency to an Erasure Code library
|
||||
is what Swift counts on to perform the low level EC functions. The use of an
|
||||
external library allows for maximum flexibility as there are a significant
|
||||
number of options out there, each with its owns pros and cons that can vary
|
||||
greatly from one use case to another.
|
||||
|
||||
---------------------------------------
|
||||
PyECLib: External Erasure Code Library
|
||||
---------------------------------------
|
||||
|
||||
PyECLib is a Python Erasure Coding Library originally designed and written as
|
||||
part of the effort to add EC support to the Swift project, however it is an
|
||||
independent project. The library provides a well-defined and simple Python
|
||||
interface and internally implements a plug-in architecture allowing it to take
|
||||
advantage of many well-known C libraries such as:
|
||||
|
||||
* Jerasure and GFComplete at http://jerasure.org.
|
||||
* Intel(R) ISA-L at http://01.org/intel%C2%AE-storage-acceleration-library-open-source-version.
|
||||
* Or write your own!
|
||||
|
||||
PyECLib uses a C based library called liberasurecode to implement the plug in
|
||||
infrastructure; liberasure code is available at:
|
||||
|
||||
* liberasurecode: https://bitbucket.org/tsg-/liberasurecode
|
||||
|
||||
PyECLib itself therefore allows for not only choice but further extensibility as
|
||||
well. PyECLib also comes with a handy utility to help determine the best
|
||||
algorithm to use based on the equipment that will be used (processors and server
|
||||
configurations may vary in performance per algorithm). More on this will be
|
||||
covered in the configuration section. PyECLib is included as a Swift
|
||||
requirement.
|
||||
|
||||
For complete details see `PyECLib <https://bitbucket.org/kmgreen2/pyeclib>`_
|
||||
|
||||
------------------------------
|
||||
Storing and Retrieving Objects
|
||||
------------------------------
|
||||
|
||||
We will discuss the details of how PUT and GET work in the "Under the Hood"
|
||||
section later on. The key point here is that all of the erasure code work goes
|
||||
on behind the scenes; this summary is a high level information overview only.
|
||||
|
||||
The PUT flow looks like this:
|
||||
|
||||
#. The proxy server streams in an object and buffers up "a segment" of data
|
||||
(size is configurable).
|
||||
#. The proxy server calls on PyECLib to encode the data into smaller fragments.
|
||||
#. The proxy streams the encoded fragments out to the storage nodes based on
|
||||
ring locations.
|
||||
#. Repeat until the client is done sending data.
|
||||
#. The client is notified of completion when a quorum is met.
|
||||
|
||||
The GET flow looks like this:
|
||||
|
||||
#. The proxy server makes simultaneous requests to participating nodes.
|
||||
#. As soon as the proxy has the fragments it needs, it calls on PyECLib to
|
||||
decode the data.
|
||||
#. The proxy streams the decoded data it has back to the client.
|
||||
#. Repeat until the proxy is done sending data back to the client.
|
||||
|
||||
It may sound like, from this high level overview, that using EC is going to
|
||||
cause an explosion in the number of actual files stored in each node's local
|
||||
file system. Although it is true that more files will be stored (because an
|
||||
object is broken into pieces), the implementation works to minimize this where
|
||||
possible, more details are available in the Under the Hood section.
|
||||
|
||||
-------------
|
||||
Handoff Nodes
|
||||
-------------
|
||||
|
||||
In EC policies, similarly to replication, handoff nodes are a set of storage
|
||||
nodes used to augment the list of primary nodes responsible for storing an
|
||||
erasure coded object. These handoff nodes are used in the event that one or more
|
||||
of the primaries are unavailable. Handoff nodes are still selected with an
|
||||
attempt to achieve maximum separation of the data being placed.
|
||||
|
||||
--------------
|
||||
Reconstruction
|
||||
--------------
|
||||
|
||||
For an EC policy, reconstruction is analogous to the process of replication for
|
||||
a replication type policy -- essentially "the reconstructor" replaces "the
|
||||
replicator" for EC policy types. The basic framework of reconstruction is very
|
||||
similar to that of replication with a few notable exceptions:
|
||||
|
||||
* Because EC does not actually replicate partitions, it needs to operate at a
|
||||
finer granularity than what is provided with rsync, therefore EC leverages
|
||||
much of ssync behind the scenes (you do not need to manually configure ssync).
|
||||
* Once a pair of nodes has determined the need to replace a missing object
|
||||
fragment, instead of pushing over a copy like replication would do, the
|
||||
reconstructor has to read in enough surviving fragments from other nodes and
|
||||
perform a local reconstruction before it has the correct data to push to the
|
||||
other node.
|
||||
* A reconstructor does not talk to all other reconstructors in the set of nodes
|
||||
responsible for an EC partition, this would be far too chatty, instead each
|
||||
reconstructor is responsible for sync'ing with the partition's closest two
|
||||
neighbors (closest meaning left and right on the ring).
|
||||
|
||||
.. note::
|
||||
|
||||
EC work (encode and decode) takes place both on the proxy nodes, for PUT/GET
|
||||
operations, as well as on the storage nodes for reconstruction. As with
|
||||
replication, reconstruction can be the result of rebalancing, bit-rot, drive
|
||||
failure or reverting data from a hand-off node back to its primary.
|
||||
|
||||
--------------------------
|
||||
Performance Considerations
|
||||
--------------------------
|
||||
|
||||
Efforts are underway to characterize performance of various Erasure Code
|
||||
schemes. One of the main goals of the beta release is to perform this
|
||||
characterization and encourage others to do so and provide meaningful feedback
|
||||
to the development community. There are many factors that will affect
|
||||
performance of EC so it is vital that we have multiple characterization
|
||||
activities happening.
|
||||
|
||||
In general, EC has different performance characteristics than replicated data.
|
||||
EC requires substantially more CPU to read and write data, and is more suited
|
||||
for larger objects that are not frequently accessed (eg backups).
|
||||
|
||||
----------------------------
|
||||
Using an Erasure Code Policy
|
||||
----------------------------
|
||||
|
||||
To use an EC policy, the administrator simply needs to define an EC policy in
|
||||
`swift.conf` and create/configure the associated object ring. An example of how
|
||||
an EC policy can be setup is shown below::
|
||||
|
||||
[storage-policy:2]
|
||||
name = ec104
|
||||
policy_type = erasure_coding
|
||||
ec_type = jerasure_rs_vand
|
||||
ec_num_data_fragments = 10
|
||||
ec_num_parity_fragments = 4
|
||||
ec_object_segment_size = 1048576
|
||||
|
||||
Let's take a closer look at each configuration parameter:
|
||||
|
||||
* ``name``: This is a standard storage policy parameter.
|
||||
See :doc:`overview_policies` for details.
|
||||
* ``policy_type``: Set this to ``erasure_coding`` to indicate that this is an EC
|
||||
policy.
|
||||
* ``ec_type``: Set this value according to the available options in the selected
|
||||
PyECLib back-end. This specifies the EC scheme that is to be used. For
|
||||
example the option shown here selects Vandermonde Reed-Solomon encoding while
|
||||
an option of ``flat_xor_hd_3`` would select Flat-XOR based HD combination
|
||||
codes. See the `PyECLib <https://bitbucket.org/kmgreen2/pyeclib>`_ page for
|
||||
full details.
|
||||
* ``ec_num_data_fragments``: The total number of fragments that will be
|
||||
comprised of data.
|
||||
* ``ec_num_parity_fragments``: The total number of fragments that will be
|
||||
comprised of parity.
|
||||
* ``ec_object_segment_size``: The amount of data that will be buffered up before
|
||||
feeding a segment into the encoder/decoder. The default value is 1048576.
|
||||
|
||||
When PyECLib encodes an object, it will break it into N fragments. However, what
|
||||
is important during configuration, is how many of those are data and how many
|
||||
are parity. So in the example above, PyECLib will actually break an object in
|
||||
14 different fragments, 10 of them will be made up of actual object data and 4
|
||||
of them will be made of parity data (calculations depending on ec_type).
|
||||
|
||||
When deciding which devices to use in the EC policy's object ring, be sure to
|
||||
carefully consider the performance impacts. Running some performance
|
||||
benchmarking in a test environment for your configuration is highly recommended
|
||||
before deployment. Once you have configured your EC policy in `swift.conf` and
|
||||
created your object ring, your application is ready to start using EC simply by
|
||||
creating a container with the specified policy name and interacting as usual.
|
||||
|
||||
.. note::
|
||||
|
||||
It's important to note that once you have deployed a policy and have created
|
||||
objects with that policy, these configurations options cannot be changed. In
|
||||
case a change in the configuration is desired, you must create a new policy
|
||||
and migrate the data to a new container.
|
||||
|
||||
Migrating Between Policies
|
||||
--------------------------
|
||||
|
||||
A common usage of EC is to migrate less commonly accessed data from a more
|
||||
expensive but lower latency policy such as replication. When an application
|
||||
determines that it wants to move data from a replication policy to an EC policy,
|
||||
it simply needs to move the data from the replicated container to an EC
|
||||
container that was created with the target durability policy.
|
||||
|
||||
Region Support
|
||||
--------------
|
||||
|
||||
For at least the initial version of EC, it is not recommended that an EC scheme
|
||||
span beyond a single region, neither performance nor functional validation has
|
||||
be been done in such a configuration.
|
||||
|
||||
--------------
|
||||
Under the Hood
|
||||
--------------
|
||||
|
||||
Now that we've explained a little about EC support in Swift and how to
|
||||
configure/use it, let's explore how EC fits in at the nuts-n-bolts level.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
|
||||
The term 'fragment' has been used already to describe the output of the EC
|
||||
process (a series of fragments) however we need to define some other key terms
|
||||
here before going any deeper. Without paying special attention to using the
|
||||
correct terms consistently, it is very easy to get confused in a hurry!
|
||||
|
||||
* **chunk**: HTTP chunks received over wire (term not used to describe any EC
|
||||
specific operation).
|
||||
* **segment**: Not to be confused with SLO/DLO use of the word, in EC we call a
|
||||
segment a series of consecutive HTTP chunks buffered up before performing an
|
||||
EC operation.
|
||||
* **fragment**: Data and parity 'fragments' are generated when erasure coding
|
||||
transformation is applied to a segment.
|
||||
* **EC archive**: A concatenation of EC fragments; to a storage node this looks
|
||||
like an object.
|
||||
* **ec_ndata**: Number of EC data fragments.
|
||||
* **ec_nparity**: Number of EC parity fragments.
|
||||
|
||||
Middleware
|
||||
----------
|
||||
|
||||
Middleware remains unchanged. For most middleware (e.g., SLO/DLO) the fact that
|
||||
the proxy is fragmenting incoming objects is transparent. For list endpoints,
|
||||
however, it is a bit different. A caller of list endpoints will get back the
|
||||
locations of all of the fragments. The caller will be unable to re-assemble the
|
||||
original object with this information, however the node locations may still
|
||||
prove to be useful information for some applications.
|
||||
|
||||
On Disk Storage
|
||||
---------------
|
||||
|
||||
EC archives are stored on disk in their respective objects-N directory based on
|
||||
their policy index. See :doc:`overview_policies` for details on per policy
|
||||
directory information.
|
||||
|
||||
The actual names on disk of EC archives also have one additional piece of data
|
||||
encoded in the filename, the fragment archive index.
|
||||
|
||||
Each storage policy now must include a transformation function that diskfile
|
||||
will use to build the filename to store on disk. The functions are implemented
|
||||
in the diskfile module as policy specific sub classes ``DiskFileManager``.
|
||||
|
||||
This is required for a few reasons. For one, it allows us to store fragment
|
||||
archives of different indexes on the same storage node which is not typical
|
||||
however it is possible in many circumstances. Without unique filenames for the
|
||||
different EC archive files in a set, we would be at risk of overwriting one
|
||||
archive of index n with another of index m in some scenarios.
|
||||
|
||||
The transformation function for the replication policy is simply a NOP. For
|
||||
reconstruction, the index is appended to the filename just before the .data
|
||||
extension. An example filename for a fragment archive storing the 5th fragment
|
||||
would like this this::
|
||||
|
||||
1418673556.92690#5.data
|
||||
|
||||
An additional file is also included for Erasure Code policies called the
|
||||
``.durable`` file. Its meaning will be covered in detail later, however, its on-
|
||||
disk format does not require the name transformation function that was just
|
||||
covered. The .durable for the example above would simply look like this::
|
||||
|
||||
1418673556.92690.durable
|
||||
|
||||
And it would be found alongside every fragment specific .data file following a
|
||||
100% successful PUT operation.
|
||||
|
||||
Proxy Server
|
||||
------------
|
||||
|
||||
High Level
|
||||
==========
|
||||
|
||||
The Proxy Server handles Erasure Coding in a different manner than replication,
|
||||
therefore there are several code paths unique to EC policies either though sub
|
||||
classing or simple conditionals. Taking a closer look at the PUT and the GET
|
||||
paths will help make this clearer. But first, a high level overview of how an
|
||||
object flows through the system:
|
||||
|
||||
.. image:: images/ec_overview.png
|
||||
|
||||
Note how:
|
||||
|
||||
* Incoming objects are buffered into segments at the proxy.
|
||||
* Segments are erasure coded into fragments at the proxy.
|
||||
* The proxy stripes fragments across participating nodes such that the on-disk
|
||||
stored files that we call a fragment archive is appended with each new
|
||||
fragment.
|
||||
|
||||
This scheme makes it possible to minimize the number of on-disk files given our
|
||||
segmenting and fragmenting.
|
||||
|
||||
Multi_Phase Conversation
|
||||
========================
|
||||
|
||||
Multi-part MIME document support is used to allow the proxy to engage in a
|
||||
handshake conversation with the storage node for processing PUT requests. This
|
||||
is required for a few different reasons.
|
||||
|
||||
#. From the perspective of the storage node, a fragment archive is really just
|
||||
another object, we need a mechanism to send down the original object etag
|
||||
after all fragment archives have landed.
|
||||
#. Without introducing strong consistency semantics, the proxy needs a mechanism
|
||||
to know when a quorum of fragment archives have actually made it to disk
|
||||
before it can inform the client of a successful PUT.
|
||||
|
||||
MIME supports a conversation between the proxy and the storage nodes for every
|
||||
PUT. This provides us with the ability to handle a PUT in one connection and
|
||||
assure that we have the essence of a 2 phase commit, basically having the proxy
|
||||
communicate back to the storage nodes once it has confirmation that all fragment
|
||||
archives in the set have been committed. Note that we still require a quorum of
|
||||
data elements of the conversation to complete before signaling status to the
|
||||
client but we can relax that requirement for the commit phase such that only 2
|
||||
confirmations to that phase of the conversation are required for success as the
|
||||
reconstructor will assure propagation of markers that indicate data durability.
|
||||
|
||||
This provides the storage node with a cheap indicator of the last known durable
|
||||
set of fragment archives for a given object on a successful durable PUT, this is
|
||||
known as the ``.durable`` file. The presence of a ``.durable`` file means, to
|
||||
the object server, `there is a set of ts.data files that are durable at
|
||||
timestamp ts.` Note that the completion of the commit phase of the conversation
|
||||
is also a signal for the object server to go ahead and immediately delete older
|
||||
timestamp files for this object. This is critical as we do not want to delete
|
||||
the older object until the storage node has confirmation from the proxy, via the
|
||||
multi-phase conversation, that the other nodes have landed enough for a quorum.
|
||||
|
||||
The basic flow looks like this:
|
||||
|
||||
* The Proxy Server erasure codes and streams the object fragments
|
||||
(ec_ndata + ec_nparity) to the storage nodes.
|
||||
* The storage nodes store objects as EC archives and upon finishing object
|
||||
data/metadata write, send a 1st-phase response to proxy.
|
||||
* Upon quorum of storage nodes responses, the proxy initiates 2nd-phase by
|
||||
sending commit confirmations to object servers.
|
||||
* Upon receipt of commit message, object servers store a 0-byte data file as
|
||||
`<timestamp>.durable` indicating successful PUT, and send a final response to
|
||||
the proxy server.
|
||||
* The proxy waits for a minimal number of two object servers to respond with a
|
||||
success (2xx) status before responding to the client with a successful
|
||||
status. In this particular case it was decided that two responses was
|
||||
the mininum amount to know that the file would be propagated in case of
|
||||
failure from other others and because a greater number would potentially
|
||||
mean more latency, which should be avoided if possible.
|
||||
|
||||
Here is a high level example of what the conversation looks like::
|
||||
|
||||
proxy: PUT /p/a/c/o
|
||||
Transfer-Encoding': 'chunked'
|
||||
Expect': '100-continue'
|
||||
X-Backend-Obj-Multiphase-Commit: yes
|
||||
obj: 100 Continue
|
||||
X-Obj-Multiphase-Commit: yes
|
||||
proxy: --MIMEboundary
|
||||
X-Document: object body
|
||||
<obj_data>
|
||||
--MIMEboundary
|
||||
X-Document: object metadata
|
||||
Content-MD5: <footer_meta_cksum>
|
||||
<footer_meta>
|
||||
--MIMEboundary
|
||||
<object server writes data, metadata>
|
||||
obj: 100 Continue
|
||||
<quorum>
|
||||
proxy: X-Document: put commit
|
||||
commit_confirmation
|
||||
--MIMEboundary--
|
||||
<object server writes ts.durable state>
|
||||
obj: 20x
|
||||
<proxy waits to receive >=2 2xx responses>
|
||||
proxy: 2xx -> client
|
||||
|
||||
A few key points on the .durable file:
|
||||
|
||||
* The .durable file means \"the matching .data file for this has sufficient
|
||||
fragment archives somewhere, committed, to reconstruct the object\".
|
||||
* The Proxy Server will never have knowledge, either on GET or HEAD, of the
|
||||
existence of a .data file on an object server if it does not have a matching
|
||||
.durable file.
|
||||
* The object server will never return a .data that does not have a matching
|
||||
.durable.
|
||||
* When a proxy does a GET, it will only receive fragment archives that have
|
||||
enough present somewhere to be reconstructed.
|
||||
|
||||
Partial PUT Failures
|
||||
====================
|
||||
|
||||
A partial PUT failure has a few different modes. In one scenario the Proxy
|
||||
Server is alive through the entire PUT conversation. This is a very
|
||||
straightforward case. The client will receive a good response if and only if a
|
||||
quorum of fragment archives were successfully landed on their storage nodes. In
|
||||
this case the Reconstructor will discover the missing fragment archives, perform
|
||||
a reconstruction and deliver fragment archives and their matching .durable files
|
||||
to the nodes.
|
||||
|
||||
The more interesting case is what happens if the proxy dies in the middle of a
|
||||
conversation. If it turns out that a quorum had been met and the commit phase
|
||||
of the conversation finished, its as simple as the previous case in that the
|
||||
reconstructor will repair things. However, if the commit didn't get a change to
|
||||
happen then some number of the storage nodes have .data files on them (fragment
|
||||
archives) but none of them knows whether there are enough elsewhere for the
|
||||
entire object to be reconstructed. In this case the client will not have
|
||||
received a 2xx response so there is no issue there, however, it is left to the
|
||||
storage nodes to clean up the stale fragment archives. Work is ongoing in this
|
||||
area to enable the proxy to play a role in reviving these fragment archives,
|
||||
however, for the current release, a proxy failure after the start of a
|
||||
conversation but before the commit message will simply result in a PUT failure.
|
||||
|
||||
GET
|
||||
===
|
||||
|
||||
The GET for EC is different enough from replication that subclassing the
|
||||
`BaseObjectController` to the `ECObjectController` enables an efficient way to
|
||||
implement the high level steps described earlier:
|
||||
|
||||
#. The proxy server makes simultaneous requests to participating nodes.
|
||||
#. As soon as the proxy has the fragments it needs, it calls on PyECLib to
|
||||
decode the data.
|
||||
#. The proxy streams the decoded data it has back to the client.
|
||||
#. Repeat until the proxy is done sending data back to the client.
|
||||
|
||||
The GET path will attempt to contact all nodes participating in the EC scheme,
|
||||
if not enough primaries respond then handoffs will be contacted just as with
|
||||
replication. Etag and content length headers are updated for the client
|
||||
response following reconstruction as the individual fragment archives metadata
|
||||
is valid only for that fragment archive.
|
||||
|
||||
Object Server
|
||||
-------------
|
||||
|
||||
The Object Server, like the Proxy Server, supports MIME conversations as
|
||||
described in the proxy section earlier. This includes processing of the commit
|
||||
message and decoding various sections of the MIME document to extract the footer
|
||||
which includes things like the entire object etag.
|
||||
|
||||
DiskFile
|
||||
========
|
||||
|
||||
Erasure code uses subclassed ``ECDiskFile``, ``ECDiskFileWriter`` and
|
||||
``ECDiskFileManager`` to impement EC specific handling of on disk files. This
|
||||
includes things like file name manipulation to include the fragment index in the
|
||||
filename, determination of valid .data files based on .durable presence,
|
||||
construction of EC specific hashes.pkl file to include fragment index
|
||||
information, etc., etc.
|
||||
|
||||
Metadata
|
||||
--------
|
||||
|
||||
There are few different categories of metadata that are associated with EC:
|
||||
|
||||
System Metadata: EC has a set of object level system metadata that it
|
||||
attaches to each of the EC archives. The metadata is for internal use only:
|
||||
|
||||
* ``X-Object-Sysmeta-EC-Etag``: The Etag of the original object.
|
||||
* ``X-Object-Sysmeta-EC-Content-Length``: The content length of the original
|
||||
object.
|
||||
* ``X-Object-Sysmeta-EC-Frag-Index``: The fragment index for the object.
|
||||
* ``X-Object-Sysmeta-EC-Scheme``: Description of the EC policy used to encode
|
||||
the object.
|
||||
* ``X-Object-Sysmeta-EC-Segment-Size``: The segment size used for the object.
|
||||
|
||||
User Metadata: User metadata is unaffected by EC, however, a full copy of the
|
||||
user metadata is stored with every EC archive. This is required as the
|
||||
reconstructor needs this information and each reconstructor only communicates
|
||||
with its closest neighbors on the ring.
|
||||
|
||||
PyECLib Metadata: PyECLib stores a small amount of metadata on a per fragment
|
||||
basis. This metadata is not documented here as it is opaque to Swift.
|
||||
|
||||
Database Updates
|
||||
----------------
|
||||
|
||||
As account and container rings are not associated with a Storage Policy, there
|
||||
is no change to how these database updates occur when using an EC policy.
|
||||
|
||||
The Reconstructor
|
||||
-----------------
|
||||
|
||||
The Reconstructor performs analogous functions to the replicator:
|
||||
|
||||
#. Recovery from disk drive failure.
|
||||
#. Moving data around because of a rebalance.
|
||||
#. Reverting data back to a primary from a handoff.
|
||||
#. Recovering fragment archives from bit rot discovered by the auditor.
|
||||
|
||||
However, under the hood it operates quite differently. The following are some
|
||||
of the key elements in understanding how the reconstructor operates.
|
||||
|
||||
Unlike the replicator, the work that the reconstructor does is not always as
|
||||
easy to break down into the 2 basic tasks of synchronize or revert (move data
|
||||
from handoff back to primary) because of the fact that one storage node can
|
||||
house fragment archives of various indexes and each index really /"belongs/" to
|
||||
a different node. So, whereas when the replicator is reverting data from a
|
||||
handoff it has just one node to send its data to, the reconstructor can have
|
||||
several. Additionally, its not always the case that the processing of a
|
||||
particular suffix directory means one or the other for the entire directory (as
|
||||
it does for replication). The scenarios that create these mixed situations can
|
||||
be pretty complex so we will just focus on what the reconstructor does here and
|
||||
not a detailed explanation of why.
|
||||
|
||||
Job Construction and Processing
|
||||
===============================
|
||||
|
||||
Because of the nature of the work it has to do as described above, the
|
||||
reconstructor builds jobs for a single job processor. The job itself contains
|
||||
all of the information needed for the processor to execute the job which may be
|
||||
a synchronization or a data reversion and there may be a mix of jobs that
|
||||
perform both of these operations on the same suffix directory.
|
||||
|
||||
Jobs are constructed on a per partition basis and then per fragment index basis.
|
||||
That is, there will be one job for every fragment index in a partition.
|
||||
Performing this construction \"up front\" like this helps minimize the
|
||||
interaction between nodes collecting hashes.pkl information.
|
||||
|
||||
Once a set of jobs for a partition has been constructed, those jobs are sent off
|
||||
to threads for execution. The single job processor then performs the necessary
|
||||
actions working closely with ssync to carry out its instructions. For data
|
||||
reversion, the actual objects themselves are cleaned up via the ssync module and
|
||||
once that partition's set of jobs is complete, the reconstructor will attempt to
|
||||
remove the relevant directory structures.
|
||||
|
||||
The scenarios that job construction has to take into account include:
|
||||
|
||||
#. A partition directory with all fragment indexes matching the local node
|
||||
index. This is the case where everything is where it belongs and we just
|
||||
need to compare hashes and sync if needed, here we sync with our partners.
|
||||
#. A partition directory with one local fragment index and mix of others. Here
|
||||
we need to sync with our partners where fragment indexes matches the
|
||||
local_id, all others are sync'd with their home nodes and then deleted.
|
||||
#. A partition directory with no local fragment index and just one or more of
|
||||
others. Here we sync with just the home nodes for the fragment indexes that
|
||||
we have and then all the local archives are deleted. This is the basic
|
||||
handoff reversion case.
|
||||
|
||||
.. note::
|
||||
A \"home node\" is the node where the fragment index encoded in the
|
||||
fragment archive's filename matches the node index of a node in the primary
|
||||
partition list.
|
||||
|
||||
Node Communication
|
||||
==================
|
||||
|
||||
The replicators talk to all nodes who have a copy of their object, typically
|
||||
just 2 other nodes. For EC, having each reconstructor node talk to all nodes
|
||||
would incur a large amount of overhead as there will typically be a much larger
|
||||
number of nodes participating in the EC scheme. Therefore, the reconstructor is
|
||||
built to talk to its adjacent nodes on the ring only. These nodes are typically
|
||||
referred to as partners.
|
||||
|
||||
Reconstruction
|
||||
==============
|
||||
|
||||
Reconstruction can be thought of sort of like replication but with an extra step
|
||||
in the middle. The reconstructor is hard-wired to use ssync to determine what is
|
||||
missing and desired by the other side. However, before an object is sent over
|
||||
the wire it needs to be reconstructed from the remaining fragments as the local
|
||||
fragment is just that - a different fragment index than what the other end is
|
||||
asking for.
|
||||
|
||||
Thus, there are hooks in ssync for EC based policies. One case would be for
|
||||
basic reconstruction which, at a high level, looks like this:
|
||||
|
||||
* Determine which nodes need to be contacted to collect other EC archives needed
|
||||
to perform reconstruction.
|
||||
* Update the etag and fragment index metadata elements of the newly constructed
|
||||
fragment archive.
|
||||
* Establish a connection to the target nodes and give ssync a DiskFileLike class
|
||||
that it can stream data from.
|
||||
|
||||
The reader in this class gathers fragments from the nodes and uses PyECLib to
|
||||
reconstruct each segment before yielding data back to ssync. Essentially what
|
||||
this means is that data is buffered, in memory, on a per segment basis at the
|
||||
node performing reconstruction and each segment is dynamically reconstructed and
|
||||
delivered to `ssync_sender` where the `send_put()` method will ship them on
|
||||
over. The sender is then responsible for deleting the objects as they are sent
|
||||
in the case of data reversion.
|
||||
|
||||
The Auditor
|
||||
-----------
|
||||
|
||||
Because the auditor already operates on a per storage policy basis, there are no
|
||||
specific auditor changes associated with EC. Each EC archive looks like, and is
|
||||
treated like, a regular object from the perspective of the auditor. Therefore,
|
||||
if the auditor finds bit-rot in an EC archive, it simply quarantines it and the
|
||||
reconstructor will take care of the rest just as the replicator does for
|
||||
replication policies.
|
@ -8,22 +8,22 @@ feature is implemented throughout the entire code base so it is an important
|
||||
concept in understanding Swift architecture.
|
||||
|
||||
As described in :doc:`overview_ring`, Swift uses modified hashing rings to
|
||||
determine where data should reside in the cluster. There is a separate ring
|
||||
for account databases, container databases, and there is also one object
|
||||
ring per storage policy. Each object ring behaves exactly the same way
|
||||
and is maintained in the same manner, but with policies, different devices
|
||||
can belong to different rings with varying levels of replication. By supporting
|
||||
multiple object rings, Swift allows the application and/or deployer to
|
||||
essentially segregate the object storage within a single cluster. There are
|
||||
many reasons why this might be desirable:
|
||||
determine where data should reside in the cluster. There is a separate ring for
|
||||
account databases, container databases, and there is also one object ring per
|
||||
storage policy. Each object ring behaves exactly the same way and is maintained
|
||||
in the same manner, but with policies, different devices can belong to different
|
||||
rings. By supporting multiple object rings, Swift allows the application and/or
|
||||
deployer to essentially segregate the object storage within a single cluster.
|
||||
There are many reasons why this might be desirable:
|
||||
|
||||
* Different levels of replication: If a provider wants to offer, for example,
|
||||
2x replication and 3x replication but doesn't want to maintain 2 separate clusters,
|
||||
they would setup a 2x policy and a 3x policy and assign the nodes to their
|
||||
respective rings.
|
||||
* Different levels of durability: If a provider wants to offer, for example,
|
||||
2x replication and 3x replication but doesn't want to maintain 2 separate
|
||||
clusters, they would setup a 2x and a 3x replication policy and assign the
|
||||
nodes to their respective rings. Furthermore, if a provider wanted to offer a
|
||||
cold storage tier, they could create an erasure coded policy.
|
||||
|
||||
* Performance: Just as SSDs can be used as the exclusive members of an account or
|
||||
database ring, an SSD-only object ring can be created as well and used to
|
||||
* Performance: Just as SSDs can be used as the exclusive members of an account
|
||||
or database ring, an SSD-only object ring can be created as well and used to
|
||||
implement a low-latency/high performance policy.
|
||||
|
||||
* Collecting nodes into group: Different object rings may have different
|
||||
@ -36,10 +36,12 @@ many reasons why this might be desirable:
|
||||
|
||||
.. note::
|
||||
|
||||
Today, choosing a different storage policy allows the use of different
|
||||
object rings, but future policies (such as Erasure Coding) will also
|
||||
change some of the actual code paths when processing a request. Also note
|
||||
that Diskfile refers to backend object storage plug-in architecture.
|
||||
Today, Swift supports two different policy types: Replication and Erasure
|
||||
Code. Erasure Code policy is currently a beta release and should not be
|
||||
used in a Production cluster. See :doc:`overview_erasure_code` for details.
|
||||
|
||||
Also note that Diskfile refers to backend object storage plug-in
|
||||
architecture. See :doc:`development_ondisk_backends` for details.
|
||||
|
||||
-----------------------
|
||||
Containers and Policies
|
||||
@ -61,31 +63,33 @@ Policy-0 is considered the default). We will be covering the difference
|
||||
between default and Policy-0 in the next section.
|
||||
|
||||
Policies are assigned when a container is created. Once a container has been
|
||||
assigned a policy, it cannot be changed (unless it is deleted/recreated). The implications
|
||||
on data placement/movement for large datasets would make this a task best left for
|
||||
applications to perform. Therefore, if a container has an existing policy of,
|
||||
for example 3x replication, and one wanted to migrate that data to a policy that specifies
|
||||
a different replication level, the application would create another container
|
||||
specifying the other policy name and then simply move the data from one container
|
||||
to the other. Policies apply on a per container basis allowing for minimal application
|
||||
awareness; once a container has been created with a specific policy, all objects stored
|
||||
in it will be done so in accordance with that policy. If a container with a
|
||||
specific name is deleted (requires the container be empty) a new container may
|
||||
be created with the same name without any restriction on storage policy
|
||||
enforced by the deleted container which previously shared the same name.
|
||||
assigned a policy, it cannot be changed (unless it is deleted/recreated). The
|
||||
implications on data placement/movement for large datasets would make this a
|
||||
task best left for applications to perform. Therefore, if a container has an
|
||||
existing policy of, for example 3x replication, and one wanted to migrate that
|
||||
data to an Erasure Code policy, the application would create another container
|
||||
specifying the other policy parameters and then simply move the data from one
|
||||
container to the other. Policies apply on a per container basis allowing for
|
||||
minimal application awareness; once a container has been created with a specific
|
||||
policy, all objects stored in it will be done so in accordance with that policy.
|
||||
If a container with a specific name is deleted (requires the container be empty)
|
||||
a new container may be created with the same name without any restriction on
|
||||
storage policy enforced by the deleted container which previously shared the
|
||||
same name.
|
||||
|
||||
Containers have a many-to-one relationship with policies meaning that any number
|
||||
of containers can share one policy. There is no limit to how many containers can use
|
||||
a specific policy.
|
||||
of containers can share one policy. There is no limit to how many containers
|
||||
can use a specific policy.
|
||||
|
||||
The notion of associating a ring with a container introduces an interesting scenario:
|
||||
What would happen if 2 containers of the same name were created with different
|
||||
Storage Policies on either side of a network outage at the same time? Furthermore,
|
||||
what would happen if objects were placed in those containers, a whole bunch of them,
|
||||
and then later the network outage was restored? Well, without special care it would
|
||||
be a big problem as an application could end up using the wrong ring to try and find
|
||||
an object. Luckily there is a solution for this problem, a daemon known as the
|
||||
Container Reconciler works tirelessly to identify and rectify this potential scenario.
|
||||
The notion of associating a ring with a container introduces an interesting
|
||||
scenario: What would happen if 2 containers of the same name were created with
|
||||
different Storage Policies on either side of a network outage at the same time?
|
||||
Furthermore, what would happen if objects were placed in those containers, a
|
||||
whole bunch of them, and then later the network outage was restored? Well,
|
||||
without special care it would be a big problem as an application could end up
|
||||
using the wrong ring to try and find an object. Luckily there is a solution for
|
||||
this problem, a daemon known as the Container Reconciler works tirelessly to
|
||||
identify and rectify this potential scenario.
|
||||
|
||||
--------------------
|
||||
Container Reconciler
|
||||
@ -184,9 +188,9 @@ this case we would not use the default as it might not have the same
|
||||
policy as legacy containers. When no other policies are defined, Swift
|
||||
will always choose ``Policy-0`` as the default.
|
||||
|
||||
In other words, default means "create using this policy if nothing else is specified"
|
||||
and ``Policy-0`` means "use the legacy policy if a container doesn't have one" which
|
||||
really means use ``object.ring.gz`` for lookups.
|
||||
In other words, default means "create using this policy if nothing else is
|
||||
specified" and ``Policy-0`` means "use the legacy policy if a container doesn't
|
||||
have one" which really means use ``object.ring.gz`` for lookups.
|
||||
|
||||
.. note::
|
||||
|
||||
@ -244,17 +248,19 @@ not mark the policy as deprecated to all nodes.
|
||||
Configuring Policies
|
||||
--------------------
|
||||
|
||||
Policies are configured in ``swift.conf`` and it is important that the deployer have a solid
|
||||
understanding of the semantics for configuring policies. Recall that a policy must have
|
||||
a corresponding ring file, so configuring a policy is a two-step process. First, edit
|
||||
your ``/etc/swift/swift.conf`` file to add your new policy and, second, create the
|
||||
corresponding policy object ring file.
|
||||
Policies are configured in ``swift.conf`` and it is important that the deployer
|
||||
have a solid understanding of the semantics for configuring policies. Recall
|
||||
that a policy must have a corresponding ring file, so configuring a policy is a
|
||||
two-step process. First, edit your ``/etc/swift/swift.conf`` file to add your
|
||||
new policy and, second, create the corresponding policy object ring file.
|
||||
|
||||
See :doc:`policies_saio` for a step by step guide on adding a policy to the SAIO setup.
|
||||
See :doc:`policies_saio` for a step by step guide on adding a policy to the SAIO
|
||||
setup.
|
||||
|
||||
Note that each policy has a section starting with ``[storage-policy:N]`` where N is the
|
||||
policy index. There's no reason other than readability that these be sequential but there
|
||||
are a number of rules enforced by Swift when parsing this file:
|
||||
Note that each policy has a section starting with ``[storage-policy:N]`` where N
|
||||
is the policy index. There's no reason other than readability that these be
|
||||
sequential but there are a number of rules enforced by Swift when parsing this
|
||||
file:
|
||||
|
||||
* If a policy with index 0 is not declared and no other policies defined,
|
||||
Swift will create one
|
||||
@ -269,9 +275,11 @@ are a number of rules enforced by Swift when parsing this file:
|
||||
* The policy name 'Policy-0' can only be used for the policy with index 0
|
||||
* If any policies are defined, exactly one policy must be declared default
|
||||
* Deprecated policies cannot be declared the default
|
||||
* If no ``policy_type`` is provided, ``replication`` is the default value.
|
||||
|
||||
The following is an example of a properly configured ``swift.conf`` file. See :doc:`policies_saio`
|
||||
for full instructions on setting up an all-in-one with this example configuration.::
|
||||
The following is an example of a properly configured ``swift.conf`` file. See
|
||||
:doc:`policies_saio` for full instructions on setting up an all-in-one with this
|
||||
example configuration.::
|
||||
|
||||
[swift-hash]
|
||||
# random unique strings that can never change (DO NOT LOSE)
|
||||
@ -280,10 +288,12 @@ for full instructions on setting up an all-in-one with this example configuratio
|
||||
|
||||
[storage-policy:0]
|
||||
name = gold
|
||||
policy_type = replication
|
||||
default = yes
|
||||
|
||||
[storage-policy:1]
|
||||
name = silver
|
||||
policy_type = replication
|
||||
deprecated = yes
|
||||
|
||||
Review :ref:`default-policy` and :ref:`deprecate-policy` for more
|
||||
@ -300,11 +310,14 @@ There are some other considerations when managing policies:
|
||||
the desired policy section, but a deprecated policy may not also
|
||||
be declared the default, and you must specify a default - so you
|
||||
must have policy which is not deprecated at all times.
|
||||
* The option ``policy_type`` is used to distinguish between different
|
||||
policy types. The default value is ``replication``. When defining an EC
|
||||
policy use the value ``erasure_coding``.
|
||||
* The EC policy has additional required parameters. See
|
||||
:doc:`overview_erasure_code` for details.
|
||||
|
||||
There will be additional parameters for policies as new features are added
|
||||
(e.g., Erasure Code), but for now only a section name/index and name are
|
||||
required. Once ``swift.conf`` is configured for a new policy, a new ring must be
|
||||
created. The ring tools are not policy name aware so it's critical that the
|
||||
Once ``swift.conf`` is configured for a new policy, a new ring must be created.
|
||||
The ring tools are not policy name aware so it's critical that the
|
||||
correct policy index be used when creating the new policy's ring file.
|
||||
Additional object rings are created in the same manner as the legacy ring
|
||||
except that '-N' is appended after the word ``object`` where N matches the
|
||||
@ -404,43 +417,47 @@ Middleware
|
||||
----------
|
||||
|
||||
Middleware can take advantage of policies through the :data:`.POLICIES` global
|
||||
and by importing :func:`.get_container_info` to gain access to the policy
|
||||
index associated with the container in question. From the index it
|
||||
can then use the :data:`.POLICIES` singleton to grab the right ring. For example,
|
||||
and by importing :func:`.get_container_info` to gain access to the policy index
|
||||
associated with the container in question. From the index it can then use the
|
||||
:data:`.POLICIES` singleton to grab the right ring. For example,
|
||||
:ref:`list_endpoints` is policy aware using the means just described. Another
|
||||
example is :ref:`recon` which will report the md5 sums for all of the rings.
|
||||
|
||||
Proxy Server
|
||||
------------
|
||||
|
||||
The :ref:`proxy-server` module's role in Storage Policies is essentially to make sure the
|
||||
correct ring is used as its member element. Before policies, the one object ring
|
||||
would be instantiated when the :class:`.Application` class was instantiated and could
|
||||
be overridden by test code via init parameter. With policies, however, there is
|
||||
no init parameter and the :class:`.Application` class instead depends on the :data:`.POLICIES`
|
||||
global singleton to retrieve the ring which is instantiated the first time it's
|
||||
needed. So, instead of an object ring member of the :class:`.Application` class, there is
|
||||
an accessor function, :meth:`~.Application.get_object_ring`, that gets the ring from :data:`.POLICIES`.
|
||||
The :ref:`proxy-server` module's role in Storage Policies is essentially to make
|
||||
sure the correct ring is used as its member element. Before policies, the one
|
||||
object ring would be instantiated when the :class:`.Application` class was
|
||||
instantiated and could be overridden by test code via init parameter. With
|
||||
policies, however, there is no init parameter and the :class:`.Application`
|
||||
class instead depends on the :data:`.POLICIES` global singleton to retrieve the
|
||||
ring which is instantiated the first time it's needed. So, instead of an object
|
||||
ring member of the :class:`.Application` class, there is an accessor function,
|
||||
:meth:`~.Application.get_object_ring`, that gets the ring from
|
||||
:data:`.POLICIES`.
|
||||
|
||||
In general, when any module running on the proxy requires an object ring, it
|
||||
does so via first getting the policy index from the cached container info. The
|
||||
exception is during container creation where it uses the policy name from the
|
||||
request header to look up policy index from the :data:`.POLICIES` global. Once the
|
||||
proxy has determined the policy index, it can use the :meth:`~.Application.get_object_ring` method
|
||||
described earlier to gain access to the correct ring. It then has the responsibility
|
||||
of passing the index information, not the policy name, on to the back-end servers
|
||||
via the header ``X-Backend-Storage-Policy-Index``. Going the other way, the proxy also
|
||||
strips the index out of headers that go back to clients, and makes sure they only
|
||||
see the friendly policy names.
|
||||
request header to look up policy index from the :data:`.POLICIES` global. Once
|
||||
the proxy has determined the policy index, it can use the
|
||||
:meth:`~.Application.get_object_ring` method described earlier to gain access to
|
||||
the correct ring. It then has the responsibility of passing the index
|
||||
information, not the policy name, on to the back-end servers via the header ``X
|
||||
-Backend-Storage-Policy-Index``. Going the other way, the proxy also strips the
|
||||
index out of headers that go back to clients, and makes sure they only see the
|
||||
friendly policy names.
|
||||
|
||||
On Disk Storage
|
||||
---------------
|
||||
|
||||
Policies each have their own directories on the back-end servers and are identified by
|
||||
their storage policy indexes. Organizing the back-end directory structures by policy
|
||||
index helps keep track of things and also allows for sharing of disks between policies
|
||||
which may or may not make sense depending on the needs of the provider. More
|
||||
on this later, but for now be aware of the following directory naming convention:
|
||||
Policies each have their own directories on the back-end servers and are
|
||||
identified by their storage policy indexes. Organizing the back-end directory
|
||||
structures by policy index helps keep track of things and also allows for
|
||||
sharing of disks between policies which may or may not make sense depending on
|
||||
the needs of the provider. More on this later, but for now be aware of the
|
||||
following directory naming convention:
|
||||
|
||||
* ``/objects`` maps to objects associated with Policy-0
|
||||
* ``/objects-N`` maps to storage policy index #N
|
||||
@ -466,19 +483,19 @@ policy index and leaves the actual directory naming/structure mechanisms to
|
||||
:class:`.Diskfile` being used will assure that data is properly located in the
|
||||
tree based on its policy.
|
||||
|
||||
For the same reason, the :ref:`object-updater` also is policy aware. As previously
|
||||
described, different policies use different async pending directories so the
|
||||
updater needs to know how to scan them appropriately.
|
||||
For the same reason, the :ref:`object-updater` also is policy aware. As
|
||||
previously described, different policies use different async pending directories
|
||||
so the updater needs to know how to scan them appropriately.
|
||||
|
||||
The :ref:`object-replicator` is policy aware in that, depending on the policy, it may have to
|
||||
do drastically different things, or maybe not. For example, the difference in
|
||||
handling a replication job for 2x versus 3x is trivial; however, the difference in
|
||||
handling replication between 3x and erasure code is most definitely not. In
|
||||
fact, the term 'replication' really isn't appropriate for some policies
|
||||
like erasure code; however, the majority of the framework for collecting and
|
||||
processing jobs is common. Thus, those functions in the replicator are
|
||||
leveraged for all policies and then there is policy specific code required for
|
||||
each policy, added when the policy is defined if needed.
|
||||
The :ref:`object-replicator` is policy aware in that, depending on the policy,
|
||||
it may have to do drastically different things, or maybe not. For example, the
|
||||
difference in handling a replication job for 2x versus 3x is trivial; however,
|
||||
the difference in handling replication between 3x and erasure code is most
|
||||
definitely not. In fact, the term 'replication' really isn't appropriate for
|
||||
some policies like erasure code; however, the majority of the framework for
|
||||
collecting and processing jobs is common. Thus, those functions in the
|
||||
replicator are leveraged for all policies and then there is policy specific code
|
||||
required for each policy, added when the policy is defined if needed.
|
||||
|
||||
The ssync functionality is policy aware for the same reason. Some of the
|
||||
other modules may not obviously be affected, but the back-end directory
|
||||
@ -487,25 +504,26 @@ parameter. Therefore ssync being policy aware really means passing the
|
||||
policy index along. See :class:`~swift.obj.ssync_sender` and
|
||||
:class:`~swift.obj.ssync_receiver` for more information on ssync.
|
||||
|
||||
For :class:`.Diskfile` itself, being policy aware is all about managing the back-end
|
||||
structure using the provided policy index. In other words, callers who get
|
||||
a :class:`.Diskfile` instance provide a policy index and :class:`.Diskfile`'s job is to keep data
|
||||
separated via this index (however it chooses) such that policies can share
|
||||
the same media/nodes if desired. The included implementation of :class:`.Diskfile`
|
||||
lays out the directory structure described earlier but that's owned within
|
||||
:class:`.Diskfile`; external modules have no visibility into that detail. A common
|
||||
function is provided to map various directory names and/or strings
|
||||
based on their policy index. For example :class:`.Diskfile` defines :func:`.get_data_dir`
|
||||
which builds off of a generic :func:`.get_policy_string` to consistently build
|
||||
policy aware strings for various usage.
|
||||
For :class:`.Diskfile` itself, being policy aware is all about managing the
|
||||
back-end structure using the provided policy index. In other words, callers who
|
||||
get a :class:`.Diskfile` instance provide a policy index and
|
||||
:class:`.Diskfile`'s job is to keep data separated via this index (however it
|
||||
chooses) such that policies can share the same media/nodes if desired. The
|
||||
included implementation of :class:`.Diskfile` lays out the directory structure
|
||||
described earlier but that's owned within :class:`.Diskfile`; external modules
|
||||
have no visibility into that detail. A common function is provided to map
|
||||
various directory names and/or strings based on their policy index. For example
|
||||
:class:`.Diskfile` defines :func:`.get_data_dir` which builds off of a generic
|
||||
:func:`.get_policy_string` to consistently build policy aware strings for
|
||||
various usage.
|
||||
|
||||
Container Server
|
||||
----------------
|
||||
|
||||
The :ref:`container-server` plays a very important role in Storage Policies, it is
|
||||
responsible for handling the assignment of a policy to a container and the
|
||||
prevention of bad things like changing policies or picking the wrong policy
|
||||
to use when nothing is specified (recall earlier discussion on Policy-0 versus
|
||||
The :ref:`container-server` plays a very important role in Storage Policies, it
|
||||
is responsible for handling the assignment of a policy to a container and the
|
||||
prevention of bad things like changing policies or picking the wrong policy to
|
||||
use when nothing is specified (recall earlier discussion on Policy-0 versus
|
||||
default).
|
||||
|
||||
The :ref:`container-updater` is policy aware, however its job is very simple, to
|
||||
@ -538,19 +556,19 @@ migrated to be fully compatible with the post-storage-policy queries without
|
||||
having to fall back and retry queries with the legacy schema to service
|
||||
container read requests.
|
||||
|
||||
The :ref:`container-sync-daemon` functionality only needs to be policy aware in that it
|
||||
accesses the object rings. Therefore, it needs to pull the policy index
|
||||
out of the container information and use it to select the appropriate
|
||||
object ring from the :data:`.POLICIES` global.
|
||||
The :ref:`container-sync-daemon` functionality only needs to be policy aware in
|
||||
that it accesses the object rings. Therefore, it needs to pull the policy index
|
||||
out of the container information and use it to select the appropriate object
|
||||
ring from the :data:`.POLICIES` global.
|
||||
|
||||
Account Server
|
||||
--------------
|
||||
|
||||
The :ref:`account-server`'s role in Storage Policies is really limited to reporting.
|
||||
When a HEAD request is made on an account (see example provided earlier),
|
||||
the account server is provided with the storage policy index and builds
|
||||
the ``object_count`` and ``byte_count`` information for the client on a per
|
||||
policy basis.
|
||||
The :ref:`account-server`'s role in Storage Policies is really limited to
|
||||
reporting. When a HEAD request is made on an account (see example provided
|
||||
earlier), the account server is provided with the storage policy index and
|
||||
builds the ``object_count`` and ``byte_count`` information for the client on a
|
||||
per policy basis.
|
||||
|
||||
The account servers are able to report per-storage-policy object and byte
|
||||
counts because of some policy specific DB schema changes. A policy specific
|
||||
@ -564,23 +582,23 @@ pre-storage-policy accounts by altering the DB schema and populating the
|
||||
point in time.
|
||||
|
||||
The per-storage-policy object and byte counts are not updated with each object
|
||||
PUT and DELETE request, instead container updates to the account server are performed
|
||||
asynchronously by the ``swift-container-updater``.
|
||||
PUT and DELETE request, instead container updates to the account server are
|
||||
performed asynchronously by the ``swift-container-updater``.
|
||||
|
||||
.. _upgrade-policy:
|
||||
|
||||
Upgrading and Confirming Functionality
|
||||
--------------------------------------
|
||||
|
||||
Upgrading to a version of Swift that has Storage Policy support is not difficult,
|
||||
in fact, the cluster administrator isn't required to make any special configuration
|
||||
changes to get going. Swift will automatically begin using the existing object
|
||||
ring as both the default ring and the Policy-0 ring. Adding the declaration of
|
||||
policy 0 is totally optional and in its absence, the name given to the implicit
|
||||
policy 0 will be 'Policy-0'. Let's say for testing purposes that you wanted to take
|
||||
an existing cluster that already has lots of data on it and upgrade to Swift with
|
||||
Storage Policies. From there you want to go ahead and create a policy and test a
|
||||
few things out. All you need to do is:
|
||||
Upgrading to a version of Swift that has Storage Policy support is not
|
||||
difficult, in fact, the cluster administrator isn't required to make any special
|
||||
configuration changes to get going. Swift will automatically begin using the
|
||||
existing object ring as both the default ring and the Policy-0 ring. Adding the
|
||||
declaration of policy 0 is totally optional and in its absence, the name given
|
||||
to the implicit policy 0 will be 'Policy-0'. Let's say for testing purposes
|
||||
that you wanted to take an existing cluster that already has lots of data on it
|
||||
and upgrade to Swift with Storage Policies. From there you want to go ahead and
|
||||
create a policy and test a few things out. All you need to do is:
|
||||
|
||||
#. Upgrade all of your Swift nodes to a policy-aware version of Swift
|
||||
#. Define your policies in ``/etc/swift/swift.conf``
|
||||
|
@ -111,11 +111,53 @@ Another improvement planned all along the way is separating the local disk
|
||||
structure from the protocol path structure. This separation will allow ring
|
||||
resizing at some point, or at least ring-doubling.
|
||||
|
||||
FOR NOW, IT IS NOT RECOMMENDED TO USE SSYNC ON PRODUCTION CLUSTERS. Some of us
|
||||
will be in a limited fashion to look for any subtle issues, tuning, etc. but
|
||||
generally ssync is an experimental feature. In its current implementation it is
|
||||
probably going to be a bit slower than RSync, but if all goes according to plan
|
||||
it will end up much faster.
|
||||
Note that for objects being stored with an Erasure Code policy, the replicator
|
||||
daemon is not involved. Instead, the reconstructor is used by Erasure Code
|
||||
policies and is analogous to the replicator for Replication type policies.
|
||||
See :doc:`overview_erasure_code` for complete information on both Erasure Code
|
||||
support as well as the reconstructor.
|
||||
|
||||
----------
|
||||
Hashes.pkl
|
||||
----------
|
||||
|
||||
The hashes.pkl file is a key element for both replication and reconstruction
|
||||
(for Erasure Coding). Both daemons use this file to determine if any kind of
|
||||
action is required between nodes that are participating in the durability
|
||||
scheme. The file itself is a pickled dictionary with slightly different
|
||||
formats depending on whether the policy is Replication or Erasure Code. In
|
||||
either case, however, the same basic information is provided between the
|
||||
nodes. The dictionary contains a dictionary where the key is a suffix
|
||||
directory name and the value is the MD5 hash of the directory listing for
|
||||
that suffix. In this manner, the daemon can quickly identify differences
|
||||
between local and remote suffix directories on a per partition basis as the
|
||||
scope of any one hashes.pkl file is a partition directory.
|
||||
|
||||
For Erasure Code policies, there is a little more information required. An
|
||||
object's hash directory may contain multiple fragments of a single object in
|
||||
the event that the node is acting as a handoff or perhaps if a rebalance is
|
||||
underway. Each fragment of an object is stored with a fragment index, so
|
||||
the hashes.pkl for an Erasure Code partition will still be a dictionary
|
||||
keyed on the suffix directory name, however, the value is another dictionary
|
||||
keyed on the fragment index with subsequent MD5 hashes for each one as
|
||||
values. Some files within an object hash directory don't require a fragment
|
||||
index so None is used to represent those. Below are examples of what these
|
||||
dictionaries might look like.
|
||||
|
||||
Replication hashes.pkl::
|
||||
|
||||
{'a43': '72018c5fbfae934e1f56069ad4425627',
|
||||
'b23': '12348c5fbfae934e1f56069ad4421234'}
|
||||
|
||||
Erasure Code hashes.pkl::
|
||||
|
||||
{'a43': {None: '72018c5fbfae934e1f56069ad4425627',
|
||||
2: 'b6dd6db937cb8748f50a5b6e4bc3b808'},
|
||||
'b23': {None: '12348c5fbfae934e1f56069ad4421234',
|
||||
1: '45676db937cb8748f50a5b6e4bc34567'}}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
-----------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user