696 lines
29 KiB
Python
Raw Normal View History

# Copyright (c) 2010-2012 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from swift.common import ring
from swift.common.ring.utils import (tiers_for_dev, build_tier_tree,
validate_and_normalize_ip,
validate_and_normalize_address,
is_valid_hostname,
is_local_device, parse_search_value,
parse_search_values_from_opts,
parse_change_values_from_opts,
validate_args, parse_args,
parse_builder_ring_filename_args,
build_dev_from_opts, dispersion_report,
parse_address)
class TestUtils(unittest.TestCase):
def setUp(self):
self.test_dev = {'region': 1, 'zone': 1, 'ip': '192.168.1.1',
'port': '6200', 'id': 0}
def get_test_devs():
dev0 = {'region': 1, 'zone': 1, 'ip': '192.168.1.1',
'port': '6200', 'id': 0}
dev1 = {'region': 1, 'zone': 1, 'ip': '192.168.1.1',
'port': '6200', 'id': 1}
dev2 = {'region': 1, 'zone': 1, 'ip': '192.168.1.1',
'port': '6200', 'id': 2}
dev3 = {'region': 1, 'zone': 1, 'ip': '192.168.1.2',
'port': '6200', 'id': 3}
dev4 = {'region': 1, 'zone': 1, 'ip': '192.168.1.2',
'port': '6200', 'id': 4}
dev5 = {'region': 1, 'zone': 1, 'ip': '192.168.1.2',
'port': '6200', 'id': 5}
dev6 = {'region': 1, 'zone': 2, 'ip': '192.168.2.1',
'port': '6200', 'id': 6}
dev7 = {'region': 1, 'zone': 2, 'ip': '192.168.2.1',
'port': '6200', 'id': 7}
dev8 = {'region': 1, 'zone': 2, 'ip': '192.168.2.1',
'port': '6200', 'id': 8}
dev9 = {'region': 1, 'zone': 2, 'ip': '192.168.2.2',
'port': '6200', 'id': 9}
dev10 = {'region': 1, 'zone': 2, 'ip': '192.168.2.2',
'port': '6200', 'id': 10}
dev11 = {'region': 1, 'zone': 2, 'ip': '192.168.2.2',
'port': '6200', 'id': 11}
return [dev0, dev1, dev2, dev3, dev4, dev5,
dev6, dev7, dev8, dev9, dev10, dev11]
self.test_devs = get_test_devs()
def test_tiers_for_dev(self):
self.assertEqual(
tiers_for_dev(self.test_dev),
((1,),
(1, 1),
(1, 1, '192.168.1.1'),
(1, 1, '192.168.1.1', 0)))
def test_build_tier_tree(self):
ret = build_tier_tree(self.test_devs)
self.assertEqual(len(ret), 8)
self.assertEqual(ret[()], set([(1,)]))
self.assertEqual(ret[(1,)], set([(1, 1), (1, 2)]))
self.assertEqual(ret[(1, 1)],
set([(1, 1, '192.168.1.2'),
(1, 1, '192.168.1.1')]))
self.assertEqual(ret[(1, 2)],
set([(1, 2, '192.168.2.2'),
(1, 2, '192.168.2.1')]))
self.assertEqual(ret[(1, 1, '192.168.1.1')],
set([(1, 1, '192.168.1.1', 0),
(1, 1, '192.168.1.1', 1),
(1, 1, '192.168.1.1', 2)]))
self.assertEqual(ret[(1, 1, '192.168.1.2')],
set([(1, 1, '192.168.1.2', 3),
(1, 1, '192.168.1.2', 4),
(1, 1, '192.168.1.2', 5)]))
self.assertEqual(ret[(1, 2, '192.168.2.1')],
set([(1, 2, '192.168.2.1', 6),
(1, 2, '192.168.2.1', 7),
(1, 2, '192.168.2.1', 8)]))
self.assertEqual(ret[(1, 2, '192.168.2.2')],
set([(1, 2, '192.168.2.2', 9),
(1, 2, '192.168.2.2', 10),
(1, 2, '192.168.2.2', 11)]))
def test_is_valid_hostname(self):
self.assertTrue(is_valid_hostname("local"))
self.assertTrue(is_valid_hostname("test.test.com"))
hostname = "test." * 51
self.assertTrue(is_valid_hostname(hostname))
hostname = hostname.rstrip('.')
self.assertTrue(is_valid_hostname(hostname))
hostname = hostname + "00"
self.assertFalse(is_valid_hostname(hostname))
self.assertFalse(is_valid_hostname("$blah#"))
def test_is_local_device(self):
Allow 1+ object-servers-per-disk deployment Enabled by a new > 0 integer config value, "servers_per_port" in the [DEFAULT] config section for object-server and/or replication server configs. The setting's integer value determines how many different object-server workers handle requests for any single unique local port in the ring. In this mode, the parent swift-object-server process continues to run as the original user (i.e. root if low-port binding is required), binds to all ports as defined in the ring, and forks off the specified number of workers per listen socket. The child, per-port servers drop privileges and behave pretty much how object-server workers always have, except that because the ring has unique ports per disk, the object-servers will only be handling requests for a single disk. The parent process detects dead servers and restarts them (with the correct listen socket), starts missing servers when an updated ring file is found with a device on the server with a new port, and kills extraneous servers when their port is found to no longer be in the ring. The ring files are stat'ed at most every "ring_check_interval" seconds, as configured in the object-server config (same default of 15s). Immediately stopping all swift-object-worker processes still works by sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process still causes the parent process to close all listen sockets and exit, allowing existing children to finish serving their existing requests. The drop_privileges helper function now has an optional param to suppress the setsid() call, which otherwise screws up the child workers' process management. The class method RingData.load() can be told to only load the ring metadata (i.e. everything except replica2part2dev_id) with the optional kwarg, header_only=True. This is used to keep the parent and all forked off workers from unnecessarily having full copies of all storage policy rings in memory. A new helper class, swift.common.storage_policy.BindPortsCache, provides a method to return a set of all device ports in all rings for the server on which it is instantiated (identified by its set of IP addresses). The BindPortsCache instance will track mtimes of ring files, so they are not opened more frequently than necessary. This patch includes enhancements to the probe tests and object-replicator/object-reconstructor config plumbing to allow the probe tests to work correctly both in the "normal" config (same IP but unique ports for each SAIO "server") and a server-per-port setup where each SAIO "server" must have a unique IP address and unique port per disk within each "server". The main probe tests only work with 4 servers and 4 disks, but you can see the difference in the rings for the EC probe tests where there are 2 disks per server for a total of 8 disks. Specifically, swift.common.ring.utils.is_local_device() will ignore the ports when the "my_port" argument is None. Then, object-replicator and object-reconstructor both set self.bind_port to None if server_per_port is enabled. Bonus improvement for IPv6 addresses in is_local_device(). This PR for vagrant-swift-all-in-one will aid in testing this patch: https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/ Also allow SAIO to answer is_local_device() better; common SAIO setups have multiple "servers" all on the same host with different ports for the different "servers" (which happen to match the IPs specified in the rings for the devices on each of those "servers"). However, you can configure the SAIO to have different localhost IP addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the servers' config files' bind_ip setting. This new whataremyips() implementation combined with a little plumbing allows is_local_device() to accurately answer, even on an SAIO. In the default case (an unspecified bind_ip defaults to '0.0.0.0') as well as an explict "bind to everything" like '0.0.0.0' or '::', whataremyips() behaves as it always has, returning all IP addresses for the server. Also updated probe tests to handle each "server" in the SAIO having a unique IP address. For some (noisy) benchmarks that show servers_per_port=X is at least as good as the same number of "normal" workers: https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md Benchmarks showing the benefits of I/O isolation with a small number of slow disks: https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md If you were wondering what the overhead of threads_per_disk looks like: https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md DocImpact Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
# localhost shows up in whataremyips() output as "::1" for IPv6
my_ips = ["127.0.0.1", "::1"]
my_port = 6200
self.assertTrue(is_local_device(my_ips, my_port,
Allow 1+ object-servers-per-disk deployment Enabled by a new > 0 integer config value, "servers_per_port" in the [DEFAULT] config section for object-server and/or replication server configs. The setting's integer value determines how many different object-server workers handle requests for any single unique local port in the ring. In this mode, the parent swift-object-server process continues to run as the original user (i.e. root if low-port binding is required), binds to all ports as defined in the ring, and forks off the specified number of workers per listen socket. The child, per-port servers drop privileges and behave pretty much how object-server workers always have, except that because the ring has unique ports per disk, the object-servers will only be handling requests for a single disk. The parent process detects dead servers and restarts them (with the correct listen socket), starts missing servers when an updated ring file is found with a device on the server with a new port, and kills extraneous servers when their port is found to no longer be in the ring. The ring files are stat'ed at most every "ring_check_interval" seconds, as configured in the object-server config (same default of 15s). Immediately stopping all swift-object-worker processes still works by sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process still causes the parent process to close all listen sockets and exit, allowing existing children to finish serving their existing requests. The drop_privileges helper function now has an optional param to suppress the setsid() call, which otherwise screws up the child workers' process management. The class method RingData.load() can be told to only load the ring metadata (i.e. everything except replica2part2dev_id) with the optional kwarg, header_only=True. This is used to keep the parent and all forked off workers from unnecessarily having full copies of all storage policy rings in memory. A new helper class, swift.common.storage_policy.BindPortsCache, provides a method to return a set of all device ports in all rings for the server on which it is instantiated (identified by its set of IP addresses). The BindPortsCache instance will track mtimes of ring files, so they are not opened more frequently than necessary. This patch includes enhancements to the probe tests and object-replicator/object-reconstructor config plumbing to allow the probe tests to work correctly both in the "normal" config (same IP but unique ports for each SAIO "server") and a server-per-port setup where each SAIO "server" must have a unique IP address and unique port per disk within each "server". The main probe tests only work with 4 servers and 4 disks, but you can see the difference in the rings for the EC probe tests where there are 2 disks per server for a total of 8 disks. Specifically, swift.common.ring.utils.is_local_device() will ignore the ports when the "my_port" argument is None. Then, object-replicator and object-reconstructor both set self.bind_port to None if server_per_port is enabled. Bonus improvement for IPv6 addresses in is_local_device(). This PR for vagrant-swift-all-in-one will aid in testing this patch: https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/ Also allow SAIO to answer is_local_device() better; common SAIO setups have multiple "servers" all on the same host with different ports for the different "servers" (which happen to match the IPs specified in the rings for the devices on each of those "servers"). However, you can configure the SAIO to have different localhost IP addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the servers' config files' bind_ip setting. This new whataremyips() implementation combined with a little plumbing allows is_local_device() to accurately answer, even on an SAIO. In the default case (an unspecified bind_ip defaults to '0.0.0.0') as well as an explict "bind to everything" like '0.0.0.0' or '::', whataremyips() behaves as it always has, returning all IP addresses for the server. Also updated probe tests to handle each "server" in the SAIO having a unique IP address. For some (noisy) benchmarks that show servers_per_port=X is at least as good as the same number of "normal" workers: https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md Benchmarks showing the benefits of I/O isolation with a small number of slow disks: https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md If you were wondering what the overhead of threads_per_disk looks like: https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md DocImpact Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
"127.0.0.1", my_port))
self.assertTrue(is_local_device(my_ips, my_port,
"::1", my_port))
self.assertTrue(is_local_device(
my_ips, my_port,
"0000:0000:0000:0000:0000:0000:0000:0001", my_port))
self.assertTrue(is_local_device(my_ips, my_port,
"localhost", my_port))
self.assertFalse(is_local_device(my_ips, my_port,
Allow 1+ object-servers-per-disk deployment Enabled by a new > 0 integer config value, "servers_per_port" in the [DEFAULT] config section for object-server and/or replication server configs. The setting's integer value determines how many different object-server workers handle requests for any single unique local port in the ring. In this mode, the parent swift-object-server process continues to run as the original user (i.e. root if low-port binding is required), binds to all ports as defined in the ring, and forks off the specified number of workers per listen socket. The child, per-port servers drop privileges and behave pretty much how object-server workers always have, except that because the ring has unique ports per disk, the object-servers will only be handling requests for a single disk. The parent process detects dead servers and restarts them (with the correct listen socket), starts missing servers when an updated ring file is found with a device on the server with a new port, and kills extraneous servers when their port is found to no longer be in the ring. The ring files are stat'ed at most every "ring_check_interval" seconds, as configured in the object-server config (same default of 15s). Immediately stopping all swift-object-worker processes still works by sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process still causes the parent process to close all listen sockets and exit, allowing existing children to finish serving their existing requests. The drop_privileges helper function now has an optional param to suppress the setsid() call, which otherwise screws up the child workers' process management. The class method RingData.load() can be told to only load the ring metadata (i.e. everything except replica2part2dev_id) with the optional kwarg, header_only=True. This is used to keep the parent and all forked off workers from unnecessarily having full copies of all storage policy rings in memory. A new helper class, swift.common.storage_policy.BindPortsCache, provides a method to return a set of all device ports in all rings for the server on which it is instantiated (identified by its set of IP addresses). The BindPortsCache instance will track mtimes of ring files, so they are not opened more frequently than necessary. This patch includes enhancements to the probe tests and object-replicator/object-reconstructor config plumbing to allow the probe tests to work correctly both in the "normal" config (same IP but unique ports for each SAIO "server") and a server-per-port setup where each SAIO "server" must have a unique IP address and unique port per disk within each "server". The main probe tests only work with 4 servers and 4 disks, but you can see the difference in the rings for the EC probe tests where there are 2 disks per server for a total of 8 disks. Specifically, swift.common.ring.utils.is_local_device() will ignore the ports when the "my_port" argument is None. Then, object-replicator and object-reconstructor both set self.bind_port to None if server_per_port is enabled. Bonus improvement for IPv6 addresses in is_local_device(). This PR for vagrant-swift-all-in-one will aid in testing this patch: https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/ Also allow SAIO to answer is_local_device() better; common SAIO setups have multiple "servers" all on the same host with different ports for the different "servers" (which happen to match the IPs specified in the rings for the devices on each of those "servers"). However, you can configure the SAIO to have different localhost IP addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the servers' config files' bind_ip setting. This new whataremyips() implementation combined with a little plumbing allows is_local_device() to accurately answer, even on an SAIO. In the default case (an unspecified bind_ip defaults to '0.0.0.0') as well as an explict "bind to everything" like '0.0.0.0' or '::', whataremyips() behaves as it always has, returning all IP addresses for the server. Also updated probe tests to handle each "server" in the SAIO having a unique IP address. For some (noisy) benchmarks that show servers_per_port=X is at least as good as the same number of "normal" workers: https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md Benchmarks showing the benefits of I/O isolation with a small number of slow disks: https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md If you were wondering what the overhead of threads_per_disk looks like: https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md DocImpact Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
"localhost", my_port + 1))
self.assertFalse(is_local_device(my_ips, my_port,
Allow 1+ object-servers-per-disk deployment Enabled by a new > 0 integer config value, "servers_per_port" in the [DEFAULT] config section for object-server and/or replication server configs. The setting's integer value determines how many different object-server workers handle requests for any single unique local port in the ring. In this mode, the parent swift-object-server process continues to run as the original user (i.e. root if low-port binding is required), binds to all ports as defined in the ring, and forks off the specified number of workers per listen socket. The child, per-port servers drop privileges and behave pretty much how object-server workers always have, except that because the ring has unique ports per disk, the object-servers will only be handling requests for a single disk. The parent process detects dead servers and restarts them (with the correct listen socket), starts missing servers when an updated ring file is found with a device on the server with a new port, and kills extraneous servers when their port is found to no longer be in the ring. The ring files are stat'ed at most every "ring_check_interval" seconds, as configured in the object-server config (same default of 15s). Immediately stopping all swift-object-worker processes still works by sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process still causes the parent process to close all listen sockets and exit, allowing existing children to finish serving their existing requests. The drop_privileges helper function now has an optional param to suppress the setsid() call, which otherwise screws up the child workers' process management. The class method RingData.load() can be told to only load the ring metadata (i.e. everything except replica2part2dev_id) with the optional kwarg, header_only=True. This is used to keep the parent and all forked off workers from unnecessarily having full copies of all storage policy rings in memory. A new helper class, swift.common.storage_policy.BindPortsCache, provides a method to return a set of all device ports in all rings for the server on which it is instantiated (identified by its set of IP addresses). The BindPortsCache instance will track mtimes of ring files, so they are not opened more frequently than necessary. This patch includes enhancements to the probe tests and object-replicator/object-reconstructor config plumbing to allow the probe tests to work correctly both in the "normal" config (same IP but unique ports for each SAIO "server") and a server-per-port setup where each SAIO "server" must have a unique IP address and unique port per disk within each "server". The main probe tests only work with 4 servers and 4 disks, but you can see the difference in the rings for the EC probe tests where there are 2 disks per server for a total of 8 disks. Specifically, swift.common.ring.utils.is_local_device() will ignore the ports when the "my_port" argument is None. Then, object-replicator and object-reconstructor both set self.bind_port to None if server_per_port is enabled. Bonus improvement for IPv6 addresses in is_local_device(). This PR for vagrant-swift-all-in-one will aid in testing this patch: https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/ Also allow SAIO to answer is_local_device() better; common SAIO setups have multiple "servers" all on the same host with different ports for the different "servers" (which happen to match the IPs specified in the rings for the devices on each of those "servers"). However, you can configure the SAIO to have different localhost IP addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the servers' config files' bind_ip setting. This new whataremyips() implementation combined with a little plumbing allows is_local_device() to accurately answer, even on an SAIO. In the default case (an unspecified bind_ip defaults to '0.0.0.0') as well as an explict "bind to everything" like '0.0.0.0' or '::', whataremyips() behaves as it always has, returning all IP addresses for the server. Also updated probe tests to handle each "server" in the SAIO having a unique IP address. For some (noisy) benchmarks that show servers_per_port=X is at least as good as the same number of "normal" workers: https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md Benchmarks showing the benefits of I/O isolation with a small number of slow disks: https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md If you were wondering what the overhead of threads_per_disk looks like: https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md DocImpact Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
"127.0.0.2", my_port))
# for those that don't have a local port
self.assertTrue(is_local_device(my_ips, None,
my_ips[0], None))
Allow 1+ object-servers-per-disk deployment Enabled by a new > 0 integer config value, "servers_per_port" in the [DEFAULT] config section for object-server and/or replication server configs. The setting's integer value determines how many different object-server workers handle requests for any single unique local port in the ring. In this mode, the parent swift-object-server process continues to run as the original user (i.e. root if low-port binding is required), binds to all ports as defined in the ring, and forks off the specified number of workers per listen socket. The child, per-port servers drop privileges and behave pretty much how object-server workers always have, except that because the ring has unique ports per disk, the object-servers will only be handling requests for a single disk. The parent process detects dead servers and restarts them (with the correct listen socket), starts missing servers when an updated ring file is found with a device on the server with a new port, and kills extraneous servers when their port is found to no longer be in the ring. The ring files are stat'ed at most every "ring_check_interval" seconds, as configured in the object-server config (same default of 15s). Immediately stopping all swift-object-worker processes still works by sending the parent a SIGTERM. Likewise, a SIGHUP to the parent process still causes the parent process to close all listen sockets and exit, allowing existing children to finish serving their existing requests. The drop_privileges helper function now has an optional param to suppress the setsid() call, which otherwise screws up the child workers' process management. The class method RingData.load() can be told to only load the ring metadata (i.e. everything except replica2part2dev_id) with the optional kwarg, header_only=True. This is used to keep the parent and all forked off workers from unnecessarily having full copies of all storage policy rings in memory. A new helper class, swift.common.storage_policy.BindPortsCache, provides a method to return a set of all device ports in all rings for the server on which it is instantiated (identified by its set of IP addresses). The BindPortsCache instance will track mtimes of ring files, so they are not opened more frequently than necessary. This patch includes enhancements to the probe tests and object-replicator/object-reconstructor config plumbing to allow the probe tests to work correctly both in the "normal" config (same IP but unique ports for each SAIO "server") and a server-per-port setup where each SAIO "server" must have a unique IP address and unique port per disk within each "server". The main probe tests only work with 4 servers and 4 disks, but you can see the difference in the rings for the EC probe tests where there are 2 disks per server for a total of 8 disks. Specifically, swift.common.ring.utils.is_local_device() will ignore the ports when the "my_port" argument is None. Then, object-replicator and object-reconstructor both set self.bind_port to None if server_per_port is enabled. Bonus improvement for IPv6 addresses in is_local_device(). This PR for vagrant-swift-all-in-one will aid in testing this patch: https://github.com/swiftstack/vagrant-swift-all-in-one/pull/16/ Also allow SAIO to answer is_local_device() better; common SAIO setups have multiple "servers" all on the same host with different ports for the different "servers" (which happen to match the IPs specified in the rings for the devices on each of those "servers"). However, you can configure the SAIO to have different localhost IP addresses (e.g. 127.0.0.1, 127.0.0.2, etc.) in the ring and in the servers' config files' bind_ip setting. This new whataremyips() implementation combined with a little plumbing allows is_local_device() to accurately answer, even on an SAIO. In the default case (an unspecified bind_ip defaults to '0.0.0.0') as well as an explict "bind to everything" like '0.0.0.0' or '::', whataremyips() behaves as it always has, returning all IP addresses for the server. Also updated probe tests to handle each "server" in the SAIO having a unique IP address. For some (noisy) benchmarks that show servers_per_port=X is at least as good as the same number of "normal" workers: https://gist.github.com/dbishop/c214f89ca708a6b1624a#file-summary-md Benchmarks showing the benefits of I/O isolation with a small number of slow disks: https://gist.github.com/dbishop/fd0ab067babdecfb07ca#file-results-md If you were wondering what the overhead of threads_per_disk looks like: https://gist.github.com/dbishop/1d14755fedc86a161718#file-tabular_results-md DocImpact Change-Id: I2239a4000b41a7e7cc53465ce794af49d44796c6
2015-05-14 22:14:15 -07:00
# When servers_per_port is active, the "my_port" passed in is None
# which means "don't include port in the determination of locality
# because it's not reliable in this deployment scenario"
self.assertTrue(is_local_device(my_ips, None,
"127.0.0.1", 6666))
self.assertTrue(is_local_device(my_ips, None,
"::1", 6666))
self.assertTrue(is_local_device(
my_ips, None,
"0000:0000:0000:0000:0000:0000:0000:0001", 6666))
self.assertTrue(is_local_device(my_ips, None,
"localhost", 6666))
self.assertFalse(is_local_device(my_ips, None,
"127.0.0.2", my_port))
def test_validate_and_normalize_ip(self):
ipv4 = "10.0.0.1"
self.assertEqual(ipv4, validate_and_normalize_ip(ipv4))
ipv6 = "fe80::204:61ff:fe9d:f156"
self.assertEqual(ipv6, validate_and_normalize_ip(ipv6.upper()))
hostname = "test.test.com"
self.assertRaises(ValueError,
validate_and_normalize_ip, hostname)
hostname = "$blah#"
self.assertRaises(ValueError,
validate_and_normalize_ip, hostname)
def test_validate_and_normalize_address(self):
ipv4 = "10.0.0.1"
self.assertEqual(ipv4, validate_and_normalize_address(ipv4))
ipv6 = "fe80::204:61ff:fe9d:f156"
self.assertEqual(ipv6, validate_and_normalize_address(ipv6.upper()))
hostname = "test.test.com"
self.assertEqual(hostname,
validate_and_normalize_address(hostname.upper()))
hostname = "$blah#"
self.assertRaises(ValueError,
validate_and_normalize_address, hostname)
def test_parse_search_value(self):
res = parse_search_value('r0')
self.assertEqual(res, {'region': 0})
res = parse_search_value('r1')
self.assertEqual(res, {'region': 1})
res = parse_search_value('r1z2')
self.assertEqual(res, {'region': 1, 'zone': 2})
res = parse_search_value('d1')
self.assertEqual(res, {'id': 1})
res = parse_search_value('z1')
self.assertEqual(res, {'zone': 1})
res = parse_search_value('-127.0.0.1')
self.assertEqual(res, {'ip': '127.0.0.1'})
res = parse_search_value('127.0.0.1')
self.assertEqual(res, {'ip': '127.0.0.1'})
res = parse_search_value('-[127.0.0.1]:10001')
self.assertEqual(res, {'ip': '127.0.0.1', 'port': 10001})
res = parse_search_value(':10001')
self.assertEqual(res, {'port': 10001})
res = parse_search_value('R127.0.0.10')
self.assertEqual(res, {'replication_ip': '127.0.0.10'})
res = parse_search_value('R[127.0.0.10]:20000')
self.assertEqual(res, {'replication_ip': '127.0.0.10',
'replication_port': 20000})
res = parse_search_value('R:20000')
self.assertEqual(res, {'replication_port': 20000})
res = parse_search_value('/sdb1')
self.assertEqual(res, {'device': 'sdb1'})
res = parse_search_value('_meta1')
self.assertEqual(res, {'meta': 'meta1'})
self.assertRaises(ValueError, parse_search_value, 'OMGPONIES')
def test_parse_search_values_from_opts(self):
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "test.test.com",
"--port", "6200",
"--replication-ip", "r.test.com",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "change.test.test.com",
"--change-port", "6201",
"--change-replication-ip", "change.r.test.com",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
expected = {
'id': 1,
'region': 2,
'zone': 3,
'ip': "test.test.com",
'port': 6200,
'replication_ip': "r.test.com",
'replication_port': 7000,
'device': "sda3",
'meta': "some meta data",
'weight': 3.14159265359,
}
new_cmd_format, opts, args = validate_args(argv)
search_values = parse_search_values_from_opts(opts)
self.assertEqual(search_values, expected)
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "127.0.0.1",
"--port", "6200",
"--replication-ip", "127.0.0.10",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "127.0.0.2",
"--change-port", "6201",
"--change-replication-ip", "127.0.0.20",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
expected = {
'id': 1,
'region': 2,
'zone': 3,
'ip': "127.0.0.1",
'port': 6200,
'replication_ip': "127.0.0.10",
'replication_port': 7000,
'device': "sda3",
'meta': "some meta data",
'weight': 3.14159265359,
}
new_cmd_format, opts, args = validate_args(argv)
search_values = parse_search_values_from_opts(opts)
self.assertEqual(search_values, expected)
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "[127.0.0.1]",
"--port", "6200",
"--replication-ip", "[127.0.0.10]",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "[127.0.0.2]",
"--change-port", "6201",
"--change-replication-ip", "[127.0.0.20]",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
new_cmd_format, opts, args = validate_args(argv)
search_values = parse_search_values_from_opts(opts)
self.assertEqual(search_values, expected)
def test_parse_change_values_from_opts(self):
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "test.test.com",
"--port", "6200",
"--replication-ip", "r.test.com",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "change.test.test.com",
"--change-port", "6201",
"--change-replication-ip", "change.r.test.com",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
expected = {
'ip': "change.test.test.com",
'port': 6201,
'replication_ip': "change.r.test.com",
'replication_port': 7001,
'device': "sdb3",
'meta': "some meta data for change",
}
new_cmd_format, opts, args = validate_args(argv)
search_values = parse_change_values_from_opts(opts)
self.assertEqual(search_values, expected)
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "127.0.0.1",
"--port", "6200",
"--replication-ip", "127.0.0.10",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "127.0.0.2",
"--change-port", "6201",
"--change-replication-ip", "127.0.0.20",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
expected = {
'ip': "127.0.0.2",
'port': 6201,
'replication_ip': "127.0.0.20",
'replication_port': 7001,
'device': "sdb3",
'meta': "some meta data for change",
}
new_cmd_format, opts, args = validate_args(argv)
search_values = parse_change_values_from_opts(opts)
self.assertEqual(search_values, expected)
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "[127.0.0.1]",
"--port", "6200",
"--replication-ip", "[127.0.0.10]",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "[127.0.0.2]",
"--change-port", "6201",
"--change-replication-ip", "[127.0.0.20]",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
new_cmd_format, opts, args = validate_args(argv)
search_values = parse_change_values_from_opts(opts)
self.assertEqual(search_values, expected)
def test_validate_args(self):
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "test.test.com",
"--port", "6200",
"--replication-ip", "r.test.com",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "change.test.test.com",
"--change-port", "6201",
"--change-replication-ip", "change.r.test.com",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
new_cmd_format, opts, args = validate_args(argv)
self.assertTrue(new_cmd_format)
self.assertEqual(opts.id, 1)
self.assertEqual(opts.region, 2)
self.assertEqual(opts.zone, 3)
self.assertEqual(opts.ip, "test.test.com")
self.assertEqual(opts.port, 6200)
self.assertEqual(opts.replication_ip, "r.test.com")
self.assertEqual(opts.replication_port, 7000)
self.assertEqual(opts.device, "sda3")
self.assertEqual(opts.meta, "some meta data")
self.assertEqual(opts.weight, 3.14159265359)
self.assertEqual(opts.change_ip, "change.test.test.com")
self.assertEqual(opts.change_port, 6201)
self.assertEqual(opts.change_replication_ip, "change.r.test.com")
self.assertEqual(opts.change_replication_port, 7001)
self.assertEqual(opts.change_device, "sdb3")
self.assertEqual(opts.change_meta, "some meta data for change")
def test_validate_args_new_cmd_format(self):
argv = \
["--id", "0", "--region", "0", "--zone", "0",
"--ip", "",
"--port", "0",
"--replication-ip", "",
"--replication-port", "0",
"--device", "",
"--meta", "",
"--weight", "0",
"--change-ip", "",
"--change-port", "0",
"--change-replication-ip", "",
"--change-replication-port", "0",
"--change-device", "",
"--change-meta", ""]
new_cmd_format, opts, args = validate_args(argv)
self.assertTrue(new_cmd_format)
argv = \
["--id", None, "--region", None, "--zone", None,
"--ip", "",
"--port", "0",
"--replication-ip", "",
"--replication-port", "0",
"--device", "",
"--meta", "",
"--weight", None,
"--change-ip", "change.test.test.com",
"--change-port", "6201",
"--change-replication-ip", "change.r.test.com",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
new_cmd_format, opts, args = validate_args(argv)
self.assertFalse(new_cmd_format)
argv = \
["--id", "0"]
new_cmd_format, opts, args = validate_args(argv)
self.assertTrue(new_cmd_format)
argv = \
["--region", "0"]
new_cmd_format, opts, args = validate_args(argv)
self.assertTrue(new_cmd_format)
argv = \
["--zone", "0"]
new_cmd_format, opts, args = validate_args(argv)
self.assertTrue(new_cmd_format)
argv = \
["--weight", "0"]
new_cmd_format, opts, args = validate_args(argv)
self.assertTrue(new_cmd_format)
def test_parse_args(self):
argv = \
["--id", "1", "--region", "2", "--zone", "3",
"--ip", "test.test.com",
"--port", "6200",
"--replication-ip", "r.test.com",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359",
"--change-ip", "change.test.test.com",
"--change-port", "6201",
"--change-replication-ip", "change.r.test.com",
"--change-replication-port", "7001",
"--change-device", "sdb3",
"--change-meta", "some meta data for change"]
opts, args = parse_args(argv)
self.assertEqual(opts.id, 1)
self.assertEqual(opts.region, 2)
self.assertEqual(opts.zone, 3)
self.assertEqual(opts.ip, "test.test.com")
self.assertEqual(opts.port, 6200)
self.assertEqual(opts.replication_ip, "r.test.com")
self.assertEqual(opts.replication_port, 7000)
self.assertEqual(opts.device, "sda3")
self.assertEqual(opts.meta, "some meta data")
self.assertEqual(opts.weight, 3.14159265359)
self.assertEqual(opts.change_ip, "change.test.test.com")
self.assertEqual(opts.change_port, 6201)
self.assertEqual(opts.change_replication_ip, "change.r.test.com")
self.assertEqual(opts.change_replication_port, 7001)
self.assertEqual(opts.change_device, "sdb3")
self.assertEqual(opts.change_meta, "some meta data for change")
self.assertEqual(len(args), 0)
def test_parse_builder_ring_filename_args(self):
args = 'swift-ring-builder object.builder write_ring'
self.assertEqual((
'object.builder', 'object.ring.gz'
), parse_builder_ring_filename_args(args.split()))
args = 'swift-ring-builder container.ring.gz write_builder'
self.assertEqual((
'container.builder', 'container.ring.gz'
), parse_builder_ring_filename_args(args.split()))
# builder name arg should always fall through
args = 'swift-ring-builder test create'
self.assertEqual((
'test', 'test.ring.gz'
), parse_builder_ring_filename_args(args.split()))
args = 'swift-ring-builder my.file.name create'
self.assertEqual((
'my.file.name', 'my.file.name.ring.gz'
), parse_builder_ring_filename_args(args.split()))
def test_build_dev_from_opts(self):
argv = \
["--region", "0", "--zone", "3",
"--ip", "test.test.com",
"--port", "6200",
"--replication-ip", "r.test.com",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359"]
expected = {
'region': 0,
'zone': 3,
'ip': "test.test.com",
'port': 6200,
'replication_ip': "r.test.com",
'replication_port': 7000,
'device': "sda3",
'meta': "some meta data",
'weight': 3.14159265359,
}
opts, args = parse_args(argv)
device = build_dev_from_opts(opts)
self.assertEqual(device, expected)
argv = \
["--region", "2", "--zone", "3",
"--ip", "[test.test.com]",
"--port", "6200",
"--replication-ip", "[r.test.com]",
"--replication-port", "7000",
"--device", "sda3",
"--meta", "some meta data",
"--weight", "3.14159265359"]
opts, args = parse_args(argv)
self.assertRaises(ValueError, build_dev_from_opts, opts)
argv = \
["--region", "2", "--zone", "3",
"--ip", "[test.test.com]",
"--port", "6200",
"--replication-ip", "[r.test.com]",
"--replication-port", "7000",
"--meta", "some meta data",
"--weight", "3.14159265359"]
opts, args = parse_args(argv)
self.assertRaises(ValueError, build_dev_from_opts, opts)
def test_replication_defaults(self):
args = '-r 1 -z 1 -i 127.0.0.1 -p 6010 -d d1 -w 100'.split()
opts, _ = parse_args(args)
device = build_dev_from_opts(opts)
expected = {
'device': 'd1',
'ip': '127.0.0.1',
'meta': '',
'port': 6010,
'region': 1,
'replication_ip': '127.0.0.1',
'replication_port': 6010,
'weight': 100.0,
'zone': 1,
}
self.assertEqual(device, expected)
args = '-r 1 -z 1 -i test.com -p 6010 -d d1 -w 100'.split()
opts, _ = parse_args(args)
device = build_dev_from_opts(opts)
expected = {
'device': 'd1',
'ip': 'test.com',
'meta': '',
'port': 6010,
'region': 1,
'replication_ip': 'test.com',
'replication_port': 6010,
'weight': 100.0,
'zone': 1,
}
self.assertEqual(device, expected)
def test_dispersion_report(self):
rb = ring.RingBuilder(8, 3, 0)
rb.add_dev({'id': 0, 'region': 1, 'zone': 0, 'weight': 100,
'ip': '127.0.0.0', 'port': 10000, 'device': 'sda1'})
Validate against duplicate device part replica assignment We should never assign multiple replicas of the same partition to the same device - our on-disk layout can only support a single replica of a given part on a single device. We should not do this, so we validate against it and raise a loud warning if this terrible state is ever observed after a rebalance. Unfortunately currently there's a couple not necessarily uncommon scenarios which will trigger this observed state today: 1. If we have less devices than replicas 2. If a server or zones aggregate device weight make it the most appropriate candidate for multiple replicas and you're a bit unlucky Fixing #1 would be easy, we should just not allow that state anymore. Really we never did - if you have a 3 replica ring with one device - you have one replica. Everything that iter_nodes'd would de-dupe. We should just be insisting that you explicitly acknowledge your replica count with set_replicas. I have been lost in the abyss for days searching for a general solutions to #2. I'm sure it exists, but I will not have wrestled it to submission by RC1. In the meantime we can eliminate a great deal of the luck required simply by refusing to place more than one replica of a part on a device in assign_parts. The meat of the change is a small update to the .validate method in RingBuilder. It basically unrolls a pre-existing (part, replica) loop so that all the replicas of the part come out in order so that we can build up the set of dev_id's for which all the replicas of a given part are assigned part-by-part. If we observe any duplicates - we raise a warning. To clean the cobwebs out of the rest of the corner cases we're going to delay get_required_overload from kicking in until we achive dispersion, and a small check was added when selecting a device subtier to validate if it's already being used - picking any other device in the tier works out much better. If no other devices are available in the tier - we raise a warning. A more elegant or optimized solution may exist. Many unittests did not meet the criteria #1, but the fix was straight forward after being identified by the pigeonhole check. However, many more tests were affected by #2 - but again the fix came to be simply adding more devices. The fantasy that all failure domains contain at least replica count devices is prevalent in both our ring placement algorithm and it's tests. These tests were trying to demonstrate some complex characteristics of our ring placement algorithm and I believe we just got a bit too carried away trying to find the simplest possible example to demonstrate the desirable trait. I think a better example looks more like a real ring - with many devices in each server and many servers in each zone - I think more devices makes the tests better. As much as possible I've tried to maintain the original intent of the tests - when adding devices I've either spread the weight out amongst them or added proportional weights to the other tiers. I added an example straw man test to validate that three devices with different weights in three different zones won't blow up. Once we can do that without raising warnings and assigning duplicate device part replicas - we can add more. And more importantly change the warnings to errors - because we would much prefer to not do that #$%^ anymore. Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Related-Bug: #1452431 Change-Id: I592d5b611188670ae842fe3d030aa3b340ac36f9
2015-09-11 16:24:52 -07:00
rb.add_dev({'id': 3, 'region': 1, 'zone': 0, 'weight': 100,
'ip': '127.0.0.0', 'port': 10000, 'device': 'sdb1'})
rb.add_dev({'id': 4, 'region': 1, 'zone': 0, 'weight': 100,
'ip': '127.0.0.0', 'port': 10000, 'device': 'sdc1'})
rb.add_dev({'id': 5, 'region': 1, 'zone': 0, 'weight': 100,
'ip': '127.0.0.0', 'port': 10000, 'device': 'sdd1'})
rb.add_dev({'id': 1, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.1', 'port': 10001, 'device': 'sda1'})
Validate against duplicate device part replica assignment We should never assign multiple replicas of the same partition to the same device - our on-disk layout can only support a single replica of a given part on a single device. We should not do this, so we validate against it and raise a loud warning if this terrible state is ever observed after a rebalance. Unfortunately currently there's a couple not necessarily uncommon scenarios which will trigger this observed state today: 1. If we have less devices than replicas 2. If a server or zones aggregate device weight make it the most appropriate candidate for multiple replicas and you're a bit unlucky Fixing #1 would be easy, we should just not allow that state anymore. Really we never did - if you have a 3 replica ring with one device - you have one replica. Everything that iter_nodes'd would de-dupe. We should just be insisting that you explicitly acknowledge your replica count with set_replicas. I have been lost in the abyss for days searching for a general solutions to #2. I'm sure it exists, but I will not have wrestled it to submission by RC1. In the meantime we can eliminate a great deal of the luck required simply by refusing to place more than one replica of a part on a device in assign_parts. The meat of the change is a small update to the .validate method in RingBuilder. It basically unrolls a pre-existing (part, replica) loop so that all the replicas of the part come out in order so that we can build up the set of dev_id's for which all the replicas of a given part are assigned part-by-part. If we observe any duplicates - we raise a warning. To clean the cobwebs out of the rest of the corner cases we're going to delay get_required_overload from kicking in until we achive dispersion, and a small check was added when selecting a device subtier to validate if it's already being used - picking any other device in the tier works out much better. If no other devices are available in the tier - we raise a warning. A more elegant or optimized solution may exist. Many unittests did not meet the criteria #1, but the fix was straight forward after being identified by the pigeonhole check. However, many more tests were affected by #2 - but again the fix came to be simply adding more devices. The fantasy that all failure domains contain at least replica count devices is prevalent in both our ring placement algorithm and it's tests. These tests were trying to demonstrate some complex characteristics of our ring placement algorithm and I believe we just got a bit too carried away trying to find the simplest possible example to demonstrate the desirable trait. I think a better example looks more like a real ring - with many devices in each server and many servers in each zone - I think more devices makes the tests better. As much as possible I've tried to maintain the original intent of the tests - when adding devices I've either spread the weight out amongst them or added proportional weights to the other tiers. I added an example straw man test to validate that three devices with different weights in three different zones won't blow up. Once we can do that without raising warnings and assigning duplicate device part replicas - we can add more. And more importantly change the warnings to errors - because we would much prefer to not do that #$%^ anymore. Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Related-Bug: #1452431 Change-Id: I592d5b611188670ae842fe3d030aa3b340ac36f9
2015-09-11 16:24:52 -07:00
rb.add_dev({'id': 6, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.1', 'port': 10001, 'device': 'sdb1'})
rb.add_dev({'id': 7, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.1', 'port': 10001, 'device': 'sdc1'})
rb.add_dev({'id': 8, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.1', 'port': 10001, 'device': 'sdd1'})
rb.add_dev({'id': 2, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.2', 'port': 10002, 'device': 'sda1'})
Validate against duplicate device part replica assignment We should never assign multiple replicas of the same partition to the same device - our on-disk layout can only support a single replica of a given part on a single device. We should not do this, so we validate against it and raise a loud warning if this terrible state is ever observed after a rebalance. Unfortunately currently there's a couple not necessarily uncommon scenarios which will trigger this observed state today: 1. If we have less devices than replicas 2. If a server or zones aggregate device weight make it the most appropriate candidate for multiple replicas and you're a bit unlucky Fixing #1 would be easy, we should just not allow that state anymore. Really we never did - if you have a 3 replica ring with one device - you have one replica. Everything that iter_nodes'd would de-dupe. We should just be insisting that you explicitly acknowledge your replica count with set_replicas. I have been lost in the abyss for days searching for a general solutions to #2. I'm sure it exists, but I will not have wrestled it to submission by RC1. In the meantime we can eliminate a great deal of the luck required simply by refusing to place more than one replica of a part on a device in assign_parts. The meat of the change is a small update to the .validate method in RingBuilder. It basically unrolls a pre-existing (part, replica) loop so that all the replicas of the part come out in order so that we can build up the set of dev_id's for which all the replicas of a given part are assigned part-by-part. If we observe any duplicates - we raise a warning. To clean the cobwebs out of the rest of the corner cases we're going to delay get_required_overload from kicking in until we achive dispersion, and a small check was added when selecting a device subtier to validate if it's already being used - picking any other device in the tier works out much better. If no other devices are available in the tier - we raise a warning. A more elegant or optimized solution may exist. Many unittests did not meet the criteria #1, but the fix was straight forward after being identified by the pigeonhole check. However, many more tests were affected by #2 - but again the fix came to be simply adding more devices. The fantasy that all failure domains contain at least replica count devices is prevalent in both our ring placement algorithm and it's tests. These tests were trying to demonstrate some complex characteristics of our ring placement algorithm and I believe we just got a bit too carried away trying to find the simplest possible example to demonstrate the desirable trait. I think a better example looks more like a real ring - with many devices in each server and many servers in each zone - I think more devices makes the tests better. As much as possible I've tried to maintain the original intent of the tests - when adding devices I've either spread the weight out amongst them or added proportional weights to the other tiers. I added an example straw man test to validate that three devices with different weights in three different zones won't blow up. Once we can do that without raising warnings and assigning duplicate device part replicas - we can add more. And more importantly change the warnings to errors - because we would much prefer to not do that #$%^ anymore. Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Related-Bug: #1452431 Change-Id: I592d5b611188670ae842fe3d030aa3b340ac36f9
2015-09-11 16:24:52 -07:00
rb.add_dev({'id': 9, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.2', 'port': 10002, 'device': 'sdb1'})
rb.add_dev({'id': 10, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.2', 'port': 10002, 'device': 'sdc1'})
rb.add_dev({'id': 11, 'region': 1, 'zone': 1, 'weight': 200,
'ip': '127.0.0.2', 'port': 10002, 'device': 'sdd1'})
# this ring is pretty volatile and the assertions are pretty brittle
# so we use a specific seed
rb.rebalance(seed=100)
rb.validate()
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
self.assertEqual(rb.dispersion, 39.84375)
report = dispersion_report(rb)
self.assertEqual(report['worst_tier'], 'r1z1')
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
self.assertEqual(report['max_dispersion'], 39.84375)
def build_tier_report(max_replicas, placed_parts, dispersion,
replicas):
return {
'max_replicas': max_replicas,
'placed_parts': placed_parts,
'dispersion': dispersion,
'replicas': replicas,
}
# Each node should store 256 partitions to avoid multiple replicas
# 2/5 of total weight * 768 ~= 307 -> 51 partitions on each node in
# zone 1 are stored at least twice on the nodes
expected = [
['r1z1', build_tier_report(
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
2, 256, 39.84375, [0, 0, 154, 102])],
['r1z1-127.0.0.1', build_tier_report(
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
1, 256, 19.921875, [0, 205, 51, 0])],
['r1z1-127.0.0.2', build_tier_report(
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
1, 256, 19.921875, [0, 205, 51, 0])],
]
Validate against duplicate device part replica assignment We should never assign multiple replicas of the same partition to the same device - our on-disk layout can only support a single replica of a given part on a single device. We should not do this, so we validate against it and raise a loud warning if this terrible state is ever observed after a rebalance. Unfortunately currently there's a couple not necessarily uncommon scenarios which will trigger this observed state today: 1. If we have less devices than replicas 2. If a server or zones aggregate device weight make it the most appropriate candidate for multiple replicas and you're a bit unlucky Fixing #1 would be easy, we should just not allow that state anymore. Really we never did - if you have a 3 replica ring with one device - you have one replica. Everything that iter_nodes'd would de-dupe. We should just be insisting that you explicitly acknowledge your replica count with set_replicas. I have been lost in the abyss for days searching for a general solutions to #2. I'm sure it exists, but I will not have wrestled it to submission by RC1. In the meantime we can eliminate a great deal of the luck required simply by refusing to place more than one replica of a part on a device in assign_parts. The meat of the change is a small update to the .validate method in RingBuilder. It basically unrolls a pre-existing (part, replica) loop so that all the replicas of the part come out in order so that we can build up the set of dev_id's for which all the replicas of a given part are assigned part-by-part. If we observe any duplicates - we raise a warning. To clean the cobwebs out of the rest of the corner cases we're going to delay get_required_overload from kicking in until we achive dispersion, and a small check was added when selecting a device subtier to validate if it's already being used - picking any other device in the tier works out much better. If no other devices are available in the tier - we raise a warning. A more elegant or optimized solution may exist. Many unittests did not meet the criteria #1, but the fix was straight forward after being identified by the pigeonhole check. However, many more tests were affected by #2 - but again the fix came to be simply adding more devices. The fantasy that all failure domains contain at least replica count devices is prevalent in both our ring placement algorithm and it's tests. These tests were trying to demonstrate some complex characteristics of our ring placement algorithm and I believe we just got a bit too carried away trying to find the simplest possible example to demonstrate the desirable trait. I think a better example looks more like a real ring - with many devices in each server and many servers in each zone - I think more devices makes the tests better. As much as possible I've tried to maintain the original intent of the tests - when adding devices I've either spread the weight out amongst them or added proportional weights to the other tiers. I added an example straw man test to validate that three devices with different weights in three different zones won't blow up. Once we can do that without raising warnings and assigning duplicate device part replicas - we can add more. And more importantly change the warnings to errors - because we would much prefer to not do that #$%^ anymore. Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Related-Bug: #1452431 Change-Id: I592d5b611188670ae842fe3d030aa3b340ac36f9
2015-09-11 16:24:52 -07:00
report = dispersion_report(rb, 'r1z1[^/]*$', verbose=True)
graph = report['graph']
Validate against duplicate device part replica assignment We should never assign multiple replicas of the same partition to the same device - our on-disk layout can only support a single replica of a given part on a single device. We should not do this, so we validate against it and raise a loud warning if this terrible state is ever observed after a rebalance. Unfortunately currently there's a couple not necessarily uncommon scenarios which will trigger this observed state today: 1. If we have less devices than replicas 2. If a server or zones aggregate device weight make it the most appropriate candidate for multiple replicas and you're a bit unlucky Fixing #1 would be easy, we should just not allow that state anymore. Really we never did - if you have a 3 replica ring with one device - you have one replica. Everything that iter_nodes'd would de-dupe. We should just be insisting that you explicitly acknowledge your replica count with set_replicas. I have been lost in the abyss for days searching for a general solutions to #2. I'm sure it exists, but I will not have wrestled it to submission by RC1. In the meantime we can eliminate a great deal of the luck required simply by refusing to place more than one replica of a part on a device in assign_parts. The meat of the change is a small update to the .validate method in RingBuilder. It basically unrolls a pre-existing (part, replica) loop so that all the replicas of the part come out in order so that we can build up the set of dev_id's for which all the replicas of a given part are assigned part-by-part. If we observe any duplicates - we raise a warning. To clean the cobwebs out of the rest of the corner cases we're going to delay get_required_overload from kicking in until we achive dispersion, and a small check was added when selecting a device subtier to validate if it's already being used - picking any other device in the tier works out much better. If no other devices are available in the tier - we raise a warning. A more elegant or optimized solution may exist. Many unittests did not meet the criteria #1, but the fix was straight forward after being identified by the pigeonhole check. However, many more tests were affected by #2 - but again the fix came to be simply adding more devices. The fantasy that all failure domains contain at least replica count devices is prevalent in both our ring placement algorithm and it's tests. These tests were trying to demonstrate some complex characteristics of our ring placement algorithm and I believe we just got a bit too carried away trying to find the simplest possible example to demonstrate the desirable trait. I think a better example looks more like a real ring - with many devices in each server and many servers in each zone - I think more devices makes the tests better. As much as possible I've tried to maintain the original intent of the tests - when adding devices I've either spread the weight out amongst them or added proportional weights to the other tiers. I added an example straw man test to validate that three devices with different weights in three different zones won't blow up. Once we can do that without raising warnings and assigning duplicate device part replicas - we can add more. And more importantly change the warnings to errors - because we would much prefer to not do that #$%^ anymore. Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Related-Bug: #1452431 Change-Id: I592d5b611188670ae842fe3d030aa3b340ac36f9
2015-09-11 16:24:52 -07:00
for i, (expected_key, expected_report) in enumerate(expected):
key, report = graph[i]
self.assertEqual(
(key, report),
(expected_key, expected_report)
)
# overcompensate in r1z0
Validate against duplicate device part replica assignment We should never assign multiple replicas of the same partition to the same device - our on-disk layout can only support a single replica of a given part on a single device. We should not do this, so we validate against it and raise a loud warning if this terrible state is ever observed after a rebalance. Unfortunately currently there's a couple not necessarily uncommon scenarios which will trigger this observed state today: 1. If we have less devices than replicas 2. If a server or zones aggregate device weight make it the most appropriate candidate for multiple replicas and you're a bit unlucky Fixing #1 would be easy, we should just not allow that state anymore. Really we never did - if you have a 3 replica ring with one device - you have one replica. Everything that iter_nodes'd would de-dupe. We should just be insisting that you explicitly acknowledge your replica count with set_replicas. I have been lost in the abyss for days searching for a general solutions to #2. I'm sure it exists, but I will not have wrestled it to submission by RC1. In the meantime we can eliminate a great deal of the luck required simply by refusing to place more than one replica of a part on a device in assign_parts. The meat of the change is a small update to the .validate method in RingBuilder. It basically unrolls a pre-existing (part, replica) loop so that all the replicas of the part come out in order so that we can build up the set of dev_id's for which all the replicas of a given part are assigned part-by-part. If we observe any duplicates - we raise a warning. To clean the cobwebs out of the rest of the corner cases we're going to delay get_required_overload from kicking in until we achive dispersion, and a small check was added when selecting a device subtier to validate if it's already being used - picking any other device in the tier works out much better. If no other devices are available in the tier - we raise a warning. A more elegant or optimized solution may exist. Many unittests did not meet the criteria #1, but the fix was straight forward after being identified by the pigeonhole check. However, many more tests were affected by #2 - but again the fix came to be simply adding more devices. The fantasy that all failure domains contain at least replica count devices is prevalent in both our ring placement algorithm and it's tests. These tests were trying to demonstrate some complex characteristics of our ring placement algorithm and I believe we just got a bit too carried away trying to find the simplest possible example to demonstrate the desirable trait. I think a better example looks more like a real ring - with many devices in each server and many servers in each zone - I think more devices makes the tests better. As much as possible I've tried to maintain the original intent of the tests - when adding devices I've either spread the weight out amongst them or added proportional weights to the other tiers. I added an example straw man test to validate that three devices with different weights in three different zones won't blow up. Once we can do that without raising warnings and assigning duplicate device part replicas - we can add more. And more importantly change the warnings to errors - because we would much prefer to not do that #$%^ anymore. Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Related-Bug: #1452431 Change-Id: I592d5b611188670ae842fe3d030aa3b340ac36f9
2015-09-11 16:24:52 -07:00
rb.add_dev({'id': 12, 'region': 1, 'zone': 0, 'weight': 500,
'ip': '127.0.0.3', 'port': 10003, 'device': 'sda1'})
rb.add_dev({'id': 13, 'region': 1, 'zone': 0, 'weight': 500,
'ip': '127.0.0.3', 'port': 10003, 'device': 'sdb1'})
rb.add_dev({'id': 14, 'region': 1, 'zone': 0, 'weight': 500,
'ip': '127.0.0.3', 'port': 10003, 'device': 'sdc1'})
rb.add_dev({'id': 15, 'region': 1, 'zone': 0, 'weight': 500,
'ip': '127.0.0.3', 'port': 10003, 'device': 'sdd1'})
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
# when the biggest tier has the smallest devices things get ugly
# can't move all the part-replicas in one rebalance
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
rb.rebalance(seed=100)
report = dispersion_report(rb, verbose=True)
self.assertEqual(rb.dispersion, 9.375)
self.assertEqual(report['worst_tier'], 'r1z1-127.0.0.1')
self.assertEqual(report['max_dispersion'], 7.18562874251497)
# do a sencond rebalance
rb.rebalance(seed=100)
report = dispersion_report(rb, verbose=True)
self.assertEqual(rb.dispersion, 50.0)
Validate against duplicate device part replica assignment We should never assign multiple replicas of the same partition to the same device - our on-disk layout can only support a single replica of a given part on a single device. We should not do this, so we validate against it and raise a loud warning if this terrible state is ever observed after a rebalance. Unfortunately currently there's a couple not necessarily uncommon scenarios which will trigger this observed state today: 1. If we have less devices than replicas 2. If a server or zones aggregate device weight make it the most appropriate candidate for multiple replicas and you're a bit unlucky Fixing #1 would be easy, we should just not allow that state anymore. Really we never did - if you have a 3 replica ring with one device - you have one replica. Everything that iter_nodes'd would de-dupe. We should just be insisting that you explicitly acknowledge your replica count with set_replicas. I have been lost in the abyss for days searching for a general solutions to #2. I'm sure it exists, but I will not have wrestled it to submission by RC1. In the meantime we can eliminate a great deal of the luck required simply by refusing to place more than one replica of a part on a device in assign_parts. The meat of the change is a small update to the .validate method in RingBuilder. It basically unrolls a pre-existing (part, replica) loop so that all the replicas of the part come out in order so that we can build up the set of dev_id's for which all the replicas of a given part are assigned part-by-part. If we observe any duplicates - we raise a warning. To clean the cobwebs out of the rest of the corner cases we're going to delay get_required_overload from kicking in until we achive dispersion, and a small check was added when selecting a device subtier to validate if it's already being used - picking any other device in the tier works out much better. If no other devices are available in the tier - we raise a warning. A more elegant or optimized solution may exist. Many unittests did not meet the criteria #1, but the fix was straight forward after being identified by the pigeonhole check. However, many more tests were affected by #2 - but again the fix came to be simply adding more devices. The fantasy that all failure domains contain at least replica count devices is prevalent in both our ring placement algorithm and it's tests. These tests were trying to demonstrate some complex characteristics of our ring placement algorithm and I believe we just got a bit too carried away trying to find the simplest possible example to demonstrate the desirable trait. I think a better example looks more like a real ring - with many devices in each server and many servers in each zone - I think more devices makes the tests better. As much as possible I've tried to maintain the original intent of the tests - when adding devices I've either spread the weight out amongst them or added proportional weights to the other tiers. I added an example straw man test to validate that three devices with different weights in three different zones won't blow up. Once we can do that without raising warnings and assigning duplicate device part replicas - we can add more. And more importantly change the warnings to errors - because we would much prefer to not do that #$%^ anymore. Co-Authored-By: Kota Tsuyuzaki <tsuyuzaki.kota@lab.ntt.co.jp> Related-Bug: #1452431 Change-Id: I592d5b611188670ae842fe3d030aa3b340ac36f9
2015-09-11 16:24:52 -07:00
self.assertEqual(report['worst_tier'], 'r1z0-127.0.0.3')
self.assertEqual(report['max_dispersion'], 50.0)
Put part-replicas where they go It's harder than it sounds. There was really three challenges. Challenge #1 Initial Assignment =============================== Before starting to assign parts on this new shiny ring you've constructed, maybe we'll pause for a moment up front and consider the lay of the land. This process is called the replica_plan. The replica_plan approach is separating part assignment failures into two modes: 1) we considered the cluster topology and it's weights and came up with the wrong plan 2) we failed to execute on the plan I failed at both parts plenty of times before I got it this close. I'm sure a counter example still exists, but when we find it the new helper methods will let us reason about where things went wrong. Challenge #2 Fixing Placement ============================= With a sound plan in hand, it's much easier to fail to execute on it the less material you have to execute with - so we gather up as many parts as we can - as long as we think we can find them a better home. Picking the right parts for gather is a black art - when you notice a balance is slow it's because it's spending so much time iterating over replica2part2dev trying to decide just the right parts to gather. The replica plan can help at least in the gross dispersion collection to gather up the worst offenders first before considering balance. I think trying to avoid picking up parts that are stuck to the tier before falling into a forced grab on anything over parts_wanted helps with stability generally - but depending on where the parts_wanted are in relation to the full devices it's pretty easy pick up something that'll end up really close to where it started. I tried to break the gather methods into smaller pieces so it looked like I knew what I was doing. Going with a MAXIMUM gather iteration instead of balance (which doesn't reflect the replica_plan) doesn't seem to be costing me anything - most of the time the exit condition is either solved or all the parts overly aggressively locked up on min_part_hours. So far, it mostly seemds if the thing is going to balance this round it'll get it in the first couple of shakes. Challenge #3 Crazy replica2part2dev tables ========================================== I think there's lots of ways "scars" can build up a ring which can result in very particular replica2part2dev tables that are physically difficult to dig out of. It's repairing these scars that will take multiple rebalances to resolve. ... but at this point ... ... lacking a counter example ... I've been able to close up all the edge cases I was able to find. It may not be quick, but progress will be made. Basically my strategy just required a better understanding of how previous algorithms were able to *mostly* keep things moving by brute forcing the whole mess with a bunch of randomness. Then when we detect our "elegant" careful part selection isn't making progress - we can fall back to same old tricks. Validation ========== We validate against duplicate part replica assignment after rebalance and raise an ERROR if we detect more than one replica of a part assigned to the same device. In order to meet that requirement we have to have as many devices as replicas, so attempting to rebalance with too few devices w/o changing your replica_count is also an ERROR not a warning. Random Thoughts =============== As usual with rings, the test diff can be hard to reason about - hopefully I've added enough comments to assure future me that these assertions make sense. Despite being a large rewrite of a lot of important code, the existing code is known to have failed us. This change fixes a critical bug that's trivial to reproduce in a critical component of the system. There's probably a bunch of error messages and exit status stuff that's not as helpful as it could be considering the new behaviors. Change-Id: I1bbe7be38806fc1c8b9181a722933c18a6c76e05 Closes-Bug: #1452431
2015-10-15 16:20:58 -07:00
# ... but overload can square it
rb.set_overload(rb.get_required_overload())
rb.rebalance()
self.assertEqual(rb.dispersion, 0.0)
def test_parse_address_old_format(self):
# Test old format
argv = "127.0.0.1:6200R127.0.0.1:6200/sda1_some meta data"
ip, port, rest = parse_address(argv)
self.assertEqual(ip, '127.0.0.1')
self.assertEqual(port, 6200)
self.assertEqual(rest, 'R127.0.0.1:6200/sda1_some meta data')
if __name__ == '__main__':
unittest.main()