From 6ae46796a61fc97467450b5bdd51dc6a0c86f9f4 Mon Sep 17 00:00:00 2001 From: Dmitry Mescheryakov <dmescheryakov@mirantis.com> Date: Mon, 23 Nov 2015 17:27:24 +0300 Subject: [PATCH] Use round robin failover strategy for Kombu driver Shuffle strategy we use right now leads to increased reconnection time and provides no benefit. Sometimes it might lead to RPC operations timeout because the strategy provides no guarantee on how long the reconnection process will take. See the referenced bug for details. On the other side, round-robin strategy provides least achievable reconnection time. It also provides guarantee that if K of N RabbitMQ hosts are alive, it will take at most N - K + 1 attempts to successfully reconnect to RabbitMQ cluster. With shuffle strategy during failover clients connect to random hosts and so the load is distributed evenly between alive RabbitMQs. But since we shuffle list of hosts before providing it to Kombu, load will be distributed evenly with round-robin strategy as well. DocImpact A new configuration option kombu_failover_strategy for Kombu driver is added. It determines how the next RabbitMQ node is chosen in case the one we are currently connected to becomes unavailable. It takes effect only if more than one RabbitMQ node is provided in config. Available options are: * round-robin: each RabbitMQ host in the list is tried in cycle until oslo.messaging successfully connects. Since oslo.messaging shuffles list of RabbitMQ hosts, the order of hosts in the cycle will be random and will not depend on order provided in config. * shuffle: oslo.messaging selects a random host from the list and tries to connect to it. If connection fails, oslo.messaging repeats attempt to connect to another random host. Oslo.messaging stops once it successfully connects to a host. Note that in each iteration a host to connect is selected independently of previous iterations, i.e. it might happen that oslo.messaging will try to connect to the same host several times in a row. The option's default value is round-robin. Before the option was introduced, the default strategy was shuffle. For the reasoning, see the main body of the commit message and the referenced bug. Closes-Bug: #1519851 Change-Id: I9a510c86bd5a6ce8b707734385af1a83de82804e --- oslo_messaging/_drivers/impl_rabbit.py | 10 +++++++++- oslo_messaging/tests/drivers/test_impl_rabbit.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/oslo_messaging/_drivers/impl_rabbit.py b/oslo_messaging/_drivers/impl_rabbit.py index be41cc0f5..892961c42 100644 --- a/oslo_messaging/_drivers/impl_rabbit.py +++ b/oslo_messaging/_drivers/impl_rabbit.py @@ -84,6 +84,13 @@ rabbit_opts = [ help='How long to wait before considering a reconnect ' 'attempt to have failed. This value should not be ' 'longer than rpc_response_timeout.'), + cfg.StrOpt('kombu_failover_strategy', + choices=('round-robin', 'shuffle'), + default='round-robin', + help='Determines how the next RabbitMQ node is chosen in case ' + 'the one we are currently connected to becomes ' + 'unavailable. Takes effect only if more than one ' + 'RabbitMQ node is provided in config.'), cfg.StrOpt('rabbit_host', default='localhost', deprecated_group='DEFAULT', @@ -377,6 +384,7 @@ class Connection(object): self.amqp_auto_delete = driver_conf.amqp_auto_delete self.rabbit_use_ssl = driver_conf.rabbit_use_ssl self.kombu_reconnect_timeout = driver_conf.kombu_reconnect_timeout + self.kombu_failover_strategy = driver_conf.kombu_failover_strategy if self.rabbit_use_ssl: self.kombu_ssl_version = driver_conf.kombu_ssl_version @@ -456,8 +464,8 @@ class Connection(object): self.connection = kombu.connection.Connection( self._url, ssl=self._fetch_ssl_params(), login_method=self.login_method, - failover_strategy="shuffle", heartbeat=self.heartbeat_timeout_threshold, + failover_strategy=self.kombu_failover_strategy, transport_options={ 'confirm_publish': True, 'on_blocked': self._on_connection_blocked, diff --git a/oslo_messaging/tests/drivers/test_impl_rabbit.py b/oslo_messaging/tests/drivers/test_impl_rabbit.py index 06c78982a..db1a8c3df 100644 --- a/oslo_messaging/tests/drivers/test_impl_rabbit.py +++ b/oslo_messaging/tests/drivers/test_impl_rabbit.py @@ -169,7 +169,7 @@ class TestRabbitDriverLoadSSL(test_utils.BaseTestCase): 'on_blocked': mock.ANY, 'on_unblocked': mock.ANY}, ssl=self.expected, login_method='AMQPLAIN', - heartbeat=60, failover_strategy="shuffle") + heartbeat=60, failover_strategy='round-robin') class TestRabbitPublisher(test_utils.BaseTestCase):