From 6ae46796a61fc97467450b5bdd51dc6a0c86f9f4 Mon Sep 17 00:00:00 2001
From: Dmitry Mescheryakov <dmescheryakov@mirantis.com>
Date: Mon, 23 Nov 2015 17:27:24 +0300
Subject: [PATCH] Use round robin failover strategy for Kombu driver

Shuffle strategy we use right now leads to increased reconnection time
and provides no benefit. Sometimes it might lead to RPC operations
timeout because the strategy provides no guarantee on how long the
reconnection process will take. See the referenced bug for details.

On the other side, round-robin strategy provides least achievable
reconnection time. It also provides guarantee that if K of N RabbitMQ
hosts are alive, it will take at most N - K + 1 attempts to
successfully reconnect to RabbitMQ cluster.

With shuffle strategy during failover clients connect to random hosts
and so the load is distributed evenly between alive RabbitMQs.
But since we shuffle list of hosts before providing it to Kombu, load
will be distributed evenly with round-robin strategy as well.

DocImpact
A new configuration option kombu_failover_strategy for Kombu driver is
added. It determines how the next RabbitMQ node is chosen in case the
one we are currently connected to becomes unavailable. It takes effect
only if more than one RabbitMQ node is provided in config. Available
options are:

 * round-robin: each RabbitMQ host in the list is tried in cycle until
   oslo.messaging successfully connects. Since oslo.messaging
   shuffles list of RabbitMQ hosts, the order of hosts in the cycle
   will be random and will not depend on order provided in config.

 * shuffle: oslo.messaging selects a random host from the list and
   tries to connect to it. If connection fails, oslo.messaging repeats
   attempt to connect to another random host. Oslo.messaging stops
   once it successfully connects to a host. Note that in each
   iteration a host to connect is selected independently of previous
   iterations, i.e. it might happen that oslo.messaging will try to
   connect to the same host several times in a row.

The option's default value is round-robin. Before the option was
introduced, the default strategy was shuffle. For the reasoning,
see the main body of the commit message and the referenced bug.

Closes-Bug: #1519851
Change-Id: I9a510c86bd5a6ce8b707734385af1a83de82804e
---
 oslo_messaging/_drivers/impl_rabbit.py           | 10 +++++++++-
 oslo_messaging/tests/drivers/test_impl_rabbit.py |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/oslo_messaging/_drivers/impl_rabbit.py b/oslo_messaging/_drivers/impl_rabbit.py
index be41cc0f5..892961c42 100644
--- a/oslo_messaging/_drivers/impl_rabbit.py
+++ b/oslo_messaging/_drivers/impl_rabbit.py
@@ -84,6 +84,13 @@ rabbit_opts = [
                help='How long to wait before considering a reconnect '
                     'attempt to have failed. This value should not be '
                     'longer than rpc_response_timeout.'),
+    cfg.StrOpt('kombu_failover_strategy',
+               choices=('round-robin', 'shuffle'),
+               default='round-robin',
+               help='Determines how the next RabbitMQ node is chosen in case '
+                    'the one we are currently connected to becomes '
+                    'unavailable. Takes effect only if more than one '
+                    'RabbitMQ node is provided in config.'),
     cfg.StrOpt('rabbit_host',
                default='localhost',
                deprecated_group='DEFAULT',
@@ -377,6 +384,7 @@ class Connection(object):
         self.amqp_auto_delete = driver_conf.amqp_auto_delete
         self.rabbit_use_ssl = driver_conf.rabbit_use_ssl
         self.kombu_reconnect_timeout = driver_conf.kombu_reconnect_timeout
+        self.kombu_failover_strategy = driver_conf.kombu_failover_strategy
 
         if self.rabbit_use_ssl:
             self.kombu_ssl_version = driver_conf.kombu_ssl_version
@@ -456,8 +464,8 @@ class Connection(object):
         self.connection = kombu.connection.Connection(
             self._url, ssl=self._fetch_ssl_params(),
             login_method=self.login_method,
-            failover_strategy="shuffle",
             heartbeat=self.heartbeat_timeout_threshold,
+            failover_strategy=self.kombu_failover_strategy,
             transport_options={
                 'confirm_publish': True,
                 'on_blocked': self._on_connection_blocked,
diff --git a/oslo_messaging/tests/drivers/test_impl_rabbit.py b/oslo_messaging/tests/drivers/test_impl_rabbit.py
index 06c78982a..db1a8c3df 100644
--- a/oslo_messaging/tests/drivers/test_impl_rabbit.py
+++ b/oslo_messaging/tests/drivers/test_impl_rabbit.py
@@ -169,7 +169,7 @@ class TestRabbitDriverLoadSSL(test_utils.BaseTestCase):
                                              'on_blocked': mock.ANY,
                                              'on_unblocked': mock.ANY},
             ssl=self.expected, login_method='AMQPLAIN',
-            heartbeat=60, failover_strategy="shuffle")
+            heartbeat=60, failover_strategy='round-robin')
 
 
 class TestRabbitPublisher(test_utils.BaseTestCase):