From 94f3ce0c78998e29fcc034a9b0844f9d6d602807 Mon Sep 17 00:00:00 2001
From: John Garbutt <john.garbutt@stackhpc.com>
Date: Fri, 17 Dec 2021 16:20:32 +0000
Subject: [PATCH] RabbitMQ: Support setting ha-promote-on-shutdown

By default ha-promote-on-shutdown=when-synced. However we are seeing
issues with RabbitMQ automatically recovering when nodes are restarted.
https://www.rabbitmq.com/ha.html#cluster-shutdown

Rather than waiting for operator interventions, it is better we allow
recovery to happen, even if that means we may loose some messages.
A few failed and timed out operations is better than a totaly broken
cloud. This is achieved using ha-promote-on-shutdown=always.

Note, when a node failure is detected, this is already the default
behaviour from 3.7.5 onwards:
https://www.rabbitmq.com/ha.html#promoting-unsynchronised-mirrors

This patch adds the option to change the ha-promote-on-shutdown
definition, using the flag `rabbitmq_ha_promote_on_shutdown`. This
value is unset by default to avoid any unexpected changes to the
RabbitMQ definitions.json file, as that would trigger an unexpected
restart of RabbitMQ during the next deploy.

Related-Bug: #1954925

Change-Id: I2146bda2c72ddac2c9923c6941b0596395fd9ab5
---
 ansible/roles/rabbitmq/defaults/main.yml            |  5 +++++
 .../roles/rabbitmq/templates/definitions.json.j2    |  4 ++--
 ...tmq-ha-promote-on-shutdown-9099c6643f2d0cce.yaml | 13 +++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 releasenotes/notes/rabbitmq-ha-promote-on-shutdown-9099c6643f2d0cce.yaml

diff --git a/ansible/roles/rabbitmq/defaults/main.yml b/ansible/roles/rabbitmq/defaults/main.yml
index 388369f58b..06aaf8a9c4 100644
--- a/ansible/roles/rabbitmq/defaults/main.yml
+++ b/ansible/roles/rabbitmq/defaults/main.yml
@@ -84,6 +84,11 @@ rabbitmq_server_additional_erl_args: "+S 2:2 +sbwt none +sbwtdcpu none +sbwtdio
 rabbitmq_tls_options: {}
 # To avoid split-brain
 rabbitmq_cluster_partition_handling: "pause_minority"
+# For consistency use "when-synced", for availability use "always"
+# The rabbitmq default for ha queues is "when-synced"
+# More details see:
+# https://www.rabbitmq.com/ha.html#promoting-unsynchronised-mirrors
+rabbitmq_ha_promote_on_shutdown:
 rabbitmq_extra_config: {}
 
 ####################
diff --git a/ansible/roles/rabbitmq/templates/definitions.json.j2 b/ansible/roles/rabbitmq/templates/definitions.json.j2
index 450a04df49..d04a0deabd 100644
--- a/ansible/roles/rabbitmq/templates/definitions.json.j2
+++ b/ansible/roles/rabbitmq/templates/definitions.json.j2
@@ -18,8 +18,8 @@
   ],
 {% if om_enable_rabbitmq_high_availability | bool %}
   "policies":[
-    {"vhost": "/", "name": "ha-all", "pattern": "^(?!(amq\\.)|(.*_fanout_)|(reply_)).*", "apply-to": "all", "definition": {"ha-mode":"all"}, "priority":0}{% if project_name == 'outward_rabbitmq' %},
-    {"vhost": "{{ murano_agent_rabbitmq_vhost }}", "name": "ha-all", "pattern": "^(?!(amq\\.)|(.*_fanout_)|(reply_)).*", "apply-to": "all", "definition": {"ha-mode":"all"}, "priority":0}
+    {"vhost": "/", "name": "ha-all", "pattern": "^(?!(amq\\.)|(.*_fanout_)|(reply_)).*", "apply-to": "all", "definition": {"ha-mode":"all"{% if rabbitmq_ha_promote_on_shutdown is not none %},"ha-promote-on-shutdown":"{{ rabbitmq_ha_promote_on_shutdown }}"{% endif %}}, "priority":0}{% if project_name == 'outward_rabbitmq' %},
+    {"vhost": "{{ murano_agent_rabbitmq_vhost }}", "name": "ha-all", "pattern": "^(?!(amq\\.)|(.*_fanout_)|(reply_)).*", "apply-to": "all", "definition": {"ha-mode":"all"{% if rabbitmq_ha_promote_on_shutdown is not none %},"ha-promote-on-shutdown":"{{ rabbitmq_ha_promote_on_shutdown }}"{% endif %}}, "priority":0}
     {% endif %}
   ]
 {% else %}
diff --git a/releasenotes/notes/rabbitmq-ha-promote-on-shutdown-9099c6643f2d0cce.yaml b/releasenotes/notes/rabbitmq-ha-promote-on-shutdown-9099c6643f2d0cce.yaml
new file mode 100644
index 0000000000..c97d3b68a7
--- /dev/null
+++ b/releasenotes/notes/rabbitmq-ha-promote-on-shutdown-9099c6643f2d0cce.yaml
@@ -0,0 +1,13 @@
+---
+features:
+  - |
+    The config option `rabbitmq_ha_promote_on_shutdown` has been added, which
+    allows changing the RabbitMQ definition `ha-promote-on-shutdown`. By
+    default `ha-promote-on-shutdown` is "when-synced". We recommend changing
+    this to be "always". This basically means we don't mind losing some
+    messages, instead we give priority to rabbitmq availability. This is most
+    relevant when restarting rabbitmq, such as when upgrading. Note that
+    setting the value of this flag, even to the default value of "when-synced",
+    will cause RabbitMQ to be restarted on the next deploy.
+    For more details please see:
+    https://www.rabbitmq.com/ha.html#cluster-shutdown