Rabbit: Ensure node has joined cluster on initial startup

This PS extends the rabbit startup locgic to ensure nodes have
actually joined the cluster on startup.

Change-Id: Ib876d9abd89209d0a7972983bdf4daacf5f8f582
Signed-off-by: Pete Birley <pete@port.direct>
This commit is contained in:
Pete Birley 2019-05-16 16:10:32 -05:00 committed by Pete Birley
parent 819cf51083
commit 9b5b901104
4 changed files with 75 additions and 33 deletions

View File

@ -18,4 +18,8 @@ limitations under the License.
set -e
exec rabbitmqctl status
if [ -f /run/rabbit-disable-readiness ]; then
exit 1
else
exec rabbitmqctl status
fi

View File

@ -29,10 +29,15 @@ function check_rabbit_node_health () {
rabbitmq-diagnostics node_health_check -n "${CLUSTER_SEED_NAME}" -t 10 &>/dev/null
}
function check_rabbit_node_ready () {
get_node_name () {
TARGET_POD=$1
POD_NAME_PREFIX="$(echo "${MY_POD_NAME}" | awk 'BEGIN{FS=OFS="-"}{NF--; print}')"
CLUSTER_SEED_NAME="$(echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }")"
echo "${RABBITMQ_NODENAME}" | awk -F "@${MY_POD_NAME}." "{ print \$1 \"@${POD_NAME_PREFIX}-${TARGET_POD}.\" \$2 }"
}
function check_rabbit_node_ready () {
TARGET_POD=$1
CLUSTER_SEED_NAME="$(get_node_name ${TARGET_POD})"
CLUSTER_SEED_HOST="$(echo "${CLUSTER_SEED_NAME}" | awk -F '@' '{ print $NF }')"
check_rabbit_node_health "${CLUSTER_SEED_NAME}" && \
check_if_open "${CLUSTER_SEED_HOST}" "${PORT_HTTP}" && \
@ -56,7 +61,39 @@ if ! [ "${POD_INCREMENT}" -eq "0" ] && ! [ -d "/var/lib/rabbitmq/mnesia" ] ; the
fi
done
done
rm -fv /run/rabbit-disable-liveness-probe
function reset_rabbit () {
rabbitmqctl shutdown || true
rm -rf /var/lib/rabbitmq/*
exit 1
}
# Start RabbitMQ, but disable readiness from being reported so the pod is not
# marked as up prematurely.
touch /run/rabbit-disable-readiness
rabbitmq-server &
# Wait for server to start, and reset if it does not
END=$(($(date +%s) + 180))
while ! rabbitmqctl -q cluster_status; do
sleep 5
NOW=$(date +%s)
[ $NOW -gt $END ] && reset_rabbit
done
# Wait for server to join cluster, reset if it does not
POD_INCREMENT=$(echo "${MY_POD_NAME}" | awk -F '-' '{print $NF}')
END=$(($(date +%s) + 180))
while ! rabbitmqctl -l --node $(get_node_name 0) -q cluster_status | grep -q "$(get_node_name ${POD_INCREMENT})"; do
sleep 5
NOW=$(date +%s)
[ $NOW -gt $END ] && reset_rabbit
done
# Shutdown the inital server
rabbitmqctl shutdown
rm -fv /run/rabbit-disable-readiness /run/rabbit-disable-liveness-probe
fi
exec rabbitmq-server

View File

@ -16,7 +16,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/}}
set -e
set -ex
# Extract connection details
RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
@ -24,22 +24,30 @@ RABBIT_HOSTNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
RABBIT_PORT=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $2}' \
| awk -F'[:/]' '{print $2}'`
set +x
# Extract Admin User creadential
RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
| awk -F'[//:]' '{print $4}'`
RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
| awk -F'[//:]' '{print $5}'`
set -x
function rabbit_check_node_count () {
echo "Checking node count "
NODES_IN_CLUSTER=$(rabbitmqadmin \
function rabbitmqadmin_authed () {
set +x
rabbitmqadmin \
--host="${RABBIT_HOSTNAME}" \
--port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f bash | wc -w)
$@
set -x
}
function rabbit_check_node_count () {
echo "Checking node count "
NODES_IN_CLUSTER=$(rabbitmqadmin_authed list nodes -f bash | wc -w)
if [ "$NODES_IN_CLUSTER" -eq "$RABBIT_REPLICA_COUNT" ]; then
echo "Number of nodes in cluster match number of desired pods ($NODES_IN_CLUSTER)"
echo "Number of nodes in cluster ($NODES_IN_CLUSTER) match number of desired pods ($NODES_IN_CLUSTER)"
else
echo "Number of nodes in cluster ($NODES_IN_CLUSTER) does not match number of desired pods ($RABBIT_REPLICA_COUNT)"
exit 1
@ -49,13 +57,9 @@ function rabbit_check_node_count () {
rabbit_check_node_count
function rabbit_find_partitions () {
rabbitmqadmin \
--host="${RABBIT_HOSTNAME}" \
--port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f raw_json | \
python -c "
NODE_INFO=$(mktemp)
rabbitmqadmin_authed list nodes -f pretty_json | tee "${NODE_INFO}"
cat "${NODE_INFO}" | python -c "
import json, sys, traceback
print('Checking cluster partitions')
obj=json.load(sys.stdin)
@ -66,31 +70,20 @@ for num, node in enumerate(obj):
raise Exception('cluster partition found: %s' % partition)
except KeyError:
print('Error: partition key not found for node %s' % node)
sys.exit(1)
print('No cluster partitions found')
"
rm -vf "${NODE_INFO}"
}
rabbit_find_partitions
function rabbit_check_users_match () {
echo "Checking users match on all nodes"
NODES=$(rabbitmqadmin \
--host="${RABBIT_HOSTNAME}" \
--port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f bash)
NODES=$(rabbitmqadmin_authed list nodes -f bash)
USER_LIST=$(mktemp --directory)
echo "Found the following nodes: ${NODES}"
for NODE in ${NODES}; do
echo "Checking Node: ${NODE#*@}"
rabbitmqadmin \
--host=${NODE#*@} \
--port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list users -f bash > ${USER_LIST}/${NODE#*@}
rabbitmqadmin_authed list users -f bash > ${USER_LIST}/${NODE#*@}
done
cd ${USER_LIST}; diff -q --from-file $(ls ${USER_LIST})
echo "User lists match for all nodes"

View File

@ -30,13 +30,21 @@ RABBITMQ_ADMIN_USERNAME=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $
RABBITMQ_ADMIN_PASSWORD=`echo $RABBITMQ_ADMIN_CONNECTION | awk -F'[@]' '{print $1}' \
| awk -F'[//:]' '{print $5}'`
function active_rabbit_nodes () {
set -ex
function rabbitmqadmin_authed () {
set +x
rabbitmqadmin \
--host="${RABBIT_HOSTNAME}" \
--port="${RABBIT_PORT}" \
--username="${RABBITMQ_ADMIN_USERNAME}" \
--password="${RABBITMQ_ADMIN_PASSWORD}" \
list nodes -f bash | wc -w
$@
set -x
}
function active_rabbit_nodes () {
rabbitmqadmin_authed list nodes -f bash | wc -w
}
until test "$(active_rabbit_nodes)" -ge "$RABBIT_REPLICA_COUNT"; do