From 4d629d3db60056655afa6f9549c687654eecb66d Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Tue, 12 Oct 2021 14:36:43 -0600 Subject: [PATCH] [ceph-mon] Prevent mon-check from removing mons when down temporarily A race condition exists that can cause the mon-check pod to delete mons from the monmap that are only down temporarily. This sometimes causes issues with the monmap when those mons come back up. This change adds a check to see if the list of mons in the monmap is larger than expected before removing anything. If not, the monmap is left alone. Change-Id: I43b186bf80741fc178c6806d24c179417d7f2406 --- ceph-mon/Chart.yaml | 2 +- ceph-mon/templates/bin/moncheck/_start.sh.tpl | 33 +++++++++++++++++-- releasenotes/notes/ceph-mon.yaml | 1 + 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/ceph-mon/Chart.yaml b/ceph-mon/Chart.yaml index daeea5cbf..4cbb703cd 100644 --- a/ceph-mon/Chart.yaml +++ b/ceph-mon/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph Mon name: ceph-mon -version: 0.1.14 +version: 0.1.15 home: https://github.com/ceph/ceph ... diff --git a/ceph-mon/templates/bin/moncheck/_start.sh.tpl b/ceph-mon/templates/bin/moncheck/_start.sh.tpl index 65141d640..9091826c1 100644 --- a/ceph-mon/templates/bin/moncheck/_start.sh.tpl +++ b/ceph-mon/templates/bin/moncheck/_start.sh.tpl @@ -24,13 +24,42 @@ function check_mon_msgr2 { fi } +function get_mon_count { + ceph mon count-metadata hostname | jq '. | length' +} + +function check_mon_addrs { + local mon_dump=$(ceph mon dump) + local mon_hostnames=$(echo "${mon_dump}" | awk '/mon\./{print $3}' | sed 's/mon\.//g') + local mon_endpoints=$(kubectl get endpoints ceph-mon-discovery -n ceph -o json) + local v1_port=$(jq '.subsets[0].ports[] | select(.name == "mon") | .port' <<< ${mon_endpoints}) + local v2_port=$(jq '.subsets[0].ports[] | select(.name == "mon-msgr2") | .port' <<< ${mon_endpoints}) + + for mon in ${mon_hostnames}; do + local mon_endpoint=$(echo "${mon_dump}" | awk "/${mon}/{print \$2}") + local mon_ip=$(jq -r ".subsets[0].addresses[] | select(.nodeName == \"${mon}\") | .ip" <<< ${mon_endpoints}) + local desired_endpoint=$(printf '[v1:%s:%s/0,v2:%s:%s/0]' ${mon_ip} ${v1_port} ${mon_ip} ${v2_port}) + + if [[ "${mon_endpoint}" != "${desired_endpoint}" ]]; then + echo "endpoint for ${mon} is ${mon_endpoint}, setting it to ${desired_endpoint}" + ceph mon set-addrs ${mon} ${desired_endpoint} + fi + done +} function watch_mon_health { + previous_mon_count=$(get_mon_count) while [ true ]; do - echo "checking for zombie mons" - python3 /tmp/moncheck-reap-zombies.py || true + mon_count=$(get_mon_count) + if [[ ${mon_count} -ne ${previous_mon_count} ]]; then + echo "checking for zombie mons" + python3 /tmp/moncheck-reap-zombies.py || true + fi + previous_mon_count=${mon_count} echo "checking for ceph-mon msgr v2" check_mon_msgr2 + echo "checking mon endpoints in monmap" + check_mon_addrs echo "sleep 30 sec" sleep 30 done diff --git a/releasenotes/notes/ceph-mon.yaml b/releasenotes/notes/ceph-mon.yaml index e071dc960..7d4a74388 100644 --- a/releasenotes/notes/ceph-mon.yaml +++ b/releasenotes/notes/ceph-mon.yaml @@ -15,4 +15,5 @@ ceph-mon: - 0.1.12 Fix Ceph checkDNS script - 0.1.13 Helm 3 - Fix Job labels - 0.1.14 Update htk requirements + - 0.1.15 Prevent mon-check from removing mons when down temporarily ...