[ceph-mon] Prevent mon-check from removing mons when down temporarily
A race condition exists that can cause the mon-check pod to delete mons from the monmap that are only down temporarily. This sometimes causes issues with the monmap when those mons come back up. This change adds a check to see if the list of mons in the monmap is larger than expected before removing anything. If not, the monmap is left alone. Change-Id: I43b186bf80741fc178c6806d24c179417d7f2406
This commit is contained in:
parent
f4a74884e5
commit
4d629d3db6
@ -15,6 +15,6 @@ apiVersion: v1
|
|||||||
appVersion: v1.0.0
|
appVersion: v1.0.0
|
||||||
description: OpenStack-Helm Ceph Mon
|
description: OpenStack-Helm Ceph Mon
|
||||||
name: ceph-mon
|
name: ceph-mon
|
||||||
version: 0.1.14
|
version: 0.1.15
|
||||||
home: https://github.com/ceph/ceph
|
home: https://github.com/ceph/ceph
|
||||||
...
|
...
|
||||||
|
@ -24,13 +24,42 @@ function check_mon_msgr2 {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function get_mon_count {
|
||||||
|
ceph mon count-metadata hostname | jq '. | length'
|
||||||
|
}
|
||||||
|
|
||||||
|
function check_mon_addrs {
|
||||||
|
local mon_dump=$(ceph mon dump)
|
||||||
|
local mon_hostnames=$(echo "${mon_dump}" | awk '/mon\./{print $3}' | sed 's/mon\.//g')
|
||||||
|
local mon_endpoints=$(kubectl get endpoints ceph-mon-discovery -n ceph -o json)
|
||||||
|
local v1_port=$(jq '.subsets[0].ports[] | select(.name == "mon") | .port' <<< ${mon_endpoints})
|
||||||
|
local v2_port=$(jq '.subsets[0].ports[] | select(.name == "mon-msgr2") | .port' <<< ${mon_endpoints})
|
||||||
|
|
||||||
|
for mon in ${mon_hostnames}; do
|
||||||
|
local mon_endpoint=$(echo "${mon_dump}" | awk "/${mon}/{print \$2}")
|
||||||
|
local mon_ip=$(jq -r ".subsets[0].addresses[] | select(.nodeName == \"${mon}\") | .ip" <<< ${mon_endpoints})
|
||||||
|
local desired_endpoint=$(printf '[v1:%s:%s/0,v2:%s:%s/0]' ${mon_ip} ${v1_port} ${mon_ip} ${v2_port})
|
||||||
|
|
||||||
|
if [[ "${mon_endpoint}" != "${desired_endpoint}" ]]; then
|
||||||
|
echo "endpoint for ${mon} is ${mon_endpoint}, setting it to ${desired_endpoint}"
|
||||||
|
ceph mon set-addrs ${mon} ${desired_endpoint}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
function watch_mon_health {
|
function watch_mon_health {
|
||||||
|
previous_mon_count=$(get_mon_count)
|
||||||
while [ true ]; do
|
while [ true ]; do
|
||||||
|
mon_count=$(get_mon_count)
|
||||||
|
if [[ ${mon_count} -ne ${previous_mon_count} ]]; then
|
||||||
echo "checking for zombie mons"
|
echo "checking for zombie mons"
|
||||||
python3 /tmp/moncheck-reap-zombies.py || true
|
python3 /tmp/moncheck-reap-zombies.py || true
|
||||||
|
fi
|
||||||
|
previous_mon_count=${mon_count}
|
||||||
echo "checking for ceph-mon msgr v2"
|
echo "checking for ceph-mon msgr v2"
|
||||||
check_mon_msgr2
|
check_mon_msgr2
|
||||||
|
echo "checking mon endpoints in monmap"
|
||||||
|
check_mon_addrs
|
||||||
echo "sleep 30 sec"
|
echo "sleep 30 sec"
|
||||||
sleep 30
|
sleep 30
|
||||||
done
|
done
|
||||||
|
@ -15,4 +15,5 @@ ceph-mon:
|
|||||||
- 0.1.12 Fix Ceph checkDNS script
|
- 0.1.12 Fix Ceph checkDNS script
|
||||||
- 0.1.13 Helm 3 - Fix Job labels
|
- 0.1.13 Helm 3 - Fix Job labels
|
||||||
- 0.1.14 Update htk requirements
|
- 0.1.14 Update htk requirements
|
||||||
|
- 0.1.15 Prevent mon-check from removing mons when down temporarily
|
||||||
...
|
...
|
||||||
|
Loading…
x
Reference in New Issue
Block a user