
The rook-mon-exit service was causing uncontrolled swact in some scenarios, such as: BnR, upgrade/downgrade rook-ceph app, and apply action with swact during action. To fix it, the rook-mon-exit script was improved to guarantee the pod check will occur only when the rook-ceph-mon-float deployment exists and the replica is 1, avoiding the uncontrolled swact when the floating monitor was turned off for some action. Additionally, a new check to ensure the rook-ceph is running was implemented, checking the helm release rook-ceph and rook-ceph-cluster is ready. Test Plan: - PASS: Apply the rook-ceph App - PASS: Swact among controllers and check if the floating monitor will be scheduled correctly in the active controller - PASS: Reboot the active controller and check if the floating monitor will be scheduled correctly in the new active controller - PASS: Reboot all controllers and check if the floating monitor will be scheduled correctly in the active controller - PASS: Turn off all controllers, turn on the controllers in reverse order and check if the floating monitor will be scheduled correctly in the active controller - PASS: Perform Backup and Restore, and check if uncontrolled swacts was not occurring - PASS: Upgrade app from an old app version to a new app version - PASS: Downgrade app from a new app version to an old app version - PASS: Apply app and cause a proposal uncontrolled swact during the apply app action Closes-Bug: 2115438 Change-Id: I9cc2e5d55f389b053a6a29fdbe3ce2337d8871da Signed-off-by: Gustavo Ornaghi Antunes <gustavo.ornaghiantunes@windriver.com>
303 lines
11 KiB
Bash
303 lines
11 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Copyright (c) 2020 Intel Corporation, Inc.
|
|
# Copyright (c) 2024-2025 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
RETVAL=0
|
|
DRBD_MOUNT="/var/lib/ceph/mon-float"
|
|
DRBD_MAJ_DEV_NUM="147"
|
|
REQUEST_TIMEOUT='5s'
|
|
LOG_FILE=/var/log/ceph/floating-mon-rook-service.log
|
|
|
|
################################################################################
|
|
# Log Utilities
|
|
################################################################################
|
|
source /usr/lib/ceph/ceph_common.sh
|
|
|
|
log () {
|
|
local name=""
|
|
local log_level="$1"
|
|
# Checking if the first parameter is not a log level
|
|
if grep -q -v "${log_level}" <<< "INFO DEBUG WARN ERROR"; then
|
|
name=" ($1)";
|
|
log_level="$2"
|
|
shift
|
|
fi
|
|
|
|
shift
|
|
|
|
local message="$@"
|
|
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
|
|
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
|
|
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/rook-mon-exit <prefix> <log_level>: <message>
|
|
wlog "${prefix}" "${log_level}" "${message}"
|
|
return 0
|
|
}
|
|
|
|
################################################################################
|
|
# Start Action
|
|
################################################################################
|
|
function start {
|
|
log INFO "Start: Starting mon-float"
|
|
|
|
# Add label for pod scheduling
|
|
# NOTE: Because SM and k8s can be restarted independently the k8s API may not
|
|
# be available at the time of the start action. Don't fail. Confirm label is
|
|
# applied in the status check
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
label node "$(hostname)" \
|
|
ceph-mon-float-placement=enabled
|
|
|
|
log INFO "Start: Started mon-float"
|
|
RETVAL=0
|
|
}
|
|
|
|
################################################################################
|
|
# Stop Action
|
|
################################################################################
|
|
function stop {
|
|
log INFO "Stop: Stopping mon-float"
|
|
# Add remove label to prevent pod scheduling
|
|
# NOTE: Because SM and k8s can be restarted independently the k8s API may not
|
|
# be available at the time of the start action. Don't fail. Confirm label is
|
|
# applied in the status check
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
label node "$(hostname)" \
|
|
ceph-mon-float-placement-
|
|
|
|
# Get floating monitor pod running on this node
|
|
POD=$(
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
get pod -n rook-ceph \
|
|
-l "app=rook-ceph-mon,mon=float" --no-headers=true \
|
|
--field-selector=spec.nodeName="$(hostname)" \
|
|
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null
|
|
)
|
|
|
|
# Is there a floating monitor here?
|
|
if [ -n "${POD}" ]; then
|
|
log INFO "Stop: Deleting floating monitor pod"
|
|
# delete detected pod to force a reschedule
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
delete pod -n rook-ceph "${POD}"
|
|
log INFO "Stop: Deleted floating monitor pod"
|
|
fi
|
|
log INFO "Stop: Stopped floating monitor pod"
|
|
RETVAL=0
|
|
}
|
|
|
|
################################################################################
|
|
# Status Action
|
|
################################################################################
|
|
function status {
|
|
# Status is based on the following expected scenario:
|
|
# - active controller: enabled-active enabled-active
|
|
# - standby controller: enabled-standby disabled
|
|
#
|
|
# SM status summary:
|
|
# - RETVAL 0: The service is running
|
|
# - RETVAL 1: The service is not running
|
|
#
|
|
# Status returns following this logic:
|
|
# - Floating is not running: (SM will keep as is)
|
|
# - active controller: RETVAL 0
|
|
# - standby controller: RETVAL 1
|
|
# - Floating is installing: (Unknown state)
|
|
# - active controller: RETVAL 1 (SM will start the service)
|
|
# - standby controller: RETVAL 1
|
|
# - Floating is installed:
|
|
# - active controller:
|
|
# - has label and pod: RETVAL 0
|
|
# - do not have label or pod: RETVAL 1 (SM will start the service)
|
|
# - standby controller:
|
|
# - has label or pod: RETVAL 0 (SM will stop the service)
|
|
# - do not have label and pod: RETVAL 1
|
|
#
|
|
|
|
declare -A HR_STATUS
|
|
JSONPATH='{range .items[*]}'
|
|
JSONPATH+='{.metadata.name}{" "}'
|
|
JSONPATH+='{.status.conditions[?(@.type=="Ready")].status}{"\n"}'
|
|
JSONPATH+='{end}'
|
|
while read -r nome status; do
|
|
HR_STATUS["$nome"]="$status"
|
|
done < <(
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout "${REQUEST_TIMEOUT}" \
|
|
get hr -n rook-ceph \
|
|
-o jsonpath="${JSONPATH}"
|
|
)
|
|
|
|
IS_ROOK_INSTALLED=$(
|
|
[[ "${HR_STATUS['rook-ceph-cluster']}" == "True" &&
|
|
"${HR_STATUS['rook-ceph']}" == "True" ]] \
|
|
&& echo 1 || echo 0
|
|
)
|
|
IS_FLOAT_INSTALLED=$(
|
|
[[ "${HR_STATUS['rook-ceph-floating-monitor']}" == "True" ]] \
|
|
&& echo 1 || echo 0
|
|
)
|
|
IS_READY_TO_CHECK=$(
|
|
[[ "$IS_ROOK_INSTALLED" == "1" && "$IS_FLOAT_INSTALLED" == "1" ]] \
|
|
&& echo 1 || echo 0
|
|
)
|
|
|
|
# If the floating is installed, get infos for next checks.
|
|
if (( IS_READY_TO_CHECK )); then
|
|
# Is this host labeled for the floating monitor
|
|
IS_NODE_LABELED=$(
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
get nodes \
|
|
-l ceph-mon-float-placement --no-headers=true \
|
|
--field-selector=metadata.name="$(hostname)" \
|
|
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null
|
|
)
|
|
IS_NODE_LABELED=$(
|
|
[[ -n "${IS_NODE_LABELED}" ]] \
|
|
&& echo 1 || echo 0
|
|
)
|
|
|
|
FLOATING_MON_REPLICAS=$(
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
get deployment -n rook-ceph \
|
|
rook-ceph-mon-float --no-headers=true \
|
|
-o=custom-columns=STATUS:.status.replicas \
|
|
2>/dev/null | sed 's/<none>/0/'
|
|
)
|
|
FLOATING_MON_REPLICAS=${FLOATING_MON_REPLICAS:-0}
|
|
|
|
if (( FLOATING_MON_REPLICAS == 1 )); then
|
|
# Get floating monitor pod running on this node
|
|
HAS_FLOATING_POD=$(
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
get pod -n rook-ceph \
|
|
-l app="rook-ceph-mon,mon=float" \
|
|
--no-headers=true \
|
|
--field-selector=spec.nodeName="$(hostname)" \
|
|
-o name
|
|
)
|
|
fi
|
|
HAS_FLOATING_POD=$(
|
|
[[ -n "${HAS_FLOATING_POD}" ]] \
|
|
&& echo 1 || echo 0
|
|
)
|
|
else
|
|
# Get floating monitor pod if the floating is not installed
|
|
FLOAT_POD_STATUS=$(
|
|
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
|
--request-timeout ${REQUEST_TIMEOUT} \
|
|
get pod -n rook-ceph \
|
|
-l app="rook-ceph-mon,mon=float" \
|
|
--no-headers=true \
|
|
-o=custom-columns=STATUS:.status.phase
|
|
)
|
|
fi
|
|
|
|
IS_EXPECTED_STATE=0
|
|
mountpoint -d ${DRBD_MOUNT} | grep -q ^${DRBD_MAJ_DEV_NUM}
|
|
# It is the active controller
|
|
if [ $? -eq 0 ]; then
|
|
# Rook-ceph is installed and there is not mon-float.
|
|
if (( IS_READY_TO_CHECK )); then
|
|
if (( IS_NODE_LABELED && \
|
|
( HAS_FLOATING_POD || FLOATING_MON_REPLICAS == 0 ) )); then
|
|
# If floating monitor is installed and service in the active
|
|
# host is in expected state, return [0] to keep as is.
|
|
RETVAL=0
|
|
else
|
|
# If floating monitor is installed but active host lacks
|
|
# label or pod, return [1] to SM trigger the start action.
|
|
LOG_MESSAGE="Floating monitor is missing on this host."
|
|
RETVAL=1
|
|
fi
|
|
elif [[ "${FLOAT_POD_STATUS}" == "Pending" ]]; then
|
|
# If floating monitor is installing, return [1] to SM trigger
|
|
# the start action.
|
|
LOG_MESSAGE="Floating monitor is pending on this host."
|
|
RETVAL=1
|
|
else
|
|
# If floating monitor isn't running on active controller,
|
|
# return [0] to prevent service failure and uncontrolled swact.
|
|
RETVAL=0
|
|
fi
|
|
[ "${RETVAL}" -eq 0 ] && IS_EXPECTED_STATE=1
|
|
# It is the standby controller
|
|
else
|
|
# Rook-ceph is installed and there is mon-float.
|
|
if (( IS_READY_TO_CHECK )); then
|
|
if (( ! IS_NODE_LABELED && \
|
|
( ! HAS_FLOATING_POD || FLOATING_MON_REPLICAS == 0 ) )); then
|
|
# If floating monitor is installed and service in the standby
|
|
# host is in expected state, return [1] to keep as is.
|
|
RETVAL=1
|
|
else
|
|
# If floating monitor is installed but standby host has
|
|
# label or pod, return [1] to SM trigger the start action.
|
|
LOG_MESSAGE="Floating monitor is present but not "
|
|
LOG_MESSAGE+="allowed on this host."
|
|
RETVAL=0
|
|
fi
|
|
else
|
|
# If floating monitor isn't running on standby controller,
|
|
# return [1] to prevent service failure and uncontrolled swact.
|
|
RETVAL=1
|
|
fi
|
|
[ "${RETVAL}" -eq 1 ] && IS_EXPECTED_STATE=1
|
|
fi
|
|
|
|
if [ -n "${LOG_MESSAGE}" ]; then
|
|
log INFO "Status: ${LOG_MESSAGE}"
|
|
fi
|
|
|
|
# Show floating monitor deployment log only if the rook-ceph is installed,
|
|
# floating monitor is installed, the expected state from this service is right
|
|
# but the floating monitor deployment replica is not the expected value [1]
|
|
if (( IS_READY_TO_CHECK && \
|
|
IS_EXPECTED_STATE && \
|
|
FLOATING_MON_REPLICAS == 0 )); then
|
|
log INFO \
|
|
"Status: Floating monitor deployment has 0 replicas" \
|
|
"or does not exist."
|
|
fi
|
|
}
|
|
|
|
################################################################################
|
|
# Main Entry
|
|
################################################################################
|
|
|
|
case "$1" in
|
|
start)
|
|
start
|
|
;;
|
|
|
|
stop)
|
|
stop
|
|
;;
|
|
|
|
restart)
|
|
stop
|
|
start
|
|
;;
|
|
|
|
status)
|
|
status
|
|
;;
|
|
|
|
*)
|
|
echo "usage: $0 { start | stop | status | restart }"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
exit $RETVAL
|