Improve rook-mon-exit service
The rook-mon-exit service was causing uncontrolled swact in some scenarios, such as: BnR, upgrade/downgrade rook-ceph app, and apply action with swact during action. To fix it, the rook-mon-exit script was improved to guarantee the pod check will occur only when the rook-ceph-mon-float deployment exists and the replica is 1, avoiding the uncontrolled swact when the floating monitor was turned off for some action. Additionally, a new check to ensure the rook-ceph is running was implemented, checking the helm release rook-ceph and rook-ceph-cluster is ready. Test Plan: - PASS: Apply the rook-ceph App - PASS: Swact among controllers and check if the floating monitor will be scheduled correctly in the active controller - PASS: Reboot the active controller and check if the floating monitor will be scheduled correctly in the new active controller - PASS: Reboot all controllers and check if the floating monitor will be scheduled correctly in the active controller - PASS: Turn off all controllers, turn on the controllers in reverse order and check if the floating monitor will be scheduled correctly in the active controller - PASS: Perform Backup and Restore, and check if uncontrolled swacts was not occurring - PASS: Upgrade app from an old app version to a new app version - PASS: Downgrade app from a new app version to an old app version - PASS: Apply app and cause a proposal uncontrolled swact during the apply app action Closes-Bug: 2115438 Change-Id: I9cc2e5d55f389b053a6a29fdbe3ce2337d8871da Signed-off-by: Gustavo Ornaghi Antunes <gustavo.ornaghiantunes@windriver.com>
This commit is contained in:
@@ -21,7 +21,7 @@ log () {
|
||||
local name=""
|
||||
local log_level="$1"
|
||||
# Checking if the first parameter is not a log level
|
||||
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
|
||||
if grep -q -v "${log_level}" <<< "INFO DEBUG WARN ERROR"; then
|
||||
name=" ($1)";
|
||||
log_level="$2"
|
||||
shift
|
||||
@@ -41,7 +41,7 @@ log () {
|
||||
# Start Action
|
||||
################################################################################
|
||||
function start {
|
||||
log INFO "Starting mon-float"
|
||||
log INFO "Start: Starting mon-float"
|
||||
|
||||
# Add label for pod scheduling
|
||||
# NOTE: Because SM and k8s can be restarted independently the k8s API may not
|
||||
@@ -49,10 +49,10 @@ function start {
|
||||
# applied in the status check
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
label node $(hostname) \
|
||||
label node "$(hostname)" \
|
||||
ceph-mon-float-placement=enabled
|
||||
|
||||
log INFO "Started mon-float"
|
||||
log INFO "Start: Started mon-float"
|
||||
RETVAL=0
|
||||
}
|
||||
|
||||
@@ -60,35 +60,36 @@ function start {
|
||||
# Stop Action
|
||||
################################################################################
|
||||
function stop {
|
||||
log INFO "Stopping mon-float"
|
||||
log INFO "Stop: Stopping mon-float"
|
||||
# Add remove label to prevent pod scheduling
|
||||
# NOTE: Because SM and k8s can be restarted independently the k8s API may not
|
||||
# be available at the time of the start action. Don't fail. Confirm label is
|
||||
# applied in the status check
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
label node $(hostname) \
|
||||
label node "$(hostname)" \
|
||||
ceph-mon-float-placement-
|
||||
|
||||
# Get floating monitor pod running on this node
|
||||
POD=$(kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
get pod -n rook-ceph \
|
||||
-l app="rook-ceph-mon,mon=float" --no-headers=true \
|
||||
--field-selector=spec.nodeName=$(hostname) \
|
||||
-o=custom-columns=NAME:.metadata.name)
|
||||
POD=$(
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
get pod -n rook-ceph \
|
||||
-l "app=rook-ceph-mon,mon=float" --no-headers=true \
|
||||
--field-selector=spec.nodeName="$(hostname)" \
|
||||
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null
|
||||
)
|
||||
|
||||
# Is there a floating monitor here?
|
||||
if [ ! -z "${POD}" ]; then
|
||||
log INFO "Deleting floating monitor pod"
|
||||
if [ -n "${POD}" ]; then
|
||||
log INFO "Stop: Deleting floating monitor pod"
|
||||
# delete detected pod to force a reschedule
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
delete pod -n rook-ceph \
|
||||
${POD}
|
||||
log INFO "Deleted floating monitor pod"
|
||||
delete pod -n rook-ceph "${POD}"
|
||||
log INFO "Stop: Deleted floating monitor pod"
|
||||
fi
|
||||
log INFO "Stopped floating monitor pod"
|
||||
log INFO "Stop: Stopped floating monitor pod"
|
||||
RETVAL=0
|
||||
}
|
||||
|
||||
@@ -120,89 +121,130 @@ function status {
|
||||
# - do not have label and pod: RETVAL 1
|
||||
#
|
||||
|
||||
FLOAT_IS_INSTALLED=$(
|
||||
declare -A HR_STATUS
|
||||
JSONPATH='{range .items[*]}'
|
||||
JSONPATH+='{.metadata.name}{" "}'
|
||||
JSONPATH+='{.status.conditions[?(@.type=="Ready")].status}{"\n"}'
|
||||
JSONPATH+='{end}'
|
||||
while read -r nome status; do
|
||||
HR_STATUS["$nome"]="$status"
|
||||
done < <(
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
get hr rook-ceph-floating-monitor -n rook-ceph \
|
||||
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
|
||||
--request-timeout "${REQUEST_TIMEOUT}" \
|
||||
get hr -n rook-ceph \
|
||||
-o jsonpath="${JSONPATH}"
|
||||
)
|
||||
|
||||
# If the floating is not installed, skip other checks.
|
||||
if [[ "${FLOAT_IS_INSTALLED}" == "True" ]]; then
|
||||
IS_ROOK_INSTALLED=$(
|
||||
[[ "${HR_STATUS['rook-ceph-cluster']}" == "True" &&
|
||||
"${HR_STATUS['rook-ceph']}" == "True" ]] \
|
||||
&& echo 1 || echo 0
|
||||
)
|
||||
IS_FLOAT_INSTALLED=$(
|
||||
[[ "${HR_STATUS['rook-ceph-floating-monitor']}" == "True" ]] \
|
||||
&& echo 1 || echo 0
|
||||
)
|
||||
IS_READY_TO_CHECK=$(
|
||||
[[ "$IS_ROOK_INSTALLED" == "1" && "$IS_FLOAT_INSTALLED" == "1" ]] \
|
||||
&& echo 1 || echo 0
|
||||
)
|
||||
|
||||
# If the floating is installed, get infos for next checks.
|
||||
if (( IS_READY_TO_CHECK )); then
|
||||
# Is this host labeled for the floating monitor
|
||||
NODE_LABELED=$(
|
||||
IS_NODE_LABELED=$(
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
get nodes \
|
||||
-l ceph-mon-float-placement --no-headers=true \
|
||||
--field-selector=metadata.name=$(hostname) \
|
||||
-o=custom-columns=NAME:.metadata.name
|
||||
--field-selector=metadata.name="$(hostname)" \
|
||||
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null
|
||||
)
|
||||
IS_NODE_LABELED=$(
|
||||
[[ -n "${IS_NODE_LABELED}" ]] \
|
||||
&& echo 1 || echo 0
|
||||
)
|
||||
|
||||
# Get floating monitor pod running on this node
|
||||
NODE_WITH_FLOAT=$(
|
||||
FLOATING_MON_REPLICAS=$(
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
get pod -n rook-ceph \
|
||||
-l app="rook-ceph-mon,mon=float" \
|
||||
--no-headers=true \
|
||||
--field-selector=spec.nodeName=$(hostname) \
|
||||
-o=custom-columns=NAME:.metadata.name
|
||||
get deployment -n rook-ceph \
|
||||
rook-ceph-mon-float --no-headers=true \
|
||||
-o=custom-columns=STATUS:.status.replicas \
|
||||
2>/dev/null | sed 's/<none>/0/'
|
||||
)
|
||||
else
|
||||
# Check floating pod status only when helm release is Unknown —
|
||||
# during first app apply. Later, it returns True or False.
|
||||
if [[ "${FLOAT_IS_INSTALLED}" == "Unknown" ]]; then
|
||||
# Get floating monitor pod
|
||||
FLOAT_POD_STATUS=$(
|
||||
FLOATING_MON_REPLICAS=${FLOATING_MON_REPLICAS:-0}
|
||||
|
||||
if (( FLOATING_MON_REPLICAS == 1 )); then
|
||||
# Get floating monitor pod running on this node
|
||||
HAS_FLOATING_POD=$(
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
get pod -n rook-ceph \
|
||||
-l app="rook-ceph-mon,mon=float" \
|
||||
--no-headers=true \
|
||||
-o=custom-columns=STATUS:.status.phase
|
||||
--field-selector=spec.nodeName="$(hostname)" \
|
||||
-o name
|
||||
)
|
||||
fi
|
||||
HAS_FLOATING_POD=$(
|
||||
[[ -n "${HAS_FLOATING_POD}" ]] \
|
||||
&& echo 1 || echo 0
|
||||
)
|
||||
else
|
||||
# Get floating monitor pod if the floating is not installed
|
||||
FLOAT_POD_STATUS=$(
|
||||
kubectl --kubeconfig=/etc/kubernetes/admin.conf \
|
||||
--request-timeout ${REQUEST_TIMEOUT} \
|
||||
get pod -n rook-ceph \
|
||||
-l app="rook-ceph-mon,mon=float" \
|
||||
--no-headers=true \
|
||||
-o=custom-columns=STATUS:.status.phase
|
||||
)
|
||||
fi
|
||||
|
||||
IS_EXPECTED_STATE=0
|
||||
mountpoint -d ${DRBD_MOUNT} | grep -q ^${DRBD_MAJ_DEV_NUM}
|
||||
# It is the active controller
|
||||
if [ $? -eq 0 ]; then
|
||||
# There is not mon-float.
|
||||
if [[ "${FLOAT_IS_INSTALLED}" == "True" ]]; then
|
||||
if [[ -n "${NODE_LABELED}" && -n "${NODE_WITH_FLOAT}" ]]; then
|
||||
# Rook-ceph is installed and there is not mon-float.
|
||||
if (( IS_READY_TO_CHECK )); then
|
||||
if (( IS_NODE_LABELED && \
|
||||
( HAS_FLOATING_POD || FLOATING_MON_REPLICAS == 0 ) )); then
|
||||
# If floating monitor is installed and service in the active
|
||||
# host is in expected state, return [0] to keep as is.
|
||||
RETVAL=0
|
||||
else
|
||||
# If floating monitor is installed but active host lacks
|
||||
# label or pod, return [1] to SM trigger the start action.
|
||||
log INFO "Status: Floating monitor is missing on this host."
|
||||
LOG_MESSAGE="Floating monitor is missing on this host."
|
||||
RETVAL=1
|
||||
fi
|
||||
elif [[ "${FLOAT_POD_STATUS}" == "Pending" ]]; then
|
||||
# If floating monitor is installing, return [1] to SM trigger
|
||||
# the start action.
|
||||
log INFO "Status: Floating monitor is pending on this host."
|
||||
LOG_MESSAGE="Floating monitor is pending on this host."
|
||||
RETVAL=1
|
||||
else
|
||||
# If floating monitor isn't running on active controller,
|
||||
# return [0] to prevent service failure and uncontrolled swact.
|
||||
RETVAL=0
|
||||
fi
|
||||
[ "${RETVAL}" -eq 0 ] && IS_EXPECTED_STATE=1
|
||||
# It is the standby controller
|
||||
else
|
||||
# There is mon-float.
|
||||
if [[ "${FLOAT_IS_INSTALLED}" == "True" ]]; then
|
||||
if [[ -z "${NODE_LABELED}" && -z "${NODE_WITH_FLOAT}" ]]; then
|
||||
# Rook-ceph is installed and there is mon-float.
|
||||
if (( IS_READY_TO_CHECK )); then
|
||||
if (( ! IS_NODE_LABELED && \
|
||||
( ! HAS_FLOATING_POD || FLOATING_MON_REPLICAS == 0 ) )); then
|
||||
# If floating monitor is installed and service in the standby
|
||||
# host is in expected state, return [1] to keep as is.
|
||||
RETVAL=1
|
||||
else
|
||||
# If floating monitor is installed but standby host has
|
||||
# label or pod, return [1] to SM trigger the start action.
|
||||
log INFO "Status: Floating monitor is present but not" \
|
||||
"allowed on this host."
|
||||
LOG_MESSAGE="Floating monitor is present but not "
|
||||
LOG_MESSAGE+="allowed on this host."
|
||||
RETVAL=0
|
||||
fi
|
||||
else
|
||||
@@ -210,6 +252,22 @@ function status {
|
||||
# return [1] to prevent service failure and uncontrolled swact.
|
||||
RETVAL=1
|
||||
fi
|
||||
[ "${RETVAL}" -eq 1 ] && IS_EXPECTED_STATE=1
|
||||
fi
|
||||
|
||||
if [ -n "${LOG_MESSAGE}" ]; then
|
||||
log INFO "Status: ${LOG_MESSAGE}"
|
||||
fi
|
||||
|
||||
# Show floating monitor deployment log only if the rook-ceph is installed,
|
||||
# floating monitor is installed, the expected state from this service is right
|
||||
# but the floating monitor deployment replica is not the expected value [1]
|
||||
if (( IS_READY_TO_CHECK && \
|
||||
IS_EXPECTED_STATE && \
|
||||
FLOATING_MON_REPLICAS == 0 )); then
|
||||
log INFO \
|
||||
"Status: Floating monitor deployment has 0 replicas" \
|
||||
"or does not exist."
|
||||
fi
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user