#!/bin/bash # # Copyright (c) 2020 Intel Corporation, Inc. # Copyright (c) 2024-2025 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # RETVAL=0 DRBD_MOUNT="/var/lib/ceph/mon-float" DRBD_MAJ_DEV_NUM="147" REQUEST_TIMEOUT='5s' LOG_FILE=/var/log/ceph/floating-mon-rook-service.log ################################################################################ # Log Utilities ################################################################################ source /usr/lib/ceph/ceph_common.sh log () { local name="" local log_level="$1" # Checking if the first parameter is not a log level if grep -q -v "${log_level}" <<< "INFO DEBUG WARN ERROR"; then name=" ($1)"; log_level="$2" shift fi shift local message="$@" # prefix = [] local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}" # yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/rook-mon-exit : wlog "${prefix}" "${log_level}" "${message}" return 0 } ################################################################################ # Start Action ################################################################################ function start { log INFO "Start: Starting mon-float" # Add label for pod scheduling # NOTE: Because SM and k8s can be restarted independently the k8s API may not # be available at the time of the start action. Don't fail. Confirm label is # applied in the status check kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ label node "$(hostname)" \ ceph-mon-float-placement=enabled log INFO "Start: Started mon-float" RETVAL=0 } ################################################################################ # Stop Action ################################################################################ function stop { log INFO "Stop: Stopping mon-float" # Add remove label to prevent pod scheduling # NOTE: Because SM and k8s can be restarted independently the k8s API may not # be available at the time of the start action. Don't fail. Confirm label is # applied in the status check kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ label node "$(hostname)" \ ceph-mon-float-placement- # Get floating monitor pod running on this node POD=$( kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ get pod -n rook-ceph \ -l "app=rook-ceph-mon,mon=float" --no-headers=true \ --field-selector=spec.nodeName="$(hostname)" \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null ) # Is there a floating monitor here? if [ -n "${POD}" ]; then log INFO "Stop: Deleting floating monitor pod" # delete detected pod to force a reschedule kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ delete pod -n rook-ceph "${POD}" log INFO "Stop: Deleted floating monitor pod" fi log INFO "Stop: Stopped floating monitor pod" RETVAL=0 } ################################################################################ # Status Action ################################################################################ function status { # Status is based on the following expected scenario: # - active controller: enabled-active enabled-active # - standby controller: enabled-standby disabled # # SM status summary: # - RETVAL 0: The service is running # - RETVAL 1: The service is not running # # Status returns following this logic: # - Floating is not running: (SM will keep as is) # - active controller: RETVAL 0 # - standby controller: RETVAL 1 # - Floating is installing: (Unknown state) # - active controller: RETVAL 1 (SM will start the service) # - standby controller: RETVAL 1 # - Floating is installed: # - active controller: # - has label and pod: RETVAL 0 # - do not have label or pod: RETVAL 1 (SM will start the service) # - standby controller: # - has label or pod: RETVAL 0 (SM will stop the service) # - do not have label and pod: RETVAL 1 # declare -A HR_STATUS JSONPATH='{range .items[*]}' JSONPATH+='{.metadata.name}{" "}' JSONPATH+='{.status.conditions[?(@.type=="Ready")].status}{"\n"}' JSONPATH+='{end}' while read -r nome status; do HR_STATUS["$nome"]="$status" done < <( kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout "${REQUEST_TIMEOUT}" \ get hr -n rook-ceph \ -o jsonpath="${JSONPATH}" ) IS_ROOK_INSTALLED=$( [[ "${HR_STATUS['rook-ceph-cluster']}" == "True" && "${HR_STATUS['rook-ceph']}" == "True" ]] \ && echo 1 || echo 0 ) IS_FLOAT_INSTALLED=$( [[ "${HR_STATUS['rook-ceph-floating-monitor']}" == "True" ]] \ && echo 1 || echo 0 ) IS_READY_TO_CHECK=$( [[ "$IS_ROOK_INSTALLED" == "1" && "$IS_FLOAT_INSTALLED" == "1" ]] \ && echo 1 || echo 0 ) # If the floating is installed, get infos for next checks. if (( IS_READY_TO_CHECK )); then # Is this host labeled for the floating monitor IS_NODE_LABELED=$( kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ get nodes \ -l ceph-mon-float-placement --no-headers=true \ --field-selector=metadata.name="$(hostname)" \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null ) IS_NODE_LABELED=$( [[ -n "${IS_NODE_LABELED}" ]] \ && echo 1 || echo 0 ) FLOATING_MON_REPLICAS=$( kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ get deployment -n rook-ceph \ rook-ceph-mon-float --no-headers=true \ -o=custom-columns=STATUS:.status.replicas \ 2>/dev/null | sed 's//0/' ) FLOATING_MON_REPLICAS=${FLOATING_MON_REPLICAS:-0} if (( FLOATING_MON_REPLICAS == 1 )); then # Get floating monitor pod running on this node HAS_FLOATING_POD=$( kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ get pod -n rook-ceph \ -l app="rook-ceph-mon,mon=float" \ --no-headers=true \ --field-selector=spec.nodeName="$(hostname)" \ -o name ) fi HAS_FLOATING_POD=$( [[ -n "${HAS_FLOATING_POD}" ]] \ && echo 1 || echo 0 ) else # Get floating monitor pod if the floating is not installed FLOAT_POD_STATUS=$( kubectl --kubeconfig=/etc/kubernetes/admin.conf \ --request-timeout ${REQUEST_TIMEOUT} \ get pod -n rook-ceph \ -l app="rook-ceph-mon,mon=float" \ --no-headers=true \ -o=custom-columns=STATUS:.status.phase ) fi IS_EXPECTED_STATE=0 mountpoint -d ${DRBD_MOUNT} | grep -q ^${DRBD_MAJ_DEV_NUM} # It is the active controller if [ $? -eq 0 ]; then # Rook-ceph is installed and there is not mon-float. if (( IS_READY_TO_CHECK )); then if (( IS_NODE_LABELED && \ ( HAS_FLOATING_POD || FLOATING_MON_REPLICAS == 0 ) )); then # If floating monitor is installed and service in the active # host is in expected state, return [0] to keep as is. RETVAL=0 else # If floating monitor is installed but active host lacks # label or pod, return [1] to SM trigger the start action. LOG_MESSAGE="Floating monitor is missing on this host." RETVAL=1 fi elif [[ "${FLOAT_POD_STATUS}" == "Pending" ]]; then # If floating monitor is installing, return [1] to SM trigger # the start action. LOG_MESSAGE="Floating monitor is pending on this host." RETVAL=1 else # If floating monitor isn't running on active controller, # return [0] to prevent service failure and uncontrolled swact. RETVAL=0 fi [ "${RETVAL}" -eq 0 ] && IS_EXPECTED_STATE=1 # It is the standby controller else # Rook-ceph is installed and there is mon-float. if (( IS_READY_TO_CHECK )); then if (( ! IS_NODE_LABELED && \ ( ! HAS_FLOATING_POD || FLOATING_MON_REPLICAS == 0 ) )); then # If floating monitor is installed and service in the standby # host is in expected state, return [1] to keep as is. RETVAL=1 else # If floating monitor is installed but standby host has # label or pod, return [1] to SM trigger the start action. LOG_MESSAGE="Floating monitor is present but not " LOG_MESSAGE+="allowed on this host." RETVAL=0 fi else # If floating monitor isn't running on standby controller, # return [1] to prevent service failure and uncontrolled swact. RETVAL=1 fi [ "${RETVAL}" -eq 1 ] && IS_EXPECTED_STATE=1 fi if [ -n "${LOG_MESSAGE}" ]; then log INFO "Status: ${LOG_MESSAGE}" fi # Show floating monitor deployment log only if the rook-ceph is installed, # floating monitor is installed, the expected state from this service is right # but the floating monitor deployment replica is not the expected value [1] if (( IS_READY_TO_CHECK && \ IS_EXPECTED_STATE && \ FLOATING_MON_REPLICAS == 0 )); then log INFO \ "Status: Floating monitor deployment has 0 replicas" \ "or does not exist." fi } ################################################################################ # Main Entry ################################################################################ case "$1" in start) start ;; stop) stop ;; restart) stop start ;; status) status ;; *) echo "usage: $0 { start | stop | status | restart }" exit 1 ;; esac exit $RETVAL