diff --git a/meta-stx-virt/recipes-extended/ceph/ceph_13.2.2.bb b/meta-stx-virt/recipes-extended/ceph/ceph_13.2.2.bb index b908707..b5c9287 100644 --- a/meta-stx-virt/recipes-extended/ceph/ceph_13.2.2.bb +++ b/meta-stx-virt/recipes-extended/ceph/ceph_13.2.2.bb @@ -113,21 +113,13 @@ SRC_URI = "\ file://0002-zstd-fix-error-for-cross-compile.patch \ file://0003-ceph-add-pybind-support-in-OE.patch \ file://0004-ceph-detect-init-correct-the-installation-for-OE.patch \ - \ - file://ceph-init-wrapper.sh \ - file://ceph-manage-journal.py \ - file://ceph-preshutdown.sh \ - file://ceph-radosgw.service \ - file://ceph.conf \ - file://ceph.conf.pmon \ - file://ceph.service \ - file://ceph.sh \ - file://mgr-restful-plugin.py \ - file://mgr-restful-plugin.service \ - file://starlingx-docker-override.conf \ " inherit cmake pythonnative python-dir systemd +inherit stx-metadata + +STX_REPO = "integ" +STX_SUBPATH = "ceph/ceph/files" DISTRO_FEATURES_BACKFILL_CONSIDERED_remove = "sysvinit" @@ -196,31 +188,31 @@ do_install_append () { install -m 0755 ${D}${libexecdir}/ceph/ceph_common.sh ${D}${libdir}/ceph install -d ${D}${sysconfdir}/ceph - install -m 0644 ${WORKDIR}/ceph.conf ${D}${sysconfdir}/ceph/ - install -m 0644 ${WORKDIR}/ceph-radosgw.service ${D}${systemd_system_unitdir}/ceph-radosgw@.service - install -m 0644 ${WORKDIR}/ceph.service ${D}${systemd_system_unitdir} - install -m 0644 ${WORKDIR}/mgr-restful-plugin.service ${D}${systemd_system_unitdir} + install -m 0644 ${STX_METADATA_PATH}/ceph.conf ${D}${sysconfdir}/ceph/ + install -m 0644 ${STX_METADATA_PATH}/ceph-radosgw.service ${D}${systemd_system_unitdir}/ceph-radosgw@.service + install -m 0644 ${STX_METADATA_PATH}/ceph.service ${D}${systemd_system_unitdir} + install -m 0644 ${STX_METADATA_PATH}/mgr-restful-plugin.service ${D}${systemd_system_unitdir} - install -m 0700 ${WORKDIR}/ceph-manage-journal.py ${D}${sbindir}/ceph-manage-journal - install -Dm 0750 ${WORKDIR}/mgr-restful-plugin.py ${D}${sysconfdir}/rc.d/init.d/mgr-restful-plugin - install -Dm 0750 ${WORKDIR}/mgr-restful-plugin.py ${D}${sysconfdir}/init.d/mgr-restful-plugin - install -m 0750 ${WORKDIR}/ceph.conf.pmon ${D}${sysconfdir}/ceph/ + install -m 0700 ${STX_METADATA_PATH}/ceph-manage-journal.py ${D}${sbindir}/ceph-manage-journal + install -Dm 0750 ${STX_METADATA_PATH}/mgr-restful-plugin.py ${D}${sysconfdir}/rc.d/init.d/mgr-restful-plugin + install -Dm 0750 ${STX_METADATA_PATH}/mgr-restful-plugin.py ${D}${sysconfdir}/init.d/mgr-restful-plugin + install -m 0750 ${STX_METADATA_PATH}/ceph.conf.pmon ${D}${sysconfdir}/ceph/ install -d -m 0750 ${D}${sysconfdir}/services.d/controller install -d -m 0750 ${D}${sysconfdir}/services.d/storage install -d -m 0750 ${D}${sysconfdir}/services.d/worker - install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/controller - install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/storage - install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/worker + install -m 0750 ${STX_METADATA_PATH}/ceph.sh ${D}${sysconfdir}/services.d/controller + install -m 0750 ${STX_METADATA_PATH}/ceph.sh ${D}${sysconfdir}/services.d/storage + install -m 0750 ${STX_METADATA_PATH}/ceph.sh ${D}${sysconfdir}/services.d/worker - install -Dm 0750 ${WORKDIR}/ceph-init-wrapper.sh ${D}${sysconfdir}/rc.d/init.d/ceph-init-wrapper - install -Dm 0750 ${WORKDIR}/ceph-init-wrapper.sh ${D}${sysconfdir}/init.d/ceph-init-wrapper + install -Dm 0750 ${STX_METADATA_PATH}/ceph-init-wrapper.sh ${D}${sysconfdir}/rc.d/init.d/ceph-init-wrapper + install -Dm 0750 ${STX_METADATA_PATH}/ceph-init-wrapper.sh ${D}${sysconfdir}/init.d/ceph-init-wrapper sed -i -e 's|/usr/lib64|${libdir}|' ${D}${sysconfdir}/rc.d/init.d/ceph-init-wrapper ${D}${sysconfdir}/init.d/ceph-init-wrapper - install -m 0700 ${WORKDIR}/ceph-preshutdown.sh ${D}${sbindir}/ceph-preshutdown.sh + install -m 0700 ${STX_METADATA_PATH}/ceph-preshutdown.sh ${D}${sbindir}/ceph-preshutdown.sh - install -Dm 0644 ${WORKDIR}/starlingx-docker-override.conf ${D}${systemd_system_unitdir}/docker.service.d/starlingx-docker-override.conf + install -Dm 0644 ${STX_METADATA_PATH}/starlingx-docker-override.conf ${D}${systemd_system_unitdir}/docker.service.d/starlingx-docker-override.conf install -m 0644 -D ${S}/src/etc-rbdmap ${D}${sysconfdir}/ceph/rbdmap install -m 0644 -D ${S}/etc/sysconfig/ceph ${D}${sysconfdir}/sysconfig/ceph diff --git a/meta-stx-virt/recipes-extended/ceph/ceph_14.1.0.bbappend b/meta-stx-virt/recipes-extended/ceph/ceph_14.1.0.bbappend index a89528f..09d0cf9 100644 --- a/meta-stx-virt/recipes-extended/ceph/ceph_14.1.0.bbappend +++ b/meta-stx-virt/recipes-extended/ceph/ceph_14.1.0.bbappend @@ -1,22 +1,15 @@ -FILESEXTRAPATHS_prepend := "${THISDIR}/${BP}:${THISDIR}/files:" +FILESEXTRAPATHS_prepend := "${THISDIR}/${BP}:" inherit python3native python3-dir +inherit stx-metadata + +STX_REPO = "integ" +STX_SUBPATH = "ceph/ceph/files" DISTRO_FEATURES_BACKFILL_CONSIDERED_remove = "sysvinit" SRC_URI += "\ file://0001-ceph-rebase-on-stx.3.0-and-warrior.patch \ - file://ceph.conf \ - file://ceph-init-wrapper.sh \ - file://ceph-preshutdown.sh \ - file://ceph.service \ - file://mgr-restful-plugin.py \ - file://starlingx-docker-override.conf \ - file://ceph.conf.pmon \ - file://ceph-manage-journal.py \ - file://ceph-radosgw.service \ - file://ceph.sh \ - file://mgr-restful-plugin.service \ file://rados.runtime.decode.error.patch \ " DEPENDS = "boost rdma-core bzip2 curl expat gperf-native \ @@ -55,31 +48,31 @@ EXTRA_OECMAKE = "-DWITH_MANPAGE=OFF \ do_install_append () { install -d ${D}${sysconfdir}/ceph - install -m 0644 ${WORKDIR}/ceph.conf ${D}${sysconfdir}/ceph/ - install -m 0644 ${WORKDIR}/ceph-radosgw.service ${D}${systemd_system_unitdir}/ceph-radosgw@.service - install -m 0644 ${WORKDIR}/ceph.service ${D}${systemd_system_unitdir} - install -m 0644 ${WORKDIR}/mgr-restful-plugin.service ${D}${systemd_system_unitdir} + install -m 0644 ${STX_METADATA_PATH}/ceph.conf ${D}${sysconfdir}/ceph/ + install -m 0644 ${STX_METADATA_PATH}/ceph-radosgw.service ${D}${systemd_system_unitdir}/ceph-radosgw@.service + install -m 0644 ${STX_METADATA_PATH}/ceph.service ${D}${systemd_system_unitdir} + install -m 0644 ${STX_METADATA_PATH}/mgr-restful-plugin.service ${D}${systemd_system_unitdir} - install -m 0700 ${WORKDIR}/ceph-manage-journal.py ${D}${sbindir}/ceph-manage-journal - install -Dm 0750 ${WORKDIR}/mgr-restful-plugin.py ${D}${sysconfdir}/rc.d/init.d/mgr-restful-plugin - install -Dm 0750 ${WORKDIR}/mgr-restful-plugin.py ${D}${sysconfdir}/init.d/mgr-restful-plugin - install -m 0750 ${WORKDIR}/ceph.conf.pmon ${D}${sysconfdir}/ceph/ + install -m 0700 ${STX_METADATA_PATH}/ceph-manage-journal.py ${D}${sbindir}/ceph-manage-journal + install -Dm 0750 ${STX_METADATA_PATH}/mgr-restful-plugin.py ${D}${sysconfdir}/rc.d/init.d/mgr-restful-plugin + install -Dm 0750 ${STX_METADATA_PATH}/mgr-restful-plugin.py ${D}${sysconfdir}/init.d/mgr-restful-plugin + install -m 0750 ${STX_METADATA_PATH}/ceph.conf.pmon ${D}${sysconfdir}/ceph/ install -d -m 0750 ${D}${sysconfdir}/services.d/controller install -d -m 0750 ${D}${sysconfdir}/services.d/storage install -d -m 0750 ${D}${sysconfdir}/services.d/worker - install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/controller - install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/storage - install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/worker + install -m 0750 ${STX_METADATA_PATH}/ceph.sh ${D}${sysconfdir}/services.d/controller + install -m 0750 ${STX_METADATA_PATH}/ceph.sh ${D}${sysconfdir}/services.d/storage + install -m 0750 ${STX_METADATA_PATH}/ceph.sh ${D}${sysconfdir}/services.d/worker - install -Dm 0750 ${WORKDIR}/ceph-init-wrapper.sh ${D}${sysconfdir}/rc.d/init.d/ceph-init-wrapper - install -Dm 0750 ${WORKDIR}/ceph-init-wrapper.sh ${D}${sysconfdir}/init.d/ceph-init-wrapper + install -Dm 0750 ${STX_METADATA_PATH}/ceph-init-wrapper.sh ${D}${sysconfdir}/rc.d/init.d/ceph-init-wrapper + install -Dm 0750 ${STX_METADATA_PATH}/ceph-init-wrapper.sh ${D}${sysconfdir}/init.d/ceph-init-wrapper sed -i -e 's|/usr/lib64|${libdir}|' ${D}${sysconfdir}/rc.d/init.d/ceph-init-wrapper ${D}${sysconfdir}/init.d/ceph-init-wrapper - install -m 0700 ${WORKDIR}/ceph-preshutdown.sh ${D}${sbindir}/ceph-preshutdown.sh + install -m 0700 ${STX_METADATA_PATH}/ceph-preshutdown.sh ${D}${sbindir}/ceph-preshutdown.sh - install -Dm 0644 ${WORKDIR}/starlingx-docker-override.conf ${D}${systemd_system_unitdir}/docker.service.d/starlingx-docker-override.conf + install -Dm 0644 ${STX_METADATA_PATH}/starlingx-docker-override.conf ${D}${systemd_system_unitdir}/docker.service.d/starlingx-docker-override.conf install -m 0644 -D ${S}/src/etc-rbdmap ${D}${sysconfdir}/ceph/rbdmap install -m 0644 -D ${S}/etc/sysconfig/ceph ${D}${sysconfdir}/sysconfig/ceph diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph-init-wrapper.sh b/meta-stx-virt/recipes-extended/ceph/files/ceph-init-wrapper.sh deleted file mode 100755 index ddbbc84..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph-init-wrapper.sh +++ /dev/null @@ -1,331 +0,0 @@ -#!/bin/bash -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -# This script is a helper wrapper for pmon monitoring of ceph -# processes. The "/etc/init.d/ceph" script does not know if ceph is -# running on the node. For example when the node is locked, ceph -# processes are not running. In that case we do not want pmond to -# monitor these processes. -# -# The script "/etc/services.d//ceph.sh" will create the file -# "/var/run/.ceph_started" when ceph is running and remove it when -# is not. -# -# The script also extracts one or more ceph process names that are -# reported as 'not running' or 'dead' or 'failed' by '/etc/intit.d/ceph status' -# and writes the names to a text file: /tmp/ceph_status_failure.txt for -# pmond to access. The pmond adds the text to logs and alarms. Example of text -# samples written to file by this script are: -# 'osd.1' -# 'osd.1, osd.2' -# 'mon.storage-0' -# 'mon.storage-0, osd.2' -# -# Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status' -# the script will try increase their logging to 'debug' for a configurable interval. -# With logging increased it will outputs a few stack traces then, at the end of this -# interval, it dumps its stack core and kills it. -# -# Return values; -# zero - /etc/init.d/ceph returned success or ceph is not running on the node -# non-zero /etc/init.d/ceph returned a failure or invalid syntax -# - -source /usr/bin/tsconfig -source /etc/platform/platform.conf - -CEPH_SCRIPT="/etc/init.d/ceph" -CEPH_FILE="$VOLATILE_PATH/.ceph_started" -CEPH_GET_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_mon_status" -CEPH_GET_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_osd_status" -CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt" - -BINDIR=/usr/bin -SBINDIR=/usr/sbin -LIBDIR=/usr/lib64/ceph -ETCDIR=/etc/ceph -source $LIBDIR/ceph_common.sh - -LOG_PATH=/var/log/ceph -LOG_FILE=$LOG_PATH/ceph-process-states.log -LOG_LEVEL=NORMAL # DEBUG -verbose=0 - -DATA_PATH=$VOLATILE_PATH/ceph_hang # folder where we keep state information -mkdir -p $DATA_PATH # make sure folder exists - -MONITORING_INTERVAL=15 -TRACE_LOOP_INTERVAL=5 -CEPH_STATUS_TIMEOUT=20 - -LOCK_CEPH_MON_SERVICE_FILE="$VOLATILE_PATH/.ceph_mon_status" -LOCK_CEPH_OSD_SERVICE_FILE="$VOLATILE_PATH/.ceph_osd_status" -LOCK_CEPH_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_mon_service" -LOCK_CEPH_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_osd_service" - -# Seconds to wait for ceph status to finish before -# continuing to execute a service action -MONITOR_STATUS_TIMEOUT=30 -MAX_STATUS_TIMEOUT=120 - -RC=0 - -# SM can only pass arguments through environment variable -# when ARGS is not empty use it to extend command line arguments -args=("$@") -if [ ! -z $ARGS ]; then - IFS=";" read -r -a new_args <<< "$ARGS" - args+=("${new_args[@]}") -fi - -with_service_lock () -{ - local target="$1"; shift - [ -z "${target}" ] && target="mon osd" - - # Run in sub-shell so we don't leak file descriptors - # used for locking service actions - ( - # Grab service locks - wlog "-" INFO "Grab service locks" - [[ "${target}" == *"mon"* ]] && flock ${LOCK_CEPH_MON_SERVICE_FD} - [[ "${target}" == *"osd"* ]] && flock ${LOCK_CEPH_OSD_SERVICE_FD} - - # Try to lock status with a timeout in case status is stuck - wlog "-" INFO "Lock service status" - deadline=$((SECONDS + MAX_STATUS_TIMEOUT + 1)) - if [[ "${target}" == *"mon"* ]]; then - flock --exclusive --timeout ${MONITOR_STATUS_TIMEOUT} ${LOCK_CEPH_MON_STATUS_FD} - fi - if [[ "${target}" == *"osd"* ]]; then - timeout=$((deadline - SECONDS)) - if [[ $timeout -gt 0 ]]; then - flock --exclusive --timeout ${timeout} ${LOCK_CEPH_OSD_STATUS_FD} - fi - fi - - # Close lock file descriptors so they are - # not inherited by the spawned process then - # run service action - wlog "-" INFO "Run service action: $@" - "$@" {LOCK_CEPH_MON_SERVICE_FD}>&- \ - {LOCK_CEPH_MON_STATUS_FD}>&- \ - {LOCK_CEPH_OSD_SERVICE_FD}>&- \ - {LOCK_CEPH_OSD_STATUS_FD}>&- - - ) {LOCK_CEPH_MON_SERVICE_FD}>${LOCK_CEPH_MON_SERVICE_FILE} \ - {LOCK_CEPH_MON_STATUS_FD}>${LOCK_CEPH_MON_STATUS_FILE} \ - {LOCK_CEPH_OSD_SERVICE_FD}>${LOCK_CEPH_OSD_SERVICE_FILE} \ - {LOCK_CEPH_OSD_STATUS_FD}>${LOCK_CEPH_OSD_STATUS_FILE} - RC=$? -} - -start () -{ - if [ ! -f ${CEPH_FILE} ]; then - # Ceph is not running on this node, return success - exit 0 - fi - wlog "-" INFO "Ceph START $1 command received" - with_service_lock "$1" ${CEPH_SCRIPT} start $1 - wlog "-" INFO "Ceph START $1 command finished." -} - -stop () -{ - wlog "-" INFO "Ceph STOP $1 command received." - with_service_lock "$1" ${CEPH_SCRIPT} stop $1 - wlog "-" INFO "Ceph STOP $1 command finished." -} - -restart () -{ - if [ ! -f ${CEPH_FILE} ]; then - # Ceph is not running on this node, return success - exit 0 - fi - wlog "-" INFO "Ceph RESTART $1 command received." - with_service_lock "$1" ${CEPH_SCRIPT} restart $1 - wlog "-" INFO "Ceph RESTART $1 command finished." -} - -log_and_restart_blocked_osds () -{ - # Log info about the blocked osd daemons and then restart it - local names=$1 - local message=$2 - for name in $names; do - wlog $name "INFO" "$message" - ${CEPH_SCRIPT} restart $name - done -} - -log_and_kill_hung_procs () -{ - # Log info about the hung processes and then kill them; later on pmon will restart them - local names=$1 - for name in $names; do - type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' - id=`echo $name | cut -c 4- | sed 's/^\\.//'` - get_conf run_dir "/var/run/ceph" "run dir" - get_conf pid_file "$run_dir/$type.$id.pid" "pid file" - pid=$(cat $pid_file) - wlog $name "INFO" "Dealing with hung process (pid:$pid)" - - # monitoring interval - wlog $name "INFO" "Increasing log level" - execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20" - monitoring=$MONITORING_INTERVAL - while [ $monitoring -gt 0 ]; do - if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then - date=$(date "+%Y-%m-%d_%H-%M-%S") - log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log" - wlog $name "INFO" "Dumping stack trace to: $log_file" - $(pstack $pid >$log_file) & - fi - let monitoring-=1 - sleep 1 - done - wlog $name "INFO" "Trigger core dump" - kill -ABRT $pid &>/dev/null - rm -f $pid_file # process is dead, core dump is archiving, preparing for restart - # Wait for pending systemd core dumps - sleep 2 # hope systemd_coredump has started meanwhile - deadline=$(( $(date '+%s') + 300 )) - while [[ $(date '+%s') -lt "${deadline}" ]]; do - systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}") - [[ -z "${systemd_coredump_pid}" ]] && break - wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}" - sleep 2 - done - kill -KILL $pid &>/dev/null - done -} - -status () -{ - local target="$1" # no shift here - [ -z "${target}" ] && target="mon osd" - - if [ ! -f ${CEPH_FILE} ]; then - # Ceph is not running on this node, return success - exit 0 - fi - - if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then - timeout $CEPH_STATUS_TIMEOUT ceph -s - if [ "$?" -ne 0 ]; then - # Ceph cluster is not accessible. Don't panic, controller swact - # may be in progress. - wlog "-" INFO "Ceph is down, ignoring OSD status." - exit 0 - fi - fi - - # Report success while ceph mon is running a service action - # otherwise mark ceph mon status is in progress - exec {LOCK_CEPH_MON_STATUS_FD}>${LOCK_CEPH_MON_STATUS_FILE} - if [[ "${target}" == *"mon"* ]]; then - flock --shared --nonblock ${LOCK_CEPH_MON_SERVICE_FILE} true - if [[ $? -ne 0 ]]; then - exit 0 - fi - # Lock will be released when script exits - flock --shared ${LOCK_CEPH_MON_STATUS_FD} - fi - # Report success while ceph mon is running a service action - # otherwise mark ceph osd status is in progress - exec {LOCK_CEPH_OSD_STATUS_FD}>${LOCK_CEPH_OSD_STATUS_FILE} - if [[ "${target}" == *"osd"* ]]; then - flock --shared --nonblock ${LOCK_CEPH_OSD_SERVICE_FILE} true - if [[ $? -ne 0 ]]; then - exit 0 - fi - # Lock will be released when script exits - flock --shared ${LOCK_CEPH_OSD_STATUS_FD} - fi - - result=`${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-` - RC=$? - if [ "$RC" -ne 0 ]; then - erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` - hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` - blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` - stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` - invalid=0 - host=`hostname` - if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then - # On 2 node configuration we have a floating monitor - host="controller" - fi - for i in $(echo $erred_procs $hung_procs); do - if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then - continue - else - invalid=1 - fi - done - - log_and_restart_blocked_osds "$blocked_ops_procs"\ - "Restarting OSD with blocked operations" - log_and_restart_blocked_osds "$stuck_peering_procs"\ - "Restarting OSD stuck peering" - log_and_kill_hung_procs $hung_procs - - rm -f $CEPH_STATUS_FAILURE_TEXT_FILE - if [ $invalid -eq 0 ]; then - text="" - for i in $erred_procs; do - text+="$i, " - done - for i in $hung_procs; do - text+="$i (process hang), " - done - echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE - else - echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs" - echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE - fi - fi - - if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then - # SM needs exit code != 0 from 'status mon' argument of the init script on - # standby controller otherwise it thinks that the monitor is running and - # tries to stop it. - # '/etc/init.d/ceph status mon' checks the status of monitors configured in - # /etc/ceph/ceph.conf and if it should be running on current host. - # If it should not be running it just exits with code 0. This is what - # happens on the standby controller. - # When floating monitor is running on active controller /var/lib/ceph/mon of - # standby is not mounted (Ceph monitor partition is DRBD synced). - test -e "/var/lib/ceph/mon/ceph-controller" - if [ "$?" -ne 0 ]; then - exit 3 - fi - fi -} - - -case "${args[0]}" in - start) - start ${args[1]} - ;; - stop) - stop ${args[1]} - ;; - restart) - restart ${args[1]} - ;; - status) - status ${args[1]} - ;; - *) - echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.|mon.}]" - exit 1 - ;; -esac - -exit $RC diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph-manage-journal.py b/meta-stx-virt/recipes-extended/ceph/files/ceph-manage-journal.py deleted file mode 100644 index f91cbc1..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph-manage-journal.py +++ /dev/null @@ -1,334 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -import ast -import os -import os.path -import re -import subprocess -import sys - -DEVICE_NAME_NVME = "nvme" - -######### -# Utils # -######### - - -def command(arguments, **kwargs): - """Execute e command and capture stdout, stderr & return code""" - process = subprocess.Popen( - arguments, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - **kwargs) - out, err = process.communicate() - return out, err, process.returncode - - -def get_input(arg, valid_keys): - """Convert the input to a dict and perform basic validation""" - json_string = arg.replace("\\n", "\n") - try: - input_dict = ast.literal_eval(json_string) - if not all(k in input_dict for k in valid_keys): - return None - except Exception: - return None - - return input_dict - - -def get_partition_uuid(dev): - output, _, _ = command(['blkid', dev]) - try: - return re.search('PARTUUID=\"(.+?)\"', output).group(1) - except AttributeError: - return None - - -def device_path_to_device_node(device_path): - try: - output, _, _ = command(["udevadm", "settle", "-E", device_path]) - out, err, retcode = command(["readlink", "-f", device_path]) - out = out.rstrip() - except Exception as e: - return None - - return out - - -########################################### -# Manage Journal Disk Partitioning Scheme # -########################################### - -DISK_BY_PARTUUID = "/dev/disk/by-partuuid/" -JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' # Type of a journal partition - - -def is_partitioning_correct(disk_path, partition_sizes): - """Validate the existence and size of journal partitions""" - - # Obtain the device node from the device path. - disk_node = device_path_to_device_node(disk_path) - - # Check that partition table format is GPT - output, _, _ = command(["udevadm", "settle", "-E", disk_node]) - output, _, _ = command(["parted", "-s", disk_node, "print"]) - if not re.search('Partition Table: gpt', output): - print("Format of disk node %s is not GPT, zapping disk" % disk_node) - return False - - # Check each partition size - partition_index = 1 - for size in partition_sizes: - # Check that each partition size matches the one in input - if DEVICE_NAME_NVME in disk_node: - partition_node = '{}p{}'.format(disk_node, str(partition_index)) - else: - partition_node = '{}{}'.format(disk_node, str(partition_index)) - - output, _, _ = command(["udevadm", "settle", "-E", partition_node]) - cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"] - output, _, _ = command(cmd) - - regex = ("^Disk " + str(partition_node) + ":\\s*" + - str(size) + "[\\.0]*MiB") - if not re.search(regex, output, re.MULTILINE): - print("Journal partition %(node)s size is not %(size)s, " - "zapping disk" % {"node": partition_node, "size": size}) - return False - - partition_index += 1 - - output, _, _ = command(["udevadm", "settle", "-t", "10"]) - return True - - -def create_partitions(disk_path, partition_sizes): - """Recreate partitions""" - - # Obtain the device node from the device path. - disk_node = device_path_to_device_node(disk_path) - - # Issue: After creating a new partition table on a device, Udev does not - # always remove old symlinks (i.e. to previous partitions on that device). - # Also, even if links are erased before zapping the disk, some of them will - # be recreated even though there is no partition to back them! - # Therefore, we have to remove the links AFTER we erase the partition table - # Issue: DISK_BY_PARTUUID directory is not present at all if there are no - # GPT partitions on the storage node so nothing to remove in this case - links = [] - if os.path.isdir(DISK_BY_PARTUUID): - links = [os.path.join(DISK_BY_PARTUUID, l) for l in os.listdir(DISK_BY_PARTUUID) - if os.path.islink(os.path.join(DISK_BY_PARTUUID, l))] - - # Erase all partitions on current node by creating a new GPT table - _, err, ret = command(["parted", "-s", disk_node, "mktable", "gpt"]) - if ret: - print("Error erasing partition table of %(node)s\n" - "Return code: %(ret)s reason: %(reason)s" % - {"node": disk_node, "ret": ret, "reason": err}) - exit(1) - - # Erase old symlinks - for l in links: - if disk_node in os.path.realpath(l): - os.remove(l) - - # Create partitions in order - used_space_mib = 1 # leave 1 MB at the beginning of the disk - num = 1 - for size in partition_sizes: - cmd = ['parted', '-s', disk_node, 'unit', 'mib', - 'mkpart', 'primary', - str(used_space_mib), str(used_space_mib + size)] - _, err, ret = command(cmd) - parms = {"disk_node": disk_node, - "start": used_space_mib, - "end": used_space_mib + size, - "reason": err} - print("Created partition from start=%(start)s MiB to end=%(end)s MiB" - " on %(disk_node)s" % parms) - if ret: - print("Failed to create partition with " - "start=%(start)s, end=%(end)s " - "on %(disk_node)s reason: %(reason)s" % parms) - exit(1) - # Set partition type to ceph journal - # noncritical operation, it makes 'ceph-disk list' output correct info - cmd = ['sgdisk', - '--change-name={num}:ceph journal'.format(num=num), - '--typecode={num}:{uuid}'.format( - num=num, - uuid=JOURNAL_UUID, - ), - disk_node] - _, err, ret = command(cmd) - if ret: - print("WARNINIG: Failed to set partition name and typecode") - used_space_mib += size - num += 1 - - -########################### -# Manage Journal Location # -########################### - -OSD_PATH = "/var/lib/ceph/osd/" - - -def mount_data_partition(data_path, osdid): - """Mount an OSD data partition and return the mounted path""" - - # Obtain the device node from the device path. - data_node = device_path_to_device_node(data_path) - - mount_path = OSD_PATH + "ceph-" + str(osdid) - output, _, _ = command(['mount']) - regex = "^" + data_node + ".*" + mount_path - if not re.search(regex, output, re.MULTILINE): - cmd = ['mount', '-t', 'xfs', data_node, mount_path] - _, _, ret = command(cmd) - params = {"node": data_node, "path": mount_path} - if ret: - print("Failed to mount %(node)s to %(path), aborting" % params) - exit(1) - else: - print("Mounted %(node)s to %(path)s" % params) - return mount_path - - -def is_location_correct(path, journal_path, osdid): - """Check if location points to the correct device""" - - # Obtain the device node from the device path. - journal_node = device_path_to_device_node(journal_path) - - cur_node = os.path.realpath(path + "/journal") - if cur_node == journal_node: - return True - else: - return False - - -def fix_location(mount_point, journal_path, osdid): - """Move the journal to the new partition""" - - # Obtain the device node from the device path. - journal_node = device_path_to_device_node(journal_path) - - # Fix symlink - path = mount_point + "/journal" # 'journal' symlink path used by ceph-osd - journal_uuid = get_partition_uuid(journal_node) - new_target = DISK_BY_PARTUUID + journal_uuid - params = {"path": path, "target": new_target} - try: - if os.path.lexists(path): - os.unlink(path) # delete the old symlink - os.symlink(new_target, path) - print("Symlink created: %(path)s -> %(target)s" % params) - except: - print("Failed to create symlink: %(path)s -> %(target)s" % params) - exit(1) - # Fix journal_uuid - path = mount_point + "/journal_uuid" - try: - with open(path, 'w') as f: - f.write(journal_uuid) - except Exception as ex: - # The operation is noncritical, it only makes 'ceph-disk list' - # display complete output. We log and continue. - params = {"path": path, "uuid": journal_uuid} - print("WARNING: Failed to set uuid of %(path)s to %(uuid)s" % params) - - # Clean the journal partition - # even if erasing the partition table, if another journal was present here - # it's going to be reused. Journals are always bigger than 100MB. - command(['dd', 'if=/dev/zero', 'of=%s' % journal_node, - 'bs=1M', 'count=100']) - - # Format the journal - cmd = ['/usr/bin/ceph-osd', '-i', str(osdid), - '--pid-file', '/var/run/ceph/osd.%s.pid' % osdid, - '-c', '/etc/ceph/ceph.conf', - '--cluster', 'ceph', - '--mkjournal'] - out, err, ret = command(cmd) - params = {"journal_node": journal_node, - "osdid": osdid, - "ret": ret, - "reason": err} - if not ret: - print("Prepared new journal partition: %(journal_node)s " - "for osd id: %(osdid)s" % params) - else: - print("Error initializing journal node: " - "%(journal_node)s for osd id: %(osdid)s " - "ceph-osd return code: %(ret)s reason: %(reason)s" % params) - - -######## -# Main # -######## - -def main(argv): - # parse and validate arguments - err = False - partitions = None - location = None - if len(argv) != 2: - err = True - elif argv[0] == "partitions": - valid_keys = ['disk_path', 'journals'] - partitions = get_input(argv[1], valid_keys) - if not partitions: - err = True - elif not isinstance(partitions['journals'], list): - err = True - elif argv[0] == "location": - valid_keys = ['data_path', 'journal_path', 'osdid'] - location = get_input(argv[1], valid_keys) - if not location: - err = True - elif not isinstance(location['osdid'], int): - err = True - else: - err = True - if err: - print("Command intended for internal use only") - exit(-1) - - if partitions: - # Recreate partitions only if the existing ones don't match input - if not is_partitioning_correct(partitions['disk_path'], - partitions['journals']): - create_partitions(partitions['disk_path'], partitions['journals']) - else: - print("Partition table for %s is correct, " - "no need to repartition" % - device_path_to_device_node(partitions['disk_path'])) - elif location: - # we need to have the data partition mounted & we can let it mounted - mount_point = mount_data_partition(location['data_path'], - location['osdid']) - # Update journal location only if link point to another partition - if not is_location_correct(mount_point, - location['journal_path'], - location['osdid']): - print("Fixing journal location for " - "OSD id: %(id)s" % {"node": location['data_path'], - "id": location['osdid']}) - fix_location(mount_point, - location['journal_path'], - location['osdid']) - else: - print("Journal location for %s is correct," - "no need to change it" % location['data_path']) - - -main(sys.argv[1:]) diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph-preshutdown.sh b/meta-stx-virt/recipes-extended/ceph/files/ceph-preshutdown.sh deleted file mode 100644 index 5f59bd1..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph-preshutdown.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -script=$(basename $0) - -# Set nullglob so wildcards will return empty string if no match -shopt -s nullglob - -for dev in /dev/rbd[0-9]*; do - for mnt in $(mount | awk -v dev=$dev '($1 == dev) {print $3}'); do - logger -t ${script} "Unmounting $mnt" - /usr/bin/umount $mnt - done - logger -t ${script} "Unmounted $dev" -done - -for dev in /dev/rbd[0-9]*; do - /usr/bin/rbd unmap -o force $dev - logger -t ${script} "Unmapped $dev" -done - -lsmod | grep -q '^rbd\>' && /usr/sbin/modprobe -r rbd -lsmod | grep -q '^libceph\>' && /usr/sbin/modprobe -r libceph - -exit 0 - diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph-radosgw.service b/meta-stx-virt/recipes-extended/ceph/files/ceph-radosgw.service deleted file mode 100644 index 391ecf6..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph-radosgw.service +++ /dev/null @@ -1,18 +0,0 @@ -[Unit] -Description=radosgw RESTful rados gateway -After=network.target -#After=remote-fs.target nss-lookup.target network-online.target time-sync.target -#Wants=network-online.target - -[Service] -Type=forking -Restart=no -KillMode=process -RemainAfterExit=yes -ExecStart=/etc/rc.d/init.d/ceph-radosgw start -ExecStop=/etc/rc.d/init.d/ceph-radosgw stop -ExecReload=/etc/rc.d/init.d/ceph-radosgw reload - -[Install] -WantedBy=multi-user.target - diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph.conf b/meta-stx-virt/recipes-extended/ceph/files/ceph.conf deleted file mode 100644 index 29e0711..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph.conf +++ /dev/null @@ -1,58 +0,0 @@ -[global] - # Unique ID for the cluster. - fsid = %CLUSTER_UUID% - # Public network where the monitor is connected to, i.e, 128.224.0.0/16 - #public network = 127.0.0.1/24 - # For version 0.55 and beyond, you must explicitly enable - # or disable authentication with "auth" entries in [global]. - auth_cluster_required = cephx - auth_service_required = cephx - auth_client_required = cephx - osd_journal_size = 1024 - - # Uncomment the following line if you are mounting with ext4 - # filestore xattr use omap = true - - # Number of replicas of objects. Write an object 2 times. - # Cluster cannot reach an active + clean state until there's enough OSDs - # to handle the number of copies of an object. In this case, it requires - # at least 2 OSDs - osd_pool_default_size = 2 - - # Allow writing one copy in a degraded state. - osd_pool_default_min_size = 1 - - # Ensure you have a realistic number of placement groups. We recommend - # approximately 100 per OSD. E.g., total number of OSDs multiplied by 100 - # divided by the number of replicas (i.e., osd pool default size). So for - # 2 OSDs and osd pool default size = 2, we'd recommend approximately - # (100 * 2) / 2 = 100. - osd_pool_default_pg_num = 64 - osd_pool_default_pgp_num = 64 - osd_crush_chooseleaf_type = 1 - setuser match path = /var/lib/ceph/$type/$cluster-$id - - # Override Jewel default of 2 reporters. StarlingX has replication factor 2 - mon_osd_min_down_reporters = 1 - - # Use Hammer's report interval default value - osd_mon_report_interval_max = 120 - - # Configure max PGs per OSD to cover worst-case scenario of all possible - # StarlingX deployments i.e. AIO-SX with one OSD. Otherwise using - # the default value provided by Ceph Mimic leads to "too many PGs per OSD" - # health warning as the pools needed by stx-openstack are being created. - mon_max_pg_per_osd = 2048 - osd_max_pg_per_osd_hard_ratio = 1.2 - -[osd] - osd_mkfs_type = xfs - osd_mkfs_options_xfs = "-f" - osd_mount_options_xfs = "rw,noatime,inode64,logbufs=8,logbsize=256k" - -[mon] - mon warn on legacy crush tunables = false - # Quiet new warnings on move to Hammer - mon pg warn max per osd = 2048 - mon pg warn max object skew = 0 - mgr initial modules = restful diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph.conf.pmon b/meta-stx-virt/recipes-extended/ceph/files/ceph.conf.pmon deleted file mode 100644 index 00418b2..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph.conf.pmon +++ /dev/null @@ -1,26 +0,0 @@ -[process] -process = ceph -script = /etc/init.d/ceph-init-wrapper - -style = lsb -severity = major ; minor, major, critical -restarts = 3 ; restart retries before error assertion -interval = 30 ; number of seconds to wait between restarts - -mode = status ; Monitoring mode: passive (default) or active - ; passive: process death monitoring (default: always) - ; active : heartbeat monitoring, i.e. request / response messaging - ; status : determine process health with executing "status" command - ; "start" is used to start the process(es) again - ; ignore : do not monitor or stop monitoring - -; Status and Active Monitoring Options - -period = 30 ; monitor period in seconds -timeout = 120 ; for active mode, messaging timeout period in seconds, must be shorter than period - ; for status mode, max amount of time for a command to execute - -; Status Monitoring Options -start_arg = start ; start argument for the script -status_arg = status ; status argument for the script -status_failure_text = /tmp/ceph_status_failure.txt ; text to be added to alarms or logs, this is optional diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph.service b/meta-stx-virt/recipes-extended/ceph/files/ceph.service deleted file mode 100644 index d3c2acc..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph.service +++ /dev/null @@ -1,16 +0,0 @@ -[Unit] -Description=StarlingX Ceph Startup -After=network.target - -[Service] -Type=forking -Restart=no -KillMode=process -RemainAfterExit=yes -ExecStart=/etc/rc.d/init.d/ceph start -ExecStop=/etc/rc.d/init.d/ceph stop -PIDFile=/var/run/ceph/ceph.pid - -[Install] -WantedBy=multi-user.target - diff --git a/meta-stx-virt/recipes-extended/ceph/files/ceph.sh b/meta-stx-virt/recipes-extended/ceph/files/ceph.sh deleted file mode 100644 index e7e6ecd..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/ceph.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash - -INITDIR=/etc/init.d -LOGFILE=/var/log/ceph/ceph-init.log -CEPH_FILE=/var/run/.ceph_started - -# Get our nodetype -. /etc/platform/platform.conf - -# Exit immediately if ceph not configured (i.e. no mon in the config file) -if ! grep -q "mon\." /etc/ceph/ceph.conf -then - exit 0 -fi - -logecho () -{ - echo $1 - date >> ${LOGFILE} - echo $1 >> ${LOGFILE} -} - -start () -{ - logecho "Starting ceph services..." - ${INITDIR}/ceph start >> ${LOGFILE} 2>&1 - RC=$? - - if [ ! -f ${CEPH_FILE} ]; then - touch ${CEPH_FILE} - fi -} - -stop () -{ - if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" == "simplex" ]]; then - logecho "Ceph services will continue to run on node" - exit 0 - fi - - logecho "Stopping ceph services..." - - if [ -f ${CEPH_FILE} ]; then - rm -f ${CEPH_FILE} - fi - - ${INITDIR}/ceph stop >> ${LOGFILE} 2>&1 - RC=$? -} - -RC=0 - -case "$1" in - start) - start - ;; - stop) - stop - ;; - *) - echo "Usage: $0 {start|stop}" - exit 1 - ;; -esac - -logecho "RC was: $RC" -exit $RC diff --git a/meta-stx-virt/recipes-extended/ceph/files/mgr-restful-plugin.py b/meta-stx-virt/recipes-extended/ceph/files/mgr-restful-plugin.py deleted file mode 100644 index d1f14b8..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/mgr-restful-plugin.py +++ /dev/null @@ -1,1121 +0,0 @@ -#!/usr/bin/python -# -# Copyright (c) 2019 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - - -### BEGIN INIT INFO -# Provides: ceph/mgr RESTful API plugin -# Required-Start: $ceph -# Required-Stop: $ceph -# Default-Start: 2 3 4 5 -# Default-Stop: 0 1 6 -# Short-Description: Ceph MGR RESTful API plugin -# Description: Ceph MGR RESTful API plugin -### END INIT INFO - -import argparse -import contextlib -import errno -import fcntl -import inspect -import json -import logging -import multiprocessing -import os -import shutil -import signal -import socket -import subprocess -import sys -import tempfile -import time - -import daemon -import psutil -import requests - -# 'timeout' command returns exit status 124 -# if command times out (see man page) -GNU_TIMEOUT_EXPIRED_RETCODE = 124 - - -def psutil_terminate_kill(target, timeout): - - """Extend psutil functionality to stop a process. - - SIGINT is sent to each target then after a grace period SIGKILL - is sent to the ones that are still running. - """ - - if not isinstance(target, list): - target = [target] - _, target = psutil.wait_procs(target, timeout=0) - for action in [lambda p: p.terminate(), lambda p: p.kill()]: - for proc in target: - action(proc) - _, target = psutil.wait_procs( - target, timeout=timeout) - - -class Config(object): - - """ceph-mgr service wrapper configuration options. - - In the future we may want to load them from a configuration file - (for example /etc/ceph/mgr-restful-plugin.conf ) - """ - - def __init__(self): - self.log_level = logging.INFO - self.log_dir = '/var/log' - - self.ceph_mgr_service = '/usr/bin/ceph-mgr' - self.ceph_mgr_config = '/etc/ceph/ceph.conf' - self.ceph_mgr_cluster = 'ceph' - self.ceph_mgr_rundir = '/var/run/ceph/mgr' - self.ceph_mgr_confdir = '/var/lib/ceph/mgr' - self.ceph_mgr_identity = socket.gethostname() - - self.service_name = 'mgr-restful-plugin' - self.service_socket = os.path.join( - self.ceph_mgr_rundir, '{}.socket'.format(self.service_name)) - self.service_lock = os.path.join( - self.ceph_mgr_rundir, '{}.lock'.format(self.service_name)) - self.service_pid_file = os.path.join( - '/var/run/ceph', '{}.pid'.format(self.service_name)) - - self.restful_plugin_port = 5001 - - # maximum size of a message received/sent via - # service monitor control socket - self.service_socket_bufsize = 1024 - - # maximum time to wait for ceph cli to exit - self.ceph_cli_timeout_sec = 30 - - # how much time to wait after ceph cli commands fail with timeout - # before running any other commands - self.cluster_grace_period_sec = 30 - - # after ceph-mgr is started it goes through an internal initialization - # phase before; how much time to wait before querying ceph-mgr - self.ceph_mgr_grace_period_sec = 15 - - # after sending SIGTERM to ceph-mgr how much time to wait before - # sending SIGKILL (maximum time allowed for ceph-mgr cleanup) - self.ceph_mgr_kill_delay_sec = 5 - - # if service monitor is running a recovery procedure it reports - # status OK even if ceph-mgr is currently down. This sets the - # maximum number of consecutive ceph-mgr failures before reporting - # status error - self.ceph_mgr_fail_count_report_error = 3 - - # maximum number of consecutive ceph-mgr failures before - # stopping mgr-restful-plugin service - self.ceph_mgr_fail_count_exit = 5 - - # maximum time allowed for ceph-mgr to respond to a REST API request - self.rest_api_timeout_sec = 15 - - # interval between consecutive REST API requests (ping's). A smaller - # value here triggers more requests to ceph-mgr restful plugin. A - # higher value makes recovery slower when services become unavailable - self.restful_plugin_ping_delay_sec = 3 - - # where to save the self-signed certificate generated by ceph-mgr - self.restful_plugin_cert_path = os.path.join( - self.ceph_mgr_rundir, 'restful.crt') - - # time to wait after enabling restful plugin - self.restful_plugin_grace_period_sec = 3 - - # after how many REST API ping failures to restart ceph-mgr - self.ping_fail_count_restart_mgr = 3 - - # after how many REST API ping failures to report status error. - # Until then service monitor reports status OK just in case - # restful plugin recovers - self.ping_fail_count_report_error = 5 - - @staticmethod - def load(): - return Config() - - -def setup_logging(name=None, cleanup_handlers=False): - if not name: - name = CONFIG.service_name - log = logging.getLogger(name) - log.setLevel(CONFIG.log_level) - if cleanup_handlers: - try: - for handler in log.handlers: - if isinstance(handler, logging.StreamHandler): - handler.flush() - if isinstance(handler, logging.FileHandler): - handler.close() - log.handlers = [] - except Exception: - pass - elif log.handlers: - return log - handler = logging.FileHandler( - os.path.join(CONFIG.log_dir, - '{}.log'.format(CONFIG.service_name))) - handler.setFormatter( - logging.Formatter('%(asctime)s %(process)s %(levelname)s %(name)s %(message)s')) - log.addHandler(handler) - return log - - -CONFIG = Config.load() -LOG = setup_logging(name='init-wrapper') - - -class ServiceException(Exception): - - """Generic mgr-restful-plugin service exception. - - Build exception string based on static (per exception class) - string plus args, keyword args passed to exception constructor. - """ - - message = "" - - def __init__(self, *args, **kwargs): - if "message" not in kwargs: - try: - message = self.message.format(*args, **kwargs) - except Exception: # noqa - message = '{}, args:{}, kwargs: {}'.format( - self.message, args, kwargs) - else: - message = kwargs["message"] - super(ServiceException, self).__init__(message) - - -class ServiceAlreadyStarted(ServiceException): - message = ('Service monitor already started') - - -class ServiceLockFailed(ServiceException): - message = ('Unable to lock service monitor: ' - 'reason={reason}') - - -class ServiceNoSocket(ServiceException): - message = ('Unable to create service monitor socket: ' - 'reason={reason}') - - -class ServiceSocketBindFailed(ServiceException): - message = ('Failed to bind service monitor socket: ' - 'path={path}, reason={reason}') - - -class ServiceNoPidFile(ServiceException): - message = ('Failed to update pid file: ' - 'path={path}, reason={reason}') - - -class CommandFailed(ServiceException): - message = ('Command failed: command={command}, ' - 'reason={reason}, out={out}') - - -class CommandTimeout(ServiceException): - message = ('Command timeout: command={command}, ' - 'timeout={timeout}') - - -class CephMgrStartFailed(ServiceException): - message = ('Failed to start ceph_mgr: ' - 'reason={reason}') - - -class CephRestfulPluginFailed(ServiceException): - message = ('Failed to start restful plugin: ' - 'reason={reason}') - - -class RestApiPingFailed(ServiceException): - message = ('REST API ping failed: ' - 'reason={reason}') - - -class ServiceMonitor(object): - - """Configure and monitor ceph-mgr and restful plugin (Ceph REST API) - - 1. process init script service requests: status, stop. Requests are - received via a control socket. Stop has priority over whatever - the monitor is doing currently. Any ceph command that may be running - is terminated/killed. Note that while ceph-mgr and restful plugin - configuration is in progress ServiceMonitor reports status OK to - avoid being restarted by SM. - - 2. configure ceph-mgr and mgr restful plugin: authentication, REST API - service port, self signed certificate. This runs as a separate - process so it can be stopped when init script requests it. - - 3. periodically check (ping) REST API responds to HTTPS requests. - Recovery actions are taken if REST API fails to respond: restart - ceph-mgr, wait for cluster to become available again. - """ - - def __init__(self): - # process running configuration & REST API ping loop - self.monitor = None - - # command socket used by init script - self.command = None - - # ceph-mgr process - self.ceph_mgr = None - - # consecutive ceph-mgr/restful-plugin start failures. Service monitor - # reports failure after CONFIG.ceph_mgr_max_failure_count - self.ceph_mgr_failure_count = 0 - - # consecutive REST API ping failures. ceph-mgr service is restarted - # after CONFIG.ping_fail_count_restart_mgr threshold is exceeded - self.ping_failure_count = 0 - - # REST API url reported by ceph-mgr after enabling restful plugin - self.restful_plugin_url = '' - - # REST API self signed certificate generated by restful plugin - self.certificate = '' - - def run(self): - self.disable_certificate_check() - with self.service_lock(), self.service_socket(), \ - self.service_pid_file(): - self.start_monitor() - self.server_loop() - - def disable_certificate_check(self): - # ceph-mgr restful plugin is configured with a self-signed - # certificate. Certificate host is hard-coded to "ceph-restful" - # which causes HTTPS requests to fail because they don't - # match current host name ("controller-..."). Disable HTTPS - # certificates check in urllib3 - LOG.warning('Disable urllib3 certifcates check') - requests.packages.urllib3.disable_warnings() - - def server_loop(self): - self.command.listen(2) - while True: - try: - client, _ = self.command.accept() - request = client.recv(CONFIG.service_socket_bufsize) - LOG.debug('Monitor command socket: request=%s', str(request)) - cmd = request.split(' ') - cmd, args = cmd[0], cmd[1:] - if cmd == 'status': - self.send_response(client, request, self.status()) - elif cmd == 'stop': - self.stop() - self.send_response(client, request, 'OK') - break - elif cmd == 'restful-url': - try: - self.restful_plugin_url = args[0] - self.send_response(client, request, 'OK') - except IndexError: - LOG.warning('Failed to update restful plugin url: ' - 'args=%s', str(args)) - self.send_response(client, request, 'ERR') - elif cmd == 'certificate': - try: - self.certificate = args[0] if args else '' - self.send_response(client, request, 'OK') - except IndexError: - LOG.warning('Failed to update certificate path: ' - 'args=%s', str(args)) - self.send_response(client, request, 'ERR') - elif cmd == 'ceph-mgr-failures': - try: - self.ceph_mgr_failure_count = int(args[0]) - self.send_response(client, request, 'OK') - if self.ceph_mgr_failure_count >= CONFIG.ceph_mgr_fail_count_exit: - self.stop() - break - except (IndexError, ValueError): - LOG.warning('Failed to update ceph-mgr failures: ' - 'args=%s', str(args)) - self.send_response(client, request, 'ERR') - elif cmd == 'ping-failures': - try: - self.ping_failure_count = int(args[0]) - self.send_response(client, request, 'OK') - except (IndexError, ValueError): - LOG.warning('Failed to update ping failures: ' - 'args=%s', str(args)) - self.send_response(client, request, 'ERR') - except Exception as err: - LOG.exception(err) - - @staticmethod - def send_response(client, request, response): - try: - client.send(response) - except socket.error as err: - LOG.warning('Failed to send response back. ' - 'request=%s, response=%s, reason=%s', - request, response, err) - - def status(self): - if not self.restful_plugin_url: - if self.ceph_mgr_failure_count < CONFIG.ceph_mgr_fail_count_report_error \ - and self.ping_failure_count < CONFIG.ping_fail_count_report_error: - LOG.debug('Monitor is starting services. Report status OK') - return 'OK' - LOG.debug('Too many failures: ' - 'ceph_mgr=%d < %d, ping=%d < %d. ' - 'Report status ERR', - self.ceph_mgr_failure_count, - CONFIG.ceph_mgr_fail_count_report_error, - self.ping_failure_count, - CONFIG.ping_fail_count_report_error) - return 'ERR.down' - try: - self.restful_plugin_ping() - LOG.debug('Restful plugin ping successful. Report status OK') - return 'OK' - except (CommandFailed, RestApiPingFailed): - if self.ceph_mgr_failure_count < CONFIG.ceph_mgr_fail_count_report_error \ - and self.ping_failure_count < CONFIG.ping_fail_count_report_error: - LOG.info('Restful plugin does not respond but failure ' - 'count is within acceptable limits: ' - ' ceph_mgr=%d < %d, ping=%d < %d. ' - 'Report status OK', - self.ceph_mgr_failure_count, - CONFIG.ceph_mgr_fail_count_report_error, - self.ping_failure_count, - CONFIG.ping_fail_count_report_error) - return 'OK' - LOG.debug('Restful does not respond (ping failure count %d). ' - 'Report status ERR', self.ping_failure_count) - return 'ERR.ping_failed' - - def stop(self): - if not self.monitor: - return - LOG.info('Stop monitor with SIGTERM to process group %d', - self.monitor.pid) - try: - os.killpg(self.monitor.pid, signal.SIGTERM) - except OSError as err: - LOG.info('Stop monitor failed: reason=%s', str(err)) - return - time.sleep(CONFIG.ceph_mgr_kill_delay_sec) - LOG.info('Stop monitor with SIGKILL to process group %d', - self.monitor.pid) - try: - os.killpg(self.monitor.pid, signal.SIGKILL) - os.waitpid(self.monitor.pid, 0) - except OSError as err: - LOG.info('Stop monitor failed: reason=%s', str(err)) - return - LOG.info('Monitor stopped: pid=%d', self.monitor.pid) - - @contextlib.contextmanager - def service_lock(self): - LOG.info('Take service lock: path=%s', CONFIG.service_lock) - try: - os.makedirs(os.path.dirname(CONFIG.service_lock)) - except OSError: - pass - lock_file = open(CONFIG.service_lock, 'w') - try: - fcntl.flock(lock_file.fileno(), - fcntl.LOCK_EX | fcntl.LOCK_NB) - except (IOError, OSError) as err: - if err.errno == errno.EAGAIN: - raise ServiceAlreadyStarted() - else: - raise ServiceLockFailed(reason=str(err)) - # even if we have the lock here there might be another service manager - # running whose CONFIG.ceph_mgr_rundir was removed before starting - # this instance. Make sure there is only one service manager running - self.stop_other_service_managers() - try: - yield - finally: - os.unlink(CONFIG.service_lock) - lock_file.close() - LOG.info('Release service lock: path=%s', CONFIG.service_lock) - - def stop_other_service_managers(self): - service = os.path.join('/etc/init.d', CONFIG.service_name) - for p in psutil.process_iter(): - if p.cmdline()[:2] not in [[service], ['/usr/bin/python', service]]: - continue - if p.pid == os.getpid(): - continue - p.kill() - - @contextlib.contextmanager - def service_socket(self): - LOG.info('Create service socket') - try: - self.command = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) - except socket.error as err: - raise ServiceNoSocket(reason=str(err)) - LOG.info('Remove existing socket files') - try: - os.unlink(CONFIG.service_socket) - except OSError: - pass - LOG.info('Bind service socket: path=%s', CONFIG.service_socket) - try: - self.command.bind(CONFIG.service_socket) - except socket.error as err: - raise ServiceSocketBindFailed( - path=CONFIG.service_socket, reason=str(err)) - try: - yield - finally: - LOG.info('Close service socket and remove file: path=%s', - CONFIG.service_socket) - self.command.close() - os.unlink(CONFIG.service_socket) - - @contextlib.contextmanager - def service_pid_file(self): - LOG.info('Update service pid file: path=%s', CONFIG.service_pid_file) - try: - pid_file = open(CONFIG.service_pid_file, 'w') - pid_file.write(str(os.getpid())) - pid_file.flush() - except OSError as err: - raise ServiceNoPidFile( - path=CONFIG.service_pid_file, reason=str(err)) - try: - yield - finally: - LOG.info('Remove service pid file: path=%s', - CONFIG.service_pid_file) - try: - os.unlink(CONFIG.service_pid_file) - except OSError: - pass - - def start_monitor(self): - LOG.info('Start monitor loop') - self.monitor = multiprocessing.Process(target=self.monitor_loop) - self.monitor.start() - - def stop_unmanaged_ceph_mgr(self): - LOG.info('Stop unmanaged running ceph-mgr processes') - service_name = os.path.basename(CONFIG.ceph_mgr_service) - if self.ceph_mgr: - psutil_terminate_kill( - [proc for proc in psutil.process_iter() - if (proc.name() == service_name - and proc.pid != self.ceph_mgr.pid)], - CONFIG.ceph_mgr_kill_delay_sec) - else: - psutil_terminate_kill( - [proc for proc in psutil.process_iter() - if proc.name() == service_name], - CONFIG.ceph_mgr_kill_delay_sec) - - def monitor_loop(self): - - """Bring up and monitor ceph-mgr restful plugin. - - Steps: - - wait for Ceph cluster to become available - - configure and start ceph-mgr - - configure and enable restful plugin - - send periodic requests to REST API - - recover from failures - - Note: because this runs as a separate process it - must send status updates to service monitor - via control socket for: ping_failure_count, - restful_plugin_url and certificate. - """ - - # Promote to process group leader so parent (service monitor) - # can kill the monitor plus processes spawned by it. Otherwise - # children of monitor_loop() will keep running in background and - # will be reaped by init when they finish but by then they might - # interfere with any new service instance. - os.setpgrp() - - # Ignoring SIGTERM here ensures process group is not reused by - # the time parent (service monitor) issues the final SIGKILL. - signal.signal(signal.SIGTERM, signal.SIG_IGN) - - while True: - try: - # steps to configure/start ceph-mgr and restful plugin - self.ceph_fsid_get() - self.ceph_mgr_auth_create() - self.restful_plugin_set_server_port() - self.restful_plugin_create_certificate() - self.ceph_mgr_start() - self.restful_plugin_enable() - self.restful_plugin_create_admin_key() - self.restful_plugin_get_url() - self.restful_plugin_get_certificate() - - # REST API should be available now - # start making periodic requests (ping) - while True: - try: - self.restful_plugin_ping() - self.ping_failure_count = 0 - self.request_update_ping_failures( - self.ping_failure_count) - self.ceph_mgr_failure_count = 0 - self.request_update_ceph_mgr_failures( - self.ceph_mgr_failure_count) - time.sleep(CONFIG.restful_plugin_ping_delay_sec) - continue - except RestApiPingFailed as err: - LOG.warning(str(err)) - - LOG.info('REST API ping failure count=%d', - self.ping_failure_count) - self.ping_failure_count += 1 - self.request_update_ping_failures( - self.ping_failure_count) - - # maybe request failed because ceph-mgr is not running - if not self.ceph_mgr_is_running(): - self.ceph_mgr_failure_count += 1 - self.request_update_ceph_mgr_failures( - self.ceph_mgr_failure_count) - self.ceph_mgr_start() - time.sleep(CONFIG.ceph_mgr_grace_period_sec) - continue - - # maybe request failed because cluster health is not ok - if not self.ceph_fsid_get(): - LOG.info('Unable to get cluster fsid. ' - 'Sleep for a while') - time.sleep(CONFIG.cluster_grace_period_sec) - break - - # too many failures? Restart ceph-mgr and go again - # through configuration steps - if (self.ping_failure_count - % CONFIG.ping_fail_count_restart_mgr == 0): - LOG.info('Too many consecutive REST API failures. ' - 'Restart ceph-mgr. Update service ' - 'url and certificate') - self.ceph_mgr_stop() - self.restful_plugin_url = '' - self.request_update_plugin_url(self.restful_plugin_url) - self.certificate = '' - self.request_update_certificate(self.certificate) - break - - time.sleep(CONFIG.restful_plugin_ping_delay_sec) - - except CommandFailed as err: - LOG.warning(str(err)) - time.sleep(CONFIG.cluster_grace_period_sec) - except CommandTimeout as err: - LOG.warning(str(err)) - except (CephMgrStartFailed, CephRestfulPluginFailed) as err: - LOG.warning(str(err)) - self.ceph_mgr_failure_count += 1 - self.request_update_ceph_mgr_failures( - self.ceph_mgr_failure_count) - time.sleep(CONFIG.ceph_mgr_grace_period_sec) - except Exception as err: - LOG.exception(err) - time.sleep(CONFIG.cluster_grace_period_sec) - - @staticmethod - def run_with_timeout(command, timeout, stderr=subprocess.STDOUT): - try: - LOG.info('Run command: %s', ' '.join(command)) - return subprocess.check_output( - ['/usr/bin/timeout', str(timeout)] + command, - stderr=stderr, shell=False).strip() - except subprocess.CalledProcessError as err: - if err.returncode == GNU_TIMEOUT_EXPIRED_RETCODE: - raise CommandTimeout(command=err.cmd, timeout=timeout) - raise CommandFailed(command=err.cmd, reason=str(err), - out=err.output) - - def ceph_fsid_get(self): - return self.run_with_timeout(['/usr/bin/ceph', 'fsid'], - CONFIG.ceph_cli_timeout_sec) - - def ceph_mgr_has_auth(self): - path = '{}/ceph-{}'.format( - CONFIG.ceph_mgr_confdir, CONFIG.ceph_mgr_identity) - try: - os.makedirs(path) - except OSError as err: - pass - try: - self.run_with_timeout( - ['/usr/bin/ceph', 'auth', 'get', - 'mgr.{}'.format(CONFIG.ceph_mgr_identity), - '-o', '{}/keyring'.format(path)], - CONFIG.ceph_cli_timeout_sec) - return True - except CommandFailed as err: - if 'ENOENT' in str(err): - return False - raise - - def ceph_mgr_auth_create(self): - if self.ceph_mgr_has_auth(): - return - LOG.info('Create ceph-mgr authentication') - self.run_with_timeout( - ['/usr/bin/ceph', 'auth', 'get-or-create', - 'mgr.{}'.format(CONFIG.ceph_mgr_identity), - 'mon', 'allow *', 'osd', 'allow *'], - CONFIG.ceph_cli_timeout_sec) - - def ceph_mgr_is_running(self): - if not self.ceph_mgr: - return None - try: - self.ceph_mgr.wait(timeout=0) - except psutil.TimeoutExpired: - return True - return False - - def ceph_mgr_start(self): - if self.ceph_mgr_is_running(): - return - self.stop_unmanaged_ceph_mgr() - LOG.info('Start ceph-mgr daemon') - try: - with open(os.devnull, 'wb') as null: - self.ceph_mgr = psutil.Popen( - [CONFIG.ceph_mgr_service, - '--cluster', CONFIG.ceph_mgr_cluster, - '--conf', CONFIG.ceph_mgr_config, - '--id', CONFIG.ceph_mgr_identity, - '-f'], - close_fds=True, - stdout=null, - stderr=null, - shell=False) - except (OSError, ValueError) as err: - raise CephMgrStartFailed(reason=str(err)) - time.sleep(CONFIG.ceph_mgr_grace_period_sec) - - def ceph_mgr_stop(self): - if not self.ceph_mgr: - return - LOG.info('Stop ceph-mgr') - psutil_terminate_kill(self.ceph_mgr, CONFIG.ceph_mgr_kill_delay_sec) - - def restful_plugin_has_server_port(self): - try: - with open(os.devnull, 'wb') as null: - out = self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'get', - 'mgr/restful/server_port'], - CONFIG.ceph_cli_timeout_sec, stderr=null) - if out == str(CONFIG.restful_plugin_port): - return True - LOG.warning('Restful plugin port mismatch: ' - 'current=%d, expected=%d', out, - CONFIG.restful_plugin_port) - except CommandFailed as err: - LOG.warning('Failed to get restful plugin port: ' - 'reason=%s', str(err)) - return False - - def restful_plugin_set_server_port(self): - if self.restful_plugin_has_server_port(): - return - LOG.info('Set restful plugin port=%d', CONFIG.restful_plugin_port) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'set', - 'mgr/restful/server_port', str(CONFIG.restful_plugin_port)], - CONFIG.ceph_cli_timeout_sec) - - def restful_plugin_has_admin_key(self): - try: - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'get', - 'mgr/restful/keys/admin'], - CONFIG.ceph_cli_timeout_sec) - return True - except CommandFailed: - pass - return False - - def restful_plugin_create_admin_key(self): - if self.restful_plugin_has_admin_key(): - return - LOG.info('Create restful plugin admin key') - self.run_with_timeout( - ['/usr/bin/ceph', 'restful', - 'create-key', 'admin'], - CONFIG.ceph_cli_timeout_sec) - - def restful_plugin_has_certificate(self): - try: - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'get', - 'config/mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity)], - CONFIG.ceph_cli_timeout_sec) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'get', - 'mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity)], - CONFIG.ceph_cli_timeout_sec) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'get', - 'config/mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity)], - CONFIG.ceph_cli_timeout_sec) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'get', - '/mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity)], - CONFIG.ceph_cli_timeout_sec) - return True - except CommandFailed: - pass - return False - - def restful_plugin_create_certificate(self): - if self.restful_plugin_has_certificate(): - return - LOG.info('Create restful plugin self signed certificate') - path = tempfile.mkdtemp() - try: - try: - with tempfile.NamedTemporaryFile() as restful_cnf: - restful_cnf.write(( - '[req]\n' - 'req_extensions = v3_ca\n' - 'distinguished_name = req_distinguished_name\n' - '[v3_ca]\n' - 'subjectAltName=DNS:{}\n' - 'basicConstraints = CA:true\n' - '[ req_distinguished_name ]\n' - '0.organizationName = IT\n' - 'commonName = ceph-restful\n').format( - CONFIG.ceph_mgr_identity)) - restful_cnf.flush() - subprocess.check_call([ - '/usr/bin/openssl', 'req', '-new', '-nodes', '-x509', - '-subj', '/O=IT/CN=' + CONFIG.ceph_mgr_identity, - '-days', '3650', - '-config', restful_cnf.name, - '-out', os.path.join(path, 'crt'), - '-keyout', os.path.join(path, 'key'), - '-extensions', 'v3_ca']) - except subprocess.CalledProcessError as err: - raise CommandFailed( - command=' '.join(err.cmd), - reason='failed to generate self-signed certificate: {}'.format(str(err)), - out=err.output) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'set', - 'config/mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity), - '-i', os.path.join(path, 'crt')], - CONFIG.ceph_cli_timeout_sec) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'set', - 'mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity), - '-i', os.path.join(path, 'crt')], - CONFIG.ceph_cli_timeout_sec) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'set', - 'config/mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity), - '-i', os.path.join(path, 'key')], - CONFIG.ceph_cli_timeout_sec) - self.run_with_timeout( - ['/usr/bin/ceph', 'config-key', 'set', - 'mgr/restful/{}/key'.format(CONFIG.ceph_mgr_identity), - '-i', os.path.join(path, 'key')], - CONFIG.ceph_cli_timeout_sec) - finally: - shutil.rmtree(path) - - def restful_plugin_is_enabled(self): - command = ['/usr/bin/ceph', 'mgr', 'module', 'ls', - '--format', 'json'] - with open(os.devnull, 'wb') as null: - out = self.run_with_timeout( - command, CONFIG.ceph_cli_timeout_sec, stderr=null) - try: - if 'restful' in json.loads(out)['enabled_modules']: - return True - except ValueError as err: - raise CommandFailed( - command=' '.join(command), - reason='unable to decode json: {}'.format(err), out=out) - except KeyError as err: - raise CommandFailed( - command=' '.join(command), - reason='missing expected key: {}'.format(err), out=out) - return False - - def restful_plugin_enable(self): - if not self.restful_plugin_is_enabled(): - LOG.info('Enable restful plugin') - self.run_with_timeout( - ['/usr/bin/ceph', 'mgr', - 'module', 'enable', 'restful'], - CONFIG.ceph_cli_timeout_sec) - time.sleep(CONFIG.restful_plugin_grace_period_sec) - - def restful_plugin_get_url(self): - command = ['/usr/bin/ceph', 'mgr', 'services', - '--format', 'json'] - with open(os.devnull, 'wb') as null: - out = self.run_with_timeout( - command, CONFIG.ceph_cli_timeout_sec, stderr=null) - try: - self.restful_plugin_url = json.loads(out)['restful'] - except ValueError as err: - raise CephRestfulPluginFailed( - reason='unable to decode json: {} output={}'.format(err, out)) - except KeyError as err: - raise CephRestfulPluginFailed( - reason='missing expected key: {} in ouput={}'.format(err, out)) - self.request_update_plugin_url(self.restful_plugin_url) - - def restful_plugin_get_certificate(self): - command = ['/usr/bin/ceph', 'config-key', 'get', - 'config/mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity)] - with open(os.devnull, 'wb') as null: - certificate = self.run_with_timeout( - command, CONFIG.ceph_cli_timeout_sec, stderr=null) - with open(CONFIG.restful_plugin_cert_path, 'wb') as cert_file: - cert_file.write(certificate) - self.certificate = CONFIG.restful_plugin_cert_path - self.request_update_certificate( - self.certificate) - - def restful_plugin_ping(self): - if not self.restful_plugin_url: - raise RestApiPingFailed(reason='missing service url') - if not self.certificate: - raise RestApiPingFailed(reason='missing certificate') - LOG.debug('Ping restful plugin: url=%d', self.restful_plugin_url) - try: - response = requests.request( - 'GET', self.restful_plugin_url, verify=False, - timeout=CONFIG.rest_api_timeout_sec) - if not response.ok: - raise RestApiPingFailed( - reason='response not ok ({})'.format(response)) - LOG.debug('Ping restful plugin OK') - except (requests.ConnectionError, - requests.Timeout, - requests.HTTPError) as err: - raise RestApiPingFailed(reason=str(err)) - - @staticmethod - def _make_client_socket(): - sock = socket.socket( - socket.AF_UNIX, socket.SOCK_SEQPACKET) - sock.settimeout(2 * CONFIG.rest_api_timeout_sec) - sock.connect(CONFIG.service_socket) - return sock - - @staticmethod - def request_status(): - try: - with contextlib.closing( - ServiceMonitor._make_client_socket()) as sock: - sock.send('status') - status = sock.recv(CONFIG.service_socket_bufsize) - LOG.debug('Status %s', status) - return status.startswith('OK') - except socket.error as err: - LOG.error('Status error: reason=%s', err) - return False - - @staticmethod - def request_stop(): - try: - with contextlib.closing( - ServiceMonitor._make_client_socket()) as sock: - sock.send('stop') - response = sock.recv(CONFIG.service_socket_bufsize) - LOG.debug('Stop response: %s', response) - return True - except socket.error as err: - LOG.error('Stop error: reason=%s', err) - return False - - @staticmethod - def request_update_ceph_mgr_failures(count): - try: - with contextlib.closing( - ServiceMonitor._make_client_socket()) as sock: - sock.send('ceph-mgr-failures {}'.format(count)) - sock.recv(CONFIG.service_socket_bufsize) - return True - except socket.error as err: - LOG.error('Stop error: reason=%s', err) - return False - - @staticmethod - def request_update_ping_failures(count): - try: - with contextlib.closing( - ServiceMonitor._make_client_socket()) as sock: - sock.send('ping-failures {}'.format(count)) - sock.recv(CONFIG.service_socket_bufsize) - return True - except socket.error as err: - LOG.error('Stop error: reason=%s', err) - return False - - @staticmethod - def request_update_plugin_url(url): - try: - with contextlib.closing( - ServiceMonitor._make_client_socket()) as sock: - sock.send('restful-url {}'.format(url)) - sock.recv(CONFIG.service_socket_bufsize) - return True - except socket.error as err: - LOG.error('Stop error: reason=%s', err) - return False - - @staticmethod - def request_update_certificate(path): - try: - with contextlib.closing( - ServiceMonitor._make_client_socket()) as sock: - sock.send('certificate {}'.format(path)) - sock.recv(CONFIG.service_socket_bufsize) - return True - except socket.error as err: - LOG.error('Stop error: reason=%s', err) - return False - - -class InitWrapper(object): - - """Handle System V init script actions: start, stop, restart, etc. """ - - def __init__(self): - - """Dispatch command line action to the corresponding function. - - Candidate action functions are all class methods except ones - that start with an underscore. - """ - - parser = argparse.ArgumentParser() - actions = [m[0] - for m in inspect.getmembers(self) - if (inspect.ismethod(m[1]) - and not m[0].startswith('_'))] - parser.add_argument( - 'action', - choices=actions) - self.args = parser.parse_args() - getattr(self, self.args.action)() - - def start(self): - - """Start ServiceMonitor as a daemon unless one is already running. - - Use a pipe to report monitor status back to this process. - """ - - pipe = os.pipe() - child = os.fork() - if child == 0: - os.close(pipe[0]) - with daemon.DaemonContext(files_preserve=[pipe[1]]): - # prevent duplication of messages in log - global LOG - LOG = setup_logging(cleanup_handlers=True) - try: - monitor = ServiceMonitor() - status = 'OK' - except ServiceAlreadyStarted: - os.write(pipe[1], 'OK') - os.close(pipe[1]) - return - except Exception as err: - status = str(err) - os.write(pipe[1], status) - os.close(pipe[1]) - if status == 'OK': - try: - monitor.run() - except ServiceException as err: - LOG.warning(str(err)) - except Exception as err: - LOG.exception('Service monitor error: reason=%s', err) - else: - os.close(pipe[1]) - try: - status = os.read(pipe[0], CONFIG.service_socket_bufsize) - if status == 'OK': - sys.exit(0) - else: - LOG.warning('Service monitor failed to start: ' - 'status=%s', status) - except IOError as err: - LOG.warning('Failed to read monitor status: reason=%s', err) - os.close(pipe[0]) - os.waitpid(child, 0) - sys.exit(1) - - def stop(self): - - """Tell ServiceMonitor daemon to stop running. - - In case request fails stop ServiceMonitor and ceph_mgr proecsses - using SIGTERM followed by SIGKILL. - """ - - result = ServiceMonitor.request_stop() - if not result: - ceph_mgr = os.path.basename(CONFIG.ceph_mgr_service) - procs = [] - for proc in psutil.process_iter(): - name = proc.name() - if name == CONFIG.service_name: - procs.append(proc) - if name == ceph_mgr: - procs.append(proc) - psutil_terminate_kill(procs, CONFIG.ceph_mgr_kill_delay_sec) - - def restart(self): - self.stop() - self.start() - - def force_reload(self): - self.stop() - self.start() - - def reload(self): - self.stop() - self.start() - - def status(self): - - """Report status from ServiceMonitor. - - We don't just try to access REST API here because ServiceMonitor may - be in the process of starting/configuring ceph-mgr and restful - plugin in which case we report OK to avoid being restarted by SM. - """ - - status = ServiceMonitor.request_status() - sys.exit(0 if status is True else 1) - - -if __name__ == '__main__': - InitWrapper() diff --git a/meta-stx-virt/recipes-extended/ceph/files/mgr-restful-plugin.service b/meta-stx-virt/recipes-extended/ceph/files/mgr-restful-plugin.service deleted file mode 100644 index b3e61f0..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/mgr-restful-plugin.service +++ /dev/null @@ -1,15 +0,0 @@ -[Unit] -Description=Ceph MGR RESTful API Plugin -After=network-online.target sw-patch.service - -[Service] -Type=forking -Restart=no -KillMode=process -RemainAfterExit=yes -ExecStart=/etc/rc.d/init.d/mgr-restful-plugin start -ExecStop=/etc/rc.d/init.d/mgr-restul-plugin stop -ExecReload=/etc/rc.d/init.d/mgr-restful-plugin reload - -[Install] -WantedBy=multi-user.target diff --git a/meta-stx-virt/recipes-extended/ceph/files/starlingx-docker-override.conf b/meta-stx-virt/recipes-extended/ceph/files/starlingx-docker-override.conf deleted file mode 100644 index 5ffd859..0000000 --- a/meta-stx-virt/recipes-extended/ceph/files/starlingx-docker-override.conf +++ /dev/null @@ -1,3 +0,0 @@ -[Service] -ExecStopPost=/usr/sbin/ceph-preshutdown.sh -