From 6a96c4fe72c3e51dea601d5d8a762efb34c659dd Mon Sep 17 00:00:00 2001 From: Litao Gao <litao.gao@windriver.com> Date: Tue, 30 Jul 2019 09:25:42 -0400 Subject: [PATCH] ceph: add bbappend to port StarlingX ceph packages ceph recipe is in meta-virtualization, however, StarlingX has made many modifications, so needs extra effort to port these using bbappend. Signed-off-by: Litao Gao <litao.gao@windriver.com> --- .../stx-integ-ceph/ceph_13.2.2.bbappend | 85 ++ .../stx-integ-ceph/files/ceph-init-wrapper.sh | 282 +++++ .../files/ceph-manage-journal.py | 334 ++++++ .../stx-integ-ceph/files/ceph-preshutdown.sh | 30 + .../stx-integ-ceph/files/ceph-radosgw.service | 18 + recipes-core/stx-integ-ceph/files/ceph.conf | 58 + .../stx-integ-ceph/files/ceph.conf.pmon | 26 + .../stx-integ-ceph/files/ceph.service | 16 + recipes-core/stx-integ-ceph/files/ceph.sh | 67 ++ .../files/mgr-restful-plugin.py | 1056 +++++++++++++++++ .../files/mgr-restful-plugin.service | 15 + .../files/starlingx-docker-override.conf | 3 + ...s-for-orderly-shutdown-on-controller.patch | 59 + 13 files changed, 2049 insertions(+) create mode 100644 recipes-core/stx-integ-ceph/ceph_13.2.2.bbappend create mode 100755 recipes-core/stx-integ-ceph/files/ceph-init-wrapper.sh create mode 100644 recipes-core/stx-integ-ceph/files/ceph-manage-journal.py create mode 100644 recipes-core/stx-integ-ceph/files/ceph-preshutdown.sh create mode 100644 recipes-core/stx-integ-ceph/files/ceph-radosgw.service create mode 100644 recipes-core/stx-integ-ceph/files/ceph.conf create mode 100644 recipes-core/stx-integ-ceph/files/ceph.conf.pmon create mode 100644 recipes-core/stx-integ-ceph/files/ceph.service create mode 100644 recipes-core/stx-integ-ceph/files/ceph.sh create mode 100644 recipes-core/stx-integ-ceph/files/mgr-restful-plugin.py create mode 100644 recipes-core/stx-integ-ceph/files/mgr-restful-plugin.service create mode 100644 recipes-core/stx-integ-ceph/files/starlingx-docker-override.conf create mode 100644 recipes-core/stx-integ-ceph/patches/0001-Add-hooks-for-orderly-shutdown-on-controller.patch diff --git a/recipes-core/stx-integ-ceph/ceph_13.2.2.bbappend b/recipes-core/stx-integ-ceph/ceph_13.2.2.bbappend new file mode 100644 index 0000000..251342f --- /dev/null +++ b/recipes-core/stx-integ-ceph/ceph_13.2.2.bbappend @@ -0,0 +1,85 @@ +FILESEXTRAPATHS_prepend := "${THISDIR}/patches:${THISDIR}/files:" +SRC_URI_append = "file://0001-Add-hooks-for-orderly-shutdown-on-controller.patch \ + file://ceph.conf \ + file://ceph-init-wrapper.sh \ + file://ceph-preshutdown.sh \ + file://ceph.service \ + file://mgr-restful-plugin.py \ + file://starlingx-docker-override.conf \ + file://ceph.conf.pmon \ + file://ceph-manage-journal.py \ + file://ceph-radosgw.service \ + file://ceph.sh \ + file://mgr-restful-plugin.service \ +" + + +do_install_append () { + install -d ${D}${sysconfdir}/ceph + install -m 0644 ${WORKDIR}/ceph.conf ${D}${sysconfdir}/ceph/ + install -m 0644 ${WORKDIR}/ceph-radosgw.service ${D}${systemd_system_unitdir}/ceph-radosgw@.service + install -m 0644 ${WORKDIR}/ceph.service ${D}${systemd_system_unitdir} + install -m 0644 ${WORKDIR}/mgr-restful-plugin.service ${D}${systemd_system_unitdir} + + install -m 0700 ${WORKDIR}/ceph-manage-journal.py ${D}${sbindir}/ceph-manage-journal + install -Dm 0750 ${WORKDIR}/mgr-restful-plugin.py ${D}${sysconfdir}/rc.d/init.d/mgr-restful-plugin + install -m 0750 ${WORKDIR}/ceph.conf.pmon ${D}${sysconfdir}/ceph/ + + install -d -m 0750 ${D}${sysconfdir}/services.d/controller + install -d -m 0750 ${D}${sysconfdir}/services.d/storage + install -d -m 0750 ${D}${sysconfdir}/services.d/worker + + install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/controller + install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/storage + install -m 0750 ${WORKDIR}/ceph.sh ${D}${sysconfdir}/services.d/worker + + install -Dm 0750 ${WORKDIR}/ceph-init-wrapper.sh ${D}${sysconfdir}/rc.d/init.d/ceph-init-wrapper + install -m 0700 ${WORKDIR}/ceph-preshutdown.sh ${D}${sbindir}/ceph-preshutdown.sh + + install -Dm 0644 ${WORKDIR}/starlingx-docker-override.conf ${D}${systemd_system_unitdir}/docker.service.d/starlingx-docker-override.conf + + install -m 0644 -D ${S}/src/etc-rbdmap ${D}${sysconfdir}/ceph/rbdmap + install -m 0644 -D ${S}/etc/sysconfig/ceph ${D}${sysconfdir}/sysconfig/ceph + install -m 0644 -D ${S}/src/logrotate.conf ${D}${sysconfdir}/logrotate.d/ceph + + install -m 0644 -D ${S}/COPYING ${D}${docdir}/ceph/COPYING + install -m 0644 -D ${S}/etc/sysctl/90-ceph-osd.conf ${D}${libdir}/sysctl.d/90-ceph-osd.conf + install -m 0644 -D ${S}/udev/50-rbd.rules ${D}${libdir}/udev/rules.d/50-rbd.rules + install -m 0644 -D ${S}/udev/60-ceph-by-parttypeuuid.rules ${D}${libdir}/udev/rules.d/60-ceph-by-parttypeuuid.rules + + mkdir -p ${D}${localstatedir}/ceph + mkdir -p ${D}${localstatedir}/run/ceph + mkdir -p ${D}${localstatedir}/log/ceph + mkdir -p ${D}${localstatedir}/lib/ceph/tmp + mkdir -p ${D}${localstatedir}/lib/ceph/mon + mkdir -p ${D}${localstatedir}/lib/ceph/osd + mkdir -p ${D}${localstatedir}/lib/ceph/mds + mkdir -p ${D}${localstatedir}/lib/ceph/mgr + mkdir -p ${D}${localstatedir}/lib/ceph/radosgw + mkdir -p ${D}${localstatedir}/lib/ceph/bootstrap-osd + mkdir -p ${D}${localstatedir}/lib/ceph/bootstrap-mds + mkdir -p ${D}${localstatedir}/lib/ceph/bootstrap-rgw + mkdir -p ${D}${localstatedir}/lib/ceph/bootstrap-mgr + mkdir -p ${D}${localstatedir}/lib/ceph/bootstrap-rbd + + install -m 0750 -D ${S}/src/init-radosgw ${D}${sysconfdir}/rc.d/init.d/ceph-radosgw + sed -i '/### END INIT INFO/a SYSTEMCTL_SKIP_REDIRECT=1' ${D}${sysconfdir}/rc.d/init.d/ceph-radosgw + install -m 0750 -D ${S}/src/init-rbdmap ${D}${sysconfdir}/rc.d/init.d/rbdmap + install -m 0750 -D ${B}/bin/init-ceph ${D}${sysconfdir}/rc.d/init.d/ceph + install -m 0750 -D ${B}/bin/init-ceph ${D}${sysconfdir}/init.d/ceph + install -d -m 0750 ${D}${localstatedir}/log/radosgw +} + +TARGET_CC_ARCH += "${LDFLAGS}" +RDEPENDS_${PN} += "\ + bash \ +" + +FILES_${PN} += "\ + ${localstatedir} ${libdir} ${docdir} \ + ${systemd_system_unitdir}/mgr-restful-plugin.service \ + ${systemd_system_unitdir}/ceph-radosgw@.service \ + ${systemd_system_unitdir}/ceph.service \ + ${systemd_system_unitdir}/docker.service.d \ + /run \ +" diff --git a/recipes-core/stx-integ-ceph/files/ceph-init-wrapper.sh b/recipes-core/stx-integ-ceph/files/ceph-init-wrapper.sh new file mode 100755 index 0000000..0a5cd53 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph-init-wrapper.sh @@ -0,0 +1,282 @@ +#!/bin/bash +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# This script is a helper wrapper for pmon monitoring of ceph +# processes. The "/etc/init.d/ceph" script does not know if ceph is +# running on the node. For example when the node is locked, ceph +# processes are not running. In that case we do not want pmond to +# monitor these processes. +# +# The script "/etc/services.d/<node>/ceph.sh" will create the file +# "/var/run/.ceph_started" when ceph is running and remove it when +# is not. +# +# The script also extracts one or more ceph process names that are +# reported as 'not running' or 'dead' or 'failed' by '/etc/intit.d/ceph status' +# and writes the names to a text file: /tmp/ceph_status_failure.txt for +# pmond to access. The pmond adds the text to logs and alarms. Example of text +# samples written to file by this script are: +# 'osd.1' +# 'osd.1, osd.2' +# 'mon.storage-0' +# 'mon.storage-0, osd.2' +# +# Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status' +# the script will try increase their logging to 'debug' for a configurable interval. +# With logging increased it will outputs a few stack traces then, at the end of this +# interval, it dumps its stack core and kills it. +# +# Return values; +# zero - /etc/init.d/ceph returned success or ceph is not running on the node +# non-zero /etc/init.d/ceph returned a failure or invalid syntax +# + +source /usr/bin/tsconfig +source /etc/platform/platform.conf + +CEPH_SCRIPT="/etc/init.d/ceph" +CEPH_FILE="$VOLATILE_PATH/.ceph_started" +CEPH_RESTARTING_FILE="$VOLATILE_PATH/.ceph_restarting" +CEPH_GET_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_status" +CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt" + +BINDIR=/usr/bin +SBINDIR=/usr/sbin +LIBDIR=/usr/lib64/ceph +ETCDIR=/etc/ceph +source $LIBDIR/ceph_common.sh + +LOG_PATH=/var/log/ceph +LOG_FILE=$LOG_PATH/ceph-process-states.log +LOG_LEVEL=NORMAL # DEBUG +verbose=0 + +DATA_PATH=$VOLATILE_PATH/ceph_hang # folder where we keep state information +mkdir -p $DATA_PATH # make sure folder exists + +MONITORING_INTERVAL=15 +TRACE_LOOP_INTERVAL=5 +GET_STATUS_TIMEOUT=120 +CEPH_STATUS_TIMEOUT=20 + +WAIT_FOR_CMD=1 + +RC=0 + +args=("$@") + +if [ ! -z $ARGS ]; then + IFS=";" read -r -a new_args <<< "$ARGS" + args+=("${new_args[@]}") +fi + +wait_for_status () +{ + timeout=$GET_STATUS_TIMEOUT # wait for status no more than $timeout seconds + while [ -f ${CEPH_GET_STATUS_FILE} ] && [ $timeout -gt 0 ]; do + sleep 1 + let timeout-=1 + done + if [ $timeout -eq 0 ]; then + wlog "-" "WARN" "Getting status takes more than ${GET_STATUS_TIMEOUT}s, continuing" + rm -f $CEPH_GET_STATUS_FILE + fi +} + +start () +{ + if [ -f ${CEPH_FILE} ]; then + wait_for_status + ${CEPH_SCRIPT} start $1 + RC=$? + else + # Ceph is not running on this node, return success + exit 0 + fi +} + +stop () +{ + wait_for_status + ${CEPH_SCRIPT} stop $1 +} + +restart () +{ + if [ -f ${CEPH_FILE} ]; then + wait_for_status + touch $CEPH_RESTARTING_FILE + ${CEPH_SCRIPT} restart $1 + rm -f $CEPH_RESTARTING_FILE + else + # Ceph is not running on this node, return success + exit 0 + fi + +} + +log_and_restart_blocked_osds () +{ + # Log info about the blocked osd daemons and then restart it + local names=$1 + for name in $names; do + wlog $name "INFO" "Restarting OSD with blocked operations" + ${CEPH_SCRIPT} restart $name + done +} + +log_and_kill_hung_procs () +{ + # Log info about the hung processes and then kill them; later on pmon will restart them + local names=$1 + for name in $names; do + type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1' + id=`echo $name | cut -c 4- | sed 's/^\\.//'` + get_conf run_dir "/var/run/ceph" "run dir" + get_conf pid_file "$run_dir/$type.$id.pid" "pid file" + pid=$(cat $pid_file) + wlog $name "INFO" "Dealing with hung process (pid:$pid)" + + # monitoring interval + wlog $name "INFO" "Increasing log level" + execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20" + monitoring=$MONITORING_INTERVAL + while [ $monitoring -gt 0 ]; do + if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then + date=$(date "+%Y-%m-%d_%H-%M-%S") + log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log" + wlog $name "INFO" "Dumping stack trace to: $log_file" + $(pstack $pid >$log_file) & + fi + let monitoring-=1 + sleep 1 + done + wlog $name "INFO" "Trigger core dump" + kill -ABRT $pid &>/dev/null + rm -f $pid_file # process is dead, core dump is archiving, preparing for restart + # Wait for pending systemd core dumps + sleep 2 # hope systemd_coredump has started meanwhile + deadline=$(( $(date '+%s') + 300 )) + while [[ $(date '+%s') -lt "${deadline}" ]]; do + systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}") + [[ -z "${systemd_coredump_pid}" ]] && break + wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}" + sleep 2 + done + kill -KILL $pid &>/dev/null + done +} + + +status () +{ + if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then + timeout $CEPH_STATUS_TIMEOUT ceph -s + if [ "$?" -ne 0 ]; then + # Ceph cluster is not accessible. Don't panic, controller swact + # may be in progress. + wlog "-" INFO "Ceph is down, ignoring OSD status." + exit 0 + fi + fi + + if [ -f ${CEPH_RESTARTING_FILE} ]; then + # Ceph is restarting, we don't report state changes on the first pass + rm -f ${CEPH_RESTARTING_FILE} + exit 0 + fi + if [ -f ${CEPH_FILE} ]; then + # Make sure the script does not 'exit' between here and the 'rm -f' below + # or the checkpoint file will be left behind + touch -f ${CEPH_GET_STATUS_FILE} + result=`${CEPH_SCRIPT} status $1` + RC=$? + if [ "$RC" -ne 0 ]; then + erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` + hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` + blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'` + invalid=0 + host=`hostname` + if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then + # On 2 node configuration we have a floating monitor + host="controller" + fi + for i in $(echo $erred_procs $hung_procs); do + if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then + continue + else + invalid=1 + fi + done + + log_and_restart_blocked_osds $blocked_ops_procs + log_and_kill_hung_procs $hung_procs + + hung_procs_text="" + for i in $(echo $hung_procs); do + hung_procs_text+="$i(process hung) " + done + + rm -f $CEPH_STATUS_FAILURE_TEXT_FILE + if [ $invalid -eq 0 ]; then + text="" + for i in $erred_procs; do + text+="$i, " + done + for i in $hung_procs; do + text+="$i (process hang), " + done + echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE + else + echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs" + echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE + fi + fi + + rm -f ${CEPH_GET_STATUS_FILE} + + if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then + # SM needs exit code != 0 from 'status mon' argument of the init script on + # standby controller otherwise it thinks that the monitor is running and + # tries to stop it. + # '/etc/init.d/ceph status mon' checks the status of monitors configured in + # /etc/ceph/ceph.conf and if it should be running on current host. + # If it should not be running it just exits with code 0. This is what + # happens on the standby controller. + # When floating monitor is running on active controller /var/lib/ceph/mon of + # standby is not mounted (Ceph monitor partition is DRBD synced). + test -e "/var/lib/ceph/mon/ceph-controller" + if [ "$?" -ne 0 ]; then + exit 3 + fi + fi + else + # Ceph is not running on this node, return success + exit 0 + fi +} + + +case "${args[0]}" in + start) + start ${args[1]} + ;; + stop) + stop ${args[1]} + ;; + restart) + restart ${args[1]} + ;; + status) + status ${args[1]} + ;; + *) + echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]" + exit 1 + ;; +esac + +exit $RC diff --git a/recipes-core/stx-integ-ceph/files/ceph-manage-journal.py b/recipes-core/stx-integ-ceph/files/ceph-manage-journal.py new file mode 100644 index 0000000..f91cbc1 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph-manage-journal.py @@ -0,0 +1,334 @@ +#!/usr/bin/python +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import ast +import os +import os.path +import re +import subprocess +import sys + +DEVICE_NAME_NVME = "nvme" + +######### +# Utils # +######### + + +def command(arguments, **kwargs): + """Execute e command and capture stdout, stderr & return code""" + process = subprocess.Popen( + arguments, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + **kwargs) + out, err = process.communicate() + return out, err, process.returncode + + +def get_input(arg, valid_keys): + """Convert the input to a dict and perform basic validation""" + json_string = arg.replace("\\n", "\n") + try: + input_dict = ast.literal_eval(json_string) + if not all(k in input_dict for k in valid_keys): + return None + except Exception: + return None + + return input_dict + + +def get_partition_uuid(dev): + output, _, _ = command(['blkid', dev]) + try: + return re.search('PARTUUID=\"(.+?)\"', output).group(1) + except AttributeError: + return None + + +def device_path_to_device_node(device_path): + try: + output, _, _ = command(["udevadm", "settle", "-E", device_path]) + out, err, retcode = command(["readlink", "-f", device_path]) + out = out.rstrip() + except Exception as e: + return None + + return out + + +########################################### +# Manage Journal Disk Partitioning Scheme # +########################################### + +DISK_BY_PARTUUID = "/dev/disk/by-partuuid/" +JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106' # Type of a journal partition + + +def is_partitioning_correct(disk_path, partition_sizes): + """Validate the existence and size of journal partitions""" + + # Obtain the device node from the device path. + disk_node = device_path_to_device_node(disk_path) + + # Check that partition table format is GPT + output, _, _ = command(["udevadm", "settle", "-E", disk_node]) + output, _, _ = command(["parted", "-s", disk_node, "print"]) + if not re.search('Partition Table: gpt', output): + print("Format of disk node %s is not GPT, zapping disk" % disk_node) + return False + + # Check each partition size + partition_index = 1 + for size in partition_sizes: + # Check that each partition size matches the one in input + if DEVICE_NAME_NVME in disk_node: + partition_node = '{}p{}'.format(disk_node, str(partition_index)) + else: + partition_node = '{}{}'.format(disk_node, str(partition_index)) + + output, _, _ = command(["udevadm", "settle", "-E", partition_node]) + cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"] + output, _, _ = command(cmd) + + regex = ("^Disk " + str(partition_node) + ":\\s*" + + str(size) + "[\\.0]*MiB") + if not re.search(regex, output, re.MULTILINE): + print("Journal partition %(node)s size is not %(size)s, " + "zapping disk" % {"node": partition_node, "size": size}) + return False + + partition_index += 1 + + output, _, _ = command(["udevadm", "settle", "-t", "10"]) + return True + + +def create_partitions(disk_path, partition_sizes): + """Recreate partitions""" + + # Obtain the device node from the device path. + disk_node = device_path_to_device_node(disk_path) + + # Issue: After creating a new partition table on a device, Udev does not + # always remove old symlinks (i.e. to previous partitions on that device). + # Also, even if links are erased before zapping the disk, some of them will + # be recreated even though there is no partition to back them! + # Therefore, we have to remove the links AFTER we erase the partition table + # Issue: DISK_BY_PARTUUID directory is not present at all if there are no + # GPT partitions on the storage node so nothing to remove in this case + links = [] + if os.path.isdir(DISK_BY_PARTUUID): + links = [os.path.join(DISK_BY_PARTUUID, l) for l in os.listdir(DISK_BY_PARTUUID) + if os.path.islink(os.path.join(DISK_BY_PARTUUID, l))] + + # Erase all partitions on current node by creating a new GPT table + _, err, ret = command(["parted", "-s", disk_node, "mktable", "gpt"]) + if ret: + print("Error erasing partition table of %(node)s\n" + "Return code: %(ret)s reason: %(reason)s" % + {"node": disk_node, "ret": ret, "reason": err}) + exit(1) + + # Erase old symlinks + for l in links: + if disk_node in os.path.realpath(l): + os.remove(l) + + # Create partitions in order + used_space_mib = 1 # leave 1 MB at the beginning of the disk + num = 1 + for size in partition_sizes: + cmd = ['parted', '-s', disk_node, 'unit', 'mib', + 'mkpart', 'primary', + str(used_space_mib), str(used_space_mib + size)] + _, err, ret = command(cmd) + parms = {"disk_node": disk_node, + "start": used_space_mib, + "end": used_space_mib + size, + "reason": err} + print("Created partition from start=%(start)s MiB to end=%(end)s MiB" + " on %(disk_node)s" % parms) + if ret: + print("Failed to create partition with " + "start=%(start)s, end=%(end)s " + "on %(disk_node)s reason: %(reason)s" % parms) + exit(1) + # Set partition type to ceph journal + # noncritical operation, it makes 'ceph-disk list' output correct info + cmd = ['sgdisk', + '--change-name={num}:ceph journal'.format(num=num), + '--typecode={num}:{uuid}'.format( + num=num, + uuid=JOURNAL_UUID, + ), + disk_node] + _, err, ret = command(cmd) + if ret: + print("WARNINIG: Failed to set partition name and typecode") + used_space_mib += size + num += 1 + + +########################### +# Manage Journal Location # +########################### + +OSD_PATH = "/var/lib/ceph/osd/" + + +def mount_data_partition(data_path, osdid): + """Mount an OSD data partition and return the mounted path""" + + # Obtain the device node from the device path. + data_node = device_path_to_device_node(data_path) + + mount_path = OSD_PATH + "ceph-" + str(osdid) + output, _, _ = command(['mount']) + regex = "^" + data_node + ".*" + mount_path + if not re.search(regex, output, re.MULTILINE): + cmd = ['mount', '-t', 'xfs', data_node, mount_path] + _, _, ret = command(cmd) + params = {"node": data_node, "path": mount_path} + if ret: + print("Failed to mount %(node)s to %(path), aborting" % params) + exit(1) + else: + print("Mounted %(node)s to %(path)s" % params) + return mount_path + + +def is_location_correct(path, journal_path, osdid): + """Check if location points to the correct device""" + + # Obtain the device node from the device path. + journal_node = device_path_to_device_node(journal_path) + + cur_node = os.path.realpath(path + "/journal") + if cur_node == journal_node: + return True + else: + return False + + +def fix_location(mount_point, journal_path, osdid): + """Move the journal to the new partition""" + + # Obtain the device node from the device path. + journal_node = device_path_to_device_node(journal_path) + + # Fix symlink + path = mount_point + "/journal" # 'journal' symlink path used by ceph-osd + journal_uuid = get_partition_uuid(journal_node) + new_target = DISK_BY_PARTUUID + journal_uuid + params = {"path": path, "target": new_target} + try: + if os.path.lexists(path): + os.unlink(path) # delete the old symlink + os.symlink(new_target, path) + print("Symlink created: %(path)s -> %(target)s" % params) + except: + print("Failed to create symlink: %(path)s -> %(target)s" % params) + exit(1) + # Fix journal_uuid + path = mount_point + "/journal_uuid" + try: + with open(path, 'w') as f: + f.write(journal_uuid) + except Exception as ex: + # The operation is noncritical, it only makes 'ceph-disk list' + # display complete output. We log and continue. + params = {"path": path, "uuid": journal_uuid} + print("WARNING: Failed to set uuid of %(path)s to %(uuid)s" % params) + + # Clean the journal partition + # even if erasing the partition table, if another journal was present here + # it's going to be reused. Journals are always bigger than 100MB. + command(['dd', 'if=/dev/zero', 'of=%s' % journal_node, + 'bs=1M', 'count=100']) + + # Format the journal + cmd = ['/usr/bin/ceph-osd', '-i', str(osdid), + '--pid-file', '/var/run/ceph/osd.%s.pid' % osdid, + '-c', '/etc/ceph/ceph.conf', + '--cluster', 'ceph', + '--mkjournal'] + out, err, ret = command(cmd) + params = {"journal_node": journal_node, + "osdid": osdid, + "ret": ret, + "reason": err} + if not ret: + print("Prepared new journal partition: %(journal_node)s " + "for osd id: %(osdid)s" % params) + else: + print("Error initializing journal node: " + "%(journal_node)s for osd id: %(osdid)s " + "ceph-osd return code: %(ret)s reason: %(reason)s" % params) + + +######## +# Main # +######## + +def main(argv): + # parse and validate arguments + err = False + partitions = None + location = None + if len(argv) != 2: + err = True + elif argv[0] == "partitions": + valid_keys = ['disk_path', 'journals'] + partitions = get_input(argv[1], valid_keys) + if not partitions: + err = True + elif not isinstance(partitions['journals'], list): + err = True + elif argv[0] == "location": + valid_keys = ['data_path', 'journal_path', 'osdid'] + location = get_input(argv[1], valid_keys) + if not location: + err = True + elif not isinstance(location['osdid'], int): + err = True + else: + err = True + if err: + print("Command intended for internal use only") + exit(-1) + + if partitions: + # Recreate partitions only if the existing ones don't match input + if not is_partitioning_correct(partitions['disk_path'], + partitions['journals']): + create_partitions(partitions['disk_path'], partitions['journals']) + else: + print("Partition table for %s is correct, " + "no need to repartition" % + device_path_to_device_node(partitions['disk_path'])) + elif location: + # we need to have the data partition mounted & we can let it mounted + mount_point = mount_data_partition(location['data_path'], + location['osdid']) + # Update journal location only if link point to another partition + if not is_location_correct(mount_point, + location['journal_path'], + location['osdid']): + print("Fixing journal location for " + "OSD id: %(id)s" % {"node": location['data_path'], + "id": location['osdid']}) + fix_location(mount_point, + location['journal_path'], + location['osdid']) + else: + print("Journal location for %s is correct," + "no need to change it" % location['data_path']) + + +main(sys.argv[1:]) diff --git a/recipes-core/stx-integ-ceph/files/ceph-preshutdown.sh b/recipes-core/stx-integ-ceph/files/ceph-preshutdown.sh new file mode 100644 index 0000000..5f59bd1 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph-preshutdown.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +script=$(basename $0) + +# Set nullglob so wildcards will return empty string if no match +shopt -s nullglob + +for dev in /dev/rbd[0-9]*; do + for mnt in $(mount | awk -v dev=$dev '($1 == dev) {print $3}'); do + logger -t ${script} "Unmounting $mnt" + /usr/bin/umount $mnt + done + logger -t ${script} "Unmounted $dev" +done + +for dev in /dev/rbd[0-9]*; do + /usr/bin/rbd unmap -o force $dev + logger -t ${script} "Unmapped $dev" +done + +lsmod | grep -q '^rbd\>' && /usr/sbin/modprobe -r rbd +lsmod | grep -q '^libceph\>' && /usr/sbin/modprobe -r libceph + +exit 0 + diff --git a/recipes-core/stx-integ-ceph/files/ceph-radosgw.service b/recipes-core/stx-integ-ceph/files/ceph-radosgw.service new file mode 100644 index 0000000..391ecf6 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph-radosgw.service @@ -0,0 +1,18 @@ +[Unit] +Description=radosgw RESTful rados gateway +After=network.target +#After=remote-fs.target nss-lookup.target network-online.target time-sync.target +#Wants=network-online.target + +[Service] +Type=forking +Restart=no +KillMode=process +RemainAfterExit=yes +ExecStart=/etc/rc.d/init.d/ceph-radosgw start +ExecStop=/etc/rc.d/init.d/ceph-radosgw stop +ExecReload=/etc/rc.d/init.d/ceph-radosgw reload + +[Install] +WantedBy=multi-user.target + diff --git a/recipes-core/stx-integ-ceph/files/ceph.conf b/recipes-core/stx-integ-ceph/files/ceph.conf new file mode 100644 index 0000000..29e0711 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph.conf @@ -0,0 +1,58 @@ +[global] + # Unique ID for the cluster. + fsid = %CLUSTER_UUID% + # Public network where the monitor is connected to, i.e, 128.224.0.0/16 + #public network = 127.0.0.1/24 + # For version 0.55 and beyond, you must explicitly enable + # or disable authentication with "auth" entries in [global]. + auth_cluster_required = cephx + auth_service_required = cephx + auth_client_required = cephx + osd_journal_size = 1024 + + # Uncomment the following line if you are mounting with ext4 + # filestore xattr use omap = true + + # Number of replicas of objects. Write an object 2 times. + # Cluster cannot reach an active + clean state until there's enough OSDs + # to handle the number of copies of an object. In this case, it requires + # at least 2 OSDs + osd_pool_default_size = 2 + + # Allow writing one copy in a degraded state. + osd_pool_default_min_size = 1 + + # Ensure you have a realistic number of placement groups. We recommend + # approximately 100 per OSD. E.g., total number of OSDs multiplied by 100 + # divided by the number of replicas (i.e., osd pool default size). So for + # 2 OSDs and osd pool default size = 2, we'd recommend approximately + # (100 * 2) / 2 = 100. + osd_pool_default_pg_num = 64 + osd_pool_default_pgp_num = 64 + osd_crush_chooseleaf_type = 1 + setuser match path = /var/lib/ceph/$type/$cluster-$id + + # Override Jewel default of 2 reporters. StarlingX has replication factor 2 + mon_osd_min_down_reporters = 1 + + # Use Hammer's report interval default value + osd_mon_report_interval_max = 120 + + # Configure max PGs per OSD to cover worst-case scenario of all possible + # StarlingX deployments i.e. AIO-SX with one OSD. Otherwise using + # the default value provided by Ceph Mimic leads to "too many PGs per OSD" + # health warning as the pools needed by stx-openstack are being created. + mon_max_pg_per_osd = 2048 + osd_max_pg_per_osd_hard_ratio = 1.2 + +[osd] + osd_mkfs_type = xfs + osd_mkfs_options_xfs = "-f" + osd_mount_options_xfs = "rw,noatime,inode64,logbufs=8,logbsize=256k" + +[mon] + mon warn on legacy crush tunables = false + # Quiet new warnings on move to Hammer + mon pg warn max per osd = 2048 + mon pg warn max object skew = 0 + mgr initial modules = restful diff --git a/recipes-core/stx-integ-ceph/files/ceph.conf.pmon b/recipes-core/stx-integ-ceph/files/ceph.conf.pmon new file mode 100644 index 0000000..00418b2 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph.conf.pmon @@ -0,0 +1,26 @@ +[process] +process = ceph +script = /etc/init.d/ceph-init-wrapper + +style = lsb +severity = major ; minor, major, critical +restarts = 3 ; restart retries before error assertion +interval = 30 ; number of seconds to wait between restarts + +mode = status ; Monitoring mode: passive (default) or active + ; passive: process death monitoring (default: always) + ; active : heartbeat monitoring, i.e. request / response messaging + ; status : determine process health with executing "status" command + ; "start" is used to start the process(es) again + ; ignore : do not monitor or stop monitoring + +; Status and Active Monitoring Options + +period = 30 ; monitor period in seconds +timeout = 120 ; for active mode, messaging timeout period in seconds, must be shorter than period + ; for status mode, max amount of time for a command to execute + +; Status Monitoring Options +start_arg = start ; start argument for the script +status_arg = status ; status argument for the script +status_failure_text = /tmp/ceph_status_failure.txt ; text to be added to alarms or logs, this is optional diff --git a/recipes-core/stx-integ-ceph/files/ceph.service b/recipes-core/stx-integ-ceph/files/ceph.service new file mode 100644 index 0000000..d3c2acc --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph.service @@ -0,0 +1,16 @@ +[Unit] +Description=StarlingX Ceph Startup +After=network.target + +[Service] +Type=forking +Restart=no +KillMode=process +RemainAfterExit=yes +ExecStart=/etc/rc.d/init.d/ceph start +ExecStop=/etc/rc.d/init.d/ceph stop +PIDFile=/var/run/ceph/ceph.pid + +[Install] +WantedBy=multi-user.target + diff --git a/recipes-core/stx-integ-ceph/files/ceph.sh b/recipes-core/stx-integ-ceph/files/ceph.sh new file mode 100644 index 0000000..e7e6ecd --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/ceph.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +INITDIR=/etc/init.d +LOGFILE=/var/log/ceph/ceph-init.log +CEPH_FILE=/var/run/.ceph_started + +# Get our nodetype +. /etc/platform/platform.conf + +# Exit immediately if ceph not configured (i.e. no mon in the config file) +if ! grep -q "mon\." /etc/ceph/ceph.conf +then + exit 0 +fi + +logecho () +{ + echo $1 + date >> ${LOGFILE} + echo $1 >> ${LOGFILE} +} + +start () +{ + logecho "Starting ceph services..." + ${INITDIR}/ceph start >> ${LOGFILE} 2>&1 + RC=$? + + if [ ! -f ${CEPH_FILE} ]; then + touch ${CEPH_FILE} + fi +} + +stop () +{ + if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" == "simplex" ]]; then + logecho "Ceph services will continue to run on node" + exit 0 + fi + + logecho "Stopping ceph services..." + + if [ -f ${CEPH_FILE} ]; then + rm -f ${CEPH_FILE} + fi + + ${INITDIR}/ceph stop >> ${LOGFILE} 2>&1 + RC=$? +} + +RC=0 + +case "$1" in + start) + start + ;; + stop) + stop + ;; + *) + echo "Usage: $0 {start|stop}" + exit 1 + ;; +esac + +logecho "RC was: $RC" +exit $RC diff --git a/recipes-core/stx-integ-ceph/files/mgr-restful-plugin.py b/recipes-core/stx-integ-ceph/files/mgr-restful-plugin.py new file mode 100644 index 0000000..c1cae60 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/mgr-restful-plugin.py @@ -0,0 +1,1056 @@ +#!/usr/bin/python +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + + +### BEGIN INIT INFO +# Provides: ceph/mgr RESTful API plugin +# Required-Start: $ceph +# Required-Stop: $ceph +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Ceph MGR RESTful API plugin +# Description: Ceph MGR RESTful API plugin +### END INIT INFO + +import argparse +import contextlib +import errno +import fcntl +import inspect +import json +import logging +import multiprocessing +import os +import signal +import socket +import subprocess +import sys +import time + +import daemon +import psutil +import requests + +# 'timeout' command returns exit status 124 +# if command times out (see man page) +GNU_TIMEOUT_EXPIRED_RETCODE = 124 + + +def psutil_terminate_kill(target, timeout): + + """Extend psutil functionality to stop a process. + + SIGINT is sent to each target then after a grace period SIGKILL + is sent to the ones that are still running. + """ + + if not isinstance(target, list): + target = [target] + _, target = psutil.wait_procs(target, timeout=0) + for action in [lambda p: p.terminate(), lambda p: p.kill()]: + for proc in target: + action(proc) + _, target = psutil.wait_procs( + target, timeout=timeout) + + +class Config(object): + + """ceph-mgr service wrapper configuration options. + + In the future we may want to load them from a configuration file + (for example /etc/ceph/mgr-restful-plugin.conf ) + """ + + def __init__(self): + self.log_level = logging.INFO + self.log_dir = '/var/log' + + self.ceph_mgr_service = '/usr/bin/ceph-mgr' + self.ceph_mgr_cluster = 'ceph' + self.ceph_mgr_rundir = '/var/run/ceph/mgr' + self.ceph_mgr_identity = socket.gethostname() + + self.service_name = 'mgr-restful-plugin' + self.service_socket = os.path.join( + self.ceph_mgr_rundir, '{}.socket'.format(self.service_name)) + self.service_lock = os.path.join( + self.ceph_mgr_rundir, '{}.lock'.format(self.service_name)) + self.service_pid_file = os.path.join( + '/var/run/ceph', '{}.pid'.format(self.service_name)) + + self.restful_plugin_port = 5001 + + # maximum size of a message received/sent via + # service monitor control socket + self.service_socket_bufsize = 1024 + + # maximum time to wait for ceph cli to exit + self.ceph_cli_timeout_sec = 30 + + # how much time to wait after ceph cli commands fail with timeout + # before running any other commands + self.cluster_grace_period_sec = 30 + + # after ceph-mgr is started it goes through an internal initialization + # phase before; how much time to wait before querying ceph-mgr + self.ceph_mgr_grace_period_sec = 15 + + # after sending SIGTERM to ceph-mgr how much time to wait before + # sending SIGKILL (maximum time allowed for ceph-mgr cleanup) + self.ceph_mgr_kill_delay_sec = 5 + + # if service monitor is running a recovery procedure it reports + # status OK even if ceph-mgr is currently down. This sets the + # maximum number of consecutive ceph-mgr failures before reporting + # status error + self.ceph_mgr_fail_count_report_error = 3 + + # maximum number of consecutive ceph-mgr failures before + # stopping mgr-restful-plugin service + self.ceph_mgr_fail_count_exit = 5 + + # maximum time allowed for ceph-mgr to respond to a REST API request + self.rest_api_timeout_sec = 15 + + # interval between consecutive REST API requests (ping's). A smaller + # value here triggers more requests to ceph-mgr restful plugin. A + # higher value makes recovery slower when services become unavailable + self.restful_plugin_ping_delay_sec = 3 + + # where to save the self-signed certificate generated by ceph-mgr + self.restful_plugin_cert_path = os.path.join( + self.ceph_mgr_rundir, 'restful.crt') + + # time to wait after enabling restful plugin + self.restful_plugin_grace_period_sec = 3 + + # after how many REST API ping failures to restart ceph-mgr + self.ping_fail_count_restart_mgr = 3 + + # after how many REST API ping failures to report status error. + # Until then service monitor reports status OK just in case + # restful plugin recovers + self.ping_fail_count_report_error = 5 + + @staticmethod + def load(): + return Config() + + +def setup_logging(name=None, cleanup_handlers=False): + if not name: + name = CONFIG.service_name + log = logging.getLogger(name) + log.setLevel(CONFIG.log_level) + if cleanup_handlers: + try: + for handler in log.handlers: + if isinstance(handler, logging.StreamHandler): + handler.flush() + if isinstance(handler, logging.FileHandler): + handler.close() + log.handlers = [] + except Exception: + pass + elif log.handlers: + return log + handler = logging.FileHandler( + os.path.join(CONFIG.log_dir, + '{}.log'.format(CONFIG.service_name))) + handler.setFormatter( + logging.Formatter('%(asctime)s %(process)s %(levelname)s %(name)s %(message)s')) + log.addHandler(handler) + return log + + +CONFIG = Config.load() +LOG = setup_logging(name='init-wrapper') + + +class ServiceException(Exception): + + """Generic mgr-restful-plugin service exception. + + Build exception string based on static (per exception class) + string plus args, keyword args passed to exception constructor. + """ + + message = "" + + def __init__(self, *args, **kwargs): + if "message" not in kwargs: + try: + message = self.message.format(*args, **kwargs) + except Exception: # noqa + message = '{}, args:{}, kwargs: {}'.format( + self.message, args, kwargs) + else: + message = kwargs["message"] + super(ServiceException, self).__init__(message) + + +class ServiceAlreadyStarted(ServiceException): + message = ('Service monitor already started') + + +class ServiceLockFailed(ServiceException): + message = ('Unable to lock service monitor: ' + 'reason={reason}') + + +class ServiceNoSocket(ServiceException): + message = ('Unable to create service monitor socket: ' + 'reason={reason}') + + +class ServiceSocketBindFailed(ServiceException): + message = ('Failed to bind service monitor socket: ' + 'path={path}, reason={reason}') + + +class ServiceNoPidFile(ServiceException): + message = ('Failed to update pid file: ' + 'path={path}, reason={reason}') + + +class CommandFailed(ServiceException): + message = ('Command failed: command={command}, ' + 'reason={reason}, out={out}') + + +class CommandTimeout(ServiceException): + message = ('Command timeout: command={command}, ' + 'timeout={timeout}') + + +class CephMgrStartFailed(ServiceException): + message = ('Failed to start ceph_mgr: ' + 'reason={reason}') + + +class CephRestfulPluginFailed(ServiceException): + message = ('Failed to start restful plugin: ' + 'reason={reason}') + + +class RestApiPingFailed(ServiceException): + message = ('REST API ping failed: ' + 'reason={reason}') + + +class ServiceMonitor(object): + + """Configure and monitor ceph-mgr and restful plugin (Ceph REST API) + + 1. process init script service requests: status, stop. Requests are + received via a control socket. Stop has priority over whatever + the monitor is doing currently. Any ceph command that may be running + is terminated/killed. Note that while ceph-mgr and restful plugin + configuration is in progress ServiceMonitor reports status OK to + avoid being restarted by SM. + + 2. configure ceph-mgr and mgr restful plugin: authentication, REST API + service port, self signed certificate. This runs as a separate + process so it can be stopped when init script requests it. + + 3. periodically check (ping) REST API responds to HTTPS requests. + Recovery actions are taken if REST API fails to respond: restart + ceph-mgr, wait for cluster to become available again. + """ + + def __init__(self): + # process running configuration & REST API ping loop + self.monitor = None + + # command socket used by init script + self.command = None + + # ceph-mgr process + self.ceph_mgr = None + + # consecutive ceph-mgr/restful-plugin start failures. Service monitor + # reports failure after CONFIG.ceph_mgr_max_failure_count + self.ceph_mgr_failure_count = 0 + + # consecutive REST API ping failures. ceph-mgr service is restarted + # after CONFIG.ping_fail_count_restart_mgr threshold is exceeded + self.ping_failure_count = 0 + + # REST API url reported by ceph-mgr after enabling restful plugin + self.restful_plugin_url = '' + + # REST API self signed certificate generated by restful plugin + self.certificate = '' + + def run(self): + self.disable_certificate_check() + with self.service_lock(), self.service_socket(), \ + self.service_pid_file(): + self.start_monitor() + self.server_loop() + + def disable_certificate_check(self): + # ceph-mgr restful plugin is configured with a self-signed + # certificate. Certificate host is hard-coded to "ceph-restful" + # which causes HTTPS requests to fail because they don't + # match current host name ("controller-..."). Disable HTTPS + # certificates check in urllib3 + LOG.warning('Disable urllib3 certifcates check') + requests.packages.urllib3.disable_warnings() + + def server_loop(self): + self.command.listen(2) + while True: + try: + client, _ = self.command.accept() + request = client.recv(CONFIG.service_socket_bufsize) + LOG.debug('Monitor command socket: request=%s', str(request)) + cmd = request.split(' ') + cmd, args = cmd[0], cmd[1:] + if cmd == 'status': + self.send_response(client, request, self.status()) + elif cmd == 'stop': + self.stop() + self.send_response(client, request, 'OK') + break + elif cmd == 'restful-url': + try: + self.restful_plugin_url = args[0] + self.send_response(client, request, 'OK') + except IndexError: + LOG.warning('Failed to update restful plugin url: ' + 'args=%s', str(args)) + self.send_response(client, request, 'ERR') + elif cmd == 'certificate': + try: + self.certificate = args[0] if args else '' + self.send_response(client, request, 'OK') + except IndexError: + LOG.warning('Failed to update certificate path: ' + 'args=%s', str(args)) + self.send_response(client, request, 'ERR') + elif cmd == 'ceph-mgr-failures': + try: + self.ceph_mgr_failure_count = int(args[0]) + self.send_response(client, request, 'OK') + if self.ceph_mgr_failure_count >= CONFIG.ceph_mgr_fail_count_exit: + self.stop() + break + except (IndexError, ValueError): + LOG.warning('Failed to update ceph-mgr failures: ' + 'args=%s', str(args)) + self.send_response(client, request, 'ERR') + elif cmd == 'ping-failures': + try: + self.ping_failure_count = int(args[0]) + self.send_response(client, request, 'OK') + except (IndexError, ValueError): + LOG.warning('Failed to update ping failures: ' + 'args=%s', str(args)) + self.send_response(client, request, 'ERR') + except Exception as err: + LOG.exception(err) + + @staticmethod + def send_response(client, request, response): + try: + client.send(response) + except socket.error as err: + LOG.warning('Failed to send response back. ' + 'request=%s, response=%s, reason=%s', + request, response, err) + + def status(self): + if not self.restful_plugin_url: + if self.ceph_mgr_failure_count < CONFIG.ceph_mgr_fail_count_report_error \ + and self.ping_failure_count < CONFIG.ping_fail_count_report_error: + LOG.debug('Monitor is starting services. Report status OK') + return 'OK' + LOG.debug('Too many failures: ' + 'ceph_mgr=%d < %d, ping=%d < %d. ' + 'Report status ERR', + self.ceph_mgr_failure_count, + CONFIG.ceph_mgr_fail_count_report_error, + self.ping_failure_count, + CONFIG.ping_fail_count_report_error) + return 'ERR.down' + try: + self.restful_plugin_ping() + LOG.debug('Restful plugin ping successful. Report status OK') + return 'OK' + except (CommandFailed, RestApiPingFailed): + if self.ceph_mgr_failure_count < CONFIG.ceph_mgr_fail_count_report_error \ + and self.ping_failure_count < CONFIG.ping_fail_count_report_error: + LOG.info('Restful plugin does not respond but failure ' + 'count is within acceptable limits: ' + ' ceph_mgr=%d < %d, ping=%d < %d. ' + 'Report status OK', + self.ceph_mgr_failure_count, + CONFIG.ceph_mgr_fail_count_report_error, + self.ping_failure_count, + CONFIG.ping_fail_count_report_error) + return 'OK' + LOG.debug('Restful does not respond (ping failure count %d). ' + 'Report status ERR', self.ping_failure_count) + return 'ERR.ping_failed' + + def stop(self): + if not self.monitor: + return + LOG.info('Stop monitor with SIGTERM to process group %d', + self.monitor.pid) + try: + os.killpg(self.monitor.pid, signal.SIGTERM) + except OSError as err: + LOG.info('Stop monitor failed: reason=%s', str(err)) + return + time.sleep(CONFIG.ceph_mgr_kill_delay_sec) + LOG.info('Stop monitor with SIGKILL to process group %d', + self.monitor.pid) + try: + os.killpg(self.monitor.pid, signal.SIGKILL) + os.waitpid(self.monitor.pid, 0) + except OSError as err: + LOG.info('Stop monitor failed: reason=%s', str(err)) + return + LOG.info('Monitor stopped: pid=%d', self.monitor.pid) + + @contextlib.contextmanager + def service_lock(self): + LOG.info('Take service lock: path=%s', CONFIG.service_lock) + try: + os.makedirs(os.path.dirname(CONFIG.service_lock)) + except OSError: + pass + lock_file = open(CONFIG.service_lock, 'w') + try: + fcntl.flock(lock_file.fileno(), + fcntl.LOCK_EX | fcntl.LOCK_NB) + except (IOError, OSError) as err: + if err.errno == errno.EAGAIN: + raise ServiceAlreadyStarted() + else: + raise ServiceLockFailed(reason=str(err)) + # even if we have the lock here there might be another service manager + # running whose CONFIG.ceph_mgr_rundir was removed before starting + # this instance. Make sure there is only one service manager running + self.stop_other_service_managers() + try: + yield + finally: + os.unlink(CONFIG.service_lock) + lock_file.close() + LOG.info('Release service lock: path=%s', CONFIG.service_lock) + + def stop_other_service_managers(self): + service = os.path.join('/etc/init.d', CONFIG.service_name) + for p in psutil.process_iter(): + if p.cmdline()[:2] not in [[service], ['/usr/bin/python', service]]: + continue + if p.pid == os.getpid(): + continue + p.kill() + + @contextlib.contextmanager + def service_socket(self): + LOG.info('Create service socket') + try: + self.command = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) + except socket.error as err: + raise ServiceNoSocket(reason=str(err)) + LOG.info('Remove existing socket files') + try: + os.unlink(CONFIG.service_socket) + except OSError: + pass + LOG.info('Bind service socket: path=%s', CONFIG.service_socket) + try: + self.command.bind(CONFIG.service_socket) + except socket.error as err: + raise ServiceSocketBindFailed( + path=CONFIG.service_socket, reason=str(err)) + try: + yield + finally: + LOG.info('Close service socket and remove file: path=%s', + CONFIG.service_socket) + self.command.close() + os.unlink(CONFIG.service_socket) + + @contextlib.contextmanager + def service_pid_file(self): + LOG.info('Update service pid file: path=%s', CONFIG.service_pid_file) + try: + pid_file = open(CONFIG.service_pid_file, 'w') + pid_file.write(str(os.getpid())) + except OSError as err: + raise ServiceNoPidFile( + path=CONFIG.service_pid_file, reason=str(err)) + try: + yield + finally: + LOG.info('Remove service pid file: path=%s', + CONFIG.service_pid_file) + try: + os.unlink(CONFIG.service_pid_file) + except OSError: + pass + + def start_monitor(self): + LOG.info('Start monitor loop') + self.monitor = multiprocessing.Process(target=self.monitor_loop) + self.monitor.start() + + def stop_unmanaged_ceph_mgr(self): + LOG.info('Stop unmanaged running ceph-mgr processes') + service_name = os.path.basename(CONFIG.ceph_mgr_service) + if self.ceph_mgr: + psutil_terminate_kill( + [proc for proc in psutil.process_iter() + if (proc.name() == service_name + and proc.pid != self.ceph_mgr.pid)], + CONFIG.ceph_mgr_kill_delay_sec) + else: + psutil_terminate_kill( + [proc for proc in psutil.process_iter() + if proc.name() == service_name], + CONFIG.ceph_mgr_kill_delay_sec) + + def monitor_loop(self): + + """Bring up and monitor ceph-mgr restful plugin. + + Steps: + - wait for Ceph cluster to become available + - configure and start ceph-mgr + - configure and enable restful plugin + - send periodic requests to REST API + - recover from failures + + Note: because this runs as a separate process it + must send status updates to service monitor + via control socket for: ping_failure_count, + restful_plugin_url and certificate. + """ + + # Promote to process group leader so parent (service monitor) + # can kill the monitor plus processes spawned by it. Otherwise + # children of monitor_loop() will keep running in background and + # will be reaped by init when they finish but by then they might + # interfere with any new service instance. + os.setpgrp() + + # Ignoring SIGTERM here ensures process group is not reused by + # the time parent (service monitor) issues the final SIGKILL. + signal.signal(signal.SIGTERM, signal.SIG_IGN) + + while True: + try: + # steps to configure/start ceph-mgr and restful plugin + self.ceph_fsid_get() + self.ceph_mgr_auth_create() + self.ceph_mgr_start() + self.restful_plugin_set_server_port() + self.restful_plugin_enable() + self.restful_plugin_create_certificate() + self.restful_plugin_create_admin_key() + self.restful_plugin_get_url() + self.restful_plugin_get_certificate() + + # REST API should be available now + # start making periodic requests (ping) + while True: + try: + self.restful_plugin_ping() + self.ping_failure_count = 0 + self.request_update_ping_failures( + self.ping_failure_count) + self.ceph_mgr_failure_count = 0 + self.request_update_ceph_mgr_failures( + self.ceph_mgr_failure_count) + time.sleep(CONFIG.restful_plugin_ping_delay_sec) + continue + except RestApiPingFailed as err: + LOG.warning(str(err)) + + LOG.info('REST API ping failure count=%d', + self.ping_failure_count) + self.ping_failure_count += 1 + self.request_update_ping_failures( + self.ping_failure_count) + + # maybe request failed because ceph-mgr is not running + if not self.ceph_mgr_is_running(): + self.ceph_mgr_failure_count += 1 + self.request_update_ceph_mgr_failures( + self.ceph_mgr_failure_count) + self.ceph_mgr_start() + time.sleep(CONFIG.ceph_mgr_grace_period_sec) + continue + + # maybe request failed because cluster health is not ok + if not self.ceph_fsid_get(): + LOG.info('Unable to get cluster fsid. ' + 'Sleep for a while') + time.sleep(CONFIG.cluster_grace_period_sec) + break + + # too many failures? Restart ceph-mgr and go again + # through configuration steps + if (self.ping_failure_count + % CONFIG.ping_fail_count_restart_mgr == 0): + LOG.info('Too many consecutive REST API failures. ' + 'Restart ceph-mgr. Update service ' + 'url and certificate') + self.ceph_mgr_stop() + self.restful_plugin_url = '' + self.request_update_plugin_url(self.restful_plugin_url) + self.certificate = '' + self.request_update_certificate(self.certificate) + break + + time.sleep(CONFIG.restful_plugin_ping_delay_sec) + + except CommandFailed as err: + LOG.warning(str(err)) + time.sleep(CONFIG.cluster_grace_period_sec) + except CommandTimeout as err: + LOG.warning(str(err)) + except (CephMgrStartFailed, CephRestfulPluginFailed) as err: + LOG.warning(str(err)) + self.ceph_mgr_failure_count += 1 + self.request_update_ceph_mgr_failures( + self.ceph_mgr_failure_count) + time.sleep(CONFIG.ceph_mgr_grace_period_sec) + except Exception as err: + LOG.exception(err) + time.sleep(CONFIG.cluster_grace_period_sec) + + @staticmethod + def run_with_timeout(command, timeout, stderr=subprocess.STDOUT): + try: + LOG.info('Run command: %s', ' '.join(command)) + return subprocess.check_output( + ['/usr/bin/timeout', str(timeout)] + command, + stderr=stderr, shell=False).strip() + except subprocess.CalledProcessError as err: + if err.returncode == GNU_TIMEOUT_EXPIRED_RETCODE: + raise CommandTimeout(command=err.cmd, timeout=timeout) + raise CommandFailed(command=err.cmd, reason=str(err), + out=err.output) + + def ceph_fsid_get(self): + return self.run_with_timeout(['/usr/bin/ceph', 'fsid'], + CONFIG.ceph_cli_timeout_sec) + + def ceph_mgr_has_auth(self): + path = '{}/ceph-{}'.format( + CONFIG.ceph_mgr_rundir, CONFIG.ceph_mgr_identity) + try: + os.makedirs(path) + except OSError as err: + pass + try: + self.run_with_timeout( + ['/usr/bin/ceph', 'auth', 'get', + 'mgr.{}'.format(CONFIG.ceph_mgr_identity), + '-o', '{}/keyring'.format(path)], + CONFIG.ceph_cli_timeout_sec) + return True + except CommandFailed as err: + if 'ENOENT' in str(err): + return False + raise + + def ceph_mgr_auth_create(self): + if self.ceph_mgr_has_auth(): + return + LOG.info('Create ceph-mgr authentication') + self.run_with_timeout( + ['/usr/bin/ceph', 'auth', 'get-or-create', + 'mgr.{}'.format(CONFIG.ceph_mgr_identity), + 'mon', 'allow *', 'osd', 'allow *'], + CONFIG.ceph_cli_timeout_sec) + + def ceph_mgr_is_running(self): + if not self.ceph_mgr: + return None + try: + self.ceph_mgr.wait(timeout=0) + except psutil.TimeoutExpired: + return True + return False + + def ceph_mgr_start(self): + if self.ceph_mgr_is_running(): + return + self.stop_unmanaged_ceph_mgr() + LOG.info('Start ceph-mgr daemon') + try: + with open(os.devnull, 'wb') as null: + self.ceph_mgr = psutil.Popen( + [CONFIG.ceph_mgr_service, + '--cluster', CONFIG.ceph_mgr_cluster, + '--id', CONFIG.ceph_mgr_identity, + '-f'], + close_fds=True, + stdout=null, + stderr=null, + shell=False) + except (OSError, ValueError) as err: + raise CephMgrStartFailed(reason=str(err)) + time.sleep(CONFIG.ceph_mgr_grace_period_sec) + + def ceph_mgr_stop(self): + if not self.ceph_mgr: + return + LOG.info('Stop ceph-mgr') + psutil_terminate_kill(self.ceph_mgr, CONFIG.ceph_mgr_kill_delay_sec) + + def restful_plugin_has_server_port(self): + try: + with open(os.devnull, 'wb') as null: + out = self.run_with_timeout( + ['/usr/bin/ceph', 'config-key', 'get', + 'config/mgr/mgr/restful/server_port'], + CONFIG.ceph_cli_timeout_sec, stderr=null) + if out == str(CONFIG.restful_plugin_port): + return True + LOG.warning('Restful plugin port mismatch: ' + 'current=%d, expected=%d', out, + CONFIG.restful_plugin_port) + except CommandFailed as err: + LOG.warning('Failed to get restful plugin port: ' + 'reason=%s', str(err)) + return False + + def restful_plugin_set_server_port(self): + if self.restful_plugin_has_server_port(): + return + LOG.info('Set restful plugin port=%d', CONFIG.restful_plugin_port) + self.run_with_timeout( + ['/usr/bin/ceph', 'config', 'set', 'mgr', + 'mgr/restful/server_port', str(CONFIG.restful_plugin_port)], + CONFIG.ceph_cli_timeout_sec) + + def restful_plugin_has_admin_key(self): + try: + self.run_with_timeout( + ['/usr/bin/ceph', 'config-key', 'get', + 'mgr/restful/keys/admin'], + CONFIG.ceph_cli_timeout_sec) + return True + except CommandFailed: + pass + return False + + def restful_plugin_create_admin_key(self): + if self.restful_plugin_has_admin_key(): + return + LOG.info('Create restful plugin admin key') + self.run_with_timeout( + ['/usr/bin/ceph', 'restful', + 'create-key', 'admin'], + CONFIG.ceph_cli_timeout_sec) + + def restful_plugin_has_certificate(self): + try: + self.run_with_timeout( + ['/usr/bin/ceph', 'config-key', 'get', + 'mgr/restful/{}/crt'.format(CONFIG.ceph_mgr_identity)], + CONFIG.ceph_cli_timeout_sec) + return True + except CommandFailed: + pass + return False + + def restful_plugin_create_certificate(self): + if self.restful_plugin_has_certificate(): + return + LOG.info('Create restful plugin self signed certificate') + self.run_with_timeout( + ['/usr/bin/ceph', 'restful', + 'create-self-signed-cert'], + CONFIG.ceph_cli_timeout_sec) + + def restful_plugin_is_enabled(self): + command = ['/usr/bin/ceph', 'mgr', 'module', 'ls', + '--format', 'json'] + with open(os.devnull, 'wb') as null: + out = self.run_with_timeout( + command, CONFIG.ceph_cli_timeout_sec, stderr=null) + try: + if 'restful' in json.loads(out)['enabled_modules']: + return True + except ValueError as err: + raise CommandFailed( + command=' '.join(command), + reason='unable to decode json: {}'.format(err), out=out) + except KeyError as err: + raise CommandFailed( + command=' '.join(command), + reason='missing expected key: {}'.format(err), out=out) + return False + + def restful_plugin_enable(self): + if not self.restful_plugin_is_enabled(): + LOG.info('Enable restful plugin') + self.run_with_timeout( + ['/usr/bin/ceph', 'mgr', + 'module', 'enable', 'restful'], + CONFIG.ceph_cli_timeout_sec) + time.sleep(CONFIG.restful_plugin_grace_period_sec) + + def restful_plugin_get_url(self): + command = ['/usr/bin/ceph', 'mgr', 'services', + '--format', 'json'] + with open(os.devnull, 'wb') as null: + out = self.run_with_timeout( + command, CONFIG.ceph_cli_timeout_sec, stderr=null) + try: + self.restful_plugin_url = json.loads(out)['restful'] + except ValueError as err: + raise CephRestfulPluginFailed( + reason='unable to decode json: {} output={}'.format(err, out)) + except KeyError as err: + raise CephRestfulPluginFailed( + reason='missing expected key: {} in ouput={}'.format(err, out)) + self.request_update_plugin_url(self.restful_plugin_url) + + def restful_plugin_get_certificate(self): + command = ['/usr/bin/ceph', 'config-key', 'get', + 'mgr/restful/controller-0/crt'] + with open(os.devnull, 'wb') as null: + certificate = self.run_with_timeout( + command, CONFIG.ceph_cli_timeout_sec, stderr=null) + with open(CONFIG.restful_plugin_cert_path, 'wb') as cert_file: + cert_file.write(certificate) + self.certificate = CONFIG.restful_plugin_cert_path + self.request_update_certificate( + self.certificate) + + def restful_plugin_ping(self): + if not self.restful_plugin_url: + raise RestApiPingFailed(reason='missing service url') + if not self.certificate: + raise RestApiPingFailed(reason='missing certificate') + LOG.debug('Ping restful plugin: url=%d', self.restful_plugin_url) + try: + response = requests.request( + 'GET', self.restful_plugin_url, verify=False, + timeout=CONFIG.rest_api_timeout_sec) + if not response.ok: + raise RestApiPingFailed( + reason='response not ok ({})'.format(response)) + LOG.debug('Ping restful plugin OK') + except (requests.ConnectionError, + requests.Timeout, + requests.HTTPError) as err: + raise RestApiPingFailed(reason=str(err)) + + @staticmethod + def _make_client_socket(): + sock = socket.socket( + socket.AF_UNIX, socket.SOCK_SEQPACKET) + sock.settimeout(2 * CONFIG.rest_api_timeout_sec) + sock.connect(CONFIG.service_socket) + return sock + + @staticmethod + def request_status(): + try: + with contextlib.closing( + ServiceMonitor._make_client_socket()) as sock: + sock.send('status') + status = sock.recv(CONFIG.service_socket_bufsize) + LOG.debug('Status %s', status) + return status.startswith('OK') + except socket.error as err: + LOG.error('Status error: reason=%s', err) + return False + + @staticmethod + def request_stop(): + try: + with contextlib.closing( + ServiceMonitor._make_client_socket()) as sock: + sock.send('stop') + response = sock.recv(CONFIG.service_socket_bufsize) + LOG.debug('Stop response: %s', response) + return True + except socket.error as err: + LOG.error('Stop error: reason=%s', err) + return False + + @staticmethod + def request_update_ceph_mgr_failures(count): + try: + with contextlib.closing( + ServiceMonitor._make_client_socket()) as sock: + sock.send('ceph-mgr-failures {}'.format(count)) + sock.recv(CONFIG.service_socket_bufsize) + return True + except socket.error as err: + LOG.error('Stop error: reason=%s', err) + return False + + @staticmethod + def request_update_ping_failures(count): + try: + with contextlib.closing( + ServiceMonitor._make_client_socket()) as sock: + sock.send('ping-failures {}'.format(count)) + sock.recv(CONFIG.service_socket_bufsize) + return True + except socket.error as err: + LOG.error('Stop error: reason=%s', err) + return False + + @staticmethod + def request_update_plugin_url(url): + try: + with contextlib.closing( + ServiceMonitor._make_client_socket()) as sock: + sock.send('restful-url {}'.format(url)) + sock.recv(CONFIG.service_socket_bufsize) + return True + except socket.error as err: + LOG.error('Stop error: reason=%s', err) + return False + + @staticmethod + def request_update_certificate(path): + try: + with contextlib.closing( + ServiceMonitor._make_client_socket()) as sock: + sock.send('certificate {}'.format(path)) + sock.recv(CONFIG.service_socket_bufsize) + return True + except socket.error as err: + LOG.error('Stop error: reason=%s', err) + return False + + +class InitWrapper(object): + + """Handle System V init script actions: start, stop, restart, etc. """ + + def __init__(self): + + """Dispatch command line action to the corresponding function. + + Candidate action functions are all class methods except ones + that start with an underscore. + """ + + parser = argparse.ArgumentParser() + actions = [m[0] + for m in inspect.getmembers(self) + if (inspect.ismethod(m[1]) + and not m[0].startswith('_'))] + parser.add_argument( + 'action', + choices=actions) + self.args = parser.parse_args() + getattr(self, self.args.action)() + + def start(self): + + """Start ServiceMonitor as a daemon unless one is already running. + + Use a pipe to report monitor status back to this process. + """ + + pipe = os.pipe() + child = os.fork() + if child == 0: + os.close(pipe[0]) + with daemon.DaemonContext(files_preserve=[pipe[1]]): + # prevent duplication of messages in log + global LOG + LOG = setup_logging(cleanup_handlers=True) + try: + monitor = ServiceMonitor() + status = 'OK' + except ServiceAlreadyStarted: + os.write(pipe[1], 'OK') + os.close(pipe[1]) + return + except Exception as err: + status = str(err) + os.write(pipe[1], status) + os.close(pipe[1]) + if status == 'OK': + try: + monitor.run() + except ServiceException as err: + LOG.warning(str(err)) + except Exception as err: + LOG.exception('Service monitor error: reason=%s', err) + else: + os.close(pipe[1]) + try: + status = os.read(pipe[0], CONFIG.service_socket_bufsize) + if status == 'OK': + sys.exit(0) + else: + LOG.warning('Service monitor failed to start: ' + 'status=%s', status) + except IOError as err: + LOG.warning('Failed to read monitor status: reason=%s', err) + os.close(pipe[0]) + os.waitpid(child, 0) + sys.exit(1) + + def stop(self): + + """Tell ServiceMonitor daemon to stop running. + + In case request fails stop ServiceMonitor and ceph_mgr proecsses + using SIGTERM followed by SIGKILL. + """ + + result = ServiceMonitor.request_stop() + if not result: + ceph_mgr = os.path.basename(CONFIG.ceph_mgr_service) + procs = [] + for proc in psutil.process_iter(): + name = proc.name() + if name == CONFIG.service_name: + procs.append(proc) + if name == ceph_mgr: + procs.append(proc) + psutil_terminate_kill(procs, CONFIG.ceph_mgr_kill_delay_sec) + + def restart(self): + self.stop() + self.start() + + def force_reload(self): + self.stop() + self.start() + + def reload(self): + self.stop() + self.start() + + def status(self): + + """Report status from ServiceMonitor. + + We don't just try to access REST API here because ServiceMonitor may + be in the process of starting/configuring ceph-mgr and restful + plugin in which case we report OK to avoid being restarted by SM. + """ + + status = ServiceMonitor.request_status() + sys.exit(0 if status is True else 1) + + +if __name__ == '__main__': + InitWrapper() diff --git a/recipes-core/stx-integ-ceph/files/mgr-restful-plugin.service b/recipes-core/stx-integ-ceph/files/mgr-restful-plugin.service new file mode 100644 index 0000000..b3e61f0 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/mgr-restful-plugin.service @@ -0,0 +1,15 @@ +[Unit] +Description=Ceph MGR RESTful API Plugin +After=network-online.target sw-patch.service + +[Service] +Type=forking +Restart=no +KillMode=process +RemainAfterExit=yes +ExecStart=/etc/rc.d/init.d/mgr-restful-plugin start +ExecStop=/etc/rc.d/init.d/mgr-restul-plugin stop +ExecReload=/etc/rc.d/init.d/mgr-restful-plugin reload + +[Install] +WantedBy=multi-user.target diff --git a/recipes-core/stx-integ-ceph/files/starlingx-docker-override.conf b/recipes-core/stx-integ-ceph/files/starlingx-docker-override.conf new file mode 100644 index 0000000..5ffd859 --- /dev/null +++ b/recipes-core/stx-integ-ceph/files/starlingx-docker-override.conf @@ -0,0 +1,3 @@ +[Service] +ExecStopPost=/usr/sbin/ceph-preshutdown.sh + diff --git a/recipes-core/stx-integ-ceph/patches/0001-Add-hooks-for-orderly-shutdown-on-controller.patch b/recipes-core/stx-integ-ceph/patches/0001-Add-hooks-for-orderly-shutdown-on-controller.patch new file mode 100644 index 0000000..15bb7c3 --- /dev/null +++ b/recipes-core/stx-integ-ceph/patches/0001-Add-hooks-for-orderly-shutdown-on-controller.patch @@ -0,0 +1,59 @@ +From 03340eaf0004e3cc8e3f8991ea96a46757d92830 Mon Sep 17 00:00:00 2001 +From: Don Penney <don.penney@windriver.com> +Date: Sat, 26 Jan 2019 13:34:55 -0500 +Subject: [PATCH] Add hooks for orderly shutdown on controller + +Hook the ceph init script to add systemd overrides to define +an orderly shutdown for StarlingX controllers. + +Signed-off-by: Don Penney <don.penney@windriver.com> +--- + src/init-ceph.in | 32 ++++++++++++++++++++++++++++++++ + 1 file changed, 32 insertions(+) + +diff --git a/src/init-ceph.in b/src/init-ceph.in +index 1fdb4b3..515d818 100644 +--- a/src/init-ceph.in ++++ b/src/init-ceph.in +@@ -861,6 +861,38 @@ for name in $what; do + fi + fi + ++ . /etc/platform/platform.conf ++ if [ "${nodetype}" = "controller" ]; then ++ # StarlingX: Hook the transient services launched by systemd-run ++ # to allow for proper cleanup and orderly shutdown ++ ++ # Set nullglob so wildcards will return empty string if no match ++ shopt -s nullglob ++ ++ OSD_SERVICES=$(for svc in /run/systemd/system/ceph-osd*.service; do basename $svc; done | xargs echo) ++ for d in /run/systemd/system/ceph-osd*.d; do ++ cat <<EOF > $d/starlingx-overrides.conf ++[Unit] ++Before=docker.service ++After=sm-shutdown.service ++ ++EOF ++ done ++ ++ for d in /run/systemd/system/ceph-mon*.d; do ++ cat <<EOF > $d/starlingx-overrides.conf ++[Unit] ++Before=docker.service ++After=sm-shutdown.service ${OSD_SERVICES} ++ ++EOF ++ done ++ ++ shopt -u nullglob ++ ++ systemctl daemon-reload ++ fi ++ + [ -n "$post_start" ] && do_cmd "$post_start" + [ -n "$lockfile" ] && [ "$?" -eq 0 ] && touch $lockfile + ;; +-- +1.8.3.1 +