From 61b93c6b461afa6b4ac7222d388ff8f00bae3f48 Mon Sep 17 00:00:00 2001 From: Matthew Heler Date: Tue, 8 Jan 2019 20:11:43 -0600 Subject: [PATCH] [CEPH] Journal automation and disk cleanup updates Refactor the OSD Block initialization code that performs clean ups to use all the commands that ceph-disk zap uses. Extend the functionality when an OSD initializes to create journal partitions automatically. For example if /dev/sdc3 is defined as a journal disk, the chart will automatically create that partition. The size of the journal partition is determined by the osd_journal_size that is defined in ceph.conf. Change the OSD_FORCE_ZAP option to OSD_FORCE_REPAIR to automatically recreate/self-heal Filestore OSDs. This option will now call a function to repair a journal disk, and recreate partitions. One caveat to this, is that the device paritions must be defined (ex. /dev/sdc1) for a journal. Otherwise the OSD is zapped and re-created if the whole disk (ex. /dev/sdc) is defined as the journal disk. Change-Id: Ied131b51605595dce65eb29c0b64cb6af979066e --- ceph-osd/templates/bin/osd/_block.sh.tpl | 4 + ceph-osd/templates/bin/osd/_common.sh.tpl | 15 ++- ceph-osd/templates/bin/osd/_init.sh.tpl | 145 ++++++++++++++-------- ceph-osd/values.yaml | 9 +- 4 files changed, 116 insertions(+), 57 deletions(-) diff --git a/ceph-osd/templates/bin/osd/_block.sh.tpl b/ceph-osd/templates/bin/osd/_block.sh.tpl index a69f2ee81..ac0378407 100644 --- a/ceph-osd/templates/bin/osd/_block.sh.tpl +++ b/ceph-osd/templates/bin/osd/_block.sh.tpl @@ -78,6 +78,10 @@ if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then exit 1 else OSD_JOURNAL="${OSD_JOURNAL_DISK}" + if [ -e "${OSD_PATH}/run_mkjournal" ]; then + ceph-osd -i ${OSD_ID} --mkjournal + rm -rf ${OSD_PATH}/run_mkjournal + fi fi fi if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then diff --git a/ceph-osd/templates/bin/osd/_common.sh.tpl b/ceph-osd/templates/bin/osd/_common.sh.tpl index f88243e9e..3a2168ba1 100644 --- a/ceph-osd/templates/bin/osd/_common.sh.tpl +++ b/ceph-osd/templates/bin/osd/_common.sh.tpl @@ -23,6 +23,7 @@ set -ex : "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}" : "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}" : "${OSD_JOURNAL_UUID:=$(uuidgen)}" +: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}" eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))') eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))') @@ -142,6 +143,16 @@ function dev_part { fi } +function disk_zap { + # Run all the commands that ceph-disk zap uses to clear a disk + local device=${1} + wipefs --all ${device} + # Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise + dd if=/dev/zero of=${device} bs=1M count=200 + sgdisk --zap-all -- ${device} + sgdisk --clear --mbrtogpt -- ${device} +} + function osd_pg_interval_fix { # NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then @@ -154,7 +165,9 @@ function osd_pg_interval_fix { function udev_settle { partprobe "${OSD_DEVICE}" if [ "x$JOURNAL_TYPE" == "xblock-logical" ]; then - partprobe "${OSD_JOURNAL}" + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') + partprobe "${JDEV}" fi # watch the udev event queue, and exit if all current events are handled udevadm settle --timeout=600 diff --git a/ceph-osd/templates/bin/osd/_init.sh.tpl b/ceph-osd/templates/bin/osd/_init.sh.tpl index 311db4ab0..44504a12f 100644 --- a/ceph-osd/templates/bin/osd/_init.sh.tpl +++ b/ceph-osd/templates/bin/osd/_init.sh.tpl @@ -20,7 +20,7 @@ set -ex source /tmp/osd-common.sh -: "${OSD_FORCE_ZAP:=1}" +: "${OSD_FORCE_REPAIR:=1}" # We do not want to zap journal disk. Tracking this option seperatly. : "${JOURNAL_FORCE_ZAP:=0}" @@ -55,62 +55,91 @@ function osd_disk_prepare { # check device status first if ! parted --script ${OSD_DEVICE} print > /dev/null 2>&1; then - if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then - echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_ZAP is enabled so we are zapping the device anyway" - sgdisk -Z ${OSD_DEVICE} + if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then + echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway" + disk_zap ${OSD_DEVICE} else echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird." echo "It would be too dangerous to destroy it without any notification." - echo "Please set OSD_FORCE_ZAP to '1' if you really want to zap this disk." + echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk." exit 1 fi fi - udev_settle - # then search for some ceph metadata on the disk if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then - if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then + if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then if [ -b "${OSD_DEVICE}1" ]; then local cephFSID=$(ceph-conf --lookup fsid) if [ ! -z "${cephFSID}" ]; then local tmpmnt=$(mktemp -d) mount ${OSD_DEVICE}1 ${tmpmnt} + if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then + # we only care about journals for filestore. + if [ -f "${tmpmnt}/whoami" ]; then + OSD_JOURNAL_DISK=$(readlink -f "${tmpmnt}/journal") + local osd_id=$(cat "${tmpmnt}/whoami") + if [ ! -b "${OSD_JOURNAL_DISK}" ]; then + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') + if [ ${jdev} == ${OSD_JOURNAL} ]; then + echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL}." + echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it." + rm -rf ${tmpmnt}/ceph_fsid + else + echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL_DISK}." + echo "Because OSD_FORCE_REPAIR is set and paritions are manually defined, we will" + echo "attempt to recreate the missing journal device partitions." + osd_journal_create ${OSD_JOURNAL} + ln -sf /dev/disk/by-partuuid/${OSD_JOURNAL_UUID} ${tmpmnt}/journal + echo ${OSD_JOURNAL_UUID} | tee ${tmpmnt}/journal_uuid + chown ceph. ${OSD_JOURNAL} + # During OSD start we will format the journal and set the fsid + touch ${tmpmnt}/run_mkjournal + fi + fi + else + echo "It looks like ${OSD_DEVICE} has a ceph data partition but is missing it's metadata." + echo "The device may contain inconsistent metadata or be corrupted." + echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it." + rm -rf ${tmpmnt}/ceph_fsid + fi + fi if [ -f "${tmpmnt}/ceph_fsid" ]; then osdFSID=$(cat "${tmpmnt}/ceph_fsid") umount ${tmpmnt} if [ ${osdFSID} != ${cephFSID} ]; then echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster." echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}" - echo "Because OSD_FORCE_ZAP was set, we will zap this device." - sgdisk -Z ${OSD_DEVICE} + echo "Because OSD_FORCE_REPAIR was set, we will zap this device." + disk_zap ${OSD_DEVICE} else echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster." - echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped." + echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped." echo "Moving on, trying to activate the OSD now." return fi else umount ${tmpmnt} echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID." - echo "Because OSD_FORCE_ZAP was set, we will zap this device." - sgdisk -Z ${OSD_DEVICE} + echo "Because OSD_FORCE_REPAIR was set, we will zap this device." + disk_zap ${OSD_DEVICE} fi else echo "Unable to determine the FSID of the current cluster." - echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped." + echo "OSD_FORCE_REPAIR is set, but this OSD will not be zapped." echo "Moving on, trying to activate the OSD now." return fi else echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it." - echo "We will ignore OSD_FORCE_ZAP and try to use the device as-is" + echo "We will ignore OSD_FORCE_REPAIR and try to use the device as-is" echo "Moving on, trying to activate the OSD now." return fi else - echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_ZAP=1 to use this device anyway and zap its content" - echo "You can also use the zap_device scenario on the appropriate device to zap it" + echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_REPAIR=1 to use this device anyway and zap its content" + echo "You can also use the disk_zap scenario on the appropriate device to zap it" echo "Moving on, trying to activate the OSD now." return fi @@ -118,54 +147,60 @@ function osd_disk_prepare { if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then # we only care about journals for filestore. - if [ -n "${OSD_JOURNAL}" ]; then - if [ -b $OSD_JOURNAL ]; then - OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) - OSD_JOURNAL_PARTITION=$(echo $OSD_JOURNAL_PARTITION | sed 's/[^0-9]//g') - if [ -z "${OSD_JOURNAL_PARTITION}" ]; then - # maybe they specified the journal as a /dev path like '/dev/sdc12': - local JDEV=$(echo ${OSD_JOURNAL} | sed 's/\(.*[^0-9]\)[0-9]*$/\1/') - if [ -d /sys/block/$(basename ${JDEV})/$(basename ${OSD_JOURNAL}) ]; then - OSD_JOURNAL=$(dev_part ${JDEV} `echo ${OSD_JOURNAL} |\ - sed 's/.*[^0-9]\([0-9]*\)$/\1/'`) - OSD_JOURNAL_PARTITION=${JDEV} - fi - else - OSD_JOURNAL=$(dev_part ${OSD_JOURNAL} ${OSD_JOURNAL_PARTITION}) - fi - fi - chown ceph. ${OSD_JOURNAL} - else - echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}" - echo "For better performance on HDD, consider moving your journal to a separate device" - fi - CLI_OPTS="${CLI_OPTS} --filestore" + osd_journal_prepare else OSD_JOURNAL='' CLI_OPTS="${CLI_OPTS} --bluestore" fi - if [ -b "${OSD_JOURNAL}" -a "${JOURNAL_FORCE_ZAP:-0}" -eq 1 ]; then - # if we got here and zap is set, it's ok to wipe the journal. - echo "OSD_FORCE_ZAP is set, so we will erase the journal device ${OSD_JOURNAL}" - if [ -z "${OSD_JOURNAL_PARTITION}" ]; then - # it's a raw block device. nuke any existing partition table. - sgdisk -Z ${OSD_JOURNAL} - else - # we are likely working on a partition. Just make a filesystem on - # the device, as other partitions may be in use so nuking the whole - # disk isn't safe. - wipefs ${OSD_JOURNAL} - fi - fi + udev_settle if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then - export OSD_JOURNAL="--journal-file" + ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} --journal-file + else + ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL} fi - ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL} +} - udev_settle +function osd_journal_create { + local osd_journal=${1} + local osd_journal_partition=$(echo ${osd_journal} | sed 's/[^0-9]//g') + local jdev=$(echo ${osd_journal} | sed 's/[0-9]//g') + if [ -b "${jdev}" ]; then + sgdisk --new=${osd_journal_partition}:0:+${OSD_JOURNAL_SIZE}M \ + --change-name='${osd_journal_partition}:ceph journal' \ + --partition-guid=${osd_journal_partition}:${OSD_JOURNAL_UUID} \ + --typecode=${osd_journal_partition}:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- ${jdev} + OSD_JOURNAL=$(dev_part ${jdev} ${osd_journal_partition}) + else + echo "The backing device ${jdev} for ${OSD_JOURNAL} does not exist on this system." + exit 1 + fi +} + +function osd_journal_prepare { + if [ -n "${OSD_JOURNAL}" ]; then + if [ -b ${OSD_JOURNAL} ]; then + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + OSD_JOURNAL_PARTITION=$(echo ${OSD_JOURNAL} | sed 's/[^0-9]//g') + local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') + if [ -z "${OSD_JOURNAL_PARTITION}" ]; then + OSD_JOURNAL=$(dev_part ${jdev} ${OSD_JOURNAL_PARTITION}) + else + OSD_JOURNAL=${OSD_JOURNAL} + fi + elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then + # The block device exists but doesn't appear to be paritioned, we will proceed with parititioning the device. + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + osd_journal_create ${OSD_JOURNAL} + fi + chown ceph. ${OSD_JOURNAL} + elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then + echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}" + echo "For better performance on HDD, consider moving your journal to a separate device" + fi + CLI_OPTS="${CLI_OPTS} --filestore" } if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then diff --git a/ceph-osd/values.yaml b/ceph-osd/values.yaml index 6ae9ebc64..c07e035dc 100644 --- a/ceph-osd/values.yaml +++ b/ceph-osd/values.yaml @@ -150,10 +150,17 @@ conf: location: /var/lib/openstack-helm/ceph/osd/journal-one # - data: # type: block-logical + # location: /dev/sdd + # journal: + # type: block-logical + # location: /dev/sdf1 + # - data: + # type: block-logical # location: /dev/sde # journal: # type: block-logical - # location: /dev/sdf + # location: /dev/sdf2 + # - data: # type: block-logical # location: /dev/sdg