Merge "[CEPH] Journal automation and disk cleanup updates"

This commit is contained in:
Zuul 2019-01-28 06:05:45 +00:00 committed by Gerrit Code Review
commit f0f1b57b3c
4 changed files with 116 additions and 57 deletions

View File

@ -78,6 +78,10 @@ if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
exit 1 exit 1
else else
OSD_JOURNAL="${OSD_JOURNAL_DISK}" OSD_JOURNAL="${OSD_JOURNAL_DISK}"
if [ -e "${OSD_PATH}/run_mkjournal" ]; then
ceph-osd -i ${OSD_ID} --mkjournal
rm -rf ${OSD_PATH}/run_mkjournal
fi
fi fi
fi fi
if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then

View File

@ -23,6 +23,7 @@ set -ex
: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}" : "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}"
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}" : "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
: "${OSD_JOURNAL_UUID:=$(uuidgen)}" : "${OSD_JOURNAL_UUID:=$(uuidgen)}"
: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}"
eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))') eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))')
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))') eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
@ -142,6 +143,16 @@ function dev_part {
fi fi
} }
function disk_zap {
# Run all the commands that ceph-disk zap uses to clear a disk
local device=${1}
wipefs --all ${device}
# Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise
dd if=/dev/zero of=${device} bs=1M count=200
sgdisk --zap-all -- ${device}
sgdisk --clear --mbrtogpt -- ${device}
}
function osd_pg_interval_fix { function osd_pg_interval_fix {
# NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running # NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running
if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then
@ -154,7 +165,9 @@ function osd_pg_interval_fix {
function udev_settle { function udev_settle {
partprobe "${OSD_DEVICE}" partprobe "${OSD_DEVICE}"
if [ "x$JOURNAL_TYPE" == "xblock-logical" ]; then if [ "x$JOURNAL_TYPE" == "xblock-logical" ]; then
partprobe "${OSD_JOURNAL}" OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
partprobe "${JDEV}"
fi fi
# watch the udev event queue, and exit if all current events are handled # watch the udev event queue, and exit if all current events are handled
udevadm settle --timeout=600 udevadm settle --timeout=600

View File

@ -20,7 +20,7 @@ set -ex
source /tmp/osd-common.sh source /tmp/osd-common.sh
: "${OSD_FORCE_ZAP:=1}" : "${OSD_FORCE_REPAIR:=1}"
# We do not want to zap journal disk. Tracking this option seperatly. # We do not want to zap journal disk. Tracking this option seperatly.
: "${JOURNAL_FORCE_ZAP:=0}" : "${JOURNAL_FORCE_ZAP:=0}"
@ -55,62 +55,91 @@ function osd_disk_prepare {
# check device status first # check device status first
if ! parted --script ${OSD_DEVICE} print > /dev/null 2>&1; then if ! parted --script ${OSD_DEVICE} print > /dev/null 2>&1; then
if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_ZAP is enabled so we are zapping the device anyway" echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway"
sgdisk -Z ${OSD_DEVICE} disk_zap ${OSD_DEVICE}
else else
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird." echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
echo "It would be too dangerous to destroy it without any notification." echo "It would be too dangerous to destroy it without any notification."
echo "Please set OSD_FORCE_ZAP to '1' if you really want to zap this disk." echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk."
exit 1 exit 1
fi fi
fi fi
udev_settle
# then search for some ceph metadata on the disk # then search for some ceph metadata on the disk
if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then
if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
if [ -b "${OSD_DEVICE}1" ]; then if [ -b "${OSD_DEVICE}1" ]; then
local cephFSID=$(ceph-conf --lookup fsid) local cephFSID=$(ceph-conf --lookup fsid)
if [ ! -z "${cephFSID}" ]; then if [ ! -z "${cephFSID}" ]; then
local tmpmnt=$(mktemp -d) local tmpmnt=$(mktemp -d)
mount ${OSD_DEVICE}1 ${tmpmnt} mount ${OSD_DEVICE}1 ${tmpmnt}
if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
# we only care about journals for filestore.
if [ -f "${tmpmnt}/whoami" ]; then
OSD_JOURNAL_DISK=$(readlink -f "${tmpmnt}/journal")
local osd_id=$(cat "${tmpmnt}/whoami")
if [ ! -b "${OSD_JOURNAL_DISK}" ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
if [ ${jdev} == ${OSD_JOURNAL} ]; then
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL}."
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
rm -rf ${tmpmnt}/ceph_fsid
else
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL_DISK}."
echo "Because OSD_FORCE_REPAIR is set and paritions are manually defined, we will"
echo "attempt to recreate the missing journal device partitions."
osd_journal_create ${OSD_JOURNAL}
ln -sf /dev/disk/by-partuuid/${OSD_JOURNAL_UUID} ${tmpmnt}/journal
echo ${OSD_JOURNAL_UUID} | tee ${tmpmnt}/journal_uuid
chown ceph. ${OSD_JOURNAL}
# During OSD start we will format the journal and set the fsid
touch ${tmpmnt}/run_mkjournal
fi
fi
else
echo "It looks like ${OSD_DEVICE} has a ceph data partition but is missing it's metadata."
echo "The device may contain inconsistent metadata or be corrupted."
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
rm -rf ${tmpmnt}/ceph_fsid
fi
fi
if [ -f "${tmpmnt}/ceph_fsid" ]; then if [ -f "${tmpmnt}/ceph_fsid" ]; then
osdFSID=$(cat "${tmpmnt}/ceph_fsid") osdFSID=$(cat "${tmpmnt}/ceph_fsid")
umount ${tmpmnt} umount ${tmpmnt}
if [ ${osdFSID} != ${cephFSID} ]; then if [ ${osdFSID} != ${cephFSID} ]; then
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster." echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}" echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
echo "Because OSD_FORCE_ZAP was set, we will zap this device." echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
sgdisk -Z ${OSD_DEVICE} disk_zap ${OSD_DEVICE}
else else
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster." echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped." echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped."
echo "Moving on, trying to activate the OSD now." echo "Moving on, trying to activate the OSD now."
return return
fi fi
else else
umount ${tmpmnt} umount ${tmpmnt}
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID." echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
echo "Because OSD_FORCE_ZAP was set, we will zap this device." echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
sgdisk -Z ${OSD_DEVICE} disk_zap ${OSD_DEVICE}
fi fi
else else
echo "Unable to determine the FSID of the current cluster." echo "Unable to determine the FSID of the current cluster."
echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped." echo "OSD_FORCE_REPAIR is set, but this OSD will not be zapped."
echo "Moving on, trying to activate the OSD now." echo "Moving on, trying to activate the OSD now."
return return
fi fi
else else
echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it." echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it."
echo "We will ignore OSD_FORCE_ZAP and try to use the device as-is" echo "We will ignore OSD_FORCE_REPAIR and try to use the device as-is"
echo "Moving on, trying to activate the OSD now." echo "Moving on, trying to activate the OSD now."
return return
fi fi
else else
echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_ZAP=1 to use this device anyway and zap its content" echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_REPAIR=1 to use this device anyway and zap its content"
echo "You can also use the zap_device scenario on the appropriate device to zap it" echo "You can also use the disk_zap scenario on the appropriate device to zap it"
echo "Moving on, trying to activate the OSD now." echo "Moving on, trying to activate the OSD now."
return return
fi fi
@ -118,54 +147,60 @@ function osd_disk_prepare {
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
# we only care about journals for filestore. # we only care about journals for filestore.
if [ -n "${OSD_JOURNAL}" ]; then osd_journal_prepare
if [ -b $OSD_JOURNAL ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
OSD_JOURNAL_PARTITION=$(echo $OSD_JOURNAL_PARTITION | sed 's/[^0-9]//g')
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
# maybe they specified the journal as a /dev path like '/dev/sdc12':
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/\(.*[^0-9]\)[0-9]*$/\1/')
if [ -d /sys/block/$(basename ${JDEV})/$(basename ${OSD_JOURNAL}) ]; then
OSD_JOURNAL=$(dev_part ${JDEV} `echo ${OSD_JOURNAL} |\
sed 's/.*[^0-9]\([0-9]*\)$/\1/'`)
OSD_JOURNAL_PARTITION=${JDEV}
fi
else
OSD_JOURNAL=$(dev_part ${OSD_JOURNAL} ${OSD_JOURNAL_PARTITION})
fi
fi
chown ceph. ${OSD_JOURNAL}
else
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
echo "For better performance on HDD, consider moving your journal to a separate device"
fi
CLI_OPTS="${CLI_OPTS} --filestore"
else else
OSD_JOURNAL='' OSD_JOURNAL=''
CLI_OPTS="${CLI_OPTS} --bluestore" CLI_OPTS="${CLI_OPTS} --bluestore"
fi fi
if [ -b "${OSD_JOURNAL}" -a "${JOURNAL_FORCE_ZAP:-0}" -eq 1 ]; then udev_settle
# if we got here and zap is set, it's ok to wipe the journal.
echo "OSD_FORCE_ZAP is set, so we will erase the journal device ${OSD_JOURNAL}"
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
# it's a raw block device. nuke any existing partition table.
sgdisk -Z ${OSD_JOURNAL}
else
# we are likely working on a partition. Just make a filesystem on
# the device, as other partitions may be in use so nuking the whole
# disk isn't safe.
wipefs ${OSD_JOURNAL}
fi
fi
if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then
export OSD_JOURNAL="--journal-file" ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} --journal-file
else
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL}
fi fi
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL} }
udev_settle function osd_journal_create {
local osd_journal=${1}
local osd_journal_partition=$(echo ${osd_journal} | sed 's/[^0-9]//g')
local jdev=$(echo ${osd_journal} | sed 's/[0-9]//g')
if [ -b "${jdev}" ]; then
sgdisk --new=${osd_journal_partition}:0:+${OSD_JOURNAL_SIZE}M \
--change-name='${osd_journal_partition}:ceph journal' \
--partition-guid=${osd_journal_partition}:${OSD_JOURNAL_UUID} \
--typecode=${osd_journal_partition}:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- ${jdev}
OSD_JOURNAL=$(dev_part ${jdev} ${osd_journal_partition})
else
echo "The backing device ${jdev} for ${OSD_JOURNAL} does not exist on this system."
exit 1
fi
}
function osd_journal_prepare {
if [ -n "${OSD_JOURNAL}" ]; then
if [ -b ${OSD_JOURNAL} ]; then
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
OSD_JOURNAL_PARTITION=$(echo ${OSD_JOURNAL} | sed 's/[^0-9]//g')
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
OSD_JOURNAL=$(dev_part ${jdev} ${OSD_JOURNAL_PARTITION})
else
OSD_JOURNAL=${OSD_JOURNAL}
fi
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
# The block device exists but doesn't appear to be paritioned, we will proceed with parititioning the device.
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
osd_journal_create ${OSD_JOURNAL}
fi
chown ceph. ${OSD_JOURNAL}
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
echo "For better performance on HDD, consider moving your journal to a separate device"
fi
CLI_OPTS="${CLI_OPTS} --filestore"
} }
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then

View File

@ -165,10 +165,17 @@ conf:
location: /var/lib/openstack-helm/ceph/osd/journal-one location: /var/lib/openstack-helm/ceph/osd/journal-one
# - data: # - data:
# type: block-logical # type: block-logical
# location: /dev/sdd
# journal:
# type: block-logical
# location: /dev/sdf1
# - data:
# type: block-logical
# location: /dev/sde # location: /dev/sde
# journal: # journal:
# type: block-logical # type: block-logical
# location: /dev/sdf # location: /dev/sdf2
# - data: # - data:
# type: block-logical # type: block-logical
# location: /dev/sdg # location: /dev/sdg