Merge "[CEPH] Journal automation and disk cleanup updates"
This commit is contained in:
commit
f0f1b57b3c
@ -78,6 +78,10 @@ if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
OSD_JOURNAL="${OSD_JOURNAL_DISK}"
|
OSD_JOURNAL="${OSD_JOURNAL_DISK}"
|
||||||
|
if [ -e "${OSD_PATH}/run_mkjournal" ]; then
|
||||||
|
ceph-osd -i ${OSD_ID} --mkjournal
|
||||||
|
rm -rf ${OSD_PATH}/run_mkjournal
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then
|
if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then
|
||||||
|
@ -23,6 +23,7 @@ set -ex
|
|||||||
: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}"
|
: "${CEPH_CONF:="/etc/ceph/${CLUSTER}.conf"}"
|
||||||
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
|
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
|
||||||
: "${OSD_JOURNAL_UUID:=$(uuidgen)}"
|
: "${OSD_JOURNAL_UUID:=$(uuidgen)}"
|
||||||
|
: "${OSD_JOURNAL_SIZE:=$(awk '/^osd_journal_size/{print $3}' ${CEPH_CONF}.template)}"
|
||||||
|
|
||||||
eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))')
|
eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))')
|
||||||
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
||||||
@ -142,6 +143,16 @@ function dev_part {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function disk_zap {
|
||||||
|
# Run all the commands that ceph-disk zap uses to clear a disk
|
||||||
|
local device=${1}
|
||||||
|
wipefs --all ${device}
|
||||||
|
# Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise
|
||||||
|
dd if=/dev/zero of=${device} bs=1M count=200
|
||||||
|
sgdisk --zap-all -- ${device}
|
||||||
|
sgdisk --clear --mbrtogpt -- ${device}
|
||||||
|
}
|
||||||
|
|
||||||
function osd_pg_interval_fix {
|
function osd_pg_interval_fix {
|
||||||
# NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running
|
# NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running
|
||||||
if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then
|
if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then
|
||||||
@ -154,7 +165,9 @@ function osd_pg_interval_fix {
|
|||||||
function udev_settle {
|
function udev_settle {
|
||||||
partprobe "${OSD_DEVICE}"
|
partprobe "${OSD_DEVICE}"
|
||||||
if [ "x$JOURNAL_TYPE" == "xblock-logical" ]; then
|
if [ "x$JOURNAL_TYPE" == "xblock-logical" ]; then
|
||||||
partprobe "${OSD_JOURNAL}"
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
||||||
|
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
|
||||||
|
partprobe "${JDEV}"
|
||||||
fi
|
fi
|
||||||
# watch the udev event queue, and exit if all current events are handled
|
# watch the udev event queue, and exit if all current events are handled
|
||||||
udevadm settle --timeout=600
|
udevadm settle --timeout=600
|
||||||
|
@ -20,7 +20,7 @@ set -ex
|
|||||||
|
|
||||||
source /tmp/osd-common.sh
|
source /tmp/osd-common.sh
|
||||||
|
|
||||||
: "${OSD_FORCE_ZAP:=1}"
|
: "${OSD_FORCE_REPAIR:=1}"
|
||||||
# We do not want to zap journal disk. Tracking this option seperatly.
|
# We do not want to zap journal disk. Tracking this option seperatly.
|
||||||
: "${JOURNAL_FORCE_ZAP:=0}"
|
: "${JOURNAL_FORCE_ZAP:=0}"
|
||||||
|
|
||||||
@ -55,62 +55,91 @@ function osd_disk_prepare {
|
|||||||
|
|
||||||
# check device status first
|
# check device status first
|
||||||
if ! parted --script ${OSD_DEVICE} print > /dev/null 2>&1; then
|
if ! parted --script ${OSD_DEVICE} print > /dev/null 2>&1; then
|
||||||
if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then
|
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
|
||||||
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_ZAP is enabled so we are zapping the device anyway"
|
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway"
|
||||||
sgdisk -Z ${OSD_DEVICE}
|
disk_zap ${OSD_DEVICE}
|
||||||
else
|
else
|
||||||
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
|
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
|
||||||
echo "It would be too dangerous to destroy it without any notification."
|
echo "It would be too dangerous to destroy it without any notification."
|
||||||
echo "Please set OSD_FORCE_ZAP to '1' if you really want to zap this disk."
|
echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
udev_settle
|
|
||||||
|
|
||||||
# then search for some ceph metadata on the disk
|
# then search for some ceph metadata on the disk
|
||||||
if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then
|
if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then
|
||||||
if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then
|
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
|
||||||
if [ -b "${OSD_DEVICE}1" ]; then
|
if [ -b "${OSD_DEVICE}1" ]; then
|
||||||
local cephFSID=$(ceph-conf --lookup fsid)
|
local cephFSID=$(ceph-conf --lookup fsid)
|
||||||
if [ ! -z "${cephFSID}" ]; then
|
if [ ! -z "${cephFSID}" ]; then
|
||||||
local tmpmnt=$(mktemp -d)
|
local tmpmnt=$(mktemp -d)
|
||||||
mount ${OSD_DEVICE}1 ${tmpmnt}
|
mount ${OSD_DEVICE}1 ${tmpmnt}
|
||||||
|
if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
|
||||||
|
# we only care about journals for filestore.
|
||||||
|
if [ -f "${tmpmnt}/whoami" ]; then
|
||||||
|
OSD_JOURNAL_DISK=$(readlink -f "${tmpmnt}/journal")
|
||||||
|
local osd_id=$(cat "${tmpmnt}/whoami")
|
||||||
|
if [ ! -b "${OSD_JOURNAL_DISK}" ]; then
|
||||||
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
||||||
|
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
|
||||||
|
if [ ${jdev} == ${OSD_JOURNAL} ]; then
|
||||||
|
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL}."
|
||||||
|
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
|
||||||
|
rm -rf ${tmpmnt}/ceph_fsid
|
||||||
|
else
|
||||||
|
echo "It appears that ${OSD_DEVICE} is missing the journal at ${OSD_JOURNAL_DISK}."
|
||||||
|
echo "Because OSD_FORCE_REPAIR is set and paritions are manually defined, we will"
|
||||||
|
echo "attempt to recreate the missing journal device partitions."
|
||||||
|
osd_journal_create ${OSD_JOURNAL}
|
||||||
|
ln -sf /dev/disk/by-partuuid/${OSD_JOURNAL_UUID} ${tmpmnt}/journal
|
||||||
|
echo ${OSD_JOURNAL_UUID} | tee ${tmpmnt}/journal_uuid
|
||||||
|
chown ceph. ${OSD_JOURNAL}
|
||||||
|
# During OSD start we will format the journal and set the fsid
|
||||||
|
touch ${tmpmnt}/run_mkjournal
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "It looks like ${OSD_DEVICE} has a ceph data partition but is missing it's metadata."
|
||||||
|
echo "The device may contain inconsistent metadata or be corrupted."
|
||||||
|
echo "Because OSD_FORCE_REPAIR is set, we will wipe the metadata of the OSD and zap it."
|
||||||
|
rm -rf ${tmpmnt}/ceph_fsid
|
||||||
|
fi
|
||||||
|
fi
|
||||||
if [ -f "${tmpmnt}/ceph_fsid" ]; then
|
if [ -f "${tmpmnt}/ceph_fsid" ]; then
|
||||||
osdFSID=$(cat "${tmpmnt}/ceph_fsid")
|
osdFSID=$(cat "${tmpmnt}/ceph_fsid")
|
||||||
umount ${tmpmnt}
|
umount ${tmpmnt}
|
||||||
if [ ${osdFSID} != ${cephFSID} ]; then
|
if [ ${osdFSID} != ${cephFSID} ]; then
|
||||||
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
|
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
|
||||||
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
|
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
|
||||||
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
|
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
|
||||||
sgdisk -Z ${OSD_DEVICE}
|
disk_zap ${OSD_DEVICE}
|
||||||
else
|
else
|
||||||
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
|
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
|
||||||
echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped."
|
echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped."
|
||||||
echo "Moving on, trying to activate the OSD now."
|
echo "Moving on, trying to activate the OSD now."
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
umount ${tmpmnt}
|
umount ${tmpmnt}
|
||||||
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
|
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
|
||||||
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
|
echo "Because OSD_FORCE_REPAIR was set, we will zap this device."
|
||||||
sgdisk -Z ${OSD_DEVICE}
|
disk_zap ${OSD_DEVICE}
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "Unable to determine the FSID of the current cluster."
|
echo "Unable to determine the FSID of the current cluster."
|
||||||
echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped."
|
echo "OSD_FORCE_REPAIR is set, but this OSD will not be zapped."
|
||||||
echo "Moving on, trying to activate the OSD now."
|
echo "Moving on, trying to activate the OSD now."
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it."
|
echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it."
|
||||||
echo "We will ignore OSD_FORCE_ZAP and try to use the device as-is"
|
echo "We will ignore OSD_FORCE_REPAIR and try to use the device as-is"
|
||||||
echo "Moving on, trying to activate the OSD now."
|
echo "Moving on, trying to activate the OSD now."
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_ZAP=1 to use this device anyway and zap its content"
|
echo "INFO- It looks like ${OSD_DEVICE} is an OSD, set OSD_FORCE_REPAIR=1 to use this device anyway and zap its content"
|
||||||
echo "You can also use the zap_device scenario on the appropriate device to zap it"
|
echo "You can also use the disk_zap scenario on the appropriate device to zap it"
|
||||||
echo "Moving on, trying to activate the OSD now."
|
echo "Moving on, trying to activate the OSD now."
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
@ -118,54 +147,60 @@ function osd_disk_prepare {
|
|||||||
|
|
||||||
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
|
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
|
||||||
# we only care about journals for filestore.
|
# we only care about journals for filestore.
|
||||||
if [ -n "${OSD_JOURNAL}" ]; then
|
osd_journal_prepare
|
||||||
if [ -b $OSD_JOURNAL ]; then
|
|
||||||
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
|
||||||
OSD_JOURNAL_PARTITION=$(echo $OSD_JOURNAL_PARTITION | sed 's/[^0-9]//g')
|
|
||||||
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
|
|
||||||
# maybe they specified the journal as a /dev path like '/dev/sdc12':
|
|
||||||
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/\(.*[^0-9]\)[0-9]*$/\1/')
|
|
||||||
if [ -d /sys/block/$(basename ${JDEV})/$(basename ${OSD_JOURNAL}) ]; then
|
|
||||||
OSD_JOURNAL=$(dev_part ${JDEV} `echo ${OSD_JOURNAL} |\
|
|
||||||
sed 's/.*[^0-9]\([0-9]*\)$/\1/'`)
|
|
||||||
OSD_JOURNAL_PARTITION=${JDEV}
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
OSD_JOURNAL=$(dev_part ${OSD_JOURNAL} ${OSD_JOURNAL_PARTITION})
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
chown ceph. ${OSD_JOURNAL}
|
|
||||||
else
|
|
||||||
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
|
|
||||||
echo "For better performance on HDD, consider moving your journal to a separate device"
|
|
||||||
fi
|
|
||||||
CLI_OPTS="${CLI_OPTS} --filestore"
|
|
||||||
else
|
else
|
||||||
OSD_JOURNAL=''
|
OSD_JOURNAL=''
|
||||||
CLI_OPTS="${CLI_OPTS} --bluestore"
|
CLI_OPTS="${CLI_OPTS} --bluestore"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -b "${OSD_JOURNAL}" -a "${JOURNAL_FORCE_ZAP:-0}" -eq 1 ]; then
|
udev_settle
|
||||||
# if we got here and zap is set, it's ok to wipe the journal.
|
|
||||||
echo "OSD_FORCE_ZAP is set, so we will erase the journal device ${OSD_JOURNAL}"
|
|
||||||
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
|
|
||||||
# it's a raw block device. nuke any existing partition table.
|
|
||||||
sgdisk -Z ${OSD_JOURNAL}
|
|
||||||
else
|
|
||||||
# we are likely working on a partition. Just make a filesystem on
|
|
||||||
# the device, as other partitions may be in use so nuking the whole
|
|
||||||
# disk isn't safe.
|
|
||||||
wipefs ${OSD_JOURNAL}
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then
|
if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then
|
||||||
export OSD_JOURNAL="--journal-file"
|
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} --journal-file
|
||||||
|
else
|
||||||
|
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL}
|
}
|
||||||
|
|
||||||
udev_settle
|
function osd_journal_create {
|
||||||
|
local osd_journal=${1}
|
||||||
|
local osd_journal_partition=$(echo ${osd_journal} | sed 's/[^0-9]//g')
|
||||||
|
local jdev=$(echo ${osd_journal} | sed 's/[0-9]//g')
|
||||||
|
if [ -b "${jdev}" ]; then
|
||||||
|
sgdisk --new=${osd_journal_partition}:0:+${OSD_JOURNAL_SIZE}M \
|
||||||
|
--change-name='${osd_journal_partition}:ceph journal' \
|
||||||
|
--partition-guid=${osd_journal_partition}:${OSD_JOURNAL_UUID} \
|
||||||
|
--typecode=${osd_journal_partition}:45b0969e-9b03-4f30-b4c6-b4b80ceff106 --mbrtogpt -- ${jdev}
|
||||||
|
OSD_JOURNAL=$(dev_part ${jdev} ${osd_journal_partition})
|
||||||
|
else
|
||||||
|
echo "The backing device ${jdev} for ${OSD_JOURNAL} does not exist on this system."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function osd_journal_prepare {
|
||||||
|
if [ -n "${OSD_JOURNAL}" ]; then
|
||||||
|
if [ -b ${OSD_JOURNAL} ]; then
|
||||||
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
||||||
|
OSD_JOURNAL_PARTITION=$(echo ${OSD_JOURNAL} | sed 's/[^0-9]//g')
|
||||||
|
local jdev=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
|
||||||
|
if [ -z "${OSD_JOURNAL_PARTITION}" ]; then
|
||||||
|
OSD_JOURNAL=$(dev_part ${jdev} ${OSD_JOURNAL_PARTITION})
|
||||||
|
else
|
||||||
|
OSD_JOURNAL=${OSD_JOURNAL}
|
||||||
|
fi
|
||||||
|
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
|
||||||
|
# The block device exists but doesn't appear to be paritioned, we will proceed with parititioning the device.
|
||||||
|
OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL})
|
||||||
|
osd_journal_create ${OSD_JOURNAL}
|
||||||
|
fi
|
||||||
|
chown ceph. ${OSD_JOURNAL}
|
||||||
|
elif [ "x$JOURNAL_TYPE" != "xdirectory" ]; then
|
||||||
|
echo "No journal device specified. OSD and journal will share ${OSD_DEVICE}"
|
||||||
|
echo "For better performance on HDD, consider moving your journal to a separate device"
|
||||||
|
fi
|
||||||
|
CLI_OPTS="${CLI_OPTS} --filestore"
|
||||||
}
|
}
|
||||||
|
|
||||||
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then
|
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then
|
||||||
|
@ -165,10 +165,17 @@ conf:
|
|||||||
location: /var/lib/openstack-helm/ceph/osd/journal-one
|
location: /var/lib/openstack-helm/ceph/osd/journal-one
|
||||||
# - data:
|
# - data:
|
||||||
# type: block-logical
|
# type: block-logical
|
||||||
|
# location: /dev/sdd
|
||||||
|
# journal:
|
||||||
|
# type: block-logical
|
||||||
|
# location: /dev/sdf1
|
||||||
|
# - data:
|
||||||
|
# type: block-logical
|
||||||
# location: /dev/sde
|
# location: /dev/sde
|
||||||
# journal:
|
# journal:
|
||||||
# type: block-logical
|
# type: block-logical
|
||||||
# location: /dev/sdf
|
# location: /dev/sdf2
|
||||||
|
|
||||||
# - data:
|
# - data:
|
||||||
# type: block-logical
|
# type: block-logical
|
||||||
# location: /dev/sdg
|
# location: /dev/sdg
|
||||||
|
Loading…
x
Reference in New Issue
Block a user