[ceph-osd] Allow ceph-volume to deploy OSDs on dirty disks

Currently there are conditions that can prevent Bluestore OSDs
from deploying correctly if the disk used was previously deployed
as an OSD in another Ceph cluster. This change fixes the
ceph-volume OSD init script so it can handle these situations
correctly if OSD_FORCE_REPAIR is set.

Additionally, there is a race condition that may occur which
causes logical volumes to not get tagged with all of the
necessary metadata for OSDs to function. This change fixes
that issue as well.

Change-Id: I869ba97d2224081c99ed1728b1aaa1b893d47c87
This commit is contained in:
Brian Wickersham 2020-03-09 15:53:05 -06:00 committed by chinasubbareddy mallavarapu
parent 0a35fd827e
commit 567a7c6c1e
3 changed files with 99 additions and 59 deletions

View File

@ -73,6 +73,13 @@ function ceph_cmd_retry() {
done
}
function locked() {
exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1
flock -w 600 --verbose "${lock_fd}"
"$@"
flock -u "${lock_fd}"
}
function crush_create_or_move {
local crush_location=${1}
ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
@ -206,23 +213,29 @@ function zap_extra_partitions {
function disk_zap {
# Run all the commands that ceph-disk zap uses to clear a disk
local device=${1}
local osd_device_lvm=$(lsblk ${device} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}')
if [[ ! -z ${osd_device_lvm} ]]; then
dmsetup remove ${osd_device_lvm}
fi
if [[ $(pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph") ]]; then
local LOCAL_VG=$(pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph")
if [[ $(lvdisplay | grep ${LOCAL_VG} | grep "LV Path" | awk '{print $3}') ]]; then
echo "y" | lvremove $(lvdisplay | grep ${LOCAL_VG} | grep "LV Path" | awk '{print $3}')
local device_filter=$(echo $device | cut -d'/' -f3)
local dm_devices=$(lsblk -o name,type -l | grep "lvm" | grep "$device_filter" | awk '/ceph/{print $1}' | tr '\n' ' ')
for dm_device in ${dm_devices}; do
if [[ ! -z ${dm_device} ]]; then
dmsetup remove ${dm_device}
fi
vgremove ${LOCAL_VG}
pvremove ${OSD_DEVICE}
done
local logical_volumes=$(locked lvdisplay | grep "LV Path" | grep "$device_filter" | awk '/ceph/{print $3}' | tr '\n' ' ')
for logical_volume in ${logical_volumes}; do
if [[ ! -z ${logical_volume} ]]; then
locked lvremove -y ${logical_volume}
fi
done
local volume_group=$(pvdisplay ${device} | grep "VG Name" | awk '/ceph/{print $3}' | grep "ceph")
if [[ ${volume_group} ]]; then
vgremove ${volume_group}
pvremove ${device}
ceph-volume lvm zap ${device} --destroy
fi
wipefs --all ${device}
sgdisk --zap-all -- ${device}
# Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise
dd if=/dev/zero of=${device} bs=1M count=200
sgdisk --zap-all -- ${device}
}
function udev_settle {
@ -231,11 +244,23 @@ function udev_settle {
if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then
if [ ! -z "$BLOCK_DB" ]; then
osd_devices="${osd_devices}\|${BLOCK_DB}"
partprobe "${BLOCK_DB}"
# BLOCK_DB could be a physical or logical device here
local block_db="$BLOCK_DB"
local db_vg="$(echo $block_db | cut -d'/' -f1)"
if [ ! -z "$db_vg" ]; then
block_db=$(locked pvdisplay | grep -B1 "$db_vg" | awk '/PV Name/{print $3}')
fi
locked partprobe "${block_db}"
fi
if [ ! -z "$BLOCK_WAL" ] && [ "$BLOCK_WAL" != "$BLOCK_DB" ]; then
osd_devices="${osd_devices}\|${BLOCK_WAL}"
partprobe "${BLOCK_WAL}"
# BLOCK_WAL could be a physical or logical device here
local block_wal="$BLOCK_WAL"
local wal_vg="$(echo $block_wal | cut -d'/' -f1)"
if [ ! -z "$wal_vg" ]; then
block_wal=$(locked pvdisplay | grep -B1 "$wal_vg" | awk '/PV Name/{print $3}')
fi
locked partprobe "${block_wal}"
fi
else
if [ "x$JOURNAL_TYPE" == "xblock-logical" ] && [ ! -z "$OSD_JOURNAL" ]; then
@ -243,7 +268,7 @@ function udev_settle {
if [ ! -z "$OSD_JOURNAL" ]; then
local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g')
osd_devices="${osd_devices}\|${JDEV}"
partprobe "${JDEV}"
locked partprobe "${JDEV}"
fi
fi
fi
@ -275,7 +300,7 @@ function get_lvm_tag_from_volume {
echo
else
# Get and return the specified tag from the logical volume
lvs -o lv_tags ${logical_volume} | tr ',' '\n' | grep ${tag} | cut -d'=' -f2
locked lvs -o lv_tags ${logical_volume} | tr ',' '\n' | grep ${tag} | cut -d'=' -f2
fi
}
@ -284,7 +309,7 @@ function get_lvm_tag_from_device {
device="$1"
tag="$2"
# Attempt to get a logical volume for the physical device
logical_volume="$(pvdisplay -m ${device} | awk '/Logical volume/{print $3}')"
logical_volume="$(locked pvdisplay -m ${device} | awk '/Logical volume/{print $3}')"
# Use get_lvm_tag_from_volume to get the specified tag from the logical volume
get_lvm_tag_from_volume ${logical_volume} ${tag}

View File

@ -61,6 +61,10 @@ function osd_disk_prepare {
osd_dev_string=$(echo ${OSD_DEVICE} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-')
udev_settle
OSD_ID=$(get_osd_id_from_device ${OSD_DEVICE})
OSD_FSID=$(get_cluster_fsid_from_device ${OSD_DEVICE})
CLUSTER_FSID=$(ceph-conf --lookup fsid)
DISK_ZAPPED=0
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
if [[ ! -z ${OSD_ID} ]]; then
DM_NUM=$(dmsetup ls | grep $(lsblk -J ${OSD_DEVICE} | jq -r '.blockdevices[].children[].name') | awk '{print $2}' | cut -d':' -f2 | cut -d')' -f1)
@ -72,6 +76,7 @@ function osd_disk_prepare {
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway"
disk_zap ${OSD_DEVICE}
DISK_ZAPPED=1
else
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
echo "It would be too dangerous to destroy it without any notification."
@ -80,12 +85,21 @@ function osd_disk_prepare {
fi
fi
else
if [[ ! -z ${OSD_ID} ]]; then
if ceph --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING osd ls |grep -w ${OSD_ID}; then
echo "Running bluestore mode and ${OSD_DEVICE} already bootstrapped"
if [[ ! -z "${OSD_FSID}" ]]; then
if [[ "${OSD_FSID}" == "${CLUSTER_FSID}" ]]; then
if [[ ! -z "${OSD_ID}" ]]; then
if ceph --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING osd ls |grep -w ${OSD_ID}; then
echo "Running bluestore mode and ${OSD_DEVICE} already bootstrapped"
elif [[ $OSD_FORCE_REPAIR -eq 1 ]]; then
echo "OSD initialized for this cluster, but OSD ID not found in the cluster, reinitializing"
else
echo "OSD initialized for this cluster, but OSD ID not found in the cluster"
fi
fi
else
echo "found the wrong osd id which does not belong to current ceph cluster"
exit 1
echo "OSD initialized for a different cluster, zapping it"
disk_zap ${OSD_DEVICE}
udev_settle
fi
elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then
DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
@ -96,12 +110,11 @@ function osd_disk_prepare {
CEPH_DISK_USED=1
fi
if [[ ${OSD_FORCE_REPAIR} -eq 1 ]] && [ ${CEPH_DISK_USED} -ne 1 ]; then
echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway"
echo "${OSD_DEVICE} isn't clean, zapping it because OSD_FORCE_REPAIR is enabled"
disk_zap ${OSD_DEVICE}
else
echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird."
echo "It would be too dangerous to destroy it without any notification."
echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk."
echo "${OSD_DEVICE} isn't clean, but OSD_FORCE_REPAIR isn't enabled."
echo "Please set OSD_FORCE_REPAIR to '1' if you want to zap this disk."
exit 1
fi
fi
@ -189,12 +202,10 @@ function osd_disk_prepare {
if [[ ${BLOCK_WAL} ]]; then
block_wal_string=$(echo ${BLOCK_WAL} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-')
fi
exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1
flock -w 600 --verbose "${lock_fd}"
if [[ ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
if [[ ${block_db_string} == ${block_wal_string} ]]; then
if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then
VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}")
if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then
VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}")
WAL_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_wal_string}/ceph-wal-${osd_dev_string})
DB_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_db_string}/ceph-db-${osd_dev_string})
if [ ! -z ${OSD_ID} ] && ([ ${WAL_OSD_ID} != ${OSD_ID} ] || [ ${DB_OSD_ID} != ${OSD_ID} ]); then
@ -220,22 +231,22 @@ function osd_disk_prepare {
disk_zap ${OSD_DEVICE}
CEPH_LVM_PREPARE=1
fi
vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB}
locked vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB}
VG=ceph-db-wal-${block_db_string}
fi
if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then
lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG}
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then
locked lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG}
fi
BLOCK_DB=${VG}/ceph-db-${osd_dev_string}
if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then
lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG}
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then
locked lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG}
fi
BLOCK_WAL=${VG}/ceph-wal-${osd_dev_string}
else
if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then
VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}")
if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then
VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}")
DB_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_db_string}/ceph-db-${block_db_string})
if [ ! -z ${OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then
if [ ! -z ${OSD_ID} ] && [ ! -z ${DB_OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then
echo "Found VG, but corresponding DB is not, zapping the ${OSD_DEVICE}"
disk_zap ${OSD_DEVICE}
CEPH_LVM_PREPARE=1
@ -255,11 +266,11 @@ function osd_disk_prepare {
disk_zap ${OSD_DEVICE}
CEPH_LVM_PREPARE=1
fi
vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB}
locked vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB}
VG=ceph-db-wal-${block_db_string}
fi
if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then
VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}")
if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then
VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}")
WAL_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_wal_string}/ceph-wal-${block_wal_string})
if [ ! -z ${OSD_ID} ] && [ ${WAL_OSD_ID} != ${OSD_ID} ]; then
echo "Found VG, but corresponding WAL is not, zapping the ${OSD_DEVICE}"
@ -281,21 +292,21 @@ function osd_disk_prepare {
disk_zap ${OSD_DEVICE}
CEPH_LVM_PREPARE=1
fi
vgcreate ceph-db-wal-${block_wal_string} ${BLOCK_WAL}
locked vgcreate ceph-db-wal-${block_wal_string} ${BLOCK_WAL}
VG=ceph-db-wal-${block_wal_string}
fi
if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_db_string}") != "ceph-db-${block_db_string}" ]]; then
lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${block_db_string} ${VG}
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_db_string}") != "ceph-db-${block_db_string}" ]]; then
locked lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${block_db_string} ${VG}
fi
BLOCK_DB=${VG}/ceph-db-${block_db_string}
if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_wal_string}") != "ceph-db-${block_wal_string}" ]]; then
lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${block_wal_string} ${VG}
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_wal_string}") != "ceph-db-${block_wal_string}" ]]; then
locked lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${block_wal_string} ${VG}
fi
BLOCK_WAL=${VG}/ceph-wal-${block_wal_string}
fi
elif [[ -z ${BLOCK_DB} && ${BLOCK_WAL} ]]; then
if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then
VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}")
if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then
VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}")
WAL_OSD_ID=$(get_osd_id_from_volume /dev/ceph-wal-${block_wal_string}/ceph-wal-${osd_dev_string})
if [ ! -z ${OSD_ID} ] && [ ${WAL_OSD_ID} != ${OSD_ID} ]; then
echo "Found VG, but corresponding WAL is not, zapping the ${OSD_DEVICE}"
@ -317,16 +328,16 @@ function osd_disk_prepare {
disk_zap ${OSD_DEVICE}
CEPH_LVM_PREPARE=1
fi
vgcreate ceph-wal-${block_wal_string} ${BLOCK_WAL}
locked vgcreate ceph-wal-${block_wal_string} ${BLOCK_WAL}
VG=ceph-wal-${block_wal_string}
fi
if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then
lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG}
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then
locked lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG}
fi
BLOCK_WAL=${VG}/ceph-wal-${osd_dev_string}
elif [[ ${BLOCK_DB} && -z ${BLOCK_WAL} ]]; then
if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then
VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}")
if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then
VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}")
DB_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-${block_db_string}/ceph-db-${osd_dev_string})
if [ ! -z ${OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then
echo "Found VG, but corresponding DB is not, zapping the ${OSD_DEVICE}"
@ -348,15 +359,14 @@ function osd_disk_prepare {
disk_zap ${OSD_DEVICE}
CEPH_LVM_PREPARE=1
fi
vgcreate ceph-db-${block_db_string} ${BLOCK_DB}
locked vgcreate ceph-db-${block_db_string} ${BLOCK_DB}
VG=ceph-db-${block_db_string}
fi
if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then
lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG}
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then
locked lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG}
fi
BLOCK_DB=${VG}/ceph-db-${osd_dev_string}
fi
flock -u "${lock_fd}"
if [ -z ${BLOCK_DB} ] && [ -z ${BLOCK_WAL} ]; then
if pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph"; then
CEPH_LVM_PREPARE=0
@ -392,19 +402,21 @@ function osd_disk_prepare {
if [[ ${CEPH_DISK_USED} -eq 1 ]]; then
CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE}"
ceph-volume simple scan --force ${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}')
elif [[ ${CEPH_LVM_PREPARE} == 1 ]]; then
elif [[ ${CEPH_LVM_PREPARE} -eq 1 ]] || [[ ${DISK_ZAPPED} -eq 1 ]]; then
udev_settle
if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}") ]]; then
OSD_VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}")
else
vgcreate ceph-vg-${osd_dev_string} ${OSD_DEVICE}
OSD_VG=ceph-vg-${osd_dev_string}
fi
if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-lv-${osd_dev_string}") != "ceph-lv-${osd_dev_string}" ]]; then
if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-lv-${osd_dev_string}") != "ceph-lv-${osd_dev_string}" ]]; then
lvcreate --yes -l 100%FREE -n ceph-lv-${osd_dev_string} ${OSD_VG}
fi
OSD_LV=${OSD_VG}/ceph-lv-${osd_dev_string}
CLI_OPTS="${CLI_OPTS} --data ${OSD_LV}"
ceph-volume lvm -v prepare ${CLI_OPTS}
locked ceph-volume lvm -v prepare ${CLI_OPTS}
udev_settle
fi
}

View File

@ -383,6 +383,9 @@ spec:
- name: pod-var-lib-ceph
mountPath: /var/lib/ceph
readOnly: false
- name: pod-var-lib-ceph-tmp
mountPath: /var/lib/ceph/tmp
readOnly: false
- name: run-lvm
mountPath: /run/lvm
readOnly: false