From 567a7c6c1ea9630f83a859ab880f54ce39aff498 Mon Sep 17 00:00:00 2001 From: Brian Wickersham Date: Mon, 9 Mar 2020 15:53:05 -0600 Subject: [PATCH] [ceph-osd] Allow ceph-volume to deploy OSDs on dirty disks Currently there are conditions that can prevent Bluestore OSDs from deploying correctly if the disk used was previously deployed as an OSD in another Ceph cluster. This change fixes the ceph-volume OSD init script so it can handle these situations correctly if OSD_FORCE_REPAIR is set. Additionally, there is a race condition that may occur which causes logical volumes to not get tagged with all of the necessary metadata for OSDs to function. This change fixes that issue as well. Change-Id: I869ba97d2224081c99ed1728b1aaa1b893d47c87 --- .../bin/osd/ceph-volume/_common.sh.tpl | 57 ++++++++--- .../ceph-volume/_init-with-ceph-volume.sh.tpl | 98 +++++++++++-------- ceph-osd/templates/daemonset-osd.yaml | 3 + 3 files changed, 99 insertions(+), 59 deletions(-) diff --git a/ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl b/ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl index 628e92b82..9ab63df5e 100644 --- a/ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl +++ b/ceph-osd/templates/bin/osd/ceph-volume/_common.sh.tpl @@ -73,6 +73,13 @@ function ceph_cmd_retry() { done } +function locked() { + exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1 + flock -w 600 --verbose "${lock_fd}" + "$@" + flock -u "${lock_fd}" +} + function crush_create_or_move { local crush_location=${1} ceph_cmd_retry --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ @@ -206,23 +213,29 @@ function zap_extra_partitions { function disk_zap { # Run all the commands that ceph-disk zap uses to clear a disk local device=${1} - local osd_device_lvm=$(lsblk ${device} -o name,type -l | grep "lvm" | grep "ceph"| awk '{print $1}') - if [[ ! -z ${osd_device_lvm} ]]; then - dmsetup remove ${osd_device_lvm} - fi - if [[ $(pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph") ]]; then - local LOCAL_VG=$(pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph") - if [[ $(lvdisplay | grep ${LOCAL_VG} | grep "LV Path" | awk '{print $3}') ]]; then - echo "y" | lvremove $(lvdisplay | grep ${LOCAL_VG} | grep "LV Path" | awk '{print $3}') + local device_filter=$(echo $device | cut -d'/' -f3) + local dm_devices=$(lsblk -o name,type -l | grep "lvm" | grep "$device_filter" | awk '/ceph/{print $1}' | tr '\n' ' ') + for dm_device in ${dm_devices}; do + if [[ ! -z ${dm_device} ]]; then + dmsetup remove ${dm_device} fi - vgremove ${LOCAL_VG} - pvremove ${OSD_DEVICE} + done + local logical_volumes=$(locked lvdisplay | grep "LV Path" | grep "$device_filter" | awk '/ceph/{print $3}' | tr '\n' ' ') + for logical_volume in ${logical_volumes}; do + if [[ ! -z ${logical_volume} ]]; then + locked lvremove -y ${logical_volume} + fi + done + local volume_group=$(pvdisplay ${device} | grep "VG Name" | awk '/ceph/{print $3}' | grep "ceph") + if [[ ${volume_group} ]]; then + vgremove ${volume_group} + pvremove ${device} ceph-volume lvm zap ${device} --destroy fi wipefs --all ${device} + sgdisk --zap-all -- ${device} # Wipe the first 200MB boundary, as Bluestore redeployments will not work otherwise dd if=/dev/zero of=${device} bs=1M count=200 - sgdisk --zap-all -- ${device} } function udev_settle { @@ -231,11 +244,23 @@ function udev_settle { if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then if [ ! -z "$BLOCK_DB" ]; then osd_devices="${osd_devices}\|${BLOCK_DB}" - partprobe "${BLOCK_DB}" + # BLOCK_DB could be a physical or logical device here + local block_db="$BLOCK_DB" + local db_vg="$(echo $block_db | cut -d'/' -f1)" + if [ ! -z "$db_vg" ]; then + block_db=$(locked pvdisplay | grep -B1 "$db_vg" | awk '/PV Name/{print $3}') + fi + locked partprobe "${block_db}" fi if [ ! -z "$BLOCK_WAL" ] && [ "$BLOCK_WAL" != "$BLOCK_DB" ]; then osd_devices="${osd_devices}\|${BLOCK_WAL}" - partprobe "${BLOCK_WAL}" + # BLOCK_WAL could be a physical or logical device here + local block_wal="$BLOCK_WAL" + local wal_vg="$(echo $block_wal | cut -d'/' -f1)" + if [ ! -z "$wal_vg" ]; then + block_wal=$(locked pvdisplay | grep -B1 "$wal_vg" | awk '/PV Name/{print $3}') + fi + locked partprobe "${block_wal}" fi else if [ "x$JOURNAL_TYPE" == "xblock-logical" ] && [ ! -z "$OSD_JOURNAL" ]; then @@ -243,7 +268,7 @@ function udev_settle { if [ ! -z "$OSD_JOURNAL" ]; then local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') osd_devices="${osd_devices}\|${JDEV}" - partprobe "${JDEV}" + locked partprobe "${JDEV}" fi fi fi @@ -275,7 +300,7 @@ function get_lvm_tag_from_volume { echo else # Get and return the specified tag from the logical volume - lvs -o lv_tags ${logical_volume} | tr ',' '\n' | grep ${tag} | cut -d'=' -f2 + locked lvs -o lv_tags ${logical_volume} | tr ',' '\n' | grep ${tag} | cut -d'=' -f2 fi } @@ -284,7 +309,7 @@ function get_lvm_tag_from_device { device="$1" tag="$2" # Attempt to get a logical volume for the physical device - logical_volume="$(pvdisplay -m ${device} | awk '/Logical volume/{print $3}')" + logical_volume="$(locked pvdisplay -m ${device} | awk '/Logical volume/{print $3}')" # Use get_lvm_tag_from_volume to get the specified tag from the logical volume get_lvm_tag_from_volume ${logical_volume} ${tag} diff --git a/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl b/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl index 19a8912ea..bb009c881 100644 --- a/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl +++ b/ceph-osd/templates/bin/osd/ceph-volume/_init-with-ceph-volume.sh.tpl @@ -61,6 +61,10 @@ function osd_disk_prepare { osd_dev_string=$(echo ${OSD_DEVICE} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-') udev_settle OSD_ID=$(get_osd_id_from_device ${OSD_DEVICE}) + OSD_FSID=$(get_cluster_fsid_from_device ${OSD_DEVICE}) + CLUSTER_FSID=$(ceph-conf --lookup fsid) + DISK_ZAPPED=0 + if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then if [[ ! -z ${OSD_ID} ]]; then DM_NUM=$(dmsetup ls | grep $(lsblk -J ${OSD_DEVICE} | jq -r '.blockdevices[].children[].name') | awk '{print $2}' | cut -d':' -f2 | cut -d')' -f1) @@ -72,6 +76,7 @@ function osd_disk_prepare { if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway" disk_zap ${OSD_DEVICE} + DISK_ZAPPED=1 else echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird." echo "It would be too dangerous to destroy it without any notification." @@ -80,12 +85,21 @@ function osd_disk_prepare { fi fi else - if [[ ! -z ${OSD_ID} ]]; then - if ceph --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING osd ls |grep -w ${OSD_ID}; then - echo "Running bluestore mode and ${OSD_DEVICE} already bootstrapped" + if [[ ! -z "${OSD_FSID}" ]]; then + if [[ "${OSD_FSID}" == "${CLUSTER_FSID}" ]]; then + if [[ ! -z "${OSD_ID}" ]]; then + if ceph --name client.bootstrap-osd --keyring $OSD_BOOTSTRAP_KEYRING osd ls |grep -w ${OSD_ID}; then + echo "Running bluestore mode and ${OSD_DEVICE} already bootstrapped" + elif [[ $OSD_FORCE_REPAIR -eq 1 ]]; then + echo "OSD initialized for this cluster, but OSD ID not found in the cluster, reinitializing" + else + echo "OSD initialized for this cluster, but OSD ID not found in the cluster" + fi + fi else - echo "found the wrong osd id which does not belong to current ceph cluster" - exit 1 + echo "OSD initialized for a different cluster, zapping it" + disk_zap ${OSD_DEVICE} + udev_settle fi elif [[ $(sgdisk --print ${OSD_DEVICE} | grep "F800") ]]; then DM_DEV=${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}') @@ -96,12 +110,11 @@ function osd_disk_prepare { CEPH_DISK_USED=1 fi if [[ ${OSD_FORCE_REPAIR} -eq 1 ]] && [ ${CEPH_DISK_USED} -ne 1 ]; then - echo "It looks like ${OSD_DEVICE} isn't consistent, however OSD_FORCE_REPAIR is enabled so we are zapping the device anyway" + echo "${OSD_DEVICE} isn't clean, zapping it because OSD_FORCE_REPAIR is enabled" disk_zap ${OSD_DEVICE} else - echo "Regarding parted, device ${OSD_DEVICE} is inconsistent/broken/weird." - echo "It would be too dangerous to destroy it without any notification." - echo "Please set OSD_FORCE_REPAIR to '1' if you really want to zap this disk." + echo "${OSD_DEVICE} isn't clean, but OSD_FORCE_REPAIR isn't enabled." + echo "Please set OSD_FORCE_REPAIR to '1' if you want to zap this disk." exit 1 fi fi @@ -189,12 +202,10 @@ function osd_disk_prepare { if [[ ${BLOCK_WAL} ]]; then block_wal_string=$(echo ${BLOCK_WAL} | awk -F "/" '{print $2}{print $3}' | paste -s -d'-') fi - exec {lock_fd}>/var/lib/ceph/tmp/init-osd.lock || exit 1 - flock -w 600 --verbose "${lock_fd}" if [[ ${BLOCK_DB} && ${BLOCK_WAL} ]]; then if [[ ${block_db_string} == ${block_wal_string} ]]; then - if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then - VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") + if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then + VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") WAL_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_wal_string}/ceph-wal-${osd_dev_string}) DB_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_db_string}/ceph-db-${osd_dev_string}) if [ ! -z ${OSD_ID} ] && ([ ${WAL_OSD_ID} != ${OSD_ID} ] || [ ${DB_OSD_ID} != ${OSD_ID} ]); then @@ -220,22 +231,22 @@ function osd_disk_prepare { disk_zap ${OSD_DEVICE} CEPH_LVM_PREPARE=1 fi - vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB} + locked vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB} VG=ceph-db-wal-${block_db_string} fi - if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then - lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG} + if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then + locked lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG} fi BLOCK_DB=${VG}/ceph-db-${osd_dev_string} - if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then - lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG} + if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then + locked lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG} fi BLOCK_WAL=${VG}/ceph-wal-${osd_dev_string} else - if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then - VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") + if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then + VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") DB_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_db_string}/ceph-db-${block_db_string}) - if [ ! -z ${OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then + if [ ! -z ${OSD_ID} ] && [ ! -z ${DB_OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then echo "Found VG, but corresponding DB is not, zapping the ${OSD_DEVICE}" disk_zap ${OSD_DEVICE} CEPH_LVM_PREPARE=1 @@ -255,11 +266,11 @@ function osd_disk_prepare { disk_zap ${OSD_DEVICE} CEPH_LVM_PREPARE=1 fi - vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB} + locked vgcreate ceph-db-wal-${block_db_string} ${BLOCK_DB} VG=ceph-db-wal-${block_db_string} fi - if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then - VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") + if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then + VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") WAL_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-wal-${block_wal_string}/ceph-wal-${block_wal_string}) if [ ! -z ${OSD_ID} ] && [ ${WAL_OSD_ID} != ${OSD_ID} ]; then echo "Found VG, but corresponding WAL is not, zapping the ${OSD_DEVICE}" @@ -281,21 +292,21 @@ function osd_disk_prepare { disk_zap ${OSD_DEVICE} CEPH_LVM_PREPARE=1 fi - vgcreate ceph-db-wal-${block_wal_string} ${BLOCK_WAL} + locked vgcreate ceph-db-wal-${block_wal_string} ${BLOCK_WAL} VG=ceph-db-wal-${block_wal_string} fi - if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_db_string}") != "ceph-db-${block_db_string}" ]]; then - lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${block_db_string} ${VG} + if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_db_string}") != "ceph-db-${block_db_string}" ]]; then + locked lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${block_db_string} ${VG} fi BLOCK_DB=${VG}/ceph-db-${block_db_string} - if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_wal_string}") != "ceph-db-${block_wal_string}" ]]; then - lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${block_wal_string} ${VG} + if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${block_wal_string}") != "ceph-db-${block_wal_string}" ]]; then + locked lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${block_wal_string} ${VG} fi BLOCK_WAL=${VG}/ceph-wal-${block_wal_string} fi elif [[ -z ${BLOCK_DB} && ${BLOCK_WAL} ]]; then - if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then - VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") + if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") ]]; then + VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_wal_string}") WAL_OSD_ID=$(get_osd_id_from_volume /dev/ceph-wal-${block_wal_string}/ceph-wal-${osd_dev_string}) if [ ! -z ${OSD_ID} ] && [ ${WAL_OSD_ID} != ${OSD_ID} ]; then echo "Found VG, but corresponding WAL is not, zapping the ${OSD_DEVICE}" @@ -317,16 +328,16 @@ function osd_disk_prepare { disk_zap ${OSD_DEVICE} CEPH_LVM_PREPARE=1 fi - vgcreate ceph-wal-${block_wal_string} ${BLOCK_WAL} + locked vgcreate ceph-wal-${block_wal_string} ${BLOCK_WAL} VG=ceph-wal-${block_wal_string} fi - if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then - lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG} + if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-wal-${osd_dev_string}") != "ceph-wal-${osd_dev_string}" ]]; then + locked lvcreate -L ${BLOCK_WAL_SIZE} -n ceph-wal-${osd_dev_string} ${VG} fi BLOCK_WAL=${VG}/ceph-wal-${osd_dev_string} elif [[ ${BLOCK_DB} && -z ${BLOCK_WAL} ]]; then - if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then - VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") + if [[ $(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") ]]; then + VG=$(locked vgdisplay | grep "VG Name" | awk '{print $3}' | grep "${block_db_string}") DB_OSD_ID=$(get_osd_id_from_volume /dev/ceph-db-${block_db_string}/ceph-db-${osd_dev_string}) if [ ! -z ${OSD_ID} ] && [ ${DB_OSD_ID} != ${OSD_ID} ]; then echo "Found VG, but corresponding DB is not, zapping the ${OSD_DEVICE}" @@ -348,15 +359,14 @@ function osd_disk_prepare { disk_zap ${OSD_DEVICE} CEPH_LVM_PREPARE=1 fi - vgcreate ceph-db-${block_db_string} ${BLOCK_DB} + locked vgcreate ceph-db-${block_db_string} ${BLOCK_DB} VG=ceph-db-${block_db_string} fi - if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then - lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG} + if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-db-${osd_dev_string}") != "ceph-db-${osd_dev_string}" ]]; then + locked lvcreate -L ${BLOCK_DB_SIZE} -n ceph-db-${osd_dev_string} ${VG} fi BLOCK_DB=${VG}/ceph-db-${osd_dev_string} fi - flock -u "${lock_fd}" if [ -z ${BLOCK_DB} ] && [ -z ${BLOCK_WAL} ]; then if pvdisplay ${OSD_DEVICE} | grep "VG Name" | awk '{print $3}' | grep "ceph"; then CEPH_LVM_PREPARE=0 @@ -392,19 +402,21 @@ function osd_disk_prepare { if [[ ${CEPH_DISK_USED} -eq 1 ]]; then CLI_OPTS="${CLI_OPTS} --data ${OSD_DEVICE}" ceph-volume simple scan --force ${OSD_DEVICE}$(sgdisk --print ${OSD_DEVICE} | grep "F800" | awk '{print $1}') - elif [[ ${CEPH_LVM_PREPARE} == 1 ]]; then + elif [[ ${CEPH_LVM_PREPARE} -eq 1 ]] || [[ ${DISK_ZAPPED} -eq 1 ]]; then + udev_settle if [[ $(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}") ]]; then OSD_VG=$(vgdisplay | grep "VG Name" | awk '{print $3}' | grep "ceph-vg-${osd_dev_string}") else vgcreate ceph-vg-${osd_dev_string} ${OSD_DEVICE} OSD_VG=ceph-vg-${osd_dev_string} fi - if [[ $(lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-lv-${osd_dev_string}") != "ceph-lv-${osd_dev_string}" ]]; then + if [[ $(locked lvdisplay | grep "LV Name" | awk '{print $3}' | grep "ceph-lv-${osd_dev_string}") != "ceph-lv-${osd_dev_string}" ]]; then lvcreate --yes -l 100%FREE -n ceph-lv-${osd_dev_string} ${OSD_VG} fi OSD_LV=${OSD_VG}/ceph-lv-${osd_dev_string} CLI_OPTS="${CLI_OPTS} --data ${OSD_LV}" - ceph-volume lvm -v prepare ${CLI_OPTS} + locked ceph-volume lvm -v prepare ${CLI_OPTS} + udev_settle fi } diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index d46b29d91..970275088 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -383,6 +383,9 @@ spec: - name: pod-var-lib-ceph mountPath: /var/lib/ceph readOnly: false + - name: pod-var-lib-ceph-tmp + mountPath: /var/lib/ceph/tmp + readOnly: false - name: run-lvm mountPath: /run/lvm readOnly: false