[CEPH] Fix race conditions with OSD POD initialization

Under POD restart conditions there is a race condition with lsblk
causing the helm chart to zap a fully working OSD disk. We refactor
the code to remove this requirement.

Additonally the new automatic journal partitioning code has a race
condition in which the same journal partition could be picked twice
for OSDs on the same node. To resolve this we share a common tmp
directory from the node to all of the OSD pods on that node.

Change-Id: I807074c4c5e54b953b5c0efa4c169763c5629062
This commit is contained in:
Matthew Heler 2018-12-20 14:54:47 -06:00
parent 4233c25308
commit 30b57ba671
2 changed files with 43 additions and 32 deletions
ceph-osd/templates

@ -50,6 +50,13 @@ else
export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION})
fi
function udev_settle {
partprobe "${OSD_DEVICE}"
# watch the udev event queue, and exit if all current events are handled
udevadm settle --timeout=600
}
# Calculate proper device names, given a device and partition number
function dev_part {
local OSD_DEVICE=${1}
@ -121,46 +128,41 @@ function osd_disk_prepare {
fi
fi
udev_settle
# then search for some ceph metadata on the disk
if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then
if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then
if [ -b "${OSD_DEVICE}1" ]; then
local fs=`lsblk -fn ${OSD_DEVICE}1`
if [ ! -z "${fs}" ]; then
local cephFSID=`ceph-conf --lookup fsid`
if [ ! -z "${cephFSID}" ]; then
local tmpmnt=`mktemp -d`
mount ${OSD_DEVICE}1 ${tmpmnt}
if [ -f "${tmpmnt}/ceph_fsid" ]; then
osdFSID=`cat "${tmpmnt}/ceph_fsid"`
umount ${tmpmnt}
if [ ${osdFSID} != ${cephFSID} ]; then
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
ceph-disk -v zap ${OSD_DEVICE}
else
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped."
echo "Moving on, trying to activate the OSD now."
return
fi
else
umount ${tmpmnt}
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
local cephFSID=`ceph-conf --lookup fsid`
if [ ! -z "${cephFSID}" ]; then
local tmpmnt=`mktemp -d`
mount ${OSD_DEVICE}1 ${tmpmnt}
if [ -f "${tmpmnt}/ceph_fsid" ]; then
osdFSID=`cat "${tmpmnt}/ceph_fsid"`
umount ${tmpmnt}
if [ ${osdFSID} != ${cephFSID} ]; then
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster."
echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}"
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
ceph-disk -v zap ${OSD_DEVICE}
else
echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster."
echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped."
echo "Moving on, trying to activate the OSD now."
return
fi
else
echo "Unable to determine the FSID of the current cluster."
echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped."
echo "Moving on, trying to activate the OSD now."
return
umount ${tmpmnt}
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID."
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
ceph-disk -v zap ${OSD_DEVICE}
fi
else
echo "It looks like ${OSD_DEVICE} has a ceph data partition but no filesystem."
echo "Because OSD_FORCE_ZAP was set, we will zap this device."
ceph-disk -v zap ${OSD_DEVICE}
echo "Unable to determine the FSID of the current cluster."
echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped."
echo "Moving on, trying to activate the OSD now."
return
fi
else
echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it."
@ -225,8 +227,7 @@ function osd_disk_prepare {
ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL}
# watch the udev event queue, and exit if all current events are handled
udevadm settle --timeout=600
udev_settle
}
if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then

@ -79,6 +79,9 @@ spec:
- name: pod-var-lib-ceph
mountPath: /var/lib/ceph
readOnly: false
- name: pod-var-lib-ceph-tmp
mountPath: /var/lib/ceph/tmp
readOnly: false
- name: pod-run
mountPath: /run
readOnly: false
@ -160,6 +163,9 @@ spec:
- name: pod-var-lib-ceph
mountPath: /var/lib/ceph
readOnly: false
- name: pod-var-lib-ceph-tmp
mountPath: /var/lib/ceph/tmp
ReadOnly: false
- name: pod-run
mountPath: /run
readOnly: false
@ -288,6 +294,10 @@ spec:
path: /run/lvm
- name: pod-var-lib-ceph
emptyDir: {}
- name: pod-var-lib-ceph-tmp
hostPath:
path: /var/lib/openstack-helm/ceph/var-tmp
type: DirectoryOrCreate
- name: pod-var-log
hostPath:
path: {{ print "/var/log/ceph/" $envAll.Release.Name }}