From 3c55e7773b10a1c9804f2e59e8dba61c3028cd46 Mon Sep 17 00:00:00 2001 From: Taylor Stephen Date: Wed, 17 Jul 2019 16:09:34 -0600 Subject: [PATCH] [ceph-osd] BlueStore support for ceph-osd This adds BlueStore support for the ceph-osd chart so that OSDs may be deployed using BlueStore with optional --block.db and --block.wal parameters. Co-Authored-By: Chinasubbareddy Mallavarapu Change-Id: Ifbae8331b595c15c168ccd6e93b00ff054a607bc --- ceph-osd/templates/bin/osd/_bluestore.sh.tpl | 74 +++++++++++++++++++ ceph-osd/templates/bin/osd/_common.sh.tpl | 56 +++++++++++++- ceph-osd/templates/bin/osd/_init.sh.tpl | 46 ++++++++---- ceph-osd/templates/configmap-bin.yaml | 2 + ceph-osd/templates/daemonset-osd.yaml | 4 + .../utils/_osd_daemonset_overrides.tpl | 48 +++++++++++- ceph-osd/values.yaml | 9 +++ 7 files changed, 218 insertions(+), 21 deletions(-) create mode 100644 ceph-osd/templates/bin/osd/_bluestore.sh.tpl diff --git a/ceph-osd/templates/bin/osd/_bluestore.sh.tpl b/ceph-osd/templates/bin/osd/_bluestore.sh.tpl new file mode 100644 index 000000000..69280c8f4 --- /dev/null +++ b/ceph-osd/templates/bin/osd/_bluestore.sh.tpl @@ -0,0 +1,74 @@ +#!/bin/bash + +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +source /tmp/osd-common.sh + +set -ex + +: "${OSD_SOFT_FORCE_ZAP:=1}" + +export OSD_DEVICE=$(readlink -f ${STORAGE_LOCATION}) + +if [[ -z "${OSD_DEVICE}" ]];then + echo "ERROR- You must provide a device to build your OSD ie: /dev/sdb" + exit 1 +fi + +if [[ ! -b "${OSD_DEVICE}" ]]; then + echo "ERROR- The device pointed by OSD_DEVICE ${OSD_DEVICE} doesn't exist !" + exit 1 +fi + +CEPH_DISK_OPTIONS="" +CEPH_OSD_OPTIONS="" +DATA_UUID=$(blkid -o value -s PARTUUID ${OSD_DEVICE}*1) + +udev_settle + +DATA_PART=$(dev_part ${OSD_DEVICE} 1) +MOUNTED_PART=${DATA_PART} + +ceph-disk -v \ + --setuser ceph \ + --setgroup disk \ + activate ${CEPH_DISK_OPTIONS} \ + --no-start-daemon ${DATA_PART} + +OSD_ID=$(grep "${MOUNTED_PART}" /proc/mounts | awk '{print $2}' | grep -oh '[0-9]*') + +OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}" +OSD_KEYRING="${OSD_PATH}/keyring" +# NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing +OSD_WEIGHT=0 +# NOTE(supamatt): add or move the OSD's CRUSH location +crush_location + + +# NOTE(supamatt): Just in case permissions do not align up, we recursively set them correctly. +if [ $(stat -c%U ${OSD_PATH}) != ceph ]; then + chown -R ceph. ${OSD_PATH}; +fi + +exec /usr/bin/ceph-osd \ + --cluster ${CLUSTER} \ + ${CEPH_OSD_OPTIONS} \ + -f \ + -i ${OSD_ID} \ + --setuser ceph \ + --setgroup disk & echo $! > /run/ceph-osd.pid +wait diff --git a/ceph-osd/templates/bin/osd/_common.sh.tpl b/ceph-osd/templates/bin/osd/_common.sh.tpl index db0eb2936..308edeed9 100644 --- a/ceph-osd/templates/bin/osd/_common.sh.tpl +++ b/ceph-osd/templates/bin/osd/_common.sh.tpl @@ -142,6 +142,43 @@ function dev_part { fi } +function zap_extra_partitions { + # Examine temp mount and delete any block.db and block.wal partitions + mountpoint=${1} + journal_disk="" + journal_part="" + block_db_disk="" + block_db_part="" + block_wal_disk="" + block_wal_part="" + + # Discover journal, block.db, and block.wal partitions first before deleting anything + # If the partitions are on the same disk, deleting one can affect discovery of the other(s) + if [ -L "${mountpoint}/journal" ]; then + journal_disk=$(readlink -m ${mountpoint}/journal | sed 's/[0-9]*//g') + journal_part=$(readlink -m ${mountpoint}/journal | sed 's/[^0-9]*//g') + fi + if [ -L "${mountpoint}/block.db" ]; then + block_db_disk=$(readlink -m ${mountpoint}/block.db | sed 's/[0-9]*//g') + block_db_part=$(readlink -m ${mountpoint}/block.db | sed 's/[^0-9]*//g') + fi + if [ -L "${mountpoint}/block.wal" ]; then + block_wal_disk=$(readlink -m ${mountpoint}/block.wal | sed 's/[0-9]*//g') + block_wal_part=$(readlink -m ${mountpoint}/block.wal | sed 's/[^0-9]*//g') + fi + + # Delete any discovered journal, block.db, and block.wal partitions + if [ ! -z "${journal_disk}" ]; then + sgdisk -d ${journal_part} ${journal_disk} + fi + if [ ! -z "${block_db_disk}" ]; then + sgdisk -d ${block_db_part} ${block_db_disk} + fi + if [ ! -z "${block_wal_disk}" ]; then + sgdisk -d ${block_wal_part} ${block_wal_disk} + fi +} + function disk_zap { # Run all the commands that ceph-disk zap uses to clear a disk local device=${1} @@ -154,10 +191,21 @@ function disk_zap { function udev_settle { partprobe "${OSD_DEVICE}" - if [ "x$JOURNAL_TYPE" == "xblock-logical" ]; then - OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) - local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') - partprobe "${JDEV}" + if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then + if [ ! -z "$BLOCK_DB" ]; then + partprobe "${BLOCK_DB}" + fi + if [ ! -z "$BLOCK_WAL" ] && [ "$BLOCK_WAL" != "$BLOCK_DB" ]; then + partprobe "${BLOCK_WAL}" + fi + else + if [ "x$JOURNAL_TYPE" == "xblock-logical" ] && [ ! -z "$OSD_JOURNAL" ]; then + OSD_JOURNAL=$(readlink -f ${OSD_JOURNAL}) + if [ ! -z "$OSD_JOURNAL" ]; then + local JDEV=$(echo ${OSD_JOURNAL} | sed 's/[0-9]//g') + partprobe "${JDEV}" + fi + fi fi # watch the udev event queue, and exit if all current events are handled udevadm settle --timeout=600 diff --git a/ceph-osd/templates/bin/osd/_init.sh.tpl b/ceph-osd/templates/bin/osd/_init.sh.tpl index b009a30a2..e0f549092 100644 --- a/ceph-osd/templates/bin/osd/_init.sh.tpl +++ b/ceph-osd/templates/bin/osd/_init.sh.tpl @@ -24,6 +24,10 @@ source /tmp/osd-common.sh # We do not want to zap journal disk. Tracking this option seperatly. : "${JOURNAL_FORCE_ZAP:=0}" +if [ "x${STORAGE_TYPE%-*}" == "xbluestore" ]; then + export OSD_BLUESTORE=1 +fi + if [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then export OSD_DEVICE="/var/lib/ceph/osd" else @@ -71,7 +75,7 @@ function osd_disk_prepare { if [[ ${OSD_FORCE_REPAIR} -eq 1 ]]; then if [ -b "${OSD_DEVICE}1" ]; then local cephFSID=$(ceph-conf --lookup fsid) - if [ ! -z "${cephFSID}" ]; then + if [ ! -z "${cephFSID}" ]; then local tmpmnt=$(mktemp -d) mount ${OSD_DEVICE}1 ${tmpmnt} if [ "${OSD_BLUESTORE:-0}" -ne 1 ] && [ "x$JOURNAL_TYPE" != "xdirectory" ]; then @@ -107,22 +111,25 @@ function osd_disk_prepare { fi if [ -f "${tmpmnt}/ceph_fsid" ]; then osdFSID=$(cat "${tmpmnt}/ceph_fsid") - umount ${tmpmnt} if [ ${osdFSID} != ${cephFSID} ]; then echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster." echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}" echo "Because OSD_FORCE_REPAIR was set, we will zap this device." + zap_extra_partitions ${tmpmnt} + umount ${tmpmnt} disk_zap ${OSD_DEVICE} else + umount ${tmpmnt} echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster." echo "OSD_FORCE_REPAIR is set, but will be ignored and the device will not be zapped." echo "Moving on, trying to activate the OSD now." return fi else - umount ${tmpmnt} echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID." echo "Because OSD_FORCE_REPAIR was set, we will zap this device." + zap_extra_partitions ${tmpmnt} + umount ${tmpmnt} disk_zap ${OSD_DEVICE} fi else @@ -145,22 +152,33 @@ function osd_disk_prepare { fi fi - if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then + if [ "${OSD_BLUESTORE:-0}" -eq 1 ]; then + CLI_OPTS="${CLI_OPTS} --bluestore" + + if [ ! -z "$BLOCK_DB" ]; then + CLI_OPTS="${CLI_OPTS} --block.db ${BLOCK_DB}" + fi + + if [ ! -z "$BLOCK_WAL" ]; then + CLI_OPTS="${CLI_OPTS} --block.wal ${BLOCK_WAL}" + fi + + CLI_OPTS="${CLI_OPTS} ${OSD_DEVICE}" + else # we only care about journals for filestore. osd_journal_prepare - else - OSD_JOURNAL='' - CLI_OPTS="${CLI_OPTS} --bluestore" + + CLI_OPTS="${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE}" + + if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then + CLI_OPTS="${CLI_OPTS} --journal-file" + else + CLI_OPTS="${CLI_OPTS} ${OSD_JOURNAL}" + fi fi udev_settle - - if [ "x$JOURNAL_TYPE" == "xdirectory" ]; then - ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} --journal-file - else - ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL} - fi - + ceph-disk -v prepare ${CLI_OPTS} } function osd_journal_create { diff --git a/ceph-osd/templates/configmap-bin.yaml b/ceph-osd/templates/configmap-bin.yaml index 9f537b578..61fb26e28 100644 --- a/ceph-osd/templates/configmap-bin.yaml +++ b/ceph-osd/templates/configmap-bin.yaml @@ -36,6 +36,8 @@ data: {{ tuple "bin/osd/_directory.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-block.sh: | {{ tuple "bin/osd/_block.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-bluestore.sh: | +{{ tuple "bin/osd/_bluestore.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-init.sh: | {{ tuple "bin/osd/_init.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} osd-check.sh: | diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index 46489c0a0..1b33b431c 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -280,6 +280,10 @@ spec: mountPath: /tmp/osd-block.sh subPath: osd-block.sh readOnly: true + - name: ceph-osd-bin + mountPath: /tmp/osd-bluestore.sh + subPath: osd-bluestore.sh + readOnly: true - name: ceph-osd-bin mountPath: /tmp/osd-check.sh subPath: osd-check.sh diff --git a/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl b/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl index eae93b72a..5a5e5aeee 100644 --- a/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl +++ b/ceph-osd/templates/utils/_osd_daemonset_overrides.tpl @@ -303,6 +303,7 @@ limitations under the License. {{- $_ := set $context.Values "__tmpPodVols" $newPodDataVols }} {{ end }} + {{- if ne $v.data.type "bluestore" }} {{ if eq $v.journal.type "directory" }} {{ $journalDirVolume := dict "hostPath" (dict "path" $v.journal.location) "name" "journal" }} {{ $newPodDataVols := append $context.Values.__tmpPodVols $journalDirVolume }} @@ -312,6 +313,11 @@ limitations under the License. {{ $newPodDataVols := append $context.Values.__tmpPodVols $dataDirVolume }} {{- $_ := set $context.Values "__tmpPodVols" $newPodDataVols }} {{ end }} + {{ else }} + {{ $dataDirVolume := dict "emptyDir" dict "name" "journal" }} + {{ $newPodDataVols := append $context.Values.__tmpPodVols $dataDirVolume }} + {{- $_ := set $context.Values "__tmpPodVols" $newPodDataVols }} + {{- end }} {{- if not $context.Values.__tmpYAML.spec }}{{- $_ := set $context.Values.__tmpYAML "spec" dict }}{{- end }} {{- if not $context.Values.__tmpYAML.spec.template }}{{- $_ := set $context.Values.__tmpYAML.spec "template" dict }}{{- end }} @@ -330,9 +336,27 @@ limitations under the License. {{- if empty $context.Values._tmpYAMLcontainer.env }} {{- $_ := set $context.Values._tmpYAMLcontainer "env" ( list ) }} {{- end }} + {{- $tmpcontainerEnv := omit $context.Values._tmpYAMLcontainer "env" }} + {{- if eq $v.data.type "bluestore" }} + {{- if and $v.block_db $v.block_wal}} + {{ $containerEnv := prepend (prepend (prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} + {{- else if $v.block_db }} + {{ $containerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db) }} + {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} + {{- else if $v.block_wal }} + {{ $containerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} + {{ else }} + {{ $containerEnv := prepend (prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location) }} + {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} + {{- end }} + {{ else }} {{ $containerEnv := prepend (prepend (prepend ( prepend (index $context.Values._tmpYAMLcontainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "JOURNAL_TYPE" "value" $v.journal.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "JOURNAL_LOCATION" "value" $v.journal.location) }} + {{- $_ := set $tmpcontainerEnv "env" $containerEnv }} + {{- end }} {{- $localInitContainerEnv := omit $context.Values._tmpYAMLcontainer "env" }} - {{- $_ := set $localInitContainerEnv "env" $containerEnv }} + {{- $_ := set $localInitContainerEnv "env" $tmpcontainerEnv.env }} {{ $containerList := append $context.Values.__tmpYAMLcontainers $localInitContainerEnv }} {{ $_ := set $context.Values "__tmpYAMLcontainers" $containerList }} {{ end }} @@ -341,9 +365,27 @@ limitations under the License. {{- $_ := set $context.Values "__tmpYAMLinitContainers" list }} {{- range $podContainer := $context.Values.__daemonset_yaml.spec.template.spec.initContainers }} {{- $_ := set $context.Values "_tmpYAMLinitContainer" $podContainer }} - {{ $initContainerEnv := prepend (prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "JOURNAL_TYPE" "value" $v.journal.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "JOURNAL_LOCATION" "value" $v.journal.location) }} + {{- $tmpinitcontainerEnv := omit $context.Values._tmpYAMLinitContainer "env" }} + {{- if eq $v.data.type "bluestore" }} + {{- if and $v.block_db $v.block_wal}} + {{ $initcontainerEnv := prepend (prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} + {{- else if $v.block_db }} + {{ $initcontainerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_DB" "value" $v.block_db) }} + {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} + {{- else if $v.block_wal }} + {{ $initcontainerEnv := prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "BLOCK_WAL" "value" $v.block_wal) }} + {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} + {{ else }} + {{ $initcontainerEnv := prepend (prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location) }} + {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} + {{- end }} + {{ else }} + {{ $initcontainerEnv := prepend (prepend (prepend ( prepend (index $context.Values._tmpYAMLinitContainer "env") (dict "name" "STORAGE_TYPE" "value" $v.data.type)) (dict "name" "JOURNAL_TYPE" "value" $v.journal.type)) (dict "name" "STORAGE_LOCATION" "value" $v.data.location)) (dict "name" "JOURNAL_LOCATION" "value" $v.journal.location) }} + {{- $_ := set $tmpinitcontainerEnv "env" $initcontainerEnv }} + {{- end }} {{- $localInitContainerEnv := omit $context.Values._tmpYAMLinitContainer "env" }} - {{- $_ := set $localInitContainerEnv "env" $initContainerEnv }} + {{- $_ := set $localInitContainerEnv "env" $tmpinitcontainerEnv.env }} {{ $initContainerList := append $context.Values.__tmpYAMLinitContainers $localInitContainerEnv }} {{ $_ := set $context.Values "__tmpYAMLinitContainers" $initContainerList }} {{ end }} diff --git a/ceph-osd/values.yaml b/ceph-osd/values.yaml index ead77824b..222aee69e 100644 --- a/ceph-osd/values.yaml +++ b/ceph-osd/values.yaml @@ -204,6 +204,15 @@ conf: journal: type: directory location: /var/lib/openstack-helm/ceph/osd/journal-one + + # - data: + # type: bluestore + # location: /dev/sdb + # Separate block devices may be used for block.db and/or block.wal + # Without these values they will be co-located on the data volume + # block_db: /dev/sdc + # block_wal: /dev/sdc + # - data: # type: block-logical # location: /dev/sdd