From 5ce9f2eb3b37fcc480c185867dfc09551a944ce1 Mon Sep 17 00:00:00 2001 From: Matthew Heler Date: Fri, 16 Nov 2018 12:20:52 -0600 Subject: [PATCH] Enable Ceph charts to be rack aware for CRUSH Add support for a rack level CRUSH map. Rack level CRUSH support is enabled by using the "rack_replicated_rule" crush rule. Change-Id: I4df224f2821872faa2eddec2120832e9a22f4a7c --- ceph-client/templates/bin/pool/_init.sh.tpl | 4 +++ ceph-client/values.yaml | 8 ++++-- ceph-osd/templates/bin/osd/_block.sh.tpl | 27 +++++++++++++++------ ceph-osd/templates/daemonset-osd.yaml | 4 +++ ceph-osd/values.yaml | 12 +++++++++ 5 files changed, 46 insertions(+), 9 deletions(-) diff --git a/ceph-client/templates/bin/pool/_init.sh.tpl b/ceph-client/templates/bin/pool/_init.sh.tpl index 4b853760b..98d9f9fcb 100644 --- a/ceph-client/templates/bin/pool/_init.sh.tpl +++ b/ceph-client/templates/bin/pool/_init.sh.tpl @@ -37,6 +37,10 @@ if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^same_host$"; then ceph --cluster "${CLUSTER}" osd crush rule create-simple same_host default osd fi +if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^rack_replicated_rule$"; then + ceph --cluster "${CLUSTER}" osd crush rule create-simple rack_replicated_rule default rack +fi + function reweight_osds () { for OSD_ID in $(ceph --cluster "${CLUSTER}" osd df | awk '$3 == "0" {print $1}'); do OSD_WEIGHT=$(ceph --cluster "${CLUSTER}" osd df --format json-pretty| grep -A7 "\bosd.${OSD_ID}\b" | awk '/"kb"/{ gsub(",",""); d= $2/1073741824 ; r = sprintf("%.2f", d); print r }'); diff --git a/ceph-client/values.yaml b/ceph-client/values.yaml index c350e305d..3cb2c976f 100644 --- a/ceph-client/values.yaml +++ b/ceph-client/values.yaml @@ -128,9 +128,13 @@ conf: pg_per_osd: 100 protected: true default: - #NOTE(portdirect): this should be 'same_host' for a single node - # cluster to be in a healthy state + # NOTE(supamatt): Accepted values are: + # same_host for a single node + # replicated_rule for a multi node + # rack_replicated_rule for a multi node in multiple (>=3) racks + # Ceph cluster must be in a healthy state. crush_rule: replicated_rule + #NOTE(portdirect): this section describes the pools that will be managed by # the ceph pool management job, as it tunes the pgs and crush rule, based on # the above. diff --git a/ceph-osd/templates/bin/osd/_block.sh.tpl b/ceph-osd/templates/bin/osd/_block.sh.tpl index 749e51724..a86b8059d 100644 --- a/ceph-osd/templates/bin/osd/_block.sh.tpl +++ b/ceph-osd/templates/bin/osd/_block.sh.tpl @@ -126,13 +126,26 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}" OSD_KEYRING="${OSD_PATH}/keyring" # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing OSD_WEIGHT=0 -ceph \ - --cluster "${CLUSTER}" \ - --name="osd.${OSD_ID}" \ - --keyring="${OSD_KEYRING}" \ - osd \ - crush \ - create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} +if [ "x${CRUSH_RULE}" == "xrack_replicated_rule" ]; then + RACK_LOCATION=$(echo rack_$(echo ${HOSTNAME} | cut -c ${RACK_REGEX})) + CRUSH_LOCATION=$(echo "root=default rack=${RACK_LOCATION} host=${HOSTNAME}") + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true + RACK_LOCATION_CHECK=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | awk -F'"' '/rack/{print $4}') + if [ "x${RACK_LOCATION_CHECK}" != x${RACK_LOCATION} ]; then + # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations + # as create-or-move may not appropiately move them. + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush add-bucket ${RACK_LOCATION} rack || true + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move ${RACK_LOCATION} root=default || true + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush move ${HOSTNAME} rack=${RACK_LOCATION} || true + fi +else + ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \ + osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true +fi if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then if [ -n "${OSD_JOURNAL}" ]; then diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index 2d30b4e17..7c26825d6 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -179,6 +179,10 @@ spec: value: "ceph" - name: CEPH_GET_ADMIN_KEY value: "1" + - name: CRUSH_RULE + value: {{ .Values.conf.pool.default.crush_rule }} + - name: RACK_REGEX + value: {{ .Values.conf.pool.default.rack_regex }} command: - /tmp/osd-start.sh lifecycle: diff --git a/ceph-osd/values.yaml b/ceph-osd/values.yaml index 683df653e..7e6b12262 100644 --- a/ceph-osd/values.yaml +++ b/ceph-osd/values.yaml @@ -107,6 +107,18 @@ conf: osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M" osd_journal_size: 10240 + pool: + default: + # NOTE(supamatt): Accepted values are: + # same_host for a single node + # replicated_rule for a multi node + # rack_replicated_rule for a multi node in multiple (>=3) racks + # Ceph cluster must be in a healthy state. + crush_rule: replicated_rule + # NOTE(supamatt): By default use the first 8 characters of the hostname to + # define the the rack type bucket names for CRUSH. + rack_regex: "1-8" + storage: # NOTE(portdirect): for homogeneous clusters the `osd` key can be used to # define OSD pods that will be deployed across the cluster.