Enable Ceph charts to be rack aware for CRUSH

Add support for a rack level CRUSH map. Rack level CRUSH support is enabled by using the "rack_replicated_rule" crush rule. Change-Id: I4df224f2821872faa2eddec2120832e9a22f4a7c
2018-11-16 12:20:52 -06:00 · 2018-11-16 12:20:52 -06:00 · 5ce9f2eb3b
commit 5ce9f2eb3b
parent 5d356f9265
5 changed files with 46 additions and 9 deletions
--- a/ceph-client/templates/bin/pool/_init.sh.tpl
+++ b/ceph-client/templates/bin/pool/_init.sh.tpl
@ -37,6 +37,10 @@ if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^same_host$"; then
  ceph --cluster "${CLUSTER}" osd crush rule create-simple same_host default osd
 fi
 if ! ceph --cluster "${CLUSTER}" osd crush rule ls | grep -q "^rack_replicated_rule$"; then
  ceph --cluster "${CLUSTER}" osd crush rule create-simple rack_replicated_rule default rack
 fi
 function reweight_osds () {
  for OSD_ID in $(ceph --cluster "${CLUSTER}" osd df | awk '$3 == "0" {print $1}'); do
    OSD_WEIGHT=$(ceph --cluster "${CLUSTER}" osd df --format json-pretty| grep -A7 "\bosd.${OSD_ID}\b" | awk '/"kb"/{ gsub(",",""); d= $2/1073741824 ; r = sprintf("%.2f", d); print r }');
--- a/ceph-client/values.yaml
+++ b/ceph-client/values.yaml
@ -128,9 +128,13 @@ conf:
      pg_per_osd: 100
      protected: true
    default:
-      #NOTE(portdirect): this should be 'same_host' for a single node
+      # NOTE(supamatt): Accepted values are:
-      # cluster to be in a healthy state
+      # same_host for a single node
      # replicated_rule for a multi node
      # rack_replicated_rule for a multi node in multiple (>=3) racks
      # Ceph cluster must be in a healthy state.
      crush_rule: replicated_rule
    #NOTE(portdirect): this section describes the pools that will be managed by
    # the ceph pool management job, as it tunes the pgs and crush rule, based on
    # the above.
--- a/ceph-osd/templates/bin/osd/_block.sh.tpl
+++ b/ceph-osd/templates/bin/osd/_block.sh.tpl
@ -126,13 +126,26 @@ OSD_PATH="${OSD_PATH_BASE}-${OSD_ID}"
 OSD_KEYRING="${OSD_PATH}/keyring"
 # NOTE(supamatt): set the initial crush weight of the OSD to 0 to prevent automatic rebalancing
 OSD_WEIGHT=0
-ceph \
+if [ "x${CRUSH_RULE}" == "xrack_replicated_rule" ]; then
-  --cluster "${CLUSTER}" \
+  RACK_LOCATION=$(echo rack_$(echo ${HOSTNAME} | cut -c ${RACK_REGEX}))
-  --name="osd.${OSD_ID}" \
+  CRUSH_LOCATION=$(echo "root=default rack=${RACK_LOCATION} host=${HOSTNAME}")
-  --keyring="${OSD_KEYRING}" \
+  ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
-  osd \
+    osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
-  crush \
+  RACK_LOCATION_CHECK=$(ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" osd find ${OSD_ID} | awk -F'"' '/rack/{print $4}')
-  create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION}
+  if [ "x${RACK_LOCATION_CHECK}" != x${RACK_LOCATION} ];  then
    # NOTE(supamatt): Manually move the buckets for previously configured CRUSH configurations
    # as create-or-move may not appropiately move them.
    ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
      osd crush add-bucket ${RACK_LOCATION} rack || true
    ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
      osd crush move ${RACK_LOCATION} root=default || true
    ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
      osd crush move ${HOSTNAME} rack=${RACK_LOCATION} || true
  fi
 else
  ceph --cluster "${CLUSTER}" --name="osd.${OSD_ID}" --keyring="${OSD_KEYRING}" \
    osd crush create-or-move -- "${OSD_ID}" "${OSD_WEIGHT}" ${CRUSH_LOCATION} || true
 fi
 if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
  if [ -n "${OSD_JOURNAL}" ]; then
--- a/ceph-osd/templates/daemonset-osd.yaml
+++ b/ceph-osd/templates/daemonset-osd.yaml
@ -179,6 +179,10 @@ spec:
              value: "ceph"
            - name: CEPH_GET_ADMIN_KEY
              value: "1"
            - name: CRUSH_RULE
              value: {{ .Values.conf.pool.default.crush_rule }}
            - name: RACK_REGEX
              value: {{ .Values.conf.pool.default.rack_regex }}
          command:
            - /tmp/osd-start.sh
          lifecycle:
--- a/ceph-osd/values.yaml
+++ b/ceph-osd/values.yaml
@ -107,6 +107,18 @@ conf:
      osd_mount_options_xfs: "rw,noatime,largeio,inode64,swalloc,logbufs=8,logbsize=256k,allocsize=4M"
      osd_journal_size: 10240
  pool:
    default:
      # NOTE(supamatt): Accepted values are:
      # same_host for a single node
      # replicated_rule for a multi node
      # rack_replicated_rule for a multi node in multiple (>=3) racks
      # Ceph cluster must be in a healthy state.
      crush_rule: replicated_rule
      # NOTE(supamatt): By default use the first 8 characters of the hostname to
      # define the the rack type bucket names for CRUSH.
      rack_regex: "1-8"
  storage:
    # NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
    # define OSD pods that will be deployed across the cluster.