[CEPH] Support a troubleshooting option to reset PG metadata
Ceph upstream bug: https://tracker.ceph.com/issues/21142 is impacting the availability of our sites in pipeline. Add an option to reset the past interval metadata time on an OSDs PG to solve for this issue if it occurs. Change-Id: I1fe0bee6ce8aa402c241f1ad457bbf532945a530
This commit is contained in:
parent
4233c25308
commit
e1a3819a0d
@ -25,6 +25,7 @@ set -ex
|
|||||||
: "${OSD_SOFT_FORCE_ZAP:=1}"
|
: "${OSD_SOFT_FORCE_ZAP:=1}"
|
||||||
: "${OSD_JOURNAL_PARTITION:=}"
|
: "${OSD_JOURNAL_PARTITION:=}"
|
||||||
|
|
||||||
|
eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))')
|
||||||
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
||||||
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
|
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
|
||||||
eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))')
|
eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))')
|
||||||
@ -202,6 +203,15 @@ if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "${OSD_BLUESTORE:-0}" -ne 1 ]; then
|
||||||
|
# NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running
|
||||||
|
if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then
|
||||||
|
for PG in $(ls ${OSD_PATH}/current | awk -F'_' '/head/{print $1}'); do
|
||||||
|
ceph-objectstore-tool --data-path ${OSD_PATH} --op rm-past-intervals --pgid ${PG};
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then
|
if [ "x${JOURNAL_TYPE}" == "xdirectory" ]; then
|
||||||
touch ${OSD_JOURNAL}
|
touch ${OSD_JOURNAL}
|
||||||
chown -R ceph. /var/lib/ceph/journal
|
chown -R ceph. /var/lib/ceph/journal
|
||||||
|
@ -7,6 +7,7 @@ export LC_ALL=C
|
|||||||
: "${JOURNAL_DIR:=/var/lib/ceph/journal}"
|
: "${JOURNAL_DIR:=/var/lib/ceph/journal}"
|
||||||
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
|
: "${OSD_BOOTSTRAP_KEYRING:=/var/lib/ceph/bootstrap-osd/${CLUSTER}.keyring}"
|
||||||
|
|
||||||
|
eval OSD_PG_INTERVAL_FIX=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["osd_pg_interval_fix"]))')
|
||||||
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
eval CRUSH_FAILURE_DOMAIN_TYPE=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain"]))')
|
||||||
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
|
eval CRUSH_FAILURE_DOMAIN_NAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_name"]))')
|
||||||
eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))')
|
eval CRUSH_FAILURE_DOMAIN_BY_HOSTNAME=$(cat /etc/ceph/storage.json | python -c 'import sys, json; data = json.load(sys.stdin); print(json.dumps(data["failure_domain_by_hostname"]))')
|
||||||
@ -117,6 +118,13 @@ fi
|
|||||||
mkdir -p /etc/forego/"${CLUSTER}"
|
mkdir -p /etc/forego/"${CLUSTER}"
|
||||||
echo "" > /etc/forego/"${CLUSTER}"/Procfile
|
echo "" > /etc/forego/"${CLUSTER}"/Procfile
|
||||||
|
|
||||||
|
# NOTE(supamatt): https://tracker.ceph.com/issues/21142 is impacting us due to the older Ceph version 12.2.3 that we are running
|
||||||
|
if [ "x${OSD_PG_INTERVAL_FIX}" == "xtrue" ]; then
|
||||||
|
for PG in $(ls ${OSD_PATH}/current | awk -F'_' '/head/{print $1}'); do
|
||||||
|
ceph-objectstore-tool --data-path ${OSD_PATH} --op rm-past-intervals --pgid ${PG};
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
for OSD_ID in $(ls /var/lib/ceph/osd | sed 's/.*-//'); do
|
for OSD_ID in $(ls /var/lib/ceph/osd | sed 's/.*-//'); do
|
||||||
OSD_PATH="$OSD_PATH_BASE-$OSD_ID/"
|
OSD_PATH="$OSD_PATH_BASE-$OSD_ID/"
|
||||||
OSD_KEYRING="${OSD_PATH%/}/keyring"
|
OSD_KEYRING="${OSD_PATH%/}/keyring"
|
||||||
|
@ -132,6 +132,10 @@ conf:
|
|||||||
failure_domain_by_hostname: "false"
|
failure_domain_by_hostname: "false"
|
||||||
failure_domain_name: "false"
|
failure_domain_name: "false"
|
||||||
|
|
||||||
|
# NOTE(supamatt): Add a configurable option to reset the past interval time of a PG.
|
||||||
|
# This solves an open bug within Ceph Luminous releases. https://tracker.ceph.com/issues/21142
|
||||||
|
osd_pg_interval_fix: "false"
|
||||||
|
|
||||||
# NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
|
# NOTE(portdirect): for homogeneous clusters the `osd` key can be used to
|
||||||
# define OSD pods that will be deployed across the cluster.
|
# define OSD pods that will be deployed across the cluster.
|
||||||
# when specifing whole disk (/dev/sdf) for journals, ceph-osd chart will create
|
# when specifing whole disk (/dev/sdf) for journals, ceph-osd chart will create
|
||||||
@ -164,6 +168,7 @@ conf:
|
|||||||
# - name: host1.fqdn
|
# - name: host1.fqdn
|
||||||
# conf:
|
# conf:
|
||||||
# storage:
|
# storage:
|
||||||
|
# osd_pg_interval_fix: "true"
|
||||||
# failure_domain_name: "rack1"
|
# failure_domain_name: "rack1"
|
||||||
# osd:
|
# osd:
|
||||||
# - data:
|
# - data:
|
||||||
|
Loading…
Reference in New Issue
Block a user