ceph-init-wrapper: Detect stuck peering OSDs and restart them
OSDs might become stuck peering. Recover from such state. Closes-bug: 1851287 Change-Id: I2ef1a0e93d38c3d041ee0c5c1e66a4ac42785a68 Signed-off-by: Dan Voiculeasa <dan.voiculeasa@windriver.com>
This commit is contained in:
parent
dcacc409f4
commit
11fd5d9cd4
@ -156,8 +156,9 @@ log_and_restart_blocked_osds ()
|
|||||||
{
|
{
|
||||||
# Log info about the blocked osd daemons and then restart it
|
# Log info about the blocked osd daemons and then restart it
|
||||||
local names=$1
|
local names=$1
|
||||||
|
local message=$2
|
||||||
for name in $names; do
|
for name in $names; do
|
||||||
wlog $name "INFO" "Restarting OSD with blocked operations"
|
wlog $name "INFO" "$message"
|
||||||
${CEPH_SCRIPT} restart $name
|
${CEPH_SCRIPT} restart $name
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
@ -253,6 +254,7 @@ status ()
|
|||||||
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||||
hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||||
blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||||
|
stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||||
invalid=0
|
invalid=0
|
||||||
host=`hostname`
|
host=`hostname`
|
||||||
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
|
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
|
||||||
@ -267,14 +269,12 @@ status ()
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
log_and_restart_blocked_osds $blocked_ops_procs
|
log_and_restart_blocked_osds "$blocked_ops_procs"\
|
||||||
|
"Restarting OSD with blocked operations"
|
||||||
|
log_and_restart_blocked_osds "$stuck_peering_procs"\
|
||||||
|
"Restarting OSD stuck peering"
|
||||||
log_and_kill_hung_procs $hung_procs
|
log_and_kill_hung_procs $hung_procs
|
||||||
|
|
||||||
hung_procs_text=""
|
|
||||||
for i in $(echo $hung_procs); do
|
|
||||||
hung_procs_text+="$i(process hung) "
|
|
||||||
done
|
|
||||||
|
|
||||||
rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
|
rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
|
||||||
if [ $invalid -eq 0 ]; then
|
if [ $invalid -eq 0 ]; then
|
||||||
text=""
|
text=""
|
||||||
|
Loading…
Reference in New Issue
Block a user