Change noout to be a CRITICAL alert instead of WARNING.

When the noout flag is set in a Ceph cluster, the Nagios check
currently marks this as a warning (like Ceph itself). However,
setting it to CRITICAL will raise visbility, and indicate to the
operator that this should be a temporary state.

Closes-Bug: 1926551
Change-Id: I9831cfea3f63e82fbc8bfebc938a9795b69111c7
This commit is contained in:
Garrett Thompson
2021-09-07 14:28:30 -06:00
parent 82743ab7e5
commit 375a1d0056
4 changed files with 336 additions and 0 deletions

View File

@@ -200,6 +200,10 @@ def check_ceph_status(args):
if args.raise_nodeepscrub:
if re.match("nodeep-scrub flag", status):
status_critical = True
# Check if noout is set
if re.match("noout flag", status):
status_critical = True
status_msg.append("noout flag is set")
if overall_status == 'HEALTH_CRITICAL' or \
overall_status == 'HEALTH_ERR':
# HEALTH_ERR, report critical

206
unit_tests/ceph_noout.json Normal file
View File

@@ -0,0 +1,206 @@
{
"health": {
"health": {
"health_services": [
{
"mons": [
{
"name": "juju-c62a41-21-lxd-0",
"kb_total": 334602320,
"kb_used": 2127960,
"kb_avail": 315454468,
"avail_percent": 94,
"last_updated": "2018-11-08 09:47:09.932189",
"store_stats": {
"bytes_total": 34880542,
"bytes_sst": 0,
"bytes_log": 1647123,
"bytes_misc": 33233419,
"last_updated": "0.000000"
},
"health": "HEALTH_WARN"
},
{
"name": "juju-c62a41-24-lxd-0",
"kb_total": 334602320,
"kb_used": 2128116,
"kb_avail": 315454312,
"avail_percent": 94,
"last_updated": "2018-11-08 09:47:16.418007",
"store_stats": {
"bytes_total": 36811676,
"bytes_sst": 0,
"bytes_log": 3574345,
"bytes_misc": 33237331,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
},
{
"name": "juju-c62a41-25-lxd-0",
"kb_total": 334602320,
"kb_used": 2128860,
"kb_avail": 315453568,
"avail_percent": 94,
"last_updated": "2018-11-08 09:47:21.198816",
"store_stats": {
"bytes_total": 37388424,
"bytes_sst": 0,
"bytes_log": 4151569,
"bytes_misc": 33236855,
"last_updated": "0.000000"
},
"health": "HEALTH_OK"
}
]
}
]
},
"timechecks": {
"epoch": 14,
"round": 4480,
"round_status": "finished",
"mons": [
{
"name": "juju-c62a41-21-lxd-0",
"skew": 0.000000,
"latency": 0.000000,
"health": "HEALTH_OK"
},
{
"name": "juju-c62a41-24-lxd-0",
"skew": 0.000282,
"latency": 0.000989,
"health": "HEALTH_OK"
},
{
"name": "juju-c62a41-25-lxd-0",
"skew": -0.001223,
"latency": 0.000776,
"health": "HEALTH_OK"
}
]
},
"summary": [
{
"severity": "HEALTH_WARN",
"summary": "noout flag(s) set"
},
{
"severity": "HEALTH_WARN",
"summary": "19 pgs backfill_wait"
},
{
"severity": "HEALTH_WARN",
"summary": "4 pgs backfilling"
},
{
"severity": "HEALTH_WARN",
"summary": "1 pgs peering"
},
{
"severity": "HEALTH_WARN",
"summary": "24 pgs stuck unclean"
},
{
"severity": "HEALTH_WARN",
"summary": "recovery 17386\/112794 objects misplaced (15.414%)"
},
{
"severity": "HEALTH_WARN",
"summary": "pool pool1 has many more objects per pg than average (too few pgs?)"
},
{
"severity": "HEALTH_WARN",
"summary": "nodeep-scrub flag(s) set"
}
],
"overall_status": "HEALTH_WARN",
"detail": []
},
"fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284",
"election_epoch": 14,
"quorum": [
0,
1,
2
],
"quorum_names": [
"juju-c62a41-21-lxd-0",
"juju-c62a41-24-lxd-0",
"juju-c62a41-25-lxd-0"
],
"monmap": {
"epoch": 2,
"fsid": "66af7af5-2f60-4e0e-94dc-49f49bd37284",
"modified": "2018-10-31 15:37:56.902830",
"created": "2018-10-31 15:37:40.288870",
"mons": [
{
"rank": 0,
"name": "juju-c62a41-21-lxd-0",
"addr": "100.84.195.4:6789\/0"
},
{
"rank": 1,
"name": "juju-c62a41-24-lxd-0",
"addr": "100.84.196.4:6789\/0"
},
{
"rank": 2,
"name": "juju-c62a41-25-lxd-0",
"addr": "100.84.196.5:6789\/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 316,
"num_osds": 48,
"num_up_osds": 48,
"num_in_osds": 48,
"full": false,
"nearfull": false,
"num_remapped_pgs": 22
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 3448
},
{
"state_name": "active+remapped+wait_backfill",
"count": 19
},
{
"state_name": "active+remapped+backfilling",
"count": 4
},
{
"state_name": "peering",
"count": 1
}
],
"version": 141480,
"num_pgs": 3472,
"data_bytes": 157009583781,
"bytes_used": 487185850368,
"bytes_avail": 75282911256576,
"bytes_total": 75770097106944,
"misplaced_objects": 17386,
"misplaced_total": 112794,
"misplaced_ratio": 0.154139,
"recovering_objects_per_sec": 436,
"recovering_bytes_per_sec": 1832614589,
"recovering_keys_per_sec": 0,
"num_objects_recovered": 446,
"num_bytes_recovered": 1870659584,
"num_keys_recovered": 0
},
"fsmap": {
"epoch": 1,
"by_rank": []
}
}

View File

@@ -0,0 +1,102 @@
{
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"health": {
"checks": {
"OSDMAP_FLAGS": {
"severity": "HEALTH_WARN",
"summary": {
"message": "noout flag(s) set"
}
}
},
"status": "HEALTH_WARN"
},
"election_epoch": 5,
"quorum": [
0
],
"quorum_names": [
"juju-460e0f-11"
],
"monmap": {
"epoch": 1,
"fsid": "b03a2900-e297-11e8-a7db-00163ed10659",
"modified": "2018-11-07 14:17:12.324408",
"created": "2018-11-07 14:17:12.324408",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": []
},
"mons": [
{
"rank": 0,
"name": "juju-460e0f-11",
"addr": "192.168.100.81:6789/0",
"public_addr": "192.168.100.81:6789/0"
}
]
},
"osdmap": {
"osdmap": {
"epoch": 518,
"num_osds": 9,
"num_up_osds": 9,
"num_in_osds": 9,
"full": false,
"nearfull": false,
"num_remapped_pgs": 0
}
},
"pgmap": {
"pgs_by_state": [
{
"state_name": "active+clean",
"count": 128
}
],
"num_pgs": 128,
"num_pools": 1,
"num_objects": 14896,
"data_bytes": 62440603919,
"bytes_used": 14225776640,
"bytes_avail": 9450938368,
"bytes_total": 23676715008
},
"fsmap": {
"epoch": 1,
"by_rank": []
},
"mgrmap": {
"epoch": 5,
"active_gid": 14097,
"active_name": "juju-460e0f-11",
"active_addr": "192.168.100.81:6800/204",
"available": true,
"standbys": [],
"modules": [
"balancer",
"restful",
"status"
],
"available_modules": [
"balancer",
"dashboard",
"influx",
"localpool",
"prometheus",
"restful",
"selftest",
"status",
"zabbix"
],
"services": {}
},
"servicemap": {
"epoch": 1,
"modified": "0.000000",
"services": {}
}
}

View File

@@ -120,6 +120,17 @@ class NagiosTestCase(unittest.TestCase):
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, pre-luminous, noout
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_noout(self, mock_ceph_version, mock_subprocess):
mock_ceph_version.return_value = [10, 2, 9]
with open('unit_tests/ceph_noout.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# All OK, luminous
@patch('check_ceph_status.get_ceph_version')
def test_health_ok_luminous(self, mock_ceph_version, mock_subprocess):
@@ -209,6 +220,19 @@ class NagiosTestCase(unittest.TestCase):
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Error, luminous, noout
@patch('check_ceph_status.get_ceph_version')
def test_health_crit_noout_luminous(self,
mock_ceph_version,
mock_subprocess):
mock_ceph_version.return_value = [12, 2, 0]
with open('unit_tests/ceph_noout_luminous.json') as f:
tree = f.read()
mock_subprocess.return_value = tree.encode('UTF-8')
args = check_ceph_status.parse_args("")
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Additional Ok, luminous, deepscrub
@patch('check_ceph_status.get_ceph_version')
def test_additional_ok_deepscrub_luminous(self,