Fix spurious nagios alerts for radosgw service.
Currently, when the charm tears down the default radosgw daemon in order to make way for per host daemons, it does not remove the nrpe check for the daemon. This PR fixes the issue. It also closes a gap where alerts for the per host daemons are not setup until a hook that happens to call update_nrpe_checks as a side-effect is run. Change-Id: I7621b9671b010a77bb3e94bdd1e80f45274c73e5 Closes-Bug: #1825843
This commit is contained in:
parent
dce33bfb6e
commit
ff90c0f058
@ -255,6 +255,12 @@ def mon_relation(rid=None, unit=None):
|
||||
if systemd_based_radosgw():
|
||||
service_stop('radosgw')
|
||||
service('disable', 'radosgw')
|
||||
# Update the nrpe config. If we wait for the below
|
||||
# to be called elsewhere, there exists a period
|
||||
# where nagios will report the radosgw service as
|
||||
# down, and also not be monitoring the per
|
||||
# host services.
|
||||
update_nrpe_config(checks_to_remove=['radosgw'])
|
||||
|
||||
service('enable', service_name())
|
||||
# NOTE(jamespage):
|
||||
@ -362,13 +368,28 @@ def ha_relation_changed():
|
||||
|
||||
@hooks.hook('nrpe-external-master-relation-joined',
|
||||
'nrpe-external-master-relation-changed')
|
||||
def update_nrpe_config():
|
||||
def update_nrpe_config(checks_to_remove=None):
|
||||
"""
|
||||
Update the checks for the nagios plugin.
|
||||
|
||||
:param checks_to_remove: list of short names of nrpe checks to
|
||||
remove. For example, pass ['radosgw'] to remove the check for
|
||||
the default systemd radosgw service, to make way for per host
|
||||
services.
|
||||
:type checks_to_remove: list
|
||||
|
||||
"""
|
||||
# python-dbus is used by check_upstart_job
|
||||
apt_install('python-dbus')
|
||||
hostname = nrpe.get_nagios_hostname()
|
||||
current_unit = nrpe.get_nagios_unit_name()
|
||||
nrpe_setup = nrpe.NRPE(hostname=hostname)
|
||||
nrpe.copy_nrpe_checks()
|
||||
if checks_to_remove is not None:
|
||||
log("Removing the following nrpe checks: {}".format(checks_to_remove),
|
||||
level=DEBUG)
|
||||
for svc in checks_to_remove:
|
||||
nrpe_setup.remove_check(shortname=svc)
|
||||
nrpe.add_init_service_checks(nrpe_setup, services(), current_unit)
|
||||
nrpe.add_haproxy_checks(nrpe_setup, current_unit)
|
||||
nrpe_setup.write()
|
||||
|
@ -671,3 +671,34 @@ class SlaveMultisiteTests(CephRadosMultisiteTests):
|
||||
self.is_leader.return_value = False
|
||||
ceph_hooks.slave_relation_changed('slave:1', 'rgw/0')
|
||||
self.relation_get.assert_not_called()
|
||||
|
||||
@patch.object(ceph_hooks, 'apt_install')
|
||||
@patch.object(ceph_hooks, 'services')
|
||||
@patch.object(ceph_hooks, 'nrpe')
|
||||
def test_update_nrpe_config(self, nrpe, services, apt_install):
|
||||
# Setup Mocks
|
||||
nrpe.get_nagios_hostname.return_value = 'foo'
|
||||
nrpe.get_nagios_unit_name.return_value = 'bar'
|
||||
nrpe_setup = MagicMock()
|
||||
nrpe.NRPE.return_value = nrpe_setup
|
||||
services.return_value = ['baz', 'qux']
|
||||
|
||||
# Call the routine
|
||||
ceph_hooks.update_nrpe_config()
|
||||
|
||||
# Verify calls
|
||||
apt_install.assert_called()
|
||||
nrpe.get_nagios_hostname.assert_called()
|
||||
nrpe.get_nagios_unit_name.assert_called()
|
||||
nrpe.copy_nrpe_checks.assert_called()
|
||||
nrpe.remove_check.assert_not_called()
|
||||
nrpe.add_init_service_checks.assert_called_with(nrpe_setup,
|
||||
['baz', 'qux'], 'bar')
|
||||
nrpe.add_haproxy_checks.assert_called_with(nrpe_setup, 'bar')
|
||||
nrpe_setup.write.assert_called()
|
||||
|
||||
# Verify that remove_check is called appropriately if we pass
|
||||
# checks_to_remove
|
||||
ceph_hooks.update_nrpe_config(checks_to_remove=['quux', 'quuux'])
|
||||
nrpe_setup.remove_check.assert_has_calls([call(shortname='quux'),
|
||||
call(shortname='quuux')])
|
||||
|
Loading…
Reference in New Issue
Block a user