From ff90c0f058073292739bab33309d700d9bcfc4c6 Mon Sep 17 00:00:00 2001 From: Pete Vander Giessen Date: Wed, 24 Apr 2019 09:39:26 -0400 Subject: [PATCH] Fix spurious nagios alerts for radosgw service. Currently, when the charm tears down the default radosgw daemon in order to make way for per host daemons, it does not remove the nrpe check for the daemon. This PR fixes the issue. It also closes a gap where alerts for the per host daemons are not setup until a hook that happens to call update_nrpe_checks as a side-effect is run. Change-Id: I7621b9671b010a77bb3e94bdd1e80f45274c73e5 Closes-Bug: #1825843 --- hooks/hooks.py | 23 ++++++++++++++++++++++- unit_tests/test_hooks.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/hooks/hooks.py b/hooks/hooks.py index 5cc7b9f3..7db0e240 100755 --- a/hooks/hooks.py +++ b/hooks/hooks.py @@ -255,6 +255,12 @@ def mon_relation(rid=None, unit=None): if systemd_based_radosgw(): service_stop('radosgw') service('disable', 'radosgw') + # Update the nrpe config. If we wait for the below + # to be called elsewhere, there exists a period + # where nagios will report the radosgw service as + # down, and also not be monitoring the per + # host services. + update_nrpe_config(checks_to_remove=['radosgw']) service('enable', service_name()) # NOTE(jamespage): @@ -362,13 +368,28 @@ def ha_relation_changed(): @hooks.hook('nrpe-external-master-relation-joined', 'nrpe-external-master-relation-changed') -def update_nrpe_config(): +def update_nrpe_config(checks_to_remove=None): + """ + Update the checks for the nagios plugin. + + :param checks_to_remove: list of short names of nrpe checks to + remove. For example, pass ['radosgw'] to remove the check for + the default systemd radosgw service, to make way for per host + services. + :type checks_to_remove: list + + """ # python-dbus is used by check_upstart_job apt_install('python-dbus') hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe.copy_nrpe_checks() + if checks_to_remove is not None: + log("Removing the following nrpe checks: {}".format(checks_to_remove), + level=DEBUG) + for svc in checks_to_remove: + nrpe_setup.remove_check(shortname=svc) nrpe.add_init_service_checks(nrpe_setup, services(), current_unit) nrpe.add_haproxy_checks(nrpe_setup, current_unit) nrpe_setup.write() diff --git a/unit_tests/test_hooks.py b/unit_tests/test_hooks.py index 3051572b..cc046b83 100644 --- a/unit_tests/test_hooks.py +++ b/unit_tests/test_hooks.py @@ -671,3 +671,34 @@ class SlaveMultisiteTests(CephRadosMultisiteTests): self.is_leader.return_value = False ceph_hooks.slave_relation_changed('slave:1', 'rgw/0') self.relation_get.assert_not_called() + + @patch.object(ceph_hooks, 'apt_install') + @patch.object(ceph_hooks, 'services') + @patch.object(ceph_hooks, 'nrpe') + def test_update_nrpe_config(self, nrpe, services, apt_install): + # Setup Mocks + nrpe.get_nagios_hostname.return_value = 'foo' + nrpe.get_nagios_unit_name.return_value = 'bar' + nrpe_setup = MagicMock() + nrpe.NRPE.return_value = nrpe_setup + services.return_value = ['baz', 'qux'] + + # Call the routine + ceph_hooks.update_nrpe_config() + + # Verify calls + apt_install.assert_called() + nrpe.get_nagios_hostname.assert_called() + nrpe.get_nagios_unit_name.assert_called() + nrpe.copy_nrpe_checks.assert_called() + nrpe.remove_check.assert_not_called() + nrpe.add_init_service_checks.assert_called_with(nrpe_setup, + ['baz', 'qux'], 'bar') + nrpe.add_haproxy_checks.assert_called_with(nrpe_setup, 'bar') + nrpe_setup.write.assert_called() + + # Verify that remove_check is called appropriately if we pass + # checks_to_remove + ceph_hooks.update_nrpe_config(checks_to_remove=['quux', 'quuux']) + nrpe_setup.remove_check.assert_has_calls([call(shortname='quux'), + call(shortname='quuux')])