diff --git a/actions.yaml b/actions.yaml index 301413b7..9655a527 100644 --- a/actions.yaml +++ b/actions.yaml @@ -4,6 +4,8 @@ resume-health: description: "Resume ceph health operations across the entire ceph cluster" get-health: description: "Output the current cluster health reported by `ceph health`" +get-versions-report: + description: "Outputs running daemon versions for all cluster members" create-cache-tier: description: "Create a new cache tier" params: diff --git a/actions/ceph_ops.py b/actions/ceph_ops.py index 5cc7b13a..0e6eb7ac 100755 --- a/actions/ceph_ops.py +++ b/actions/ceph_ops.py @@ -26,6 +26,11 @@ from charmhelpers.contrib.storage.linux.ceph import pool_set, \ set_pool_quota, snapshot_pool, remove_pool_snapshot +class CephReportError(Exception): + """This indicates a critical error.""" + pass + + def list_pools(): """Return a list of all Ceph pools.""" try: @@ -35,6 +40,52 @@ def list_pools(): action_fail(str(e)) +def get_versions_report(): + """ + Return a mapping of hosts and their related ceph daemon versions. + + On error, raise a CephReportError. + """ + report = dict() + try: + output = check_output(['ceph', 'node', 'ls']).decode('UTF-8') + except CalledProcessError as e: + action_fail(str(e)) + raise(CephReportError("Getting nodes list fail")) + nodes_list = json.loads(output) + + # osd versions + for osd_host, osds in nodes_list['osd'].items(): + report.setdefault(osd_host, []) + for osd in osds: + try: + output = check_output(['ceph', 'tell', + "osd.{}".format(osd), + 'version']).decode('UTF-8') + except CalledProcessError: + raise( + CephReportError("Getting osd.{} version fail".format(osd)) + ) + report[osd_host].append(json.loads(output)['version']) + + # mon versions + for mon_host, mons in nodes_list['mon'].items(): + report.setdefault(mon_host, []) + for mon in mons: + try: + output = check_output(['ceph', 'tell', + "mon.{}".format(mon), + 'version']).decode('UTF-8') + except CalledProcessError as e: + action_fail(str(e)) + raise( + CephReportError("Getting mon.{} version fail".format(mon)) + ) + report[mon_host].append(json.loads(output)['version']) + + return json.dumps(report, indent=4) + + def get_health(): """ Returns the output of 'ceph health'. diff --git a/actions/get-versions-report b/actions/get-versions-report new file mode 120000 index 00000000..b50dd0fe --- /dev/null +++ b/actions/get-versions-report @@ -0,0 +1 @@ +get_versions_report.py \ No newline at end of file diff --git a/actions/get_versions_report.py b/actions/get_versions_report.py new file mode 100755 index 00000000..d1ea92b6 --- /dev/null +++ b/actions/get_versions_report.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# +# Copyright 2022 Canonical Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ceph_ops import get_versions_report, CephReportError +from charmhelpers.core.hookenv import log, action_set, action_fail + +if __name__ == '__main__': + try: + action_set({'message': get_versions_report()}) + except CephReportError as e: + log(e) + action_fail( + "get versions report failed with message: {}".format(str(e))) diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py index 11e32595..074efec5 100755 --- a/files/nagios/check_ceph_status.py +++ b/files/nagios/check_ceph_status.py @@ -86,6 +86,32 @@ def get_ceph_version(): return out_version +def get_daemons_versions(): + """ + Uses CLI to get the ceph versions + + :returns: set containing tuple of integers, + all the differents versions encountered in the cluster + :raises: UnknownError + """ + try: + tree = subprocess.check_output(['ceph', + 'versions']).decode('UTF-8') + except subprocess.CalledProcessError as e: + raise UnknownError( + "UNKNOWN: could not determine OSDs versions, error: {}".format(e)) + ceph_versions = json.loads(tree) + # ceph version command return a json output + # containing version of all daemons connected to the cluster + # here we parse the overall field, + # to get a set of all versions seen by the cluster + daemons_versions = set(map( + lambda x: tuple(int(i) for i in + x.split(' ')[2].split('.')), + ceph_versions['overall'].keys())) + return daemons_versions + + def get_status_and_messages(status_data): """ Used to get general status of a Ceph cluster as well as a list of @@ -135,6 +161,50 @@ def check_ceph_status(args): """ status_critical = False + # if it is just --check_daemons_versions_consistency, + # deal with it and ignore overall health + if args.check_daemons_versions_consistency: + daemons_versions = get_daemons_versions() + # we check that the osds have same versions + num_of_versions = len(daemons_versions) + if num_of_versions == 1: + message_ok = "OK: All versions alligned" + return message_ok + else: + # version diverged + # we check if major release are the same + # by parsing version number in the daemon_version set + # and keeping major version number or coverting the minor + # version number if major version is 0 + num_of_releases = set(map(lambda x: x[0], daemons_versions)) + if len(num_of_releases) == 1: + msg = 'WARNING: Components minor versions diverged.' + 'Run get-versions-report to know more' + raise WarnError(msg) + else: + # Releases diverged + major, _minor, _patch = get_ceph_version() + release_versions_diff = list(map(lambda x: major - x, + num_of_releases)) + if max(release_versions_diff) >= 2: + msg = "CRITICAL: A component is " \ + "{} version behind osd leader" \ + ". Run get-versions-report to know more".format( + max(release_versions_diff)) + raise CriticalError(msg) + if min(release_versions_diff) <= -1: + msg = "CRITICAL: A component is " \ + "{} version ahead osd leader" \ + ". Run get-versions-report to know more".format( + abs(min(release_versions_diff))) + raise CriticalError(msg) + if max(release_versions_diff) == 1: + msg = "WARNING: A component is " \ + "{} version behind osd leader" \ + ". Run get-versions-report to know more".format( + max(release_versions_diff)) + raise WarnError(msg) + if args.status_file: check_file_freshness(args.status_file) with open(args.status_file) as f: @@ -287,6 +357,11 @@ def parse_args(args): dest='check_num_osds', default=False, action='store_true', help="Check whether all OSDs are up and in") + parser.add_argument('--check_daemons_versions_consistency', + dest='check_daemons_versions_consistency', + default=False, + action='store_true', + help="Check all OSDs versions") return parser.parse_args(args) diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index 826c5d4b..a6cd1e32 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -1211,6 +1211,14 @@ def update_nrpe_config(): description='Check whether all OSDs are up and in', check_cmd=check_cmd ) + if is_leader(): + check_cmd = 'check_ceph_status.py -f {}' \ + ' --check_daemons_versions'.format(STATUS_FILE) + nrpe_setup.add_check( + shortname='ceph_daemons_versions', + description='Check wheter all ceph daemons versions are alligned', + check_cmd=check_cmd + ) nrpe_setup.write() diff --git a/unit_tests/ceph_ls_node.json b/unit_tests/ceph_ls_node.json new file mode 100644 index 00000000..556cf2e3 --- /dev/null +++ b/unit_tests/ceph_ls_node.json @@ -0,0 +1,35 @@ +{ + "mon": { + "juju-c8b0a2-3-lxd-0": [ + "juju-c8b0a2-3-lxd-0" + ], + "juju-c8b0a2-4-lxd-0": [ + "juju-c8b0a2-4-lxd-0" + ], + "juju-c8b0a2-5-lxd-0": [ + "juju-c8b0a2-5-lxd-0" + ] + }, + "osd": { + "aware-bee": [ + 1 + ], + "grand-ape": [ + 0 + ], + "lucky-muskox": [ + 2 + ] + }, + "mgr": { + "juju-c8b0a2-3-lxd-0": [ + "juju-c8b0a2-3-lxd-0" + ], + "juju-c8b0a2-4-lxd-0": [ + "juju-c8b0a2-4-lxd-0" + ], + "juju-c8b0a2-5-lxd-0": [ + "juju-c8b0a2-5-lxd-0" + ] + } +} diff --git a/unit_tests/ceph_versions_alligned.json b/unit_tests/ceph_versions_alligned.json new file mode 100644 index 00000000..3acae499 --- /dev/null +++ b/unit_tests/ceph_versions_alligned.json @@ -0,0 +1,15 @@ +{ + "mon": { + "ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3 + }, + "mgr": { + "ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3 + }, + "osd": { + "ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 2 + }, + "mds": {}, + "overall": { + "ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 8 + } +} diff --git a/unit_tests/ceph_versions_diverged.json b/unit_tests/ceph_versions_diverged.json new file mode 100644 index 00000000..4dd5c5af --- /dev/null +++ b/unit_tests/ceph_versions_diverged.json @@ -0,0 +1,19 @@ +{ + "mon": { + "ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 1, + "ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2 + }, + "mgr": { + "ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 3 + }, + "osd": { + "ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3, + "ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2 + }, + "mds": {}, + "overall": { + "ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 4, + "ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3, + "ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 4 + } +} diff --git a/unit_tests/test_actions_mon.py b/unit_tests/test_actions_mon.py index edbb4561..ff54db0f 100644 --- a/unit_tests/test_actions_mon.py +++ b/unit_tests/test_actions_mon.py @@ -13,6 +13,7 @@ import json import sys import unittest.mock as mock +from subprocess import CalledProcessError from test_utils import CharmTestCase @@ -53,6 +54,45 @@ class OpsTestCase(CharmTestCase): cmd = ['ceph', 'health'] self.check_output.assert_called_once_with(cmd) + def test_get_version_report_ok(self): + def _call_rslt(): + with open('unit_tests/ceph_ls_node.json') as f: + tree = f.read() + yield tree.encode('UTF-8') + while True: + yield ('{' + ' "version": "16.2.7",' + ' "release": "pacific",' + ' "release_type": "stable"' + '}').encode('UTF-8') + self.check_output.side_effect = _call_rslt() + result = actions.get_versions_report() + self.assertEqual('{\n' + ' "aware-bee": [\n' + ' "16.2.7"\n' + ' ],\n' + ' "grand-ape": [\n' + ' "16.2.7"\n' + ' ],\n' + ' "lucky-muskox": [\n' + ' "16.2.7"\n' + ' ],\n' + ' "juju-c8b0a2-3-lxd-0": [\n' + ' "16.2.7"\n' + ' ],\n' + ' "juju-c8b0a2-4-lxd-0": [\n' + ' "16.2.7"\n' + ' ],\n' + ' "juju-c8b0a2-5-lxd-0": [\n' + ' "16.2.7"\n' + ' ]\n' + '}', result) + + def test_get_version_report_fail(self): + self.check_output.side_effect = CalledProcessError(1, 'ceph node ls') + self.assertRaises(actions.CephReportError, + lambda: actions.get_versions_report()) + @mock.patch('socket.gethostname') def test_get_quorum_status(self, mock_hostname): mock_hostname.return_value = 'mockhost' diff --git a/unit_tests/test_check_ceph_status.py b/unit_tests/test_check_ceph_status.py index 5342ce55..e6984884 100644 --- a/unit_tests/test_check_ceph_status.py +++ b/unit_tests/test_check_ceph_status.py @@ -17,6 +17,7 @@ import os import sys from unittest.mock import patch +from subprocess import CalledProcessError # import the module we want to test os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios')) @@ -25,6 +26,90 @@ import check_ceph_status @patch('subprocess.check_output') class NagiosTestCase(unittest.TestCase): + def test_get_daemons_versions_alligned(self, mock_subprocess): + with open('unit_tests/ceph_versions_alligned.json', 'rb') as f: + mock_subprocess.return_value = f.read() + osds_versions = check_ceph_status.get_daemons_versions() + self.assertEqual(osds_versions, set([(16, 2, 7)])) + + def test_get_daemons_versions_diverged(self, mock_subprocess): + with open('unit_tests/ceph_versions_diverged.json', 'rb') as f: + mock_subprocess.return_value = f.read() + osds_versions = check_ceph_status.get_daemons_versions() + self.assertEqual(osds_versions, set([(16, 2, 7), (17, 2, 0), + (15, 2, 16)])) + + def test_get_daemons_versions_exeption(self, mock_subprocess): + mock_subprocess.side_effect = CalledProcessError(1, 'ceph versions') + self.assertRaises(check_ceph_status.UnknownError, + lambda: check_ceph_status.get_daemons_versions()) + + # Version Alligned + @patch('check_ceph_status.get_daemons_versions') + def test_versions_alligned(self, mock_daemons_versions, mock_subprocess): + mock_subprocess.return_value = 'ceph version 16.2.7 ' \ + '(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8') + mock_daemons_versions.return_value = set([(16, 2, 7)]) + args = check_ceph_status.parse_args([ + '--check_daemons_versions_consistency']) + check_output = check_ceph_status.check_ceph_status(args) + self.assertRegex(check_output, r"^OK: All versions alligned$") + + # Minor version diverged + @patch('check_ceph_status.get_daemons_versions') + def test_min_versions_diverged(self, mock_daemons_versions, + mock_subprocess): + mock_subprocess.return_value = 'ceph version 16.2.7 ' \ + '(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8') + mock_daemons_versions.return_value = set([(16, 2, 7), (16, 1, 7)]) + args = check_ceph_status.parse_args([ + '--check_daemons_versions_consistency']) + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Major version ahead + @patch('check_ceph_status.get_daemons_versions') + def test_one_version_ahead(self, mock_daemons_versions, mock_subprocess): + mock_subprocess.return_value = 'ceph version 16.2.7 ' \ + '(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8') + mock_daemons_versions.return_value = set([(16, 2, 7), (17, 2, 0)]) + args = check_ceph_status.parse_args([ + '--check_daemons_versions_consistency']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Two major version ahead + @patch('check_ceph_status.get_daemons_versions') + def test_two_version_ahead(self, mock_daemons_versions, mock_subprocess): + mock_subprocess.return_value = 'ceph version 15.2.16 ' \ + '(d46a73d6d0a67a79558054a3a5a72cb561724974)'.encode('UTF-8') + mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)]) + args = check_ceph_status.parse_args([ + '--check_daemons_versions_consistency']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Major version behind + @patch('check_ceph_status.get_daemons_versions') + def test_version_behind(self, mock_daemons_versions, mock_subprocess): + mock_subprocess.return_value = 'ceph version 16.2.7 ' \ + '(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8') + mock_daemons_versions.return_value = set([(15, 2, 16), (16, 2, 7)]) + args = check_ceph_status.parse_args([ + '--check_daemons_versions_consistency']) + self.assertRaises(check_ceph_status.WarnError, + lambda: check_ceph_status.check_ceph_status(args)) + + # Two major version behind + @patch('check_ceph_status.get_daemons_versions') + def test_two_version_behind(self, mock_daemons_versions, mock_subprocess): + mock_subprocess.return_value = 'ceph version 17.2.0 ' \ + '(43e2e60a7559d3f46c9d53f1ca875fd499a1e35e)'.encode('UTF-8') + mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)]) + args = check_ceph_status.parse_args([ + '--check_daemons_versions_consistency']) + self.assertRaises(check_ceph_status.CriticalError, + lambda: check_ceph_status.check_ceph_status(args)) def test_get_ceph_version(self, mock_subprocess): mock_subprocess.return_value = 'ceph version 10.2.9 ' \