From 00fea975e2b9c8f225645fbc255a6e5af478e496 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Tue, 10 Oct 2023 14:29:06 +0300 Subject: [PATCH] Handle deprecated "cpu_util" metric The "cpu_util" metric has been deprecated a few years ago. We'll obtain the same result by converting the cumulative cpu time to a percentage, leveraging the rate of change aggregation. Change-Id: I18fe0de6f74c785e674faceea0c48f44055818fe --- doc/source/configuration/configuring.rst | 2 +- .../contributor/plugin/strategy-plugin.rst | 2 +- .../strategies/basic-server-consolidation.rst | 3 +- .../strategies/vm_workload_consolidation.rst | 3 +- .../strategies/workload-stabilization.rst | 17 +++++----- doc/source/strategies/workload_balance.rst | 24 +++++++------- .../decision_engine/datasources/gnocchi.py | 32 ++++++++++++++++++- .../strategy/strategies/workload_balance.py | 2 +- .../datasources/test_gnocchi_helper.py | 20 ++++++++++-- 9 files changed, 74 insertions(+), 31 deletions(-) diff --git a/doc/source/configuration/configuring.rst b/doc/source/configuration/configuring.rst index 16cdcde97..9292fb398 100644 --- a/doc/source/configuration/configuring.rst +++ b/doc/source/configuration/configuring.rst @@ -372,7 +372,7 @@ You can configure and install Ceilometer by following the documentation below : #. https://docs.openstack.org/ceilometer/latest The built-in strategy 'basic_consolidation' provided by watcher requires -"**compute.node.cpu.percent**" and "**cpu_util**" measurements to be collected +"**compute.node.cpu.percent**" and "**cpu**" measurements to be collected by Ceilometer. The measurements available depend on the hypervisors that OpenStack manages on the specific implementation. diff --git a/doc/source/contributor/plugin/strategy-plugin.rst b/doc/source/contributor/plugin/strategy-plugin.rst index 8d3942b40..72a22f7a6 100644 --- a/doc/source/contributor/plugin/strategy-plugin.rst +++ b/doc/source/contributor/plugin/strategy-plugin.rst @@ -300,6 +300,6 @@ Using that you can now query the values for that specific metric: .. code-block:: py avg_meter = self.datasource_backend.statistic_aggregation( - instance.uuid, 'cpu_util', self.periods['instance'], + instance.uuid, 'instance_cpu_usage', self.periods['instance'], self.granularity, aggregation=self.aggregation_method['instance']) diff --git a/doc/source/strategies/basic-server-consolidation.rst b/doc/source/strategies/basic-server-consolidation.rst index 5ea87e48f..1192ff325 100644 --- a/doc/source/strategies/basic-server-consolidation.rst +++ b/doc/source/strategies/basic-server-consolidation.rst @@ -26,8 +26,7 @@ metric service name plugins comment ``compute_monitors`` option to ``cpu.virt_driver`` in the nova.conf. -``cpu_util`` ceilometer_ none cpu_util has been removed - since Stein. +``cpu`` ceilometer_ none ============================ ============ ======= =========================== .. _ceilometer: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#openstack-compute diff --git a/doc/source/strategies/vm_workload_consolidation.rst b/doc/source/strategies/vm_workload_consolidation.rst index 7f2bb9a2b..41bab95de 100644 --- a/doc/source/strategies/vm_workload_consolidation.rst +++ b/doc/source/strategies/vm_workload_consolidation.rst @@ -22,8 +22,7 @@ The *vm_workload_consolidation* strategy requires the following metrics: ============================ ============ ======= ========================= metric service name plugins comment ============================ ============ ======= ========================= -``cpu_util`` ceilometer_ none cpu_util has been removed - since Stein. +``cpu`` ceilometer_ none ``memory.resident`` ceilometer_ none ``memory`` ceilometer_ none ``disk.root.size`` ceilometer_ none diff --git a/doc/source/strategies/workload-stabilization.rst b/doc/source/strategies/workload-stabilization.rst index 9134d08ea..c2c341b67 100644 --- a/doc/source/strategies/workload-stabilization.rst +++ b/doc/source/strategies/workload-stabilization.rst @@ -27,9 +27,8 @@ metric service name plugins comment to ``cpu.virt_driver`` in the nova.conf. ``hardware.memory.used`` ceilometer_ SNMP_ -``cpu_util`` ceilometer_ none cpu_util has been removed - since Stein. -``memory.resident`` ceilometer_ none +``cpu`` ceilometer_ none +``instance_ram_usage`` ceilometer_ none ============================ ============ ======= ============================= .. _ceilometer: https://docs.openstack.org/ceilometer/latest/admin/telemetry-measurements.html#openstack-compute @@ -107,10 +106,10 @@ parameter type default Value description period of all received ones. ==================== ====== ===================== ============================= -.. |metrics| replace:: ["cpu_util", "memory.resident"] -.. |thresholds| replace:: {"cpu_util": 0.2, "memory.resident": 0.2} -.. |weights| replace:: {"cpu_util_weight": 1.0, "memory.resident_weight": 1.0} -.. |instance_metrics| replace:: {"cpu_util": "compute.node.cpu.percent", "memory.resident": "hardware.memory.used"} +.. |metrics| replace:: ["instance_cpu_usage", "instance_ram_usage"] +.. |thresholds| replace:: {"instance_cpu_usage": 0.2, "instance_ram_usage": 0.2} +.. |weights| replace:: {"instance_cpu_usage_weight": 1.0, "instance_ram_usage_weight": 1.0} +.. |instance_metrics| replace:: {"instance_cpu_usage": "compute.node.cpu.percent", "instance_ram_usage": "hardware.memory.used"} .. |periods| replace:: {"instance": 720, "node": 600} Efficacy Indicator @@ -136,8 +135,8 @@ How to use it ? at1 workload_balancing --strategy workload_stabilization $ openstack optimize audit create -a at1 \ - -p thresholds='{"memory.resident": 0.05}' \ - -p metrics='["memory.resident"]' + -p thresholds='{"instance_ram_usage": 0.05}' \ + -p metrics='["instance_ram_usage"]' External Links -------------- diff --git a/doc/source/strategies/workload_balance.rst b/doc/source/strategies/workload_balance.rst index fac3c82ef..8e7e3498b 100644 --- a/doc/source/strategies/workload_balance.rst +++ b/doc/source/strategies/workload_balance.rst @@ -24,8 +24,7 @@ The *workload_balance* strategy requires the following metrics: ======================= ============ ======= ========================= metric service name plugins comment ======================= ============ ======= ========================= -``cpu_util`` ceilometer_ none cpu_util has been removed - since Stein. +``cpu`` ceilometer_ none ``memory.resident`` ceilometer_ none ======================= ============ ======= ========================= @@ -65,15 +64,16 @@ Configuration Strategy parameters are: -============== ====== ============= ==================================== -parameter type default Value description -============== ====== ============= ==================================== -``metrics`` String 'cpu_util' Workload balance base on cpu or ram - utilization. choice: ['cpu_util', - 'memory.resident'] -``threshold`` Number 25.0 Workload threshold for migration -``period`` Number 300 Aggregate time period of ceilometer -============== ====== ============= ==================================== +============== ====== ==================== ==================================== +parameter type default Value description +============== ====== ==================== ==================================== +``metrics`` String 'instance_cpu_usage' Workload balance base on cpu or ram + utilization. Choices: + ['instance_cpu_usage', + 'instance_ram_usage'] +``threshold`` Number 25.0 Workload threshold for migration +``period`` Number 300 Aggregate time period of ceilometer +============== ====== ==================== ==================================== Efficacy Indicator ------------------ @@ -95,7 +95,7 @@ How to use it ? at1 workload_balancing --strategy workload_balance $ openstack optimize audit create -a at1 -p threshold=26.0 \ - -p period=310 -p metrics=cpu_util + -p period=310 -p metrics=instance_cpu_usage External Links -------------- diff --git a/watcher/decision_engine/datasources/gnocchi.py b/watcher/decision_engine/datasources/gnocchi.py index 6a52845ca..e08fc152e 100644 --- a/watcher/decision_engine/datasources/gnocchi.py +++ b/watcher/decision_engine/datasources/gnocchi.py @@ -38,7 +38,7 @@ class GnocchiHelper(base.DataSourceBase): host_inlet_temp='hardware.ipmi.node.temperature', host_airflow='hardware.ipmi.node.airflow', host_power='hardware.ipmi.node.power', - instance_cpu_usage='cpu_util', + instance_cpu_usage='cpu', instance_ram_usage='memory.resident', instance_ram_allocated='memory', instance_l3_cache_usage='cpu_l3_cache', @@ -93,6 +93,25 @@ class GnocchiHelper(base.DataSourceBase): resource_id = resources[0]['id'] + if meter_name == "instance_cpu_usage": + if resource_type != "instance": + LOG.warning("Unsupported resource type for metric " + "'instance_cpu_usage': ", resource_type) + return + + # The "cpu_util" gauge (percentage) metric has been removed. + # We're going to obtain the same result by using the rate of change + # aggregate operation. + if aggregate not in ("mean", "rate:mean"): + LOG.warning("Unsupported aggregate for instance_cpu_usage " + "metric: %s. " + "Supported aggregates: mean, rate:mean ", + aggregate) + return + + # TODO(lpetrut): consider supporting other aggregates. + aggregate = "rate:mean" + raw_kwargs = dict( metric=meter, start=start_time, @@ -117,6 +136,17 @@ class GnocchiHelper(base.DataSourceBase): # Airflow from hardware.ipmi.node.airflow is reported as # 1/10 th of actual CFM return_value *= 10 + if meter_name == "instance_cpu_usage": + # "rate:mean" can return negative values for migrated vms. + return_value = max(0, return_value) + + # We're converting the cumulative cpu time (ns) to cpu usage + # percentage. + vcpus = resource.vcpus + if not vcpus: + LOG.warning("instance vcpu count not set, assuming 1") + vcpus = 1 + return_value *= 100 / (granularity * 10e+8) / vcpus return return_value diff --git a/watcher/decision_engine/strategy/strategies/workload_balance.py b/watcher/decision_engine/strategy/strategies/workload_balance.py index d20b10c6a..2bc51f3f5 100644 --- a/watcher/decision_engine/strategy/strategies/workload_balance.py +++ b/watcher/decision_engine/strategy/strategies/workload_balance.py @@ -295,7 +295,7 @@ class WorkloadBalance(base.WorkloadStabilizationBaseStrategy): self.threshold) return self.solution - # choose the server with largest cpu_util + # choose the server with largest cpu usage source_nodes = sorted(source_nodes, reverse=True, key=lambda x: (x[self._meter])) diff --git a/watcher/tests/decision_engine/datasources/test_gnocchi_helper.py b/watcher/tests/decision_engine/datasources/test_gnocchi_helper.py index bca1ca49c..04364e74c 100644 --- a/watcher/tests/decision_engine/datasources/test_gnocchi_helper.py +++ b/watcher/tests/decision_engine/datasources/test_gnocchi_helper.py @@ -40,17 +40,25 @@ class TestGnocchiHelper(base.BaseTestCase): self.addCleanup(stat_agg_patcher.stop) def test_gnocchi_statistic_aggregation(self, mock_gnocchi): + vcpus = 2 + mock_instance = mock.Mock( + id='16a86790-327a-45f9-bc82-45839f062fdc', + vcpus=vcpus) + gnocchi = mock.MagicMock() + # cpu time rate of change (ns) + mock_rate_measure = 360 * 10e+8 * vcpus * 5.5 / 100 expected_result = 5.5 - expected_measures = [["2017-02-02T09:00:00.000000", 360, 5.5]] + expected_measures = [ + ["2017-02-02T09:00:00.000000", 360, mock_rate_measure]] gnocchi.metric.get_measures.return_value = expected_measures mock_gnocchi.return_value = gnocchi helper = gnocchi_helper.GnocchiHelper() result = helper.statistic_aggregation( - resource=mock.Mock(id='16a86790-327a-45f9-bc82-45839f062fdc'), + resource=mock_instance, resource_type='instance', meter_name='instance_cpu_usage', period=300, @@ -59,6 +67,14 @@ class TestGnocchiHelper(base.BaseTestCase): ) self.assertEqual(expected_result, result) + gnocchi.metric.get_measures.assert_called_once_with( + metric="cpu", + start=mock.ANY, + stop=mock.ANY, + resource_id=mock_instance.uuid, + granularity=360, + aggregation="rate:mean") + def test_gnocchi_statistic_series(self, mock_gnocchi): gnocchi = mock.MagicMock() expected_result = {