diff --git a/api-ref/source/parameters.yaml b/api-ref/source/parameters.yaml index c1cf611fa7e9..283742ef57d8 100644 --- a/api-ref/source/parameters.yaml +++ b/api-ref/source/parameters.yaml @@ -3902,8 +3902,11 @@ hypervisor_type_body: type: string hypervisor_uptime: description: | - The total uptime of the hypervisor and information about average load. Only - reported for active hosts where the virt driver supports this feature. + The response format of this api depends on the virt driver in use on a + given host. The libvirt driver returns the output of the `uptime` command + directly, the z/VM driver returns the `ILP` time. All other drivers + always return `null`. Note this value is cached and updated periodically. + in: body required: true type: string diff --git a/nova/api/openstack/compute/hypervisors.py b/nova/api/openstack/compute/hypervisors.py index 40ad32deabc3..1e6d6bbed8df 100644 --- a/nova/api/openstack/compute/hypervisors.py +++ b/nova/api/openstack/compute/hypervisors.py @@ -96,18 +96,24 @@ class HypervisorsController(wsgi.Controller): # The 2.88 microversion also *added* the 'uptime' field to the response if detail and api_version_request.is_supported(req, '2.88'): - try: - hyp_dict['uptime'] = self.host_api.get_host_uptime( - req.environ['nova.context'], hypervisor.host) - except ( - NotImplementedError, - exception.ComputeServiceUnavailable, - exception.HostMappingNotFound, - exception.HostNotFound, - ): - # Not all virt drivers support this, and it's not generally - # possible to get uptime for a down host - hyp_dict['uptime'] = None + uptime = None + if "stats" in hypervisor and "uptime" in hypervisor.stats: + uptime = hypervisor.stats.get("uptime") + else: + try: + uptime = self.host_api.get_host_uptime( + req.environ['nova.context'], hypervisor.host) + except ( + NotImplementedError, # only raised in tests + exception.ComputeServiceUnavailable, + exception.HostMappingNotFound, + exception.HostNotFound, + ): + # Only libvirt and ZVM drivers support this, and it's + # not generally possible to get uptime for a down host + pass + + hyp_dict['uptime'] = uptime if servers: hyp_dict['servers'] = [ diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 1e96035cd2f7..f4fcf4da180f 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1173,7 +1173,8 @@ class ResourceTracker(object): "used_disk=%(used_disk)sGB " "total_vcpus=%(total_vcpus)s " "used_vcpus=%(used_vcpus)s " - "pci_stats=%(pci_stats)s", + "pci_stats=%(pci_stats)s " + "stats=%(stats)s", {'node': nodename, 'phys_ram': cn.memory_mb, 'used_ram': cn.memory_mb_used, @@ -1181,7 +1182,9 @@ class ResourceTracker(object): 'used_disk': cn.local_gb_used, 'total_vcpus': tcpu, 'used_vcpus': ucpu, - 'pci_stats': pci_stats}) + 'pci_stats': pci_stats, + 'stats': cn.stats or {} + }) def _resource_change(self, compute_node): """Check to see if any resources have changed.""" diff --git a/nova/compute/stats.py b/nova/compute/stats.py index e9180ec6d6d8..7dab9cc8f875 100644 --- a/nova/compute/stats.py +++ b/nova/compute/stats.py @@ -37,6 +37,12 @@ class Stats(dict): if stats is None: return if isinstance(stats, dict): + # use None as a sentinel to the API that + # the driver does not support uptime + # setdefault will update the dict if and only if + # uptime is not set then return the value. + # since we dont need it we just discard the result + stats.setdefault('uptime', None) self.update(stats) return raise ValueError(_('Unexpected type adding stats')) diff --git a/nova/tests/unit/api/openstack/compute/test_hypervisors.py b/nova/tests/unit/api/openstack/compute/test_hypervisors.py index 6f6f96b39d71..e8a9609c0c3b 100644 --- a/nova/tests/unit/api/openstack/compute/test_hypervisors.py +++ b/nova/tests/unit/api/openstack/compute/test_hypervisors.py @@ -47,7 +47,7 @@ TEST_HYPERS = [ vcpus_used=2, memory_mb_used=5 * 1024, local_gb_used=125, - hypervisor_type="xen", + hypervisor_type="qemu", hypervisor_version=3, hypervisor_hostname="hyper1", free_ram_mb=5 * 1024, @@ -67,7 +67,7 @@ TEST_HYPERS = [ vcpus_used=2, memory_mb_used=5 * 1024, local_gb_used=125, - hypervisor_type="xen", + hypervisor_type="qemu", hypervisor_version=3, hypervisor_hostname="hyper2", free_ram_mb=5 * 1024, @@ -76,7 +76,8 @@ TEST_HYPERS = [ running_vms=2, cpu_info=CPU_INFO, disk_available_least=100, - host_ip=netaddr.IPAddress('2.2.2.2'))] + host_ip=netaddr.IPAddress('2.2.2.2'), + stats={'uptime': 'fake uptime'})] TEST_SERVICES = [ @@ -203,6 +204,11 @@ class HypervisorsTestV21(test.NoDBTestCase): del DETAIL_HYPERS_DICTS[1]['host'] del DETAIL_HYPERS_DICTS[0]['uuid'] del DETAIL_HYPERS_DICTS[1]['uuid'] + # Remove stats since it's not exposed in the API response, but preserve + # uptime for v2.88+ tests which expect it + for hyper_dict in DETAIL_HYPERS_DICTS: + if 'stats' in hyper_dict: + del hyper_dict['stats'] DETAIL_HYPERS_DICTS[0].update({'state': 'up', 'status': 'enabled', 'service': dict(id=1, host='compute1', @@ -850,7 +856,7 @@ class HypervisorsTestV233(HypervisorsTestV228): 'free_ram_mb': 5120, 'host_ip': netaddr.IPAddress('2.2.2.2'), 'hypervisor_hostname': 'hyper2', - 'hypervisor_type': 'xen', + 'hypervisor_type': 'qemu', 'hypervisor_version': 3, 'id': 2, 'local_gb': 250, @@ -904,7 +910,7 @@ class HypervisorsTestV233(HypervisorsTestV228): 'free_ram_mb': 5120, 'host_ip': netaddr.IPAddress('2.2.2.2'), 'hypervisor_hostname': 'hyper2', - 'hypervisor_type': 'xen', + 'hypervisor_type': 'qemu', 'hypervisor_version': 3, 'id': 2, 'local_gb': 250, @@ -951,7 +957,7 @@ class HypervisorsTestV233(HypervisorsTestV228): 'free_ram_mb': 5120, 'host_ip': netaddr.IPAddress('2.2.2.2'), 'hypervisor_hostname': 'hyper2', - 'hypervisor_type': 'xen', + 'hypervisor_type': 'qemu', 'hypervisor_version': 3, 'id': 2, 'local_gb': 250, @@ -1448,6 +1454,21 @@ class HypervisorsTestV288(HypervisorsTestV275): # cpu_info is no longer included in the response, so skip this test pass + def test_show_with_uptime_provided_by_compute_node(self): + req = self._get_request(use_admin_context=True) + result = self.controller.show(req, self.TEST_HYPERS_OBJ[1].uuid) + expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[1]) + self.assertEqual({'hypervisor': expected_dict}, result) + self.controller.host_api.get_host_uptime.assert_not_called() + + def test_detail_list_uptime(self): + _ = self._test_servers_with_no_servers(self.controller.detail) + # we have simulated that compute 2 is upgraded to store the uptime + # in the stats so we expect 1 call to get the result via RPC + # for compute1 + self.controller.host_api.get_host_uptime.assert_called_with( + mock.ANY, "compute1") + def test_uptime(self): req = self._get_request(True) self.assertRaises( diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index ec5fe68b6450..9110d34b88ad 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -22996,18 +22996,18 @@ class HostStateTestCase(test.NoDBTestCase): drvr = HostStateTestCase.FakeConnection() - stats = drvr.get_available_resource("compute1") - self.assertEqual(stats["vcpus"], 1) - self.assertEqual(stats["memory_mb"], 497) - self.assertEqual(stats["local_gb"], 100) - self.assertEqual(stats["vcpus_used"], 0) - self.assertEqual(stats["memory_mb_used"], 88) - self.assertEqual(stats["local_gb_used"], 20) - self.assertEqual(stats["hypervisor_type"], 'QEMU') - self.assertEqual(stats["hypervisor_version"], + res = drvr.get_available_resource("compute1") + self.assertEqual(res["vcpus"], 1) + self.assertEqual(res["memory_mb"], 497) + self.assertEqual(res["local_gb"], 100) + self.assertEqual(res["vcpus_used"], 0) + self.assertEqual(res["memory_mb_used"], 88) + self.assertEqual(res["local_gb_used"], 20) + self.assertEqual(res["hypervisor_type"], 'QEMU') + self.assertEqual(res["hypervisor_version"], fakelibvirt.FAKE_QEMU_VERSION) - self.assertEqual(stats["hypervisor_hostname"], 'compute1') - cpu_info = jsonutils.loads(stats["cpu_info"]) + self.assertEqual(res["hypervisor_hostname"], 'compute1') + cpu_info = jsonutils.loads(res["cpu_info"]) self.assertEqual(cpu_info, {"vendor": "Intel", "model": "pentium", "arch": fields.Architecture.I686, @@ -23017,12 +23017,13 @@ class HostStateTestCase(test.NoDBTestCase): "topology": {"cores": "1", "threads": "1", "sockets": "1"}, "maxphysaddr": {"mode": "emulate", "bits": "42"} }) - self.assertEqual(stats["disk_available_least"], 80) - self.assertEqual(jsonutils.loads(stats["pci_passthrough_devices"]), + self.assertEqual(res["disk_available_least"], 80) + self.assertEqual(jsonutils.loads(res["pci_passthrough_devices"]), HostStateTestCase.pci_devices) self.assertEqual(objects.NUMATopology.obj_from_db_obj( - stats['numa_topology']), + res['numa_topology']), HostStateTestCase.numa_topology) + self.assertEqual(res['stats']['uptime'], drvr.get_host_uptime()) class TestUpdateProviderTree(test.NoDBTestCase): diff --git a/nova/tests/unit/virt/zvm/test_driver.py b/nova/tests/unit/virt/zvm/test_driver.py index a5a129331d93..66088e455ab3 100644 --- a/nova/tests/unit/virt/zvm/test_driver.py +++ b/nova/tests/unit/virt/zvm/test_driver.py @@ -128,8 +128,11 @@ class TestZVMDriver(test.NoDBTestCase): self.assertRaises(exception.ZVMDriverException, zvmdriver.ZVMDriver, 'virtapi') + @mock.patch( + 'nova.virt.zvm.driver.ZVMDriver.get_host_uptime', + return_value='IPL at 11/14/17 10:47:44 EST') @mock.patch('nova.virt.zvm.utils.ConnectorClient.call') - def test_get_available_resource_err_case(self, call): + def test_get_available_resource_err_case(self, call, uptime_mock): res = {'overallRC': 1, 'errmsg': 'err', 'rc': 0, 'rs': 0} call.side_effect = exception.ZVMConnectorError(results=res) results = self._driver.get_available_resource() @@ -138,6 +141,8 @@ class TestZVMDriver(test.NoDBTestCase): self.assertEqual(0, results['disk_available_least']) self.assertEqual(0, results['hypervisor_version']) self.assertEqual('TESTHOST', results['hypervisor_hostname']) + self.assertEqual(uptime_mock.return_value, results['stats']['uptime']) + uptime_mock.assert_called_once() def test_driver_template_validation(self): self.flags(instance_name_template='abc%6d') diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index ca9c3168d771..8159afc49d62 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -10376,6 +10376,7 @@ class LibvirtDriver(driver.ComputeDriver): else: data['numa_topology'] = None + data['stats'] = {'uptime': self.get_host_uptime()} return data def check_instance_shared_storage_local(self, context, instance): diff --git a/nova/virt/zvm/driver.py b/nova/virt/zvm/driver.py index 4803c18ef84e..ada358026826 100644 --- a/nova/virt/zvm/driver.py +++ b/nova/virt/zvm/driver.py @@ -132,6 +132,7 @@ class ZVMDriver(driver.ComputeDriver): obj_fields.HVType.ZVM, obj_fields.VMMode.HVM)], 'numa_topology': None, + 'stats': {'uptime': self.get_host_uptime()} } LOG.debug("Getting available resource for %(host)s:%(nodename)s", diff --git a/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml b/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml new file mode 100644 index 000000000000..7d4fcfe5b57e --- /dev/null +++ b/releasenotes/notes/bug-2122036-hypervisor-uptime-performance-optimization-6f3a2c8e5d9b1a4e.yaml @@ -0,0 +1,23 @@ +--- +fixes: + - | + Fixed performance issue with the ``/os-hypervisors/detail`` API endpoint + when using microversion 2.88 or higher. The API was making sequential RPC + calls to each compute node to gather uptime information, causing significant + delays in environments with many compute nodes (LP#2122036). + + The fix optimizes uptime retrieval by: + + * Adding uptime information to the periodic resource updates sent by + nova-compute to the database, eliminating the need for synchronous RPC + calls during API requests + * Only attempting RPC-based uptime retrieval for hypervisor types that + actually support it (libvirt and z/VM), avoiding unnecessary calls to + other hypervisor types that would always return NotImplementedError + * Preferring cached uptime data from the database over RPC calls when + available, this updates at the cadence specified by + `[DEFAULT]update_resources_interval` which is the same interval the + other hypervisor stats update. + + This change significantly reduces response times for the hypervisor detail + API in large deployments while maintaining backward compatibility.