From 5e0c102830a18850e35f746160867613e96d1dbc Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Wed, 31 May 2023 13:23:32 +0100 Subject: [PATCH] Send ovn heatbeat more often. This change modifies the metadata agent heatbeat to use a random offset with a max delay of 10 seconds. The orgial reason for the current logic was to mitigate https://bugs.launchpad.net/neutron/+bug/1991817 so the logic to spread the heatbeats is maintained but we now set an upper bound on the delay. Close-Bug: #2020215 Change-Id: I4d382793255520b9c44ca2aaacebcbda9a432dde --- neutron/agent/ovn/metadata/agent.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/neutron/agent/ovn/metadata/agent.py b/neutron/agent/ovn/metadata/agent.py index 0cd9ecb8681..e3b209d0e7e 100644 --- a/neutron/agent/ovn/metadata/agent.py +++ b/neutron/agent/ovn/metadata/agent.py @@ -234,14 +234,21 @@ class SbGlobalUpdateEvent(row_event.RowEvent): ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: str(row.nb_cfg)})).execute() + delay = 0 if self.first_run: - interval = 0 self.first_run = False else: - interval = randint(0, cfg.CONF.agent_down_time // 2) + # We occasionally see port binding failed errors due to + # the ml2 driver refusing to bind the port to a dead agent. + # if all agents heartbeat at the same time, they will all + # cause a load spike on the server. To mitigate that we + # need to spread out the load by introducing a random delay. + # clamp the max delay between 3 and 10 seconds. + max_delay = max(min(cfg.CONF.agent_down_time // 3, 10), 3) + delay = randint(0, max_delay) - LOG.debug("Delaying updating chassis table for %s seconds", interval) - timer = threading.Timer(interval, _update_chassis, [self, row]) + LOG.debug("Delaying updating chassis table for %s seconds", delay) + timer = threading.Timer(delay, _update_chassis, [self, row]) timer.start()