From 8a55f091925fd5e6742fb92783c524450843f5a0 Mon Sep 17 00:00:00 2001
From: zhouhenglc <zhouhenglc@inspur.com>
Date: Thu, 20 Jan 2022 15:09:32 +0800
Subject: [PATCH] [ovn]Refusing to bind port to dead agent

Closes-bug: #1958501

Change-Id: Ia84410675d28002afc74368349c9b54f048f4f4d
---
 .../ml2/drivers/ovn/agent/neutron_agent.py    |   9 ++
 .../drivers/ovn/mech_driver/mech_driver.py    |  37 +++---
 .../ml2/drivers/ovn/mech_driver/ovsdb/api.py  |  13 ---
 .../ovn/mech_driver/ovsdb/impl_idl_ovn.py     |  11 --
 .../ovn/mech_driver/ovsdb/test_impl_idl.py    |   7 --
 neutron/tests/unit/fake_resources.py          |   5 +-
 .../ovn/mech_driver/test_mech_driver.py       | 106 +++++++++++-------
 ...fuse-bind-dead-agent-2310f9f64c2a99de.yaml |   4 +
 8 files changed, 98 insertions(+), 94 deletions(-)
 create mode 100644 releasenotes/notes/ovn-refuse-bind-dead-agent-2310f9f64c2a99de.yaml

diff --git a/neutron/plugins/ml2/drivers/ovn/agent/neutron_agent.py b/neutron/plugins/ml2/drivers/ovn/agent/neutron_agent.py
index 432e00420c5..6e3b6bfc07f 100644
--- a/neutron/plugins/ml2/drivers/ovn/agent/neutron_agent.py
+++ b/neutron/plugins/ml2/drivers/ovn/agent/neutron_agent.py
@@ -254,3 +254,12 @@ class AgentCache:
                      for cls in NeutronAgent.types.values()}
         # Return the cached agents of agent_ids whose keys are in the cache
         return (self.agents[id_] for id_ in agent_ids & self.agents.keys())
+
+    def get_agents(self, filters=None):
+        filters = filters or {}
+        agent_list = []
+        for agent in self.agents.values():
+            agent_dict = agent.as_dict()
+            if all(agent_dict[k] in v for k, v in filters.items()):
+                agent_list.append(agent)
+        return agent_list
diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py
index 54c5261881e..2db013a9736 100644
--- a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py
+++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py
@@ -959,8 +959,7 @@ class OVNMechanismDriver(api.MechanismDriver):
 
         # OVN chassis information is needed to ensure a valid port bind.
         # Collect port binding data and refuse binding if the OVN chassis
-        # cannot be found.
-        chassis_physnets = []
+        # cannot be found or is dead.
         try:
             # The PortContext host property contains special handling that
             # we need to take into account, thus passing both the port Dict
@@ -969,14 +968,6 @@ class OVNMechanismDriver(api.MechanismDriver):
             bind_host = self._ovn_client.determine_bind_host(
                 port,
                 port_context=context)
-            datapath_type, iface_types, chassis_physnets = (
-                self.sb_ovn.get_chassis_data_for_ml2_bind_port(bind_host))
-            iface_types = iface_types.split(',') if iface_types else []
-        except RuntimeError:
-            LOG.debug('Refusing to bind port %(port_id)s due to '
-                      'no OVN chassis for host: %(host)s',
-                      {'port_id': port['id'], 'host': bind_host})
-            return
         except n_exc.InvalidInput as e:
             # The port binding profile is validated both on port creation and
             # update.  The new rules apply to a VNIC type previously not
@@ -985,7 +976,23 @@ class OVNMechanismDriver(api.MechanismDriver):
             LOG.error('Validation of binding profile unexpectedly failed '
                       'while attempting to bind port %s', port['id'])
             raise e
-
+        agents = n_agent.AgentCache().get_agents({'host': bind_host})
+        if not agents:
+            LOG.warning('Refusing to bind port %(port_id)s due to '
+                        'no OVN chassis for host: %(host)s',
+                      {'port_id': port['id'], 'host': bind_host})
+            return
+        agent = agents[0]
+        if not agent.alive:
+            LOG.warning("Refusing to bind port %(pid)s to dead agent:  "
+                        "%(agent)s", {'pid': context.current['id'],
+                                      'agent': agent})
+            return
+        chassis = agent.chassis
+        datapath_type = chassis.external_ids.get('datapath-type', '')
+        iface_types = chassis.external_ids.get('iface-types', '')
+        iface_types = iface_types.split(',') if iface_types else []
+        chassis_physnets = self.sb_ovn._get_chassis_physnets(chassis)
         for segment_to_bind in context.segments_to_bind:
             network_type = segment_to_bind['network_type']
             segmentation_id = segment_to_bind['segmentation_id']
@@ -1296,12 +1303,8 @@ class OVNMechanismDriver(api.MechanismDriver):
 def get_agents(self, context, filters=None, fields=None, _driver=None):
     _driver.ping_all_chassis()
     filters = filters or {}
-    agent_list = []
-    for agent in n_agent.AgentCache():
-        agent_dict = agent.as_dict()
-        if all(agent_dict[k] in v for k, v in filters.items()):
-            agent_list.append(agent_dict)
-    return agent_list
+    agent_list = n_agent.AgentCache().get_agents(filters)
+    return [agent.as_dict() for agent in agent_list]
 
 
 def get_agent(self, context, id, fields=None, _driver=None):
diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/api.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/api.py
index a77d5baf411..c172b9e4d22 100644
--- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/api.py
+++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/api.py
@@ -658,16 +658,3 @@ class SbAPI(api.API, metaclass=abc.ABCMeta):
         :param chassis_type:    The type of chassis
         :type chassis_type:     string
         """
-
-    @abc.abstractmethod
-    def get_chassis_data_for_ml2_bind_port(self, hostname):
-        """Return chassis data for ML2 port binding.
-
-        @param hostname:       The hostname of the chassis
-        @type hostname:        string
-        :returns:              Tuple containing the chassis datapath type,
-                               iface types and physical networks for the
-                               OVN bridge mappings.
-        :raises:               RuntimeError exception if an OVN chassis
-                               does not exist.
-        """
diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py
index 835635c7647..c24aa2acef6 100644
--- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py
+++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py
@@ -895,17 +895,6 @@ class OvsdbSbOvnIdl(sb_impl_idl.OvnSbApiIdlImpl, Backend):
                      card_serial_number)
         raise RuntimeError(msg)
 
-    def get_chassis_data_for_ml2_bind_port(self, hostname):
-        try:
-            cmd = self.db_find_rows('Chassis', ('hostname', '=', hostname))
-            chassis = next(c for c in cmd.execute(check_error=True))
-        except StopIteration:
-            msg = _('Chassis with hostname %s does not exist') % hostname
-            raise RuntimeError(msg)
-        return (chassis.external_ids.get('datapath-type', ''),
-                chassis.external_ids.get('iface-types', ''),
-                self._get_chassis_physnets(chassis))
-
     def get_metadata_port_network(self, network):
         # TODO(twilson) This function should really just take a Row/RowView
         try:
diff --git a/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_impl_idl.py b/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_impl_idl.py
index 17b5fef250a..cc5718713b5 100644
--- a/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_impl_idl.py
+++ b/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_impl_idl.py
@@ -83,13 +83,6 @@ class TestSbApi(BaseOvnIdlTest):
         our_chassis = {c['name'] for c in self.data['chassis']}
         self.assertLessEqual(our_chassis, chassis_list)
 
-    def test_get_chassis_data_for_ml2_bind_port(self):
-        host = self.data['chassis'][0]['hostname']
-        dp, iface, phys = self.api.get_chassis_data_for_ml2_bind_port(host)
-        self.assertEqual('', dp)
-        self.assertEqual('', iface)
-        self.assertCountEqual(phys, ['private', 'public'])
-
     def test_chassis_exists(self):
         self.assertTrue(self.api.chassis_exists(
             self.data['chassis'][0]['hostname']))
diff --git a/neutron/tests/unit/fake_resources.py b/neutron/tests/unit/fake_resources.py
index c46b30a32b7..0b379d2014a 100644
--- a/neutron/tests/unit/fake_resources.py
+++ b/neutron/tests/unit/fake_resources.py
@@ -175,9 +175,8 @@ class FakeOvsdbSbOvnIdl(object):
         self.get_chassis_and_azs = mock.Mock()
         self.get_chassis_and_azs.return_value = {}
         self.get_all_chassis = mock.Mock()
-        self.get_chassis_data_for_ml2_bind_port = mock.Mock()
-        self.get_chassis_data_for_ml2_bind_port.return_value = \
-            ('fake', '', ['fake-physnet'])
+        self._get_chassis_physnets = mock.Mock()
+        self._get_chassis_physnets.return_value = ['fake-physnet']
         self.get_chassis_and_physnets = mock.Mock()
         self.get_gateway_chassis_from_cms_options = mock.Mock()
         self.is_col_present = mock.Mock()
diff --git a/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py b/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py
index 112d5b65057..45235f74f71 100644
--- a/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py
+++ b/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py
@@ -83,6 +83,52 @@ class MechDriverSetupBase:
         self.mech_driver.sb_ovn = fakes.FakeOvsdbSbOvnIdl()
         self.mech_driver._post_fork_event.set()
         self.mech_driver._ovn_client._qos_driver = mock.Mock()
+        neutron_agent.AgentCache(self.mech_driver)
+        # Because AgentCache is a singleton and we get a new mech_driver each
+        # setUp(), override the AgentCache driver.
+        neutron_agent.AgentCache().driver = self.mech_driver
+        agent1 = self._add_agent('agent1')
+        neutron_agent.AgentCache().get_agents = mock.Mock()
+        neutron_agent.AgentCache().get_agents.return_value = [agent1]
+
+    def _add_chassis(self, nb_cfg, name=None):
+        chassis_private = mock.Mock()
+        chassis_private.nb_cfg = nb_cfg
+        chassis_private.uuid = uuid.uuid4()
+        chassis_private.name = name if name else str(uuid.uuid4())
+        return chassis_private
+
+    def _add_chassis_agent(self, nb_cfg, agent_type, chassis_private=None,
+                           updated_at=None):
+        chassis_private = chassis_private or self._add_chassis(nb_cfg)
+        if hasattr(chassis_private, 'nb_cfg_timestamp') and isinstance(
+                chassis_private.nb_cfg_timestamp, mock.Mock):
+            del chassis_private.nb_cfg_timestamp
+        chassis_private.external_ids = {}
+        if updated_at:
+            chassis_private.external_ids = {
+                ovn_const.OVN_LIVENESS_CHECK_EXT_ID_KEY:
+                    datetime.datetime.isoformat(updated_at)}
+        if agent_type == ovn_const.OVN_METADATA_AGENT:
+            chassis_private.external_ids.update({
+                ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: nb_cfg,
+                ovn_const.OVN_AGENT_METADATA_ID_KEY: str(uuid.uuid4())})
+        chassis_private.chassis = [chassis_private]
+        return neutron_agent.AgentCache().update(agent_type, chassis_private,
+                                                 updated_at)
+
+    def _add_agent(self, name, alive=True):
+        nb_cfg = 5
+        now = timeutils.utcnow(with_timezone=True)
+        if not alive:
+            updated_at = now - datetime.timedelta(cfg.CONF.agent_down_time + 1)
+            self.mech_driver.nb_ovn.nb_global.nb_cfg = nb_cfg
+        else:
+            updated_at = now
+            self.mech_driver.nb_ovn.nb_global.nb_cfg = nb_cfg + 2
+        chassis = self._add_chassis(nb_cfg, name=name)
+        return self._add_chassis_agent(
+            nb_cfg, ovn_const.OVN_CONTROLLER_AGENT, chassis, updated_at)
 
 
 class TestOVNMechanismDriverBase(MechDriverSetupBase,
@@ -113,10 +159,6 @@ class TestOVNMechanismDriverBase(MechDriverSetupBase,
         cfg.CONF.set_override('ovsdb_connection_timeout', 30, group='ovn')
         mock.patch.object(impl_idl_ovn.Backend, 'schema_helper').start()
         super().setUp()
-        neutron_agent.AgentCache(self.mech_driver)
-        # Because AgentCache is a singleton and we get a new mech_driver each
-        # setUp(), override the AgentCache driver.
-        neutron_agent.AgentCache().driver = self.mech_driver
 
         self.nb_ovn = self.mech_driver.nb_ovn
         self.sb_ovn = self.mech_driver.sb_ovn
@@ -1189,7 +1231,7 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
             attrs={'binding:vnic_type': 'unknown'}).info()
         fake_port_context = fakes.FakePortContext(fake_port, 'host', [])
         self.mech_driver.bind_port(fake_port_context)
-        self.sb_ovn.get_chassis_data_for_ml2_bind_port.assert_not_called()
+        neutron_agent.AgentCache().get_agents.assert_not_called()
         fake_port_context.set_binding.assert_not_called()
 
     def _test_bind_port_failed(self, fake_segments):
@@ -1198,13 +1240,12 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
         fake_port_context = fakes.FakePortContext(
             fake_port, fake_host, fake_segments)
         self.mech_driver.bind_port(fake_port_context)
-        self.sb_ovn.get_chassis_data_for_ml2_bind_port.assert_called_once_with(
-            fake_host)
+        neutron_agent.AgentCache().get_agents.assert_called_once_with(
+            {'host': fake_host})
         fake_port_context.set_binding.assert_not_called()
 
     def test_bind_port_host_not_found(self):
-        self.sb_ovn.get_chassis_data_for_ml2_bind_port.side_effect = \
-            RuntimeError
+        neutron_agent.AgentCache().get_agents.return_value = []
         self._test_bind_port_failed([])
 
     def test_bind_port_no_segments_to_bind(self):
@@ -1218,14 +1259,19 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
             [fakes.FakeSegment.create_one_segment(attrs=segment_attrs).info()]
         self._test_bind_port_failed(fake_segments)
 
+    def test_bind_port_host_not_alive(self):
+        agent = self._add_agent('agent_no_alive', False)
+        neutron_agent.AgentCache().get_agents.return_value = [agent]
+        self._test_bind_port_failed([])
+
     def _test_bind_port(self, fake_segments):
         fake_port = fakes.FakePort.create_one_port().info()
         fake_host = 'host'
         fake_port_context = fakes.FakePortContext(
             fake_port, fake_host, fake_segments)
         self.mech_driver.bind_port(fake_port_context)
-        self.sb_ovn.get_chassis_data_for_ml2_bind_port.assert_called_once_with(
-            fake_host)
+        neutron_agent.AgentCache().get_agents.assert_called_once_with(
+            {'host': fake_host})
         fake_port_context.set_binding.assert_called_once_with(
             fake_segments[0]['id'],
             portbindings.VIF_TYPE_OVS,
@@ -1241,8 +1287,8 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
         fake_port_context = fakes.FakePortContext(
             fake_port, fake_host, fake_segments)
         self.mech_driver.bind_port(fake_port_context)
-        self.sb_ovn.get_chassis_data_for_ml2_bind_port.assert_called_once_with(
-            fake_host)
+        neutron_agent.AgentCache().get_agents.assert_called_once_with(
+            {'host': fake_host})
         fake_port_context.set_binding.assert_called_once_with(
             fake_segments[0]['id'],
             portbindings.VIF_TYPE_OVS,
@@ -1271,8 +1317,8 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
         fake_port_context = fakes.FakePortContext(
             fake_port, fake_host, fake_segments)
         self.mech_driver.bind_port(fake_port_context)
-        self.sb_ovn.get_chassis_data_for_ml2_bind_port.assert_called_once_with(
-            fake_smartnic_dpu)
+        neutron_agent.AgentCache().get_agents.assert_called_once_with(
+            {'host': fake_smartnic_dpu})
         fake_port_context.set_binding.assert_called_once_with(
             fake_segments[0]['id'],
             portbindings.VIF_TYPE_OVS,
@@ -1292,8 +1338,8 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
         fake_port_context = fakes.FakePortContext(
             fake_port, fake_host, fake_segments)
         self.mech_driver.bind_port(fake_port_context)
-        self.sb_ovn.get_chassis_data_for_ml2_bind_port.assert_called_once_with(
-            fake_host)
+        neutron_agent.AgentCache().get_agents.assert_called_once_with(
+            {'host': fake_host})
         fake_port_context.set_binding.assert_called_once_with(
             fake_segments[0]['id'],
             portbindings.VIF_TYPE_OVS,
@@ -2072,32 +2118,6 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase):
         self.assertEqual(1, mock_update_port.call_count)
         mock_notify_dhcp.assert_called_with(fake_port['id'])
 
-    def _add_chassis(self, nb_cfg):
-        chassis_private = mock.Mock()
-        chassis_private.nb_cfg = nb_cfg
-        chassis_private.uuid = uuid.uuid4()
-        chassis_private.name = str(uuid.uuid4())
-        return chassis_private
-
-    def _add_chassis_agent(self, nb_cfg, agent_type, chassis_private=None,
-                           updated_at=None):
-        chassis_private = chassis_private or self._add_chassis(nb_cfg)
-        if hasattr(chassis_private, 'nb_cfg_timestamp') and isinstance(
-                chassis_private.nb_cfg_timestamp, mock.Mock):
-            del chassis_private.nb_cfg_timestamp
-        chassis_private.external_ids = {}
-        if updated_at:
-            chassis_private.external_ids[
-                ovn_const.OVN_LIVENESS_CHECK_EXT_ID_KEY] = \
-                datetime.datetime.isoformat(updated_at)
-        if agent_type == ovn_const.OVN_METADATA_AGENT:
-            chassis_private.external_ids.update({
-                ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: nb_cfg,
-                ovn_const.OVN_AGENT_METADATA_ID_KEY: str(uuid.uuid4())})
-        chassis_private.chassis = [chassis_private]
-        return neutron_agent.AgentCache().update(agent_type, chassis_private,
-                                                 updated_at)
-
     def test_agent_alive_true(self):
         chassis_private = self._add_chassis(5)
         for agent_type in (ovn_const.OVN_CONTROLLER_AGENT,
diff --git a/releasenotes/notes/ovn-refuse-bind-dead-agent-2310f9f64c2a99de.yaml b/releasenotes/notes/ovn-refuse-bind-dead-agent-2310f9f64c2a99de.yaml
new file mode 100644
index 00000000000..4bb77a61e18
--- /dev/null
+++ b/releasenotes/notes/ovn-refuse-bind-dead-agent-2310f9f64c2a99de.yaml
@@ -0,0 +1,4 @@
+---
+features:
+  - |
+    OVN mechanism driver refuses to bind a port to a dead agent.
\ No newline at end of file