From 391aa4677f394f1581df17fe74da968f19981e9d Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Fri, 28 Oct 2022 14:41:20 +0100 Subject: [PATCH] Adding optional delay between l3 agent restarts This change serialises the neutron l3 agent restart process and adds a user configurable delay between restarts. This can prevent connectivity loss due to all agents being restarted at the same time. Routers increase the recovery time, making this issue more prevalent. Change-Id: I3be0ebfa12965e6ae32d1b5f13f8fd23c3f52b8c --- ansible/roles/neutron/defaults/main.yml | 4 ++ ansible/roles/neutron/handlers/main.yml | 53 ++++++++++++++++++- doc/source/reference/networking/neutron.rst | 30 +++++++++++ ...t-restart-with-delay-7c2ec5875dbb760e.yaml | 11 ++++ 4 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/serial-l3-agent-restart-with-delay-7c2ec5875dbb760e.yaml diff --git a/ansible/roles/neutron/defaults/main.yml b/ansible/roles/neutron/defaults/main.yml index 6960112025..b819d8d74d 100644 --- a/ansible/roles/neutron/defaults/main.yml +++ b/ansible/roles/neutron/defaults/main.yml @@ -567,6 +567,10 @@ neutron_tls_proxy_extra_volumes: "{{ neutron_extra_volumes }}" dhcp_agents_per_network: 2 max_l3_agents_per_router: 3 +# Adds a delay (in seconds) to the serial neutron_l3_agent container restart +# process, allowing routers to fail over without loss of connectivity. +neutron_l3_agent_failover_delay: 0 + ovsdb_timeout: 10 neutron_logging_debug: "{{ openstack_logging_debug }}" diff --git a/ansible/roles/neutron/handlers/main.yml b/ansible/roles/neutron/handlers/main.yml index 20981506a2..5a3609eb16 100644 --- a/ansible/roles/neutron/handlers/main.yml +++ b/ansible/roles/neutron/handlers/main.yml @@ -87,7 +87,23 @@ when: - kolla_action != "config" -- name: Restart neutron-l3-agent container +- name: Get container facts + become: true + kolla_container_facts: + container_engine: "{{ kolla_container_engine }}" + name: + - neutron_l3_agent + register: container_facts + when: + - kolla_action != "config" + listen: Restart neutron-l3-agent container + +- name: Group hosts + group_by: + key: neutron_l3_agent_running_{{ container_facts['neutron_l3_agent'] is defined }} + listen: Restart neutron-l3-agent container + +- name: Start stopped neutron-l3-agent container vars: service_name: "neutron-l3-agent" service: "{{ neutron_services[service_name] }}" @@ -104,6 +120,41 @@ healthcheck: "{{ service.healthcheck | default(omit) }}" when: - kolla_action != "config" + - groups['neutron_l3_agent_running_False'] is defined + - inventory_hostname in groups['neutron_l3_agent_running_False'] + listen: Restart neutron-l3-agent container + +- name: Wait if container starting + wait_for: + timeout: "{{ neutron_l3_agent_failover_delay }}" + when: + - groups['neutron_l3_agent_running_False'] is defined + - groups['neutron_l3_agent_running_True'] is defined + - kolla_action != "config" + listen: Restart neutron-l3-agent container + +- name: Restart running neutron-l3-agent container + vars: + service_name: "neutron-l3-agent" + service: "{{ neutron_services[service_name] }}" + become: true + kolla_docker: + action: "recreate_or_restart_container" + common_options: "{{ docker_common_options }}" + name: "{{ service.container_name }}" + image: "{{ service.image }}" + environment: "{{ service.environment }}" + volumes: "{{ service.volumes | reject('equalto', '') | list }}" + dimensions: "{{ service.dimensions }}" + privileged: "{{ service.privileged | default(False) }}" + healthcheck: "{{ service.healthcheck | default(omit) }}" + when: + - kolla_action != "config" + - inventory_hostname == item + loop: "{{ groups['neutron_l3_agent_running_True'] | default([]) }}" + loop_control: + pause: "{{ neutron_l3_agent_failover_delay }}" + listen: Restart neutron-l3-agent container - name: Restart neutron-sriov-agent container vars: diff --git a/doc/source/reference/networking/neutron.rst b/doc/source/reference/networking/neutron.rst index eedbf786b6..cac383bf92 100644 --- a/doc/source/reference/networking/neutron.rst +++ b/doc/source/reference/networking/neutron.rst @@ -109,6 +109,36 @@ to using the native OVS firewall driver by employing a configuration override [securitygroup] firewall_driver = openvswitch +L3 agent high availability +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +L3 and DHCP agents can be created in a high availability (HA) state with: + +.. code-block:: yaml + + enable_neutron_agent_ha: "yes" + +This allows networking to fail over across controllers if the active agent is +stopped. If this option is enabled, it can be advantageous to also set: + +.. code-block:: yaml + + neutron_l3_agent_failover_delay: + +Agents sometimes need to be restarted. This delay (in seconds) is invoked +between the restart operations of each agent. When set properly, it will stop +network outages caused by all agents restarting at the same time. The exact +length of time it takes to restart is dependent on hardware and the number of +routers present. A general rule of thumb is to set the value to ``40 + 3n`` +where ``n`` is the number of routers. For example, with 5 routers, +``40 + (3 * 5) = 65`` so the value could be set to 65. A much better approach +however would be to first time how long an outage lasts, then set the value +accordingly. + +The default value is 0. A nonzero starting value would only result in +outages if the failover time was greater than the delay, which would be more +difficult to diagnose than consistent behaviour. + OVN (ml2/ovn) ~~~~~~~~~~~~~ diff --git a/releasenotes/notes/serial-l3-agent-restart-with-delay-7c2ec5875dbb760e.yaml b/releasenotes/notes/serial-l3-agent-restart-with-delay-7c2ec5875dbb760e.yaml new file mode 100644 index 0000000000..e5f796337f --- /dev/null +++ b/releasenotes/notes/serial-l3-agent-restart-with-delay-7c2ec5875dbb760e.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + In HA mode, parallel restart of neutron-l3-agent containers will cause + a network outage. Adding routers increases the recovery time. This + release makes restarts serial and adds a user-configurable delay to ensure + each agent is returned to operation before the next one is restarted. + + The default value is 0. A nonzero starting value would only result in + outages if the failover time was greater than the delay, which would be + more difficult to diagnose than consistent behaviour.