diff --git a/ansible/roles/neutron/defaults/main.yml b/ansible/roles/neutron/defaults/main.yml index 09f6f91bf7..a7d71395b8 100644 --- a/ansible/roles/neutron/defaults/main.yml +++ b/ansible/roles/neutron/defaults/main.yml @@ -594,6 +594,10 @@ neutron_tls_proxy_extra_volumes: "{{ neutron_extra_volumes }}" dhcp_agents_per_network: 2 max_l3_agents_per_router: 3 +# Adds a delay (in seconds) to the serial neutron_l3_agent container restart +# process, allowing routers to fail over without loss of connectivity. +neutron_l3_agent_failover_delay: 0 + ovsdb_timeout: 10 neutron_logging_debug: "{{ openstack_logging_debug }}" diff --git a/ansible/roles/neutron/handlers/main.yml b/ansible/roles/neutron/handlers/main.yml index 20981506a2..5a3609eb16 100644 --- a/ansible/roles/neutron/handlers/main.yml +++ b/ansible/roles/neutron/handlers/main.yml @@ -87,7 +87,23 @@ when: - kolla_action != "config" -- name: Restart neutron-l3-agent container +- name: Get container facts + become: true + kolla_container_facts: + container_engine: "{{ kolla_container_engine }}" + name: + - neutron_l3_agent + register: container_facts + when: + - kolla_action != "config" + listen: Restart neutron-l3-agent container + +- name: Group hosts + group_by: + key: neutron_l3_agent_running_{{ container_facts['neutron_l3_agent'] is defined }} + listen: Restart neutron-l3-agent container + +- name: Start stopped neutron-l3-agent container vars: service_name: "neutron-l3-agent" service: "{{ neutron_services[service_name] }}" @@ -104,6 +120,41 @@ healthcheck: "{{ service.healthcheck | default(omit) }}" when: - kolla_action != "config" + - groups['neutron_l3_agent_running_False'] is defined + - inventory_hostname in groups['neutron_l3_agent_running_False'] + listen: Restart neutron-l3-agent container + +- name: Wait if container starting + wait_for: + timeout: "{{ neutron_l3_agent_failover_delay }}" + when: + - groups['neutron_l3_agent_running_False'] is defined + - groups['neutron_l3_agent_running_True'] is defined + - kolla_action != "config" + listen: Restart neutron-l3-agent container + +- name: Restart running neutron-l3-agent container + vars: + service_name: "neutron-l3-agent" + service: "{{ neutron_services[service_name] }}" + become: true + kolla_docker: + action: "recreate_or_restart_container" + common_options: "{{ docker_common_options }}" + name: "{{ service.container_name }}" + image: "{{ service.image }}" + environment: "{{ service.environment }}" + volumes: "{{ service.volumes | reject('equalto', '') | list }}" + dimensions: "{{ service.dimensions }}" + privileged: "{{ service.privileged | default(False) }}" + healthcheck: "{{ service.healthcheck | default(omit) }}" + when: + - kolla_action != "config" + - inventory_hostname == item + loop: "{{ groups['neutron_l3_agent_running_True'] | default([]) }}" + loop_control: + pause: "{{ neutron_l3_agent_failover_delay }}" + listen: Restart neutron-l3-agent container - name: Restart neutron-sriov-agent container vars: diff --git a/doc/source/reference/networking/neutron.rst b/doc/source/reference/networking/neutron.rst index eedbf786b6..cac383bf92 100644 --- a/doc/source/reference/networking/neutron.rst +++ b/doc/source/reference/networking/neutron.rst @@ -109,6 +109,36 @@ to using the native OVS firewall driver by employing a configuration override [securitygroup] firewall_driver = openvswitch +L3 agent high availability +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +L3 and DHCP agents can be created in a high availability (HA) state with: + +.. code-block:: yaml + + enable_neutron_agent_ha: "yes" + +This allows networking to fail over across controllers if the active agent is +stopped. If this option is enabled, it can be advantageous to also set: + +.. code-block:: yaml + + neutron_l3_agent_failover_delay: + +Agents sometimes need to be restarted. This delay (in seconds) is invoked +between the restart operations of each agent. When set properly, it will stop +network outages caused by all agents restarting at the same time. The exact +length of time it takes to restart is dependent on hardware and the number of +routers present. A general rule of thumb is to set the value to ``40 + 3n`` +where ``n`` is the number of routers. For example, with 5 routers, +``40 + (3 * 5) = 65`` so the value could be set to 65. A much better approach +however would be to first time how long an outage lasts, then set the value +accordingly. + +The default value is 0. A nonzero starting value would only result in +outages if the failover time was greater than the delay, which would be more +difficult to diagnose than consistent behaviour. + OVN (ml2/ovn) ~~~~~~~~~~~~~ diff --git a/releasenotes/notes/serial-l3-agent-restart-with-delay-7c2ec5875dbb760e.yaml b/releasenotes/notes/serial-l3-agent-restart-with-delay-7c2ec5875dbb760e.yaml new file mode 100644 index 0000000000..e5f796337f --- /dev/null +++ b/releasenotes/notes/serial-l3-agent-restart-with-delay-7c2ec5875dbb760e.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + In HA mode, parallel restart of neutron-l3-agent containers will cause + a network outage. Adding routers increases the recovery time. This + release makes restarts serial and adds a user-configurable delay to ensure + each agent is returned to operation before the next one is restarted. + + The default value is 0. A nonzero starting value would only result in + outages if the failover time was greater than the delay, which would be + more difficult to diagnose than consistent behaviour.