Merge "Adding optional delay between l3 agent restarts"
This commit is contained in:
commit
99d1e3c710
@ -594,6 +594,10 @@ neutron_tls_proxy_extra_volumes: "{{ neutron_extra_volumes }}"
|
||||
dhcp_agents_per_network: 2
|
||||
max_l3_agents_per_router: 3
|
||||
|
||||
# Adds a delay (in seconds) to the serial neutron_l3_agent container restart
|
||||
# process, allowing routers to fail over without loss of connectivity.
|
||||
neutron_l3_agent_failover_delay: 0
|
||||
|
||||
ovsdb_timeout: 10
|
||||
|
||||
neutron_logging_debug: "{{ openstack_logging_debug }}"
|
||||
|
@ -87,7 +87,23 @@
|
||||
when:
|
||||
- kolla_action != "config"
|
||||
|
||||
- name: Restart neutron-l3-agent container
|
||||
- name: Get container facts
|
||||
become: true
|
||||
kolla_container_facts:
|
||||
container_engine: "{{ kolla_container_engine }}"
|
||||
name:
|
||||
- neutron_l3_agent
|
||||
register: container_facts
|
||||
when:
|
||||
- kolla_action != "config"
|
||||
listen: Restart neutron-l3-agent container
|
||||
|
||||
- name: Group hosts
|
||||
group_by:
|
||||
key: neutron_l3_agent_running_{{ container_facts['neutron_l3_agent'] is defined }}
|
||||
listen: Restart neutron-l3-agent container
|
||||
|
||||
- name: Start stopped neutron-l3-agent container
|
||||
vars:
|
||||
service_name: "neutron-l3-agent"
|
||||
service: "{{ neutron_services[service_name] }}"
|
||||
@ -104,6 +120,41 @@
|
||||
healthcheck: "{{ service.healthcheck | default(omit) }}"
|
||||
when:
|
||||
- kolla_action != "config"
|
||||
- groups['neutron_l3_agent_running_False'] is defined
|
||||
- inventory_hostname in groups['neutron_l3_agent_running_False']
|
||||
listen: Restart neutron-l3-agent container
|
||||
|
||||
- name: Wait if container starting
|
||||
wait_for:
|
||||
timeout: "{{ neutron_l3_agent_failover_delay }}"
|
||||
when:
|
||||
- groups['neutron_l3_agent_running_False'] is defined
|
||||
- groups['neutron_l3_agent_running_True'] is defined
|
||||
- kolla_action != "config"
|
||||
listen: Restart neutron-l3-agent container
|
||||
|
||||
- name: Restart running neutron-l3-agent container
|
||||
vars:
|
||||
service_name: "neutron-l3-agent"
|
||||
service: "{{ neutron_services[service_name] }}"
|
||||
become: true
|
||||
kolla_docker:
|
||||
action: "recreate_or_restart_container"
|
||||
common_options: "{{ docker_common_options }}"
|
||||
name: "{{ service.container_name }}"
|
||||
image: "{{ service.image }}"
|
||||
environment: "{{ service.environment }}"
|
||||
volumes: "{{ service.volumes | reject('equalto', '') | list }}"
|
||||
dimensions: "{{ service.dimensions }}"
|
||||
privileged: "{{ service.privileged | default(False) }}"
|
||||
healthcheck: "{{ service.healthcheck | default(omit) }}"
|
||||
when:
|
||||
- kolla_action != "config"
|
||||
- inventory_hostname == item
|
||||
loop: "{{ groups['neutron_l3_agent_running_True'] | default([]) }}"
|
||||
loop_control:
|
||||
pause: "{{ neutron_l3_agent_failover_delay }}"
|
||||
listen: Restart neutron-l3-agent container
|
||||
|
||||
- name: Restart neutron-sriov-agent container
|
||||
vars:
|
||||
|
@ -109,6 +109,36 @@ to using the native OVS firewall driver by employing a configuration override
|
||||
[securitygroup]
|
||||
firewall_driver = openvswitch
|
||||
|
||||
L3 agent high availability
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
L3 and DHCP agents can be created in a high availability (HA) state with:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
enable_neutron_agent_ha: "yes"
|
||||
|
||||
This allows networking to fail over across controllers if the active agent is
|
||||
stopped. If this option is enabled, it can be advantageous to also set:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
neutron_l3_agent_failover_delay:
|
||||
|
||||
Agents sometimes need to be restarted. This delay (in seconds) is invoked
|
||||
between the restart operations of each agent. When set properly, it will stop
|
||||
network outages caused by all agents restarting at the same time. The exact
|
||||
length of time it takes to restart is dependent on hardware and the number of
|
||||
routers present. A general rule of thumb is to set the value to ``40 + 3n``
|
||||
where ``n`` is the number of routers. For example, with 5 routers,
|
||||
``40 + (3 * 5) = 65`` so the value could be set to 65. A much better approach
|
||||
however would be to first time how long an outage lasts, then set the value
|
||||
accordingly.
|
||||
|
||||
The default value is 0. A nonzero starting value would only result in
|
||||
outages if the failover time was greater than the delay, which would be more
|
||||
difficult to diagnose than consistent behaviour.
|
||||
|
||||
OVN (ml2/ovn)
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
In HA mode, parallel restart of neutron-l3-agent containers will cause
|
||||
a network outage. Adding routers increases the recovery time. This
|
||||
release makes restarts serial and adds a user-configurable delay to ensure
|
||||
each agent is returned to operation before the next one is restarted.
|
||||
|
||||
The default value is 0. A nonzero starting value would only result in
|
||||
outages if the failover time was greater than the delay, which would be
|
||||
more difficult to diagnose than consistent behaviour.
|
Loading…
Reference in New Issue
Block a user