Add support for Keepalived VRRP health check
Adds functionality to generate bash script which verifies health of current keepalived instance by pinging all available and configured GW addresses. This functionality supports IPv4 and IPv6 by detecting needed ping version using netaddr. DocImpact: Added a new parameter to 'l3_agent.ini' named 'ha_vrrp_health_check_interval' which is by default set to 0 (disabled). Values > 0 designate health check functionality should be enabled. Requires allowed ICMP ECHO_REQUEST because that is disabled by default. Co-Authored-By: Artur Korzeniewski <artur.korzeniewski@intel.com> Change-Id: Ib4d0691f432830357ea3f113036719645bc59a62 Closes-Bug: #1365461
This commit is contained in:
parent
8d3f216e24
commit
185d6cbc64
@ -129,7 +129,10 @@ class HaRouter(router.RouterInfo):
|
||||
ha_port_cidrs,
|
||||
nopreempt=True,
|
||||
advert_int=self.agent_conf.ha_vrrp_advert_int,
|
||||
priority=self.ha_priority)
|
||||
priority=self.ha_priority,
|
||||
vrrp_health_check_interval=(
|
||||
self.agent_conf.ha_vrrp_health_check_interval),
|
||||
ha_conf_dir=self.keepalived_manager.get_conf_dir())
|
||||
instance.track_interfaces.append(interface_name)
|
||||
|
||||
if self.agent_conf.ha_vrrp_auth_password:
|
||||
|
@ -15,6 +15,7 @@
|
||||
import errno
|
||||
import itertools
|
||||
import os
|
||||
import six
|
||||
|
||||
import netaddr
|
||||
from neutron_lib import exceptions
|
||||
@ -35,6 +36,7 @@ KEEPALIVED_SERVICE_NAME = 'keepalived'
|
||||
KEEPALIVED_EMAIL_FROM = 'neutron@openstack.local'
|
||||
KEEPALIVED_ROUTER_ID = 'neutron'
|
||||
GARP_MASTER_DELAY = 60
|
||||
HEALTH_CHECK_NAME = 'ha_health_check'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@ -160,7 +162,9 @@ class KeepalivedInstance(object):
|
||||
def __init__(self, state, interface, vrouter_id, ha_cidrs,
|
||||
priority=HA_DEFAULT_PRIORITY, advert_int=None,
|
||||
mcast_src_ip=None, nopreempt=False,
|
||||
garp_master_delay=GARP_MASTER_DELAY):
|
||||
garp_master_delay=GARP_MASTER_DELAY,
|
||||
vrrp_health_check_interval=0,
|
||||
ha_conf_dir=None):
|
||||
self.name = 'VR_%s' % vrouter_id
|
||||
|
||||
if state not in VALID_STATES:
|
||||
@ -178,12 +182,17 @@ class KeepalivedInstance(object):
|
||||
self.vips = []
|
||||
self.virtual_routes = KeepalivedInstanceRoutes()
|
||||
self.authentication = None
|
||||
self.track_script = None
|
||||
self.primary_vip_range = get_free_range(
|
||||
parent_range=constants.PRIVATE_CIDR_RANGE,
|
||||
excluded_ranges=[constants.METADATA_CIDR,
|
||||
constants.DVR_FIP_LL_CIDR] + ha_cidrs,
|
||||
size=PRIMARY_VIP_RANGE_SIZE)
|
||||
|
||||
if vrrp_health_check_interval > 0:
|
||||
self.track_script = KeepalivedTrackScript(
|
||||
vrrp_health_check_interval, ha_conf_dir, self.vrouter_id)
|
||||
|
||||
def set_authentication(self, auth_type, password):
|
||||
if auth_type not in VALID_AUTH_TYPES:
|
||||
raise InvalidAuthenticationTypeException(auth_type=auth_type)
|
||||
@ -267,12 +276,19 @@ class KeepalivedInstance(object):
|
||||
[' }'])
|
||||
|
||||
def build_config(self):
|
||||
config = ['vrrp_instance %s {' % self.name,
|
||||
' state %s' % self.state,
|
||||
' interface %s' % self.interface,
|
||||
' virtual_router_id %s' % self.vrouter_id,
|
||||
' priority %s' % self.priority,
|
||||
' garp_master_delay %s' % self.garp_master_delay]
|
||||
if self.track_script:
|
||||
config = self.track_script.build_config_preamble()
|
||||
self.track_script.routes = self.virtual_routes.gateway_routes
|
||||
self.track_script.vips = self.vips
|
||||
else:
|
||||
config = []
|
||||
|
||||
config.extend(['vrrp_instance %s {' % self.name,
|
||||
' state %s' % self.state,
|
||||
' interface %s' % self.interface,
|
||||
' virtual_router_id %s' % self.vrouter_id,
|
||||
' priority %s' % self.priority,
|
||||
' garp_master_delay %s' % self.garp_master_delay])
|
||||
|
||||
if self.nopreempt:
|
||||
config.append(' nopreempt')
|
||||
@ -299,6 +315,9 @@ class KeepalivedInstance(object):
|
||||
if len(self.virtual_routes):
|
||||
config.extend(self.virtual_routes.build_config())
|
||||
|
||||
if self.track_script:
|
||||
config.extend(self.track_script.build_config())
|
||||
|
||||
config.append('}')
|
||||
|
||||
return config
|
||||
@ -406,6 +425,10 @@ class KeepalivedManager(object):
|
||||
|
||||
keepalived_pm.enable(reload_cfg=True)
|
||||
|
||||
for key, instance in six.iteritems(self.config.instances):
|
||||
if instance.track_script:
|
||||
instance.track_script.write_check_script()
|
||||
|
||||
self.process_monitor.register(uuid=self.resource_id,
|
||||
service_name=KEEPALIVED_SERVICE_NAME,
|
||||
monitored_process=keepalived_pm)
|
||||
@ -453,3 +476,81 @@ class KeepalivedManager(object):
|
||||
return cmd
|
||||
|
||||
return callback
|
||||
|
||||
|
||||
class KeepalivedTrackScript(KeepalivedConf):
|
||||
"""Track script generator for Keepalived"""
|
||||
|
||||
def __init__(self, interval, conf_dir, vr_id):
|
||||
self.interval = interval
|
||||
self.conf_dir = conf_dir
|
||||
self.vr_id = vr_id
|
||||
self.routes = []
|
||||
self.vips = []
|
||||
|
||||
def build_config_preamble(self):
|
||||
config = ['',
|
||||
'vrrp_script %s_%s {' % (HEALTH_CHECK_NAME, self.vr_id),
|
||||
' script "%s"' % self._get_script_location(),
|
||||
' interval %s' % self.interval,
|
||||
' fall 2',
|
||||
' rise 2',
|
||||
'}',
|
||||
'']
|
||||
|
||||
return config
|
||||
|
||||
def _is_needed(self):
|
||||
"""Check if track script is needed by checking amount of routes.
|
||||
|
||||
:return: True/False
|
||||
"""
|
||||
return len(self.routes) > 0
|
||||
|
||||
def build_config(self):
|
||||
if not self._is_needed():
|
||||
return ''
|
||||
|
||||
config = [' track_script {',
|
||||
' %s_%s' % (HEALTH_CHECK_NAME, self.vr_id),
|
||||
' }']
|
||||
|
||||
return config
|
||||
|
||||
def build_script(self):
|
||||
return itertools.chain(['#!/bin/bash -eu'],
|
||||
['%s' % self._check_ip_assigned()],
|
||||
('%s' % self._add_ip_addr(route.nexthop)
|
||||
for route in self.routes if route.nexthop),
|
||||
)
|
||||
|
||||
def _add_ip_addr(self, ip_addr):
|
||||
cmd = {
|
||||
4: 'ping',
|
||||
6: 'ping6',
|
||||
}.get(netaddr.IPAddress(ip_addr).version)
|
||||
|
||||
return '%s -c 1 -w 1 %s 1>/dev/null || exit 1' % (cmd, ip_addr)
|
||||
|
||||
def _check_ip_assigned(self):
|
||||
cmd = 'ip a | grep %s || exit 0'
|
||||
return cmd % netaddr.IPNetwork(self.vips[0].ip_address).ip if len(
|
||||
self.vips) else ''
|
||||
|
||||
def _get_script_str(self):
|
||||
"""Generates and returns bash script to verify connectivity.
|
||||
|
||||
:return: Bash script code
|
||||
"""
|
||||
return '\n'.join(self.build_script())
|
||||
|
||||
def _get_script_location(self):
|
||||
return os.path.join(self.conf_dir,
|
||||
'ha_check_script_%s.sh' % self.vr_id)
|
||||
|
||||
def write_check_script(self):
|
||||
if not self._is_needed():
|
||||
return
|
||||
|
||||
file_utils.replace_file(
|
||||
self._get_script_location(), self._get_script_str(), 0o520)
|
||||
|
@ -43,6 +43,18 @@ OPTS = [
|
||||
'keepalived server connection requests. '
|
||||
'More threads create a higher CPU load '
|
||||
'on the agent node.')),
|
||||
cfg.IntOpt('ha_vrrp_health_check_interval',
|
||||
default=0,
|
||||
help=_('The VRRP health check interval in seconds. Values > 0 '
|
||||
'enable VRRP health checks. Setting it to 0 disables '
|
||||
'VRRP health checks. Recommended value is 5. '
|
||||
'This will cause pings to be sent to the gateway '
|
||||
'IP address(es) - requires ICMP_ECHO_REQUEST '
|
||||
'to be enabled on the gateway. '
|
||||
'If gateway fails, all routers will be reported '
|
||||
'as master, and master election will be repeated '
|
||||
'in round-robin fashion, until one of the router '
|
||||
'restore the gateway connection.')),
|
||||
]
|
||||
|
||||
|
||||
|
@ -591,6 +591,16 @@ class L3AgentTestFramework(base.BaseSudoTestCase):
|
||||
ha_device = ip_lib.IPDevice(device_name, router.ha_namespace)
|
||||
ha_device.link.set_down()
|
||||
|
||||
@staticmethod
|
||||
def fail_gw_router_port(router):
|
||||
r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge)
|
||||
r_br.link.set_down()
|
||||
|
||||
@staticmethod
|
||||
def restore_gw_router_port(router):
|
||||
r_br = ip_lib.IPDevice(router.driver.conf.external_network_bridge)
|
||||
r_br.link.set_up()
|
||||
|
||||
@classmethod
|
||||
def _get_addresses_on_device(cls, namespace, interface):
|
||||
return [address['cidr'] for address in
|
||||
|
@ -336,6 +336,54 @@ class L3HATestFailover(framework.L3AgentTestFramework):
|
||||
self.assertEqual(master_router, new_slave)
|
||||
self.assertEqual(slave_router, new_master)
|
||||
|
||||
def test_ha_router_lost_gw_connection(self):
|
||||
self.agent.conf.set_override(
|
||||
'ha_vrrp_health_check_interval', 5)
|
||||
self.failover_agent.conf.set_override(
|
||||
'ha_vrrp_health_check_interval', 5)
|
||||
|
||||
router1, router2 = self.create_ha_routers()
|
||||
|
||||
master_router, slave_router = self._get_master_and_slave_routers(
|
||||
router1, router2)
|
||||
|
||||
self.fail_gw_router_port(master_router)
|
||||
|
||||
# NOTE: passing slave_router as first argument, because we expect
|
||||
# that this router should be the master
|
||||
new_master, new_slave = self._get_master_and_slave_routers(
|
||||
slave_router, master_router)
|
||||
|
||||
self.assertEqual(master_router, new_slave)
|
||||
self.assertEqual(slave_router, new_master)
|
||||
|
||||
def test_both_ha_router_lost_gw_connection(self):
|
||||
self.agent.conf.set_override(
|
||||
'ha_vrrp_health_check_interval', 5)
|
||||
self.failover_agent.conf.set_override(
|
||||
'ha_vrrp_health_check_interval', 5)
|
||||
|
||||
router1, router2 = self.create_ha_routers()
|
||||
|
||||
master_router, slave_router = self._get_master_and_slave_routers(
|
||||
router1, router2)
|
||||
|
||||
self.fail_gw_router_port(master_router)
|
||||
self.fail_gw_router_port(slave_router)
|
||||
|
||||
common_utils.wait_until_true(
|
||||
lambda: master_router.ha_state == 'master')
|
||||
common_utils.wait_until_true(
|
||||
lambda: slave_router.ha_state == 'master')
|
||||
|
||||
self.restore_gw_router_port(master_router)
|
||||
|
||||
new_master, new_slave = self._get_master_and_slave_routers(
|
||||
master_router, slave_router)
|
||||
|
||||
self.assertEqual(master_router, new_master)
|
||||
self.assertEqual(slave_router, new_slave)
|
||||
|
||||
|
||||
class LinuxBridgeL3HATestCase(L3HATestCase):
|
||||
INTERFACE_DRIVER = 'neutron.agent.linux.interface.BridgeInterfaceDriver'
|
||||
|
@ -11,11 +11,16 @@
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
|
||||
from neutron_lib import constants as n_consts
|
||||
import os
|
||||
|
||||
import mock
|
||||
import testtools
|
||||
import textwrap
|
||||
|
||||
from neutron_lib import constants as n_consts
|
||||
|
||||
from neutron.agent.linux import keepalived
|
||||
from neutron.tests import base
|
||||
|
||||
@ -29,6 +34,8 @@ KEEPALIVED_GLOBAL_CONFIG = textwrap.dedent("""\
|
||||
}""") % dict(
|
||||
email_from=keepalived.KEEPALIVED_EMAIL_FROM,
|
||||
router_id=keepalived.KEEPALIVED_ROUTER_ID)
|
||||
VRRP_ID = 1
|
||||
VRRP_INTERVAL = 5
|
||||
|
||||
|
||||
class KeepalivedGetFreeRangeTestCase(base.BaseTestCase):
|
||||
@ -316,7 +323,32 @@ class KeepalivedInstanceTestCase(base.BaseTestCase,
|
||||
}
|
||||
}""")
|
||||
instance = keepalived.KeepalivedInstance(
|
||||
'MASTER', 'eth0', 1, ['169.254.192.0/18'])
|
||||
'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18'])
|
||||
self.assertEqual(expected, os.linesep.join(instance.build_config()))
|
||||
|
||||
def test_build_config_no_vips_track_script(self):
|
||||
expected = """
|
||||
vrrp_script ha_health_check_1 {
|
||||
script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh"
|
||||
interval 5
|
||||
fall 2
|
||||
rise 2
|
||||
}
|
||||
|
||||
vrrp_instance VR_1 {
|
||||
state MASTER
|
||||
interface eth0
|
||||
virtual_router_id 1
|
||||
priority 50
|
||||
garp_master_delay 60
|
||||
virtual_ipaddress {
|
||||
169.254.0.1/24 dev eth0
|
||||
}
|
||||
}"""
|
||||
instance = keepalived.KeepalivedInstance(
|
||||
'MASTER', 'eth0', VRRP_ID, ['169.254.192.0/18'])
|
||||
instance.track_script = keepalived.KeepalivedTrackScript(
|
||||
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
|
||||
self.assertEqual(expected, '\n'.join(instance.build_config()))
|
||||
|
||||
|
||||
@ -346,3 +378,74 @@ class KeepalivedVirtualRouteTestCase(base.BaseTestCase):
|
||||
def test_virtual_route_without_dev(self):
|
||||
route = keepalived.KeepalivedVirtualRoute('50.0.0.0/8', '1.2.3.4')
|
||||
self.assertEqual('50.0.0.0/8 via 1.2.3.4', route.build_config())
|
||||
|
||||
|
||||
class KeepalivedTrackScriptTestCase(base.BaseTestCase):
|
||||
|
||||
def test_build_config_preamble(self):
|
||||
exp_conf = [
|
||||
'',
|
||||
'vrrp_script ha_health_check_1 {',
|
||||
' script "/etc/ha_confs/qrouter-x/ha_check_script_1.sh"',
|
||||
' interval 5',
|
||||
' fall 2',
|
||||
' rise 2',
|
||||
'}',
|
||||
'']
|
||||
ts = keepalived.KeepalivedTrackScript(
|
||||
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
|
||||
self.assertEqual(exp_conf, ts.build_config_preamble())
|
||||
|
||||
def test_get_config_str(self):
|
||||
ts = keepalived.KeepalivedTrackScript(
|
||||
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
|
||||
ts.routes = [
|
||||
keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ]
|
||||
self.assertEqual(''' track_script {
|
||||
ha_health_check_1
|
||||
}''',
|
||||
ts.get_config_str())
|
||||
|
||||
def test_get_script_str(self):
|
||||
ts = keepalived.KeepalivedTrackScript(
|
||||
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
|
||||
ts.routes = [
|
||||
keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'), ]
|
||||
ts.vips = [
|
||||
keepalived.KeepalivedVipAddress('192.168.0.3/18', 'ha-xxx'), ]
|
||||
|
||||
self.assertEqual("""#!/bin/bash -eu
|
||||
ip a | grep 192.168.0.3 || exit 0
|
||||
ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1""",
|
||||
ts._get_script_str())
|
||||
|
||||
def test_get_script_str_no_routes(self):
|
||||
ts = keepalived.KeepalivedTrackScript(
|
||||
VRRP_INTERVAL, '/etc/ha_confs/qrouter-x', VRRP_ID)
|
||||
|
||||
self.assertEqual('#!/bin/bash -eu\n', ts._get_script_str())
|
||||
|
||||
def test_write_check_script(self):
|
||||
conf_dir = '/etc/ha_confs/qrouter-x'
|
||||
ts = keepalived.KeepalivedTrackScript(VRRP_INTERVAL, conf_dir, VRRP_ID)
|
||||
ts.routes = [
|
||||
keepalived.KeepalivedVirtualRoute('12.0.0.0/24', '10.0.0.1'),
|
||||
keepalived.KeepalivedVirtualRoute('2001:db8::1', '2001:db8::1'), ]
|
||||
with mock.patch.object(keepalived, 'file_utils') as patched_utils:
|
||||
ts.write_check_script()
|
||||
patched_utils.replace_file.assert_called_with(
|
||||
os.path.join(conf_dir, 'ha_check_script_1.sh'),
|
||||
"""#!/bin/bash -eu
|
||||
|
||||
ping -c 1 -w 1 10.0.0.1 1>/dev/null || exit 1
|
||||
ping6 -c 1 -w 1 2001:db8::1 1>/dev/null || exit 1""",
|
||||
0o520
|
||||
)
|
||||
|
||||
def test_write_check_script_no_routes(self):
|
||||
conf_dir = '/etc/ha_confs/qrouter-x'
|
||||
ts = keepalived.KeepalivedTrackScript(
|
||||
VRRP_INTERVAL, conf_dir, VRRP_ID)
|
||||
with mock.patch.object(keepalived, 'file_utils') as patched_utils:
|
||||
ts.write_check_script()
|
||||
patched_utils.replace_file.assert_not_called()
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
prelude: >
|
||||
Keepalived VRRP health check functionality to enable verification of
|
||||
connectivity from the "master" router to all gateways.
|
||||
features:
|
||||
- Activation of this feature enables gateway connectivity validation and
|
||||
rescheduling of the "master" router to another node when connectivity
|
||||
is lost. If all routers lose connectivity to the gateways, the election
|
||||
process will be repeated round-robin until one of the routers restores
|
||||
its gateway connection. In the mean time, all of the routers will be
|
||||
reported as "master".
|
Loading…
Reference in New Issue
Block a user