Burn-in: Dynamic network pairing

Pair nodes dynamically via a distributed coordination backend for
network burn-in. The algorithm uses a group to pair nodes: after
acquiring a lock, a first node joins the group, releases the lock,
waits for a second node, then they both leave, and release the lock
for the next pair.

Story: #2007523
Task: #42796

Change-Id: I572093b144bc90a49cd76929c7e8685ed45d9f6e
This commit is contained in:
Arne Wiebalck 2021-12-06 15:23:21 +01:00
parent fa5cccd137
commit 7f15455d8d
6 changed files with 293 additions and 16 deletions

View File

@ -11,11 +11,13 @@
# limitations under the License.
import json
import socket
import time
from ironic_lib import utils
from oslo_concurrency import processutils
from oslo_log import log
from tooz import coordination
from ironic_python_agent import errors
from ironic_python_agent import hardware
@ -252,45 +254,160 @@ def _do_fio_network(writer, runtime, partner, outputfile):
raise errors.CommandExecutionError(error_msg)
def _find_network_burnin_partner_and_role(backend_url, group_name, timeout):
"""Find a partner node for network burn-in and get our role.
:param backend_url: The tooz backend url.
:param group_name: The tooz group name for pairing.
:param timeout:Timeout in seconds for a node to wait for a partner.
:returns: A set with the partner node and the role of the local node.
"""
member_id = socket.gethostname()
coordinator = coordination.get_coordinator(backend_url, member_id)
coordinator.start(start_heart=True)
groups = coordinator.get_groups()
for group in groups.get():
if group_name == group.decode('utf-8'):
LOG.debug("Found group %s", group_name)
break
else:
LOG.info("Creating group %s", group_name)
coordinator.create_group(group_name)
def join_group(group_name):
request = coordinator.join_group(group_name)
request.get()
def leave_group(group_name):
request = coordinator.leave_group(group_name)
request.get()
# Attempt to get the pairing lock. The lock is released when:
# a) a node enters the group and is the first to join, or
# b) a node enters second, finished pairing, sees
# the pairing node exiting, and left itself.
# The lock 'walls' all nodes willing to pair.
group_lock = coordinator.get_lock("group_lock")
with group_lock:
# we need the initial members in order to know the first
# node (which may leave quickly when we join)
init_members = coordinator.get_members(group_name)
LOG.info("Original group members are %s", init_members.get())
members_cnt = len(init_members.get())
join_group(group_name)
# we assign the first node the writer role since it will
# leave the group first, it may be ready once the second
# node leaves the group, and we save one wait cycle
if not members_cnt:
first = True
role = "writer"
group_lock.release() # allow second node to enter
else:
first = False
role = "reader"
partner = None
start_pairing = time.time()
while time.time() - start_pairing < timeout:
if first:
# we are the first and therefore need to wait
# for another node to join
members = coordinator.get_members(group_name)
members_cnt = len(members.get())
else:
# use the initial members in case the other
# node leaves before we get an updated list
members = init_members
assert members_cnt < 3
if members_cnt == 2 or not first:
LOG.info("Two members, start pairing...")
for member in members.get():
node = member.decode('utf-8')
if node != member_id:
partner = node
if not partner:
error_msg = ("fio (network) no partner to pair found")
raise errors.CleaningError(error_msg)
# if you are the second to enter, wait for the first to exit
if not first:
members = coordinator.get_members(group_name)
while (len(members.get()) == 2):
time.sleep(0.2)
members = coordinator.get_members(group_name)
leave_group(group_name)
group_lock.release()
else:
leave_group(group_name)
break
else:
LOG.info("One member, waiting for second node to join ...")
time.sleep(1)
else:
leave_group(group_name)
error_msg = ("fio (network) timed out to find partner")
raise errors.CleaningError(error_msg)
return (partner, role)
def fio_network(node):
"""Burn-in the network with fio
Run an fio network job for a pair of nodes for a configurable
amount of time. The pair is statically defined in driver_info
via 'agent_burnin_fio_network_config'.
amount of time. The pair is either statically defined in
driver_info via 'agent_burnin_fio_network_config' or the role
and partner is found dynamically via a tooz backend.
The writer will wait for the reader to connect, then write to the
network. Upon completion, the roles are swapped.
Note (arne_wiebalck): Initial version. The plan is to make the
match making dynamic by posting availability
on a distributed backend, e.g. via tooz.
:param node: Ironic node object
:raises: CommandExecutionError if the execution of fio fails.
:raises: CleaningError if the configuration is incomplete.
"""
info = node.get('driver_info', {})
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
outputfile = info.get('agent_burnin_fio_network_outputfile', None)
# get our role and identify our partner
config = info.get('agent_burnin_fio_network_config')
if not config:
error_msg = ("fio (network) failed to find "
"'agent_burnin_fio_network_config' in driver_info")
raise errors.CleaningError(error_msg)
LOG.debug("agent_burnin_fio_network_config is %s", str(config))
if config:
LOG.debug("static agent_burnin_fio_network_config is %s",
config)
role = config.get('role')
partner = config.get('partner')
else:
timeout = info.get(
'agent_burnin_fio_network_pairing_timeout', 900)
group_name = info.get(
'agent_burnin_fio_network_pairing_group_name',
'ironic.network-burnin')
backend_url = info.get(
'agent_burnin_fio_network_pairing_backend_url', None)
if not backend_url:
msg = ('fio (network): dynamic pairing config is missing '
'agent_burnin_fio_network_pairing_backend_url')
raise errors.CleaningError(msg)
LOG.info("dynamic pairing for network burn-in ...")
(partner, role) = _find_network_burnin_partner_and_role(
backend_url=backend_url,
group_name=group_name,
timeout=timeout)
if role not in NETWORK_BURNIN_ROLES:
error_msg = "fio (network) found an unknown role: %s" % role
raise errors.CleaningError(error_msg)
partner = config.get('partner')
if not partner:
error_msg = ("fio (network) failed to find partner")
error_msg = "fio (network) failed to find partner"
raise errors.CleaningError(error_msg)
LOG.info("fio (network): partner %s, role is %s", partner, role)
logfilename = None
if outputfile:

View File

@ -14,6 +14,7 @@ from unittest import mock
from ironic_lib import utils
from oslo_concurrency import processutils
from tooz import coordination
from ironic_python_agent import burnin
from ironic_python_agent import errors
@ -379,3 +380,138 @@ class TestBurnin(base.IronicAgentTest):
# we loop 3 times, then do the 2 fio calls
self.assertEqual(5, mock_execute.call_count)
self.assertEqual(3, mock_time.call_count)
def test_fio_network_dynamic_pairing_raise_missing_config(self,
mock_execute):
node = {'driver_info': {}}
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
def test_fio_network_dynamic_pairing_raise_wrong_config(self,
mock_execute):
node = {'driver_info': {
'backend_url': 'zookeeper://zookeeper-host-01:2181',
'group_name': 'ironic.dynamic-network-burnin',
'timeout': 600}}
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
autospec=True)
def test_fio_network_dynamic_pairing_defaults(self, mock_find,
mock_execute):
node = {'driver_info': {
'agent_burnin_fio_network_pairing_backend_url':
'zookeeper://zookeeper-host-01:2181'}}
mock_find.return_value = ['partner-host', 'reader']
mock_execute.return_value = (['out', 'err'])
burnin.fio_network(node)
mock_find.assert_called_once_with(
backend_url='zookeeper://zookeeper-host-01:2181',
group_name='ironic.network-burnin',
timeout=900)
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
autospec=True)
def test_fio_network_dynamic_pairing_no_defaults(self, mock_find,
mock_execute):
node = {'driver_info': {
'agent_burnin_fio_network_pairing_backend_url':
'zookeeper://zookeeper-host-01:2181',
'agent_burnin_fio_network_pairing_group_name':
'ironic.special-group',
'agent_burnin_fio_network_pairing_timeout': 600}}
mock_find.return_value = ['partner-host', 'reader']
mock_execute.return_value = (['out', 'err'])
burnin.fio_network(node)
mock_find.assert_called_once_with(
backend_url='zookeeper://zookeeper-host-01:2181',
group_name='ironic.special-group',
timeout=600)
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
def test_fio_network_dynamic_find_timeout(self, mock_get_coordinator,
mock_execute):
mock_coordinator = mock.MagicMock()
mock_get_coordinator.return_value = mock_coordinator
# timeout since no other node is joining
self.assertRaises(errors.CleaningError,
burnin._find_network_burnin_partner_and_role,
"zk://xyz", 'group', 2)
# group did not exist, so we created it
mock_coordinator.create_group.assert_called_once_with('group')
mock_coordinator.join_group.assert_called_once()
# get_members is called initially, then every second
# up to the timeout
self.assertEqual(3, mock_coordinator.get_members.call_count)
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
def test_fio_network_dynamic_find_pair_1st(self, mock_get_coordinator,
mock_execute):
mock_coordinator = mock.MagicMock()
mock_get_coordinator.return_value = mock_coordinator
class Members:
def __init__(self, members=[]):
self.members = members
def get(self):
return self.members
# we are the first node to enter, so no other host
# initially until the second one appears after some
# interations
mock_coordinator.get_members.side_effect = \
[Members(), Members([b'host1']), Members([b'host1']),
Members([b'host1']), Members([b'host1', b'host2'])]
(partner, role) = \
burnin._find_network_burnin_partner_and_role("zk://xyz",
"group", 10)
# ... so we will leave first and be the writer
self.assertEqual((partner, role), ("host2", "writer"))
# group did not exist, so we created it
mock_coordinator.create_group.assert_called_once_with('group')
mock_coordinator.join_group.assert_called_once()
# get_members is called initially, then every second
# up to the timeout
self.assertEqual(5, mock_coordinator.get_members.call_count)
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
def test_fio_network_dynamic_find_pair_2nd(self, mock_get_coordinator,
mock_execute):
mock_coordinator = mock.MagicMock()
mock_get_coordinator.return_value = mock_coordinator
class Members:
def __init__(self, members=[]):
self.members = members
def get(self):
return self.members
# we are the second node to enter, host1 is there before us ...
mock_coordinator.get_members.side_effect = \
[Members([b'host1']),
Members([b'host1', b'host2']),
Members([b'host2'])]
(partner, role) = \
burnin._find_network_burnin_partner_and_role("zk://xyz",
"group", 10)
# ... so we will leave second and be the reader
self.assertEqual((partner, role), ("host1", "reader"))
# group did not exist, so we created it
mock_coordinator.create_group.assert_called_once_with('group')
mock_coordinator.join_group.assert_called_once()
# get_members is called initially, then every second until the
# other node appears
self.assertEqual(3, mock_coordinator.get_members.call_count)

View File

@ -7,6 +7,7 @@ dogpile.cache==0.9.2
eventlet==0.18.2
importlib_metadata==1.7.0;python_version<'3.8'
ironic-lib==5.1.0
kazoo==2.8.0
netifaces==0.10.4
openstacksdk==0.49.0
oslo.concurrency==3.26.0
@ -24,3 +25,4 @@ stestr==1.0.0
stevedore==1.20.0
tenacity==6.2.0
testtools==2.2.0
tooz==2.7.2

View File

@ -0,0 +1,17 @@
---
features:
- |
For network burn-in, nodes can now be paired dynamically via a
distributed coordination backend (as an alternative to a static
configuration). This allows burn-in to proceed on a 'first come
first served' basis with the nodes available, rather than a node
being blocked since the static partner is currently delayed.
In order to configure this dynamic pairing, the nodes will need
at least 'agent_burnin_fio_network_pairing_backend_url' in their
driver-info (the URL for the coordination backend). In order to
separate different hardware types, which may be using different
networks and shall be burnt-in separately, the nodes can in
addition define 'agent_burnin_fio_network_pairing_group_name' to
have pairing only happening between nodes in the same group. An
additional 'agent_burnin_fio_network_pairing_timeout' allows to
limit the time given to the nodes to wait for a partner.

View File

@ -20,3 +20,4 @@ tenacity>=6.2.0 # Apache-2.0
ironic-lib>=5.1.0 # Apache-2.0
Werkzeug>=1.0.1 # BSD License
cryptography>=2.3 # BSD/Apache-2.0
tooz>=2.7.2 # Apache-2.0

View File

@ -64,3 +64,7 @@ api_doc_dir = contributor/api
tag_build =
tag_date = 0
tag_svn_revision = 0
[extras]
burnin-network-kazoo =
kazoo>=2.8.0 # Apache-2.0