Burn-in: Dynamic network pairing
Pair nodes dynamically via a distributed coordination backend for network burn-in. The algorithm uses a group to pair nodes: after acquiring a lock, a first node joins the group, releases the lock, waits for a second node, then they both leave, and release the lock for the next pair. Story: #2007523 Task: #42796 Change-Id: I572093b144bc90a49cd76929c7e8685ed45d9f6e
This commit is contained in:
parent
fa5cccd137
commit
7f15455d8d
@ -11,11 +11,13 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import socket
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ironic_lib import utils
|
from ironic_lib import utils
|
||||||
from oslo_concurrency import processutils
|
from oslo_concurrency import processutils
|
||||||
from oslo_log import log
|
from oslo_log import log
|
||||||
|
from tooz import coordination
|
||||||
|
|
||||||
from ironic_python_agent import errors
|
from ironic_python_agent import errors
|
||||||
from ironic_python_agent import hardware
|
from ironic_python_agent import hardware
|
||||||
@ -252,45 +254,160 @@ def _do_fio_network(writer, runtime, partner, outputfile):
|
|||||||
raise errors.CommandExecutionError(error_msg)
|
raise errors.CommandExecutionError(error_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_network_burnin_partner_and_role(backend_url, group_name, timeout):
|
||||||
|
"""Find a partner node for network burn-in and get our role.
|
||||||
|
|
||||||
|
:param backend_url: The tooz backend url.
|
||||||
|
:param group_name: The tooz group name for pairing.
|
||||||
|
:param timeout:Timeout in seconds for a node to wait for a partner.
|
||||||
|
:returns: A set with the partner node and the role of the local node.
|
||||||
|
"""
|
||||||
|
|
||||||
|
member_id = socket.gethostname()
|
||||||
|
coordinator = coordination.get_coordinator(backend_url, member_id)
|
||||||
|
coordinator.start(start_heart=True)
|
||||||
|
|
||||||
|
groups = coordinator.get_groups()
|
||||||
|
for group in groups.get():
|
||||||
|
if group_name == group.decode('utf-8'):
|
||||||
|
LOG.debug("Found group %s", group_name)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
LOG.info("Creating group %s", group_name)
|
||||||
|
coordinator.create_group(group_name)
|
||||||
|
|
||||||
|
def join_group(group_name):
|
||||||
|
request = coordinator.join_group(group_name)
|
||||||
|
request.get()
|
||||||
|
|
||||||
|
def leave_group(group_name):
|
||||||
|
request = coordinator.leave_group(group_name)
|
||||||
|
request.get()
|
||||||
|
|
||||||
|
# Attempt to get the pairing lock. The lock is released when:
|
||||||
|
# a) a node enters the group and is the first to join, or
|
||||||
|
# b) a node enters second, finished pairing, sees
|
||||||
|
# the pairing node exiting, and left itself.
|
||||||
|
# The lock 'walls' all nodes willing to pair.
|
||||||
|
group_lock = coordinator.get_lock("group_lock")
|
||||||
|
with group_lock:
|
||||||
|
# we need the initial members in order to know the first
|
||||||
|
# node (which may leave quickly when we join)
|
||||||
|
init_members = coordinator.get_members(group_name)
|
||||||
|
LOG.info("Original group members are %s", init_members.get())
|
||||||
|
members_cnt = len(init_members.get())
|
||||||
|
|
||||||
|
join_group(group_name)
|
||||||
|
|
||||||
|
# we assign the first node the writer role since it will
|
||||||
|
# leave the group first, it may be ready once the second
|
||||||
|
# node leaves the group, and we save one wait cycle
|
||||||
|
if not members_cnt:
|
||||||
|
first = True
|
||||||
|
role = "writer"
|
||||||
|
group_lock.release() # allow second node to enter
|
||||||
|
else:
|
||||||
|
first = False
|
||||||
|
role = "reader"
|
||||||
|
|
||||||
|
partner = None
|
||||||
|
start_pairing = time.time()
|
||||||
|
while time.time() - start_pairing < timeout:
|
||||||
|
if first:
|
||||||
|
# we are the first and therefore need to wait
|
||||||
|
# for another node to join
|
||||||
|
members = coordinator.get_members(group_name)
|
||||||
|
members_cnt = len(members.get())
|
||||||
|
else:
|
||||||
|
# use the initial members in case the other
|
||||||
|
# node leaves before we get an updated list
|
||||||
|
members = init_members
|
||||||
|
|
||||||
|
assert members_cnt < 3
|
||||||
|
|
||||||
|
if members_cnt == 2 or not first:
|
||||||
|
LOG.info("Two members, start pairing...")
|
||||||
|
for member in members.get():
|
||||||
|
node = member.decode('utf-8')
|
||||||
|
if node != member_id:
|
||||||
|
partner = node
|
||||||
|
if not partner:
|
||||||
|
error_msg = ("fio (network) no partner to pair found")
|
||||||
|
raise errors.CleaningError(error_msg)
|
||||||
|
|
||||||
|
# if you are the second to enter, wait for the first to exit
|
||||||
|
if not first:
|
||||||
|
members = coordinator.get_members(group_name)
|
||||||
|
while (len(members.get()) == 2):
|
||||||
|
time.sleep(0.2)
|
||||||
|
members = coordinator.get_members(group_name)
|
||||||
|
leave_group(group_name)
|
||||||
|
group_lock.release()
|
||||||
|
else:
|
||||||
|
leave_group(group_name)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
LOG.info("One member, waiting for second node to join ...")
|
||||||
|
time.sleep(1)
|
||||||
|
else:
|
||||||
|
leave_group(group_name)
|
||||||
|
error_msg = ("fio (network) timed out to find partner")
|
||||||
|
raise errors.CleaningError(error_msg)
|
||||||
|
|
||||||
|
return (partner, role)
|
||||||
|
|
||||||
|
|
||||||
def fio_network(node):
|
def fio_network(node):
|
||||||
"""Burn-in the network with fio
|
"""Burn-in the network with fio
|
||||||
|
|
||||||
Run an fio network job for a pair of nodes for a configurable
|
Run an fio network job for a pair of nodes for a configurable
|
||||||
amount of time. The pair is statically defined in driver_info
|
amount of time. The pair is either statically defined in
|
||||||
via 'agent_burnin_fio_network_config'.
|
driver_info via 'agent_burnin_fio_network_config' or the role
|
||||||
|
and partner is found dynamically via a tooz backend.
|
||||||
|
|
||||||
The writer will wait for the reader to connect, then write to the
|
The writer will wait for the reader to connect, then write to the
|
||||||
network. Upon completion, the roles are swapped.
|
network. Upon completion, the roles are swapped.
|
||||||
|
|
||||||
Note (arne_wiebalck): Initial version. The plan is to make the
|
|
||||||
match making dynamic by posting availability
|
|
||||||
on a distributed backend, e.g. via tooz.
|
|
||||||
|
|
||||||
:param node: Ironic node object
|
:param node: Ironic node object
|
||||||
:raises: CommandExecutionError if the execution of fio fails.
|
:raises: CommandExecutionError if the execution of fio fails.
|
||||||
:raises: CleaningError if the configuration is incomplete.
|
:raises: CleaningError if the configuration is incomplete.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
info = node.get('driver_info', {})
|
info = node.get('driver_info', {})
|
||||||
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
|
runtime = info.get('agent_burnin_fio_network_runtime', 21600)
|
||||||
outputfile = info.get('agent_burnin_fio_network_outputfile', None)
|
outputfile = info.get('agent_burnin_fio_network_outputfile', None)
|
||||||
|
|
||||||
# get our role and identify our partner
|
# get our role and identify our partner
|
||||||
config = info.get('agent_burnin_fio_network_config')
|
config = info.get('agent_burnin_fio_network_config')
|
||||||
if not config:
|
if config:
|
||||||
error_msg = ("fio (network) failed to find "
|
LOG.debug("static agent_burnin_fio_network_config is %s",
|
||||||
"'agent_burnin_fio_network_config' in driver_info")
|
config)
|
||||||
raise errors.CleaningError(error_msg)
|
|
||||||
LOG.debug("agent_burnin_fio_network_config is %s", str(config))
|
|
||||||
|
|
||||||
role = config.get('role')
|
role = config.get('role')
|
||||||
|
partner = config.get('partner')
|
||||||
|
else:
|
||||||
|
timeout = info.get(
|
||||||
|
'agent_burnin_fio_network_pairing_timeout', 900)
|
||||||
|
group_name = info.get(
|
||||||
|
'agent_burnin_fio_network_pairing_group_name',
|
||||||
|
'ironic.network-burnin')
|
||||||
|
backend_url = info.get(
|
||||||
|
'agent_burnin_fio_network_pairing_backend_url', None)
|
||||||
|
if not backend_url:
|
||||||
|
msg = ('fio (network): dynamic pairing config is missing '
|
||||||
|
'agent_burnin_fio_network_pairing_backend_url')
|
||||||
|
raise errors.CleaningError(msg)
|
||||||
|
LOG.info("dynamic pairing for network burn-in ...")
|
||||||
|
(partner, role) = _find_network_burnin_partner_and_role(
|
||||||
|
backend_url=backend_url,
|
||||||
|
group_name=group_name,
|
||||||
|
timeout=timeout)
|
||||||
|
|
||||||
if role not in NETWORK_BURNIN_ROLES:
|
if role not in NETWORK_BURNIN_ROLES:
|
||||||
error_msg = "fio (network) found an unknown role: %s" % role
|
error_msg = "fio (network) found an unknown role: %s" % role
|
||||||
raise errors.CleaningError(error_msg)
|
raise errors.CleaningError(error_msg)
|
||||||
|
|
||||||
partner = config.get('partner')
|
|
||||||
if not partner:
|
if not partner:
|
||||||
error_msg = ("fio (network) failed to find partner")
|
error_msg = "fio (network) failed to find partner"
|
||||||
raise errors.CleaningError(error_msg)
|
raise errors.CleaningError(error_msg)
|
||||||
|
LOG.info("fio (network): partner %s, role is %s", partner, role)
|
||||||
|
|
||||||
logfilename = None
|
logfilename = None
|
||||||
if outputfile:
|
if outputfile:
|
||||||
|
@ -14,6 +14,7 @@ from unittest import mock
|
|||||||
|
|
||||||
from ironic_lib import utils
|
from ironic_lib import utils
|
||||||
from oslo_concurrency import processutils
|
from oslo_concurrency import processutils
|
||||||
|
from tooz import coordination
|
||||||
|
|
||||||
from ironic_python_agent import burnin
|
from ironic_python_agent import burnin
|
||||||
from ironic_python_agent import errors
|
from ironic_python_agent import errors
|
||||||
@ -379,3 +380,138 @@ class TestBurnin(base.IronicAgentTest):
|
|||||||
# we loop 3 times, then do the 2 fio calls
|
# we loop 3 times, then do the 2 fio calls
|
||||||
self.assertEqual(5, mock_execute.call_count)
|
self.assertEqual(5, mock_execute.call_count)
|
||||||
self.assertEqual(3, mock_time.call_count)
|
self.assertEqual(3, mock_time.call_count)
|
||||||
|
|
||||||
|
def test_fio_network_dynamic_pairing_raise_missing_config(self,
|
||||||
|
mock_execute):
|
||||||
|
node = {'driver_info': {}}
|
||||||
|
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
|
||||||
|
|
||||||
|
def test_fio_network_dynamic_pairing_raise_wrong_config(self,
|
||||||
|
mock_execute):
|
||||||
|
node = {'driver_info': {
|
||||||
|
'backend_url': 'zookeeper://zookeeper-host-01:2181',
|
||||||
|
'group_name': 'ironic.dynamic-network-burnin',
|
||||||
|
'timeout': 600}}
|
||||||
|
self.assertRaises(errors.CleaningError, burnin.fio_network, node)
|
||||||
|
|
||||||
|
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
|
||||||
|
autospec=True)
|
||||||
|
def test_fio_network_dynamic_pairing_defaults(self, mock_find,
|
||||||
|
mock_execute):
|
||||||
|
node = {'driver_info': {
|
||||||
|
'agent_burnin_fio_network_pairing_backend_url':
|
||||||
|
'zookeeper://zookeeper-host-01:2181'}}
|
||||||
|
mock_find.return_value = ['partner-host', 'reader']
|
||||||
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
|
||||||
|
burnin.fio_network(node)
|
||||||
|
|
||||||
|
mock_find.assert_called_once_with(
|
||||||
|
backend_url='zookeeper://zookeeper-host-01:2181',
|
||||||
|
group_name='ironic.network-burnin',
|
||||||
|
timeout=900)
|
||||||
|
|
||||||
|
@mock.patch.object(burnin, '_find_network_burnin_partner_and_role',
|
||||||
|
autospec=True)
|
||||||
|
def test_fio_network_dynamic_pairing_no_defaults(self, mock_find,
|
||||||
|
mock_execute):
|
||||||
|
node = {'driver_info': {
|
||||||
|
'agent_burnin_fio_network_pairing_backend_url':
|
||||||
|
'zookeeper://zookeeper-host-01:2181',
|
||||||
|
'agent_burnin_fio_network_pairing_group_name':
|
||||||
|
'ironic.special-group',
|
||||||
|
'agent_burnin_fio_network_pairing_timeout': 600}}
|
||||||
|
mock_find.return_value = ['partner-host', 'reader']
|
||||||
|
mock_execute.return_value = (['out', 'err'])
|
||||||
|
|
||||||
|
burnin.fio_network(node)
|
||||||
|
|
||||||
|
mock_find.assert_called_once_with(
|
||||||
|
backend_url='zookeeper://zookeeper-host-01:2181',
|
||||||
|
group_name='ironic.special-group',
|
||||||
|
timeout=600)
|
||||||
|
|
||||||
|
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
|
||||||
|
def test_fio_network_dynamic_find_timeout(self, mock_get_coordinator,
|
||||||
|
mock_execute):
|
||||||
|
mock_coordinator = mock.MagicMock()
|
||||||
|
mock_get_coordinator.return_value = mock_coordinator
|
||||||
|
|
||||||
|
# timeout since no other node is joining
|
||||||
|
self.assertRaises(errors.CleaningError,
|
||||||
|
burnin._find_network_burnin_partner_and_role,
|
||||||
|
"zk://xyz", 'group', 2)
|
||||||
|
|
||||||
|
# group did not exist, so we created it
|
||||||
|
mock_coordinator.create_group.assert_called_once_with('group')
|
||||||
|
mock_coordinator.join_group.assert_called_once()
|
||||||
|
# get_members is called initially, then every second
|
||||||
|
# up to the timeout
|
||||||
|
self.assertEqual(3, mock_coordinator.get_members.call_count)
|
||||||
|
|
||||||
|
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
|
||||||
|
def test_fio_network_dynamic_find_pair_1st(self, mock_get_coordinator,
|
||||||
|
mock_execute):
|
||||||
|
mock_coordinator = mock.MagicMock()
|
||||||
|
mock_get_coordinator.return_value = mock_coordinator
|
||||||
|
|
||||||
|
class Members:
|
||||||
|
def __init__(self, members=[]):
|
||||||
|
self.members = members
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self.members
|
||||||
|
|
||||||
|
# we are the first node to enter, so no other host
|
||||||
|
# initially until the second one appears after some
|
||||||
|
# interations
|
||||||
|
mock_coordinator.get_members.side_effect = \
|
||||||
|
[Members(), Members([b'host1']), Members([b'host1']),
|
||||||
|
Members([b'host1']), Members([b'host1', b'host2'])]
|
||||||
|
|
||||||
|
(partner, role) = \
|
||||||
|
burnin._find_network_burnin_partner_and_role("zk://xyz",
|
||||||
|
"group", 10)
|
||||||
|
|
||||||
|
# ... so we will leave first and be the writer
|
||||||
|
self.assertEqual((partner, role), ("host2", "writer"))
|
||||||
|
|
||||||
|
# group did not exist, so we created it
|
||||||
|
mock_coordinator.create_group.assert_called_once_with('group')
|
||||||
|
mock_coordinator.join_group.assert_called_once()
|
||||||
|
# get_members is called initially, then every second
|
||||||
|
# up to the timeout
|
||||||
|
self.assertEqual(5, mock_coordinator.get_members.call_count)
|
||||||
|
|
||||||
|
@mock.patch.object(coordination, 'get_coordinator', autospec=True)
|
||||||
|
def test_fio_network_dynamic_find_pair_2nd(self, mock_get_coordinator,
|
||||||
|
mock_execute):
|
||||||
|
mock_coordinator = mock.MagicMock()
|
||||||
|
mock_get_coordinator.return_value = mock_coordinator
|
||||||
|
|
||||||
|
class Members:
|
||||||
|
def __init__(self, members=[]):
|
||||||
|
self.members = members
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self.members
|
||||||
|
|
||||||
|
# we are the second node to enter, host1 is there before us ...
|
||||||
|
mock_coordinator.get_members.side_effect = \
|
||||||
|
[Members([b'host1']),
|
||||||
|
Members([b'host1', b'host2']),
|
||||||
|
Members([b'host2'])]
|
||||||
|
|
||||||
|
(partner, role) = \
|
||||||
|
burnin._find_network_burnin_partner_and_role("zk://xyz",
|
||||||
|
"group", 10)
|
||||||
|
|
||||||
|
# ... so we will leave second and be the reader
|
||||||
|
self.assertEqual((partner, role), ("host1", "reader"))
|
||||||
|
|
||||||
|
# group did not exist, so we created it
|
||||||
|
mock_coordinator.create_group.assert_called_once_with('group')
|
||||||
|
mock_coordinator.join_group.assert_called_once()
|
||||||
|
# get_members is called initially, then every second until the
|
||||||
|
# other node appears
|
||||||
|
self.assertEqual(3, mock_coordinator.get_members.call_count)
|
||||||
|
@ -7,6 +7,7 @@ dogpile.cache==0.9.2
|
|||||||
eventlet==0.18.2
|
eventlet==0.18.2
|
||||||
importlib_metadata==1.7.0;python_version<'3.8'
|
importlib_metadata==1.7.0;python_version<'3.8'
|
||||||
ironic-lib==5.1.0
|
ironic-lib==5.1.0
|
||||||
|
kazoo==2.8.0
|
||||||
netifaces==0.10.4
|
netifaces==0.10.4
|
||||||
openstacksdk==0.49.0
|
openstacksdk==0.49.0
|
||||||
oslo.concurrency==3.26.0
|
oslo.concurrency==3.26.0
|
||||||
@ -24,3 +25,4 @@ stestr==1.0.0
|
|||||||
stevedore==1.20.0
|
stevedore==1.20.0
|
||||||
tenacity==6.2.0
|
tenacity==6.2.0
|
||||||
testtools==2.2.0
|
testtools==2.2.0
|
||||||
|
tooz==2.7.2
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
For network burn-in, nodes can now be paired dynamically via a
|
||||||
|
distributed coordination backend (as an alternative to a static
|
||||||
|
configuration). This allows burn-in to proceed on a 'first come
|
||||||
|
first served' basis with the nodes available, rather than a node
|
||||||
|
being blocked since the static partner is currently delayed.
|
||||||
|
In order to configure this dynamic pairing, the nodes will need
|
||||||
|
at least 'agent_burnin_fio_network_pairing_backend_url' in their
|
||||||
|
driver-info (the URL for the coordination backend). In order to
|
||||||
|
separate different hardware types, which may be using different
|
||||||
|
networks and shall be burnt-in separately, the nodes can in
|
||||||
|
addition define 'agent_burnin_fio_network_pairing_group_name' to
|
||||||
|
have pairing only happening between nodes in the same group. An
|
||||||
|
additional 'agent_burnin_fio_network_pairing_timeout' allows to
|
||||||
|
limit the time given to the nodes to wait for a partner.
|
@ -20,3 +20,4 @@ tenacity>=6.2.0 # Apache-2.0
|
|||||||
ironic-lib>=5.1.0 # Apache-2.0
|
ironic-lib>=5.1.0 # Apache-2.0
|
||||||
Werkzeug>=1.0.1 # BSD License
|
Werkzeug>=1.0.1 # BSD License
|
||||||
cryptography>=2.3 # BSD/Apache-2.0
|
cryptography>=2.3 # BSD/Apache-2.0
|
||||||
|
tooz>=2.7.2 # Apache-2.0
|
||||||
|
Loading…
Reference in New Issue
Block a user