Fix node exclusive lock not released on console start/restart.
This patch forces kill of console process with SIGKILL if it did not terminate on SIGTERM within the CONF.console.kill_timeout and reads the shellinabox subprocess stdout/stderr after CONF.console.subprocess_timeout or if subprocess exited with non zero code. Change-Id: I55a112d877d94f31d27487846ff59fe27f602f8b Closes-Bug: 1587313 Story: 1587313 Task: 9654
This commit is contained in:
parent
37f272436f
commit
0256e1e69f
@ -45,6 +45,10 @@ opts = [
|
|||||||
default=10,
|
default=10,
|
||||||
help=_('Time (in seconds) to wait for the console subprocess '
|
help=_('Time (in seconds) to wait for the console subprocess '
|
||||||
'to start.')),
|
'to start.')),
|
||||||
|
cfg.IntOpt('kill_timeout',
|
||||||
|
default=1,
|
||||||
|
help=_('Time (in seconds) to wait for the shellinabox console '
|
||||||
|
'subprocess to exit before sending SIGKILL signal.')),
|
||||||
cfg.IPOpt('socat_address',
|
cfg.IPOpt('socat_address',
|
||||||
default='$my_ip',
|
default='$my_ip',
|
||||||
help=_('IP address of Socat service running on the host of '
|
help=_('IP address of Socat service running on the host of '
|
||||||
|
@ -101,6 +101,23 @@ def _stop_console(node_uuid):
|
|||||||
console_pid = _get_console_pid(node_uuid)
|
console_pid = _get_console_pid(node_uuid)
|
||||||
|
|
||||||
os.kill(console_pid, signal.SIGTERM)
|
os.kill(console_pid, signal.SIGTERM)
|
||||||
|
|
||||||
|
# make sure the process gets killed hard if required
|
||||||
|
attempt = 0
|
||||||
|
max_attempts = CONF.console.kill_timeout // 0.2
|
||||||
|
|
||||||
|
while attempt < max_attempts:
|
||||||
|
if psutil.pid_exists(console_pid):
|
||||||
|
if attempt == max_attempts - 1:
|
||||||
|
os.kill(console_pid, signal.SIGKILL)
|
||||||
|
LOG.debug("Waiting for the console process with PID %(pid)s "
|
||||||
|
"to exit. Node: %(node)s.",
|
||||||
|
{'pid': console_pid, 'node': node_uuid})
|
||||||
|
time.sleep(0.2)
|
||||||
|
attempt += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
except OSError as exc:
|
except OSError as exc:
|
||||||
if exc.errno != errno.ESRCH:
|
if exc.errno != errno.ESRCH:
|
||||||
msg = (_("Could not stop the console for node '%(node)s'. "
|
msg = (_("Could not stop the console for node '%(node)s'. "
|
||||||
@ -198,28 +215,24 @@ def start_shellinabox_console(node_uuid, port, console_cmd):
|
|||||||
# check if the console pid is created and the process is running.
|
# check if the console pid is created and the process is running.
|
||||||
# if it is, then the shellinaboxd is invoked successfully as a daemon.
|
# if it is, then the shellinaboxd is invoked successfully as a daemon.
|
||||||
# otherwise check the error.
|
# otherwise check the error.
|
||||||
if locals['returncode'] is not None:
|
if (locals['returncode'] == 0 and os.path.exists(pid_file)
|
||||||
if (locals['returncode'] == 0 and os.path.exists(pid_file)
|
and psutil.pid_exists(_get_console_pid(node_uuid))):
|
||||||
and psutil.pid_exists(_get_console_pid(node_uuid))):
|
raise loopingcall.LoopingCallDone()
|
||||||
raise loopingcall.LoopingCallDone()
|
|
||||||
else:
|
|
||||||
(stdout, stderr) = popen_obj.communicate()
|
|
||||||
locals['errstr'] = _(
|
|
||||||
"Command: %(command)s.\n"
|
|
||||||
"Exit code: %(return_code)s.\n"
|
|
||||||
"Stdout: %(stdout)r\n"
|
|
||||||
"Stderr: %(stderr)r") % {
|
|
||||||
'command': ' '.join(args),
|
|
||||||
'return_code': locals['returncode'],
|
|
||||||
'stdout': stdout,
|
|
||||||
'stderr': stderr}
|
|
||||||
LOG.warning(locals['errstr'])
|
|
||||||
raise loopingcall.LoopingCallDone()
|
|
||||||
|
|
||||||
if (time.time() > expiration):
|
if (time.time() > expiration or locals['returncode'] is not None):
|
||||||
locals['errstr'] = (_("Timeout while waiting for console "
|
(stdout, stderr) = popen_obj.communicate()
|
||||||
"subprocess to start for node %s.") %
|
locals['errstr'] = _(
|
||||||
node_uuid)
|
"Timeout or error while waiting for console "
|
||||||
|
"subprocess to start for node: %(node)s.\n"
|
||||||
|
"Command: %(command)s.\n"
|
||||||
|
"Exit code: %(return_code)s.\n"
|
||||||
|
"Stdout: %(stdout)r\n"
|
||||||
|
"Stderr: %(stderr)r") % {
|
||||||
|
'node': node_uuid,
|
||||||
|
'command': ' '.join(args),
|
||||||
|
'return_code': locals['returncode'],
|
||||||
|
'stdout': stdout,
|
||||||
|
'stderr': stderr}
|
||||||
LOG.warning(locals['errstr'])
|
LOG.warning(locals['errstr'])
|
||||||
raise loopingcall.LoopingCallDone()
|
raise loopingcall.LoopingCallDone()
|
||||||
|
|
||||||
|
@ -145,8 +145,28 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
|
|||||||
console_utils._stop_console(self.info['uuid'])
|
console_utils._stop_console(self.info['uuid'])
|
||||||
|
|
||||||
mock_pid.assert_called_once_with(self.info['uuid'])
|
mock_pid.assert_called_once_with(self.info['uuid'])
|
||||||
mock_kill.assert_called_once_with(mock_pid.return_value,
|
|
||||||
signal.SIGTERM)
|
# a check if process still exist (signal 0) in a loop
|
||||||
|
mock_kill.assert_any_call(mock_pid.return_value, signal.SIG_DFL)
|
||||||
|
# and that it receives the SIGTERM
|
||||||
|
mock_kill.assert_any_call(mock_pid.return_value, signal.SIGTERM)
|
||||||
|
mock_unlink.assert_called_once_with(pid_file)
|
||||||
|
|
||||||
|
@mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
|
||||||
|
@mock.patch.object(os, 'kill', autospec=True)
|
||||||
|
@mock.patch.object(psutil, 'pid_exists', autospec=True, return_value=True)
|
||||||
|
@mock.patch.object(console_utils, '_get_console_pid', autospec=True)
|
||||||
|
def test__stop_console_forced_kill(self, mock_pid, mock_psutil, mock_kill,
|
||||||
|
mock_unlink):
|
||||||
|
pid_file = console_utils._get_console_pid_file(self.info['uuid'])
|
||||||
|
mock_pid.return_value = 12345
|
||||||
|
|
||||||
|
console_utils._stop_console(self.info['uuid'])
|
||||||
|
|
||||||
|
mock_pid.assert_called_once_with(self.info['uuid'])
|
||||||
|
|
||||||
|
# Make sure console process receives hard SIGKILL
|
||||||
|
mock_kill.assert_any_call(mock_pid.return_value, signal.SIGKILL)
|
||||||
mock_unlink.assert_called_once_with(pid_file)
|
mock_unlink.assert_called_once_with(pid_file)
|
||||||
|
|
||||||
@mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
|
@mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
|
||||||
@ -322,7 +342,7 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
|
|||||||
mock_popen.assert_called_once_with(mock.ANY,
|
mock_popen.assert_called_once_with(mock.ANY,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE)
|
stderr=subprocess.PIPE)
|
||||||
mock_popen.return_value.poll.assert_called_once_with()
|
mock_popen.return_value.poll.assert_called_with()
|
||||||
|
|
||||||
@mock.patch.object(console_utils, 'open',
|
@mock.patch.object(console_utils, 'open',
|
||||||
mock.mock_open(read_data='12345\n'))
|
mock.mock_open(read_data='12345\n'))
|
||||||
@ -350,11 +370,11 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
|
|||||||
|
|
||||||
mock_stop.assert_called_once_with(self.info['uuid'])
|
mock_stop.assert_called_once_with(self.info['uuid'])
|
||||||
mock_dir_exists.assert_called_once_with()
|
mock_dir_exists.assert_called_once_with()
|
||||||
mock_pid_exists.assert_called_once_with(12345)
|
mock_pid_exists.assert_called_with(12345)
|
||||||
mock_popen.assert_called_once_with(mock.ANY,
|
mock_popen.assert_called_once_with(mock.ANY,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE)
|
stderr=subprocess.PIPE)
|
||||||
mock_popen.return_value.poll.assert_called_once_with()
|
mock_popen.return_value.poll.assert_called_with()
|
||||||
|
|
||||||
@mock.patch.object(subprocess, 'Popen', autospec=True)
|
@mock.patch.object(subprocess, 'Popen', autospec=True)
|
||||||
@mock.patch.object(console_utils, '_ensure_console_pid_dir_exists',
|
@mock.patch.object(console_utils, '_ensure_console_pid_dir_exists',
|
||||||
|
@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Fixes a locking issue where ``ipmitool-shellinabox`` console
|
||||||
|
interface users may encounter a situation where the bare metal node is
|
||||||
|
locked until the conductor is restarted.
|
||||||
|
See story `1587313 <https://storyboard.openstack.org/#!/story/1587313>`_
|
||||||
|
for additional information.
|
Loading…
Reference in New Issue
Block a user