Fix node exclusive lock not released on console start/restart.

This patch forces kill of console process with SIGKILL if it did not
terminate on SIGTERM within the CONF.console.kill_timeout and reads the
shellinabox subprocess stdout/stderr after CONF.console.subprocess_timeout
or if subprocess exited with non zero code.

Change-Id: I55a112d877d94f31d27487846ff59fe27f602f8b
Closes-Bug: 1587313
Story: 1587313
Task: 9654
This commit is contained in:
Dmitry Galkin 2018-09-19 14:02:48 +00:00
parent 37f272436f
commit 0256e1e69f
4 changed files with 71 additions and 26 deletions

View File

@ -45,6 +45,10 @@ opts = [
default=10, default=10,
help=_('Time (in seconds) to wait for the console subprocess ' help=_('Time (in seconds) to wait for the console subprocess '
'to start.')), 'to start.')),
cfg.IntOpt('kill_timeout',
default=1,
help=_('Time (in seconds) to wait for the shellinabox console '
'subprocess to exit before sending SIGKILL signal.')),
cfg.IPOpt('socat_address', cfg.IPOpt('socat_address',
default='$my_ip', default='$my_ip',
help=_('IP address of Socat service running on the host of ' help=_('IP address of Socat service running on the host of '

View File

@ -101,6 +101,23 @@ def _stop_console(node_uuid):
console_pid = _get_console_pid(node_uuid) console_pid = _get_console_pid(node_uuid)
os.kill(console_pid, signal.SIGTERM) os.kill(console_pid, signal.SIGTERM)
# make sure the process gets killed hard if required
attempt = 0
max_attempts = CONF.console.kill_timeout // 0.2
while attempt < max_attempts:
if psutil.pid_exists(console_pid):
if attempt == max_attempts - 1:
os.kill(console_pid, signal.SIGKILL)
LOG.debug("Waiting for the console process with PID %(pid)s "
"to exit. Node: %(node)s.",
{'pid': console_pid, 'node': node_uuid})
time.sleep(0.2)
attempt += 1
else:
break
except OSError as exc: except OSError as exc:
if exc.errno != errno.ESRCH: if exc.errno != errno.ESRCH:
msg = (_("Could not stop the console for node '%(node)s'. " msg = (_("Could not stop the console for node '%(node)s'. "
@ -198,28 +215,24 @@ def start_shellinabox_console(node_uuid, port, console_cmd):
# check if the console pid is created and the process is running. # check if the console pid is created and the process is running.
# if it is, then the shellinaboxd is invoked successfully as a daemon. # if it is, then the shellinaboxd is invoked successfully as a daemon.
# otherwise check the error. # otherwise check the error.
if locals['returncode'] is not None: if (locals['returncode'] == 0 and os.path.exists(pid_file)
if (locals['returncode'] == 0 and os.path.exists(pid_file) and psutil.pid_exists(_get_console_pid(node_uuid))):
and psutil.pid_exists(_get_console_pid(node_uuid))): raise loopingcall.LoopingCallDone()
raise loopingcall.LoopingCallDone()
else:
(stdout, stderr) = popen_obj.communicate()
locals['errstr'] = _(
"Command: %(command)s.\n"
"Exit code: %(return_code)s.\n"
"Stdout: %(stdout)r\n"
"Stderr: %(stderr)r") % {
'command': ' '.join(args),
'return_code': locals['returncode'],
'stdout': stdout,
'stderr': stderr}
LOG.warning(locals['errstr'])
raise loopingcall.LoopingCallDone()
if (time.time() > expiration): if (time.time() > expiration or locals['returncode'] is not None):
locals['errstr'] = (_("Timeout while waiting for console " (stdout, stderr) = popen_obj.communicate()
"subprocess to start for node %s.") % locals['errstr'] = _(
node_uuid) "Timeout or error while waiting for console "
"subprocess to start for node: %(node)s.\n"
"Command: %(command)s.\n"
"Exit code: %(return_code)s.\n"
"Stdout: %(stdout)r\n"
"Stderr: %(stderr)r") % {
'node': node_uuid,
'command': ' '.join(args),
'return_code': locals['returncode'],
'stdout': stdout,
'stderr': stderr}
LOG.warning(locals['errstr']) LOG.warning(locals['errstr'])
raise loopingcall.LoopingCallDone() raise loopingcall.LoopingCallDone()

View File

@ -145,8 +145,28 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
console_utils._stop_console(self.info['uuid']) console_utils._stop_console(self.info['uuid'])
mock_pid.assert_called_once_with(self.info['uuid']) mock_pid.assert_called_once_with(self.info['uuid'])
mock_kill.assert_called_once_with(mock_pid.return_value,
signal.SIGTERM) # a check if process still exist (signal 0) in a loop
mock_kill.assert_any_call(mock_pid.return_value, signal.SIG_DFL)
# and that it receives the SIGTERM
mock_kill.assert_any_call(mock_pid.return_value, signal.SIGTERM)
mock_unlink.assert_called_once_with(pid_file)
@mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
@mock.patch.object(os, 'kill', autospec=True)
@mock.patch.object(psutil, 'pid_exists', autospec=True, return_value=True)
@mock.patch.object(console_utils, '_get_console_pid', autospec=True)
def test__stop_console_forced_kill(self, mock_pid, mock_psutil, mock_kill,
mock_unlink):
pid_file = console_utils._get_console_pid_file(self.info['uuid'])
mock_pid.return_value = 12345
console_utils._stop_console(self.info['uuid'])
mock_pid.assert_called_once_with(self.info['uuid'])
# Make sure console process receives hard SIGKILL
mock_kill.assert_any_call(mock_pid.return_value, signal.SIGKILL)
mock_unlink.assert_called_once_with(pid_file) mock_unlink.assert_called_once_with(pid_file)
@mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True) @mock.patch.object(ironic_utils, 'unlink_without_raise', autospec=True)
@ -322,7 +342,7 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
mock_popen.assert_called_once_with(mock.ANY, mock_popen.assert_called_once_with(mock.ANY,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
mock_popen.return_value.poll.assert_called_once_with() mock_popen.return_value.poll.assert_called_with()
@mock.patch.object(console_utils, 'open', @mock.patch.object(console_utils, 'open',
mock.mock_open(read_data='12345\n')) mock.mock_open(read_data='12345\n'))
@ -350,11 +370,11 @@ class ConsoleUtilsTestCase(db_base.DbTestCase):
mock_stop.assert_called_once_with(self.info['uuid']) mock_stop.assert_called_once_with(self.info['uuid'])
mock_dir_exists.assert_called_once_with() mock_dir_exists.assert_called_once_with()
mock_pid_exists.assert_called_once_with(12345) mock_pid_exists.assert_called_with(12345)
mock_popen.assert_called_once_with(mock.ANY, mock_popen.assert_called_once_with(mock.ANY,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
mock_popen.return_value.poll.assert_called_once_with() mock_popen.return_value.poll.assert_called_with()
@mock.patch.object(subprocess, 'Popen', autospec=True) @mock.patch.object(subprocess, 'Popen', autospec=True)
@mock.patch.object(console_utils, '_ensure_console_pid_dir_exists', @mock.patch.object(console_utils, '_ensure_console_pid_dir_exists',

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes a locking issue where ``ipmitool-shellinabox`` console
interface users may encounter a situation where the bare metal node is
locked until the conductor is restarted.
See story `1587313 <https://storyboard.openstack.org/#!/story/1587313>`_
for additional information.