swift-init: New option kill-after-timeout

This option send SIGKILL to daemon after kill_wait period.
When daemon hangs and doesn't respond to SIGTERM/SIGHUP
there is no way to stop it using swift-init now. Classic
init scripts in Linux kills hanged process after grace
period and this patch add same behaviour. This is most
usefull when using "restart" on hanged daemon.

Change-Id: I8c932b673a0f51e52132df87ea2f4396f4bba9d8
This commit is contained in:
Ondřej Nový 2015-09-30 19:08:09 +02:00
parent dafeea6322
commit 3b1591f235
4 changed files with 120 additions and 6 deletions

View File

@ -74,6 +74,11 @@ def main():
help="Return zero status code even if some config is " help="Return zero status code even if some config is "
"missing. Default mode if any server is a glob or " "missing. Default mode if any server is a glob or "
"one of aliases `all`, `main` or `rest`.") "one of aliases `all`, `main` or `rest`.")
# SIGKILL daemon after kill_wait period
parser.add_option('--kill-after-timeout', dest='kill_after_timeout',
action='store_true',
help="Kill daemon and all childs after kill-wait "
"period.")
options, args = parser.parse_args() options, args = parser.parse_args()

View File

@ -111,6 +111,7 @@ allows one to use the keywords such as "all", "main" and "rest" for the <server>
.IP "-r RUN_DIR, --run-dir=RUN_DIR directory where the pids will be stored (default /var/run/swift) .IP "-r RUN_DIR, --run-dir=RUN_DIR directory where the pids will be stored (default /var/run/swift)
.IP "--strict return non-zero status code if some config is missing. Default mode if server is explicitly named." .IP "--strict return non-zero status code if some config is missing. Default mode if server is explicitly named."
.IP "--non-strict return zero status code even if some config is missing. Default mode if server is one of aliases `all`, `main` or `rest`." .IP "--non-strict return zero status code even if some config is missing. Default mode if server is one of aliases `all`, `main` or `rest`."
.IP "--kill-after-timeout kill daemon and all childs after kill-wait period."
.PD .PD
.RE .RE

View File

@ -162,6 +162,16 @@ def safe_kill(pid, sig, name):
os.kill(pid, sig) os.kill(pid, sig)
def kill_group(pid, sig):
"""Send signal to process group
: param pid: process id
: param sig: signal to send
"""
# Negative PID means process group
os.kill(-pid, sig)
class UnknownCommandError(Exception): class UnknownCommandError(Exception):
pass pass
@ -285,11 +295,27 @@ class Manager(object):
return 0 return 0
# reached interval n watch_pids w/o killing all servers # reached interval n watch_pids w/o killing all servers
kill_after_timeout = kwargs.get('kill_after_timeout', False)
for server, pids in server_pids.items(): for server, pids in server_pids.items():
if not killed_pids.issuperset(pids): if not killed_pids.issuperset(pids):
# some pids of this server were not killed # some pids of this server were not killed
print(_('Waited %s seconds for %s to die; giving up') % ( if kill_after_timeout:
kill_wait, server)) print(_('Waited %s seconds for %s to die; killing') % (
kill_wait, server))
# Send SIGKILL to all remaining pids
for pid in set(pids.keys()) - killed_pids:
print(_('Signal %s pid: %s signal: %s') % (
server, pid, signal.SIGKILL))
# Send SIGKILL to process group
try:
kill_group(pid, signal.SIGKILL)
except OSError as e:
# PID died before kill_group can take action?
if e.errno != errno.ESRCH:
raise e
else:
print(_('Waited %s seconds for %s to die; giving up') % (
kill_wait, server))
return 1 return 1
@command @command

View File

@ -1916,13 +1916,18 @@ class TestManager(unittest.TestCase):
continue continue
yield server, pid yield server, pid
def mock_kill_group(pid, sig):
self.fail('kill_group should not be called')
_orig_server = manager.Server _orig_server = manager.Server
_orig_watch_server_pids = manager.watch_server_pids _orig_watch_server_pids = manager.watch_server_pids
_orig_kill_group = manager.kill_group
try: try:
manager.watch_server_pids = mock_watch_server_pids manager.watch_server_pids = mock_watch_server_pids
manager.kill_group = mock_kill_group
# test stop one server # test stop one server
server_pids = { server_pids = {
'test': [1] 'test': {1: "dummy.pid"}
} }
manager.Server = MockServerFactory(server_pids) manager.Server = MockServerFactory(server_pids)
m = manager.Manager(['test']) m = manager.Manager(['test'])
@ -1930,7 +1935,7 @@ class TestManager(unittest.TestCase):
self.assertEqual(status, 0) self.assertEqual(status, 0)
# test not running # test not running
server_pids = { server_pids = {
'test': [] 'test': {}
} }
manager.Server = MockServerFactory(server_pids) manager.Server = MockServerFactory(server_pids)
m = manager.Manager(['test']) m = manager.Manager(['test'])
@ -1938,7 +1943,7 @@ class TestManager(unittest.TestCase):
self.assertEqual(status, 1) self.assertEqual(status, 1)
# test kill not running # test kill not running
server_pids = { server_pids = {
'test': [] 'test': {}
} }
manager.Server = MockServerFactory(server_pids) manager.Server = MockServerFactory(server_pids)
m = manager.Manager(['test']) m = manager.Manager(['test'])
@ -1946,7 +1951,7 @@ class TestManager(unittest.TestCase):
self.assertEqual(status, 0) self.assertEqual(status, 0)
# test won't die # test won't die
server_pids = { server_pids = {
'test': [None] 'test': {None: None}
} }
manager.Server = MockServerFactory(server_pids) manager.Server = MockServerFactory(server_pids)
m = manager.Manager(['test']) m = manager.Manager(['test'])
@ -1956,6 +1961,83 @@ class TestManager(unittest.TestCase):
finally: finally:
manager.Server = _orig_server manager.Server = _orig_server
manager.watch_server_pids = _orig_watch_server_pids manager.watch_server_pids = _orig_watch_server_pids
manager.kill_group = _orig_kill_group
def test_stop_kill_after_timeout(self):
class MockServerFactory(object):
class MockServer(object):
def __init__(self, pids, run_dir=manager.RUN_DIR):
self.pids = pids
def stop(self, **kwargs):
return self.pids
def status(self, **kwargs):
return not self.pids
def __init__(self, server_pids, run_dir=manager.RUN_DIR):
self.server_pids = server_pids
def __call__(self, server, run_dir=manager.RUN_DIR):
return MockServerFactory.MockServer(self.server_pids[server])
def mock_watch_server_pids(server_pids, **kwargs):
for server, pids in server_pids.items():
for pid in pids:
if pid is None:
continue
yield server, pid
mock_kill_group_called = []
def mock_kill_group(*args):
mock_kill_group_called.append(args)
def mock_kill_group_oserr(*args):
raise OSError()
def mock_kill_group_oserr_ESRCH(*args):
raise OSError(errno.ESRCH, 'No such process')
_orig_server = manager.Server
_orig_watch_server_pids = manager.watch_server_pids
_orig_kill_group = manager.kill_group
try:
manager.watch_server_pids = mock_watch_server_pids
manager.kill_group = mock_kill_group
# test stop one server
server_pids = {
'test': {None: None}
}
manager.Server = MockServerFactory(server_pids)
m = manager.Manager(['test'])
status = m.stop(kill_after_timeout=True)
self.assertEqual(status, 1)
self.assertEqual(mock_kill_group_called, [(None, 9)])
manager.kill_group = mock_kill_group_oserr
# test stop one server - OSError
server_pids = {
'test': {None: None}
}
manager.Server = MockServerFactory(server_pids)
m = manager.Manager(['test'])
with self.assertRaises(OSError):
status = m.stop(kill_after_timeout=True)
manager.kill_group = mock_kill_group_oserr_ESRCH
# test stop one server - OSError: No such process
server_pids = {
'test': {None: None}
}
manager.Server = MockServerFactory(server_pids)
m = manager.Manager(['test'])
status = m.stop(kill_after_timeout=True)
self.assertEqual(status, 1)
finally:
manager.Server = _orig_server
manager.watch_server_pids = _orig_watch_server_pids
manager.kill_group = _orig_kill_group
# TODO(clayg): more tests # TODO(clayg): more tests
def test_shutdown(self): def test_shutdown(self):