From 3b1591f235f4b85796917507be5e7fd80365ff9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Nov=C3=BD?= Date: Wed, 30 Sep 2015 19:08:09 +0200 Subject: [PATCH] swift-init: New option kill-after-timeout This option send SIGKILL to daemon after kill_wait period. When daemon hangs and doesn't respond to SIGTERM/SIGHUP there is no way to stop it using swift-init now. Classic init scripts in Linux kills hanged process after grace period and this patch add same behaviour. This is most usefull when using "restart" on hanged daemon. Change-Id: I8c932b673a0f51e52132df87ea2f4396f4bba9d8 --- bin/swift-init | 5 ++ doc/manpages/swift-init.1 | 1 + swift/common/manager.py | 30 ++++++++++- test/unit/common/test_manager.py | 90 ++++++++++++++++++++++++++++++-- 4 files changed, 120 insertions(+), 6 deletions(-) diff --git a/bin/swift-init b/bin/swift-init index 3fe18cdaa6..0fcbff5708 100755 --- a/bin/swift-init +++ b/bin/swift-init @@ -74,6 +74,11 @@ def main(): help="Return zero status code even if some config is " "missing. Default mode if any server is a glob or " "one of aliases `all`, `main` or `rest`.") + # SIGKILL daemon after kill_wait period + parser.add_option('--kill-after-timeout', dest='kill_after_timeout', + action='store_true', + help="Kill daemon and all childs after kill-wait " + "period.") options, args = parser.parse_args() diff --git a/doc/manpages/swift-init.1 b/doc/manpages/swift-init.1 index 3a0e112659..7a1ac42ab5 100644 --- a/doc/manpages/swift-init.1 +++ b/doc/manpages/swift-init.1 @@ -111,6 +111,7 @@ allows one to use the keywords such as "all", "main" and "rest" for the .IP "-r RUN_DIR, --run-dir=RUN_DIR directory where the pids will be stored (default /var/run/swift) .IP "--strict return non-zero status code if some config is missing. Default mode if server is explicitly named." .IP "--non-strict return zero status code even if some config is missing. Default mode if server is one of aliases `all`, `main` or `rest`." +.IP "--kill-after-timeout kill daemon and all childs after kill-wait period." .PD .RE diff --git a/swift/common/manager.py b/swift/common/manager.py index 03eb0479e9..e67f8a32f7 100644 --- a/swift/common/manager.py +++ b/swift/common/manager.py @@ -162,6 +162,16 @@ def safe_kill(pid, sig, name): os.kill(pid, sig) +def kill_group(pid, sig): + """Send signal to process group + + : param pid: process id + : param sig: signal to send + """ + # Negative PID means process group + os.kill(-pid, sig) + + class UnknownCommandError(Exception): pass @@ -285,11 +295,27 @@ class Manager(object): return 0 # reached interval n watch_pids w/o killing all servers + kill_after_timeout = kwargs.get('kill_after_timeout', False) for server, pids in server_pids.items(): if not killed_pids.issuperset(pids): # some pids of this server were not killed - print(_('Waited %s seconds for %s to die; giving up') % ( - kill_wait, server)) + if kill_after_timeout: + print(_('Waited %s seconds for %s to die; killing') % ( + kill_wait, server)) + # Send SIGKILL to all remaining pids + for pid in set(pids.keys()) - killed_pids: + print(_('Signal %s pid: %s signal: %s') % ( + server, pid, signal.SIGKILL)) + # Send SIGKILL to process group + try: + kill_group(pid, signal.SIGKILL) + except OSError as e: + # PID died before kill_group can take action? + if e.errno != errno.ESRCH: + raise e + else: + print(_('Waited %s seconds for %s to die; giving up') % ( + kill_wait, server)) return 1 @command diff --git a/test/unit/common/test_manager.py b/test/unit/common/test_manager.py index e0d1bdb924..3280e444f0 100644 --- a/test/unit/common/test_manager.py +++ b/test/unit/common/test_manager.py @@ -1916,13 +1916,18 @@ class TestManager(unittest.TestCase): continue yield server, pid + def mock_kill_group(pid, sig): + self.fail('kill_group should not be called') + _orig_server = manager.Server _orig_watch_server_pids = manager.watch_server_pids + _orig_kill_group = manager.kill_group try: manager.watch_server_pids = mock_watch_server_pids + manager.kill_group = mock_kill_group # test stop one server server_pids = { - 'test': [1] + 'test': {1: "dummy.pid"} } manager.Server = MockServerFactory(server_pids) m = manager.Manager(['test']) @@ -1930,7 +1935,7 @@ class TestManager(unittest.TestCase): self.assertEqual(status, 0) # test not running server_pids = { - 'test': [] + 'test': {} } manager.Server = MockServerFactory(server_pids) m = manager.Manager(['test']) @@ -1938,7 +1943,7 @@ class TestManager(unittest.TestCase): self.assertEqual(status, 1) # test kill not running server_pids = { - 'test': [] + 'test': {} } manager.Server = MockServerFactory(server_pids) m = manager.Manager(['test']) @@ -1946,7 +1951,7 @@ class TestManager(unittest.TestCase): self.assertEqual(status, 0) # test won't die server_pids = { - 'test': [None] + 'test': {None: None} } manager.Server = MockServerFactory(server_pids) m = manager.Manager(['test']) @@ -1956,6 +1961,83 @@ class TestManager(unittest.TestCase): finally: manager.Server = _orig_server manager.watch_server_pids = _orig_watch_server_pids + manager.kill_group = _orig_kill_group + + def test_stop_kill_after_timeout(self): + class MockServerFactory(object): + class MockServer(object): + def __init__(self, pids, run_dir=manager.RUN_DIR): + self.pids = pids + + def stop(self, **kwargs): + return self.pids + + def status(self, **kwargs): + return not self.pids + + def __init__(self, server_pids, run_dir=manager.RUN_DIR): + self.server_pids = server_pids + + def __call__(self, server, run_dir=manager.RUN_DIR): + return MockServerFactory.MockServer(self.server_pids[server]) + + def mock_watch_server_pids(server_pids, **kwargs): + for server, pids in server_pids.items(): + for pid in pids: + if pid is None: + continue + yield server, pid + + mock_kill_group_called = [] + + def mock_kill_group(*args): + mock_kill_group_called.append(args) + + def mock_kill_group_oserr(*args): + raise OSError() + + def mock_kill_group_oserr_ESRCH(*args): + raise OSError(errno.ESRCH, 'No such process') + + _orig_server = manager.Server + _orig_watch_server_pids = manager.watch_server_pids + _orig_kill_group = manager.kill_group + try: + manager.watch_server_pids = mock_watch_server_pids + manager.kill_group = mock_kill_group + # test stop one server + server_pids = { + 'test': {None: None} + } + manager.Server = MockServerFactory(server_pids) + m = manager.Manager(['test']) + status = m.stop(kill_after_timeout=True) + self.assertEqual(status, 1) + self.assertEqual(mock_kill_group_called, [(None, 9)]) + + manager.kill_group = mock_kill_group_oserr + # test stop one server - OSError + server_pids = { + 'test': {None: None} + } + manager.Server = MockServerFactory(server_pids) + m = manager.Manager(['test']) + with self.assertRaises(OSError): + status = m.stop(kill_after_timeout=True) + + manager.kill_group = mock_kill_group_oserr_ESRCH + # test stop one server - OSError: No such process + server_pids = { + 'test': {None: None} + } + manager.Server = MockServerFactory(server_pids) + m = manager.Manager(['test']) + status = m.stop(kill_after_timeout=True) + self.assertEqual(status, 1) + finally: + manager.Server = _orig_server + manager.watch_server_pids = _orig_watch_server_pids + manager.kill_group = _orig_kill_group # TODO(clayg): more tests def test_shutdown(self):