From 2c58ab3703675b43d9f197f609fc576e850ac0da Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Wed, 26 Jan 2022 14:59:34 +0100 Subject: [PATCH] Wait for conductor start before notifying systemd Currently, the launcher first notifies systemd, then starts checking the services (RPC and WSGI). So any failures will be reported, but only after systemd declares the service ready. This change adds a polling loop to make sure RpcService.start() finishes successfully. Change-Id: Ib460622d69a9cb1cb82e796a6ab294bbbb40c359 --- ironic/cmd/conductor.py | 6 ++++ ironic/cmd/singleprocess.py | 5 +++ ironic/common/rpc_service.py | 22 +++++++++++++ ironic/tests/unit/common/test_rpc_service.py | 31 +++++++++++++++++++ .../notes/service-wait-e85cbe7978f61764.yaml | 5 +++ 5 files changed, 69 insertions(+) create mode 100644 releasenotes/notes/service-wait-e85cbe7978f61764.yaml diff --git a/ironic/cmd/conductor.py b/ironic/cmd/conductor.py index 5fa4c84899..a932cb22fb 100644 --- a/ironic/cmd/conductor.py +++ b/ironic/cmd/conductor.py @@ -67,6 +67,12 @@ def main(): issue_startup_warnings(CONF) launcher = service.launch(CONF, mgr, restart_method='mutate') + + # NOTE(dtantsur): handling start-up failures before launcher.wait() helps + # notify systemd about them. Otherwise the launcher will report successful + # service start-up before checking the threads. + mgr.wait_for_start() + sys.exit(launcher.wait()) diff --git a/ironic/cmd/singleprocess.py b/ironic/cmd/singleprocess.py index 20a348ae5c..675bd1bc29 100644 --- a/ironic/cmd/singleprocess.py +++ b/ironic/cmd/singleprocess.py @@ -49,4 +49,9 @@ def main(): wsgi = wsgi_service.WSGIService('ironic_api', CONF.api.enable_ssl_api) launcher.launch_service(wsgi) + # NOTE(dtantsur): handling start-up failures before launcher.wait() helps + # notify systemd about them. Otherwise the launcher will report successful + # service start-up before checking the threads. + mgr.wait_for_start() + sys.exit(launcher.wait()) diff --git a/ironic/common/rpc_service.py b/ironic/common/rpc_service.py index 78379c9817..b0eec7758b 100644 --- a/ironic/common/rpc_service.py +++ b/ironic/common/rpc_service.py @@ -15,6 +15,8 @@ # under the License. import signal +import sys +import time from ironic_lib.json_rpc import server as json_rpc from oslo_config import cfg @@ -42,9 +44,29 @@ class RPCService(service.Service): self.topic = self.manager.topic self.rpcserver = None self.deregister = True + self._failure = None + self._started = False + + def wait_for_start(self): + while not self._started and not self._failure: + time.sleep(0.1) + if self._failure: + LOG.critical(self._failure) + sys.exit(self._failure) def start(self): + self._failure = None + self._started = False super(RPCService, self).start() + try: + self._real_start() + except Exception as exc: + self._failure = f"{exc.__class__.__name__}: {exc}" + raise + else: + self._started = True + + def _real_start(self): admin_context = context.get_admin_context() serializer = objects_base.IronicObjectSerializer(is_server=True) diff --git a/ironic/tests/unit/common/test_rpc_service.py b/ironic/tests/unit/common/test_rpc_service.py index 4e190f5e6f..8483bfb224 100644 --- a/ironic/tests/unit/common/test_rpc_service.py +++ b/ironic/tests/unit/common/test_rpc_service.py @@ -46,6 +46,8 @@ class TestRPCService(base.TestCase): mock_rpc, mock_ios, mock_target, mock_prepare_method): mock_rpc.return_value.start = mock.MagicMock() self.rpc_svc.handle_signal = mock.MagicMock() + self.assertFalse(self.rpc_svc._started) + self.assertFalse(self.rpc_svc._failure) self.rpc_svc.start() mock_ctx.assert_called_once_with() mock_target.assert_called_once_with(topic=self.rpc_svc.topic, @@ -55,6 +57,9 @@ class TestRPCService(base.TestCase): mock_init_method.assert_called_once_with(self.rpc_svc.manager, mock_ctx.return_value) self.assertIs(rpc.GLOBAL_MANAGER, self.rpc_svc.manager) + self.assertTrue(self.rpc_svc._started) + self.assertFalse(self.rpc_svc._failure) + self.rpc_svc.wait_for_start() # should be no-op @mock.patch.object(manager.ConductorManager, 'prepare_host', autospec=True) @mock.patch.object(oslo_messaging, 'Target', autospec=True) @@ -77,3 +82,29 @@ class TestRPCService(base.TestCase): mock_init_method.assert_called_once_with(self.rpc_svc.manager, mock_ctx.return_value) self.assertIs(rpc.GLOBAL_MANAGER, self.rpc_svc.manager) + + @mock.patch.object(manager.ConductorManager, 'prepare_host', autospec=True) + @mock.patch.object(oslo_messaging, 'Target', autospec=True) + @mock.patch.object(objects_base, 'IronicObjectSerializer', autospec=True) + @mock.patch.object(rpc, 'get_server', autospec=True) + @mock.patch.object(manager.ConductorManager, 'init_host', autospec=True) + @mock.patch.object(context, 'get_admin_context', autospec=True) + def test_start_failure(self, mock_ctx, mock_init_method, mock_rpc, + mock_ios, mock_target, mock_prepare_method): + mock_rpc.return_value.start = mock.MagicMock() + self.rpc_svc.handle_signal = mock.MagicMock() + mock_init_method.side_effect = RuntimeError("boom") + self.assertFalse(self.rpc_svc._started) + self.assertFalse(self.rpc_svc._failure) + self.assertRaises(RuntimeError, self.rpc_svc.start) + mock_ctx.assert_called_once_with() + mock_target.assert_called_once_with(topic=self.rpc_svc.topic, + server="fake_host") + mock_ios.assert_called_once_with(is_server=True) + mock_prepare_method.assert_called_once_with(self.rpc_svc.manager) + mock_init_method.assert_called_once_with(self.rpc_svc.manager, + mock_ctx.return_value) + self.assertIsNone(rpc.GLOBAL_MANAGER) + self.assertFalse(self.rpc_svc._started) + self.assertIn("boom", self.rpc_svc._failure) + self.assertRaises(SystemExit, self.rpc_svc.wait_for_start) diff --git a/releasenotes/notes/service-wait-e85cbe7978f61764.yaml b/releasenotes/notes/service-wait-e85cbe7978f61764.yaml new file mode 100644 index 0000000000..c2fff601d1 --- /dev/null +++ b/releasenotes/notes/service-wait-e85cbe7978f61764.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + The ``ironic`` and ``ironic-conductor`` services now wait for the conductor + manager to start before notifying systemd about the successful start-up.