Files
distcloud/distributedcloud/dcmanager/orchestrator/service.py
Raphael Lima 0bfa018ec1 Enable dcmanager orchestrator's worker service
This commit enables dcmanager orchestrator's worker service for System
Controller and implements its communication to the orchestrator manager
process.

Test plan:
1. PASS: Build an ISO containing the changes for the implementation of
   the orchestrator worker and successfully deploy a new DC system.
2. PASS: Verify that the deployed system has the new service running as
   expected and logging to the orchestrator.log file.
3. PASS: Verify that the orchestrator-worker service can be managed by
   SM and that restarting dcmanager-manager also restarts the
   orchestrator worker.
4. PASS: Create a strategy with a max parallel subclouds smaller than
   the total number of subclouds being orchestrated and verify that the
   subclouds deletion is properly handled by multiple workers,
   respecting the parameter. Once the steps are removed, verify that the
   strategy is deleted by the manager process.
5. PASS: Create a strategy with a max parallel subclouds smaller than
   the total number of subclouds being orchestrated, apply it and
   verify that only the amount of subclouds specified by the parameter
   are processed at a time.
6. PASS: Start applying a strategy, restart dcmanager-manager service,
   verify that the orchestrator's manager identify that there was a
   strategy being applied and, once the orchestration interval has
   passed, it retrieves the steps that were being processed and send
   them to the workers.
7. PASS: Start applying a strategy, delete one of the worker processes
   that were handling steps, wait for the orchestration interval and
   verify that the manager sends the steps that were pending to a new
   worker.
8. PASS: When applying a strategy, verify that a step that is in the
   applying state for more than a minute has its updated_at field
   updated to the current time to avoid it being detected by the manager
   as a step that is not in process.
9. PASS: Create a strategy and verify that, when a step fails, the
   strategy's state is also set to failed.
10. PASS: Create a strategy with stop-on-failure set and verify that once
    a step reaches a failed state, all workers stop processing new steps.
11. PASS: Abort an applying strategy, verify that all steps in initial
    state are updated to aborted, the ones that were processing
    continue until they reach a finished state.
12. PASS: Verify that the manager's strategy monitoring thread only
    executes when a strategy is being applied, until it reaches a
    complete state, e.g. complete, failed or aborted, or when the
    strategy's deletion is requested.

Story: 2011311
Task: 51695

Change-Id: I3ad086fa3222eda6edd1d3a314b13cb0bde161ef
Signed-off-by: Raphael Lima <Raphael.Lima@windriver.com>
2025-04-17 12:34:15 -03:00

219 lines
7.9 KiB
Python

# Copyright (c) 2020-2021, 2024-2025 Wind River Systems, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
import functools
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging
from oslo_service import service
from dccommon.subprocess_cleanup import SubprocessCleanup
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common import messaging as rpc_messaging
from dcmanager.common import scheduler
from dcmanager.common import utils
from dcmanager.orchestrator.orchestrator_manager import OrchestratorManager
from dcmanager.orchestrator.orchestrator_worker import OrchestratorWorker
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
def request_context(func):
@functools.wraps(func)
def wrapped(self, ctx, *args, **kwargs):
if ctx is not None and not isinstance(ctx, context.RequestContext):
ctx = context.RequestContext.from_dict(ctx.to_dict())
try:
return func(self, ctx, *args, **kwargs)
except exceptions.DCManagerException:
raise oslo_messaging.rpc.dispatcher.ExpectedException()
return wrapped
class DCManagerOrchestratorService(service.Service):
"""Lifecycle manager for a running orchestrator service."""
def __init__(self):
super().__init__()
self.host = cfg.CONF.host
self.rpc_api_version = consts.RPC_API_VERSION
self.topic = consts.TOPIC_DC_MANAGER_ORCHESTRATOR
# The following are initialized here, but assigned in start() which
# happens after the fork when spawning multiple worker processes
self.TG = None
self.target = None
self._rpc_server = None
self.orchestrator_manager = None
def start(self):
utils.set_open_file_limit(cfg.CONF.orchestrator_worker_rlimit_nofile)
self.init_tgm()
self.init_manager()
target = oslo_messaging.Target(
version=self.rpc_api_version, server=self.host, topic=self.topic
)
self.target = target
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
self._rpc_server.start()
super().start()
def init_tgm(self):
self.TG = scheduler.ThreadGroupManager()
def init_manager(self):
self.orchestrator_manager = OrchestratorManager()
def _stop_rpc_server(self):
# Stop RPC connection to prevent new requests
LOG.debug("Attempting to stop RPC service...")
if self._rpc_server is not None:
try:
self._rpc_server.stop()
self._rpc_server.wait()
self._rpc_server = None
LOG.info("RPC service stopped successfully")
except Exception as ex:
LOG.error("Failed to stop engine service: %s", str(ex))
def stop(self):
"""Stop anything initiated by start"""
SubprocessCleanup.shutdown_cleanup(origin="service")
self._stop_rpc_server()
if self.TG is not None:
self.TG.stop()
self.TG = None
if self.orchestrator_manager is not None:
self.orchestrator_manager.stop()
self.orchestrator_manager = None
# Terminate the engine process
LOG.info("All threads were gone, terminating engine")
super().stop()
@request_context
def create_sw_update_strategy(self, context, payload):
# Creates a software update strategy
LOG.info(
"Handling create_sw_update_strategy request of type %s"
% payload.get("type")
)
return self.orchestrator_manager.create_sw_update_strategy(context, payload)
@request_context
def delete_sw_update_strategy(self, context, update_type=None):
# Deletes the software update strategy
LOG.info("Handling delete_sw_update_strategy request")
return self.orchestrator_manager.delete_sw_update_strategy(
context, update_type=update_type
)
@request_context
def apply_sw_update_strategy(self, context, update_type=None):
# Applies the software update strategy
LOG.info("Handling apply_sw_update_strategy request")
return self.orchestrator_manager.apply_sw_update_strategy(
context, update_type=update_type
)
@request_context
def abort_sw_update_strategy(self, context, update_type=None):
# Aborts the software update strategy
LOG.info("Handling abort_sw_update_strategy request")
return self.orchestrator_manager.abort_sw_update_strategy(
context, update_type=update_type
)
@request_context
def stop_strategy(self, context, strategy_type):
LOG.info("Handling stop_strategy request")
return self.orchestrator_manager.stop_strategy(strategy_type)
class DCManagerOrchestratorWorkerService(service.Service):
"""Lifecycle manager for a running orchestrator-worker service."""
def __init__(self):
super().__init__()
self.host = cfg.CONF.host
self.rpc_api_version = consts.RPC_API_VERSION
self.topic = consts.TOPIC_DC_MANAGER_ORCHESTRATOR_WORKER
# The following are initialized here, but assigned in start() which
# happens after the fork when spawning multiple worker processes
self.TG = None
self.target = None
self._rpc_server = None
self.orchestrator_worker = None
def start(self):
utils.set_open_file_limit(cfg.CONF.orchestrator_worker_rlimit_nofile)
self.init_tgm()
self.init_worker()
target = oslo_messaging.Target(
version=self.rpc_api_version, server=self.host, topic=self.topic
)
self.target = target
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
self._rpc_server.start()
super().start()
def init_tgm(self):
self.TG = scheduler.ThreadGroupManager()
def init_worker(self):
self.orchestrator_worker = OrchestratorWorker()
def _stop_rpc_server(self):
# Stop RPC connection to prevent new requests
LOG.debug("Attempting to stop RPC service...")
if self._rpc_server is not None:
try:
self._rpc_server.stop()
self._rpc_server.wait()
self._rpc_server = None
LOG.info("RPC service stopped successfully")
except Exception as ex:
LOG.error("Failed to stop engine service: %s", str(ex))
def stop(self):
"""Stop anything initiated by start"""
SubprocessCleanup.shutdown_cleanup(origin="service")
self._stop_rpc_server()
if self.TG is not None:
self.TG.stop()
self.TG = None
if self.orchestrator_worker is not None:
self.orchestrator_worker.stop()
self.orchestrator_worker = None
# Terminate the engine process
LOG.info("All threads were gone, terminating engine")
super().stop()
@request_context
def orchestrate(self, context, steps_id, strategy_type):
LOG.info(f"Handling orchestrate request for strategy of type {strategy_type}")
return self.orchestrator_worker.orchestrate(steps_id, strategy_type)
@request_context
def stop_processing(self, context):
LOG.info("Handling stop_processing")
return self.orchestrator_worker.stop_processing()