Files
update/software/software/software_controller.py

5152 lines
213 KiB
Python

"""
Copyright (c) 2023-2025 Wind River Systems, Inc.
SPDX-License-Identifier: Apache-2.0
"""
import sys
# prevent software_controller from importing osprofiler
sys.modules['osprofiler'] = None
import configparser
import gc
import json
import logging
import os
from packaging import version
import re
import select
import sh
import shutil
import socket
import subprocess
import tempfile
import threading
import time
import typing
from wsgiref import simple_server
from fm_api import fm_api
from fm_api import constants as fm_constants
from oslo_config import cfg as oslo_cfg
import software.apt_utils as apt_utils
import software.lvm_snapshot as lvm_snapshot
import software.ostree_utils as ostree_utils
from software.api import app
from software.authapi import app as auth_app
from software.constants import CONTROLLER_0_HOSTNAME
from software.constants import CONTROLLER_1_HOSTNAME
from software.constants import INSTALL_LOCAL_FLAG
from software.states import DEPLOY_HOST_STATES
from software.states import DEPLOY_STATES
from software.states import INTERRUPTION_RECOVERY_STATES
from software.base import PatchService
from software.dc_utils import get_subcloud_groupby_version
from software.deploy_state import require_deploy_state
from software.exceptions import APTOSTreeCommandFail
from software.exceptions import HostNotFound
from software.exceptions import InternalError
from software.exceptions import MetadataFail
from software.exceptions import UpgradeNotSupported
from software.exceptions import OSTreeCommandFail
from software.exceptions import OSTreeTarFail
from software.exceptions import SoftwareError
from software.exceptions import SoftwareFail
from software.exceptions import ReleaseInvalidRequest
from software.exceptions import ReleaseValidationFailure
from software.exceptions import ReleaseIsoDeleteFailure
from software.exceptions import SoftwareServiceError
from software.exceptions import InvalidOperation
from software.exceptions import HostAgentUnreachable
from software.exceptions import HostIpNotFound
from software.exceptions import MaxReleaseExceeded
from software.exceptions import ServiceParameterNotFound
from software.plugin import DeployPluginRunner
from software.release_data import reload_release_data
from software.release_data import get_SWReleaseCollection
from software.software_functions import collect_current_load_for_hosts
from software.software_functions import copy_pxeboot_update_file
from software.software_functions import copy_pxeboot_cfg_files
from software.software_functions import create_deploy_hosts
from software.software_functions import deploy_host_validations
from software.software_functions import validate_host_deploy_order
from software.software_functions import parse_release_metadata
from software.software_functions import configure_logging
from software.software_functions import mount_iso_load
from software.software_functions import unmount_iso_load
from software.software_functions import read_upgrade_support_versions
from software.software_functions import get_to_release_from_metadata_file
from software.software_functions import BasePackageData
from software.software_functions import PatchFile
from software.software_functions import package_dir
from software.software_functions import repo_dir
from software.software_functions import root_scripts_dir
from software.software_functions import SW_VERSION
from software.software_functions import audit_log_info
from software.software_functions import repo_root_dir
from software.software_functions import is_deploy_state_in_sync
from software.software_functions import is_deployment_in_progress
from software.software_functions import get_release_from_patch
from software.software_functions import run_remove_temporary_data_script
from software.software_functions import to_bool
from software.release_state import ReleaseState
from software.utilities.deploy_set_failed import start_set_fail
from software.deploy_host_state import DeployHostState
from software.deploy_state import DeployState
from software.release_verify import verify_files
import software.config as cfg
import software.utils as utils
from software.sysinv_utils import get_k8s_ver
from software.sysinv_utils import is_system_controller
from software.sysinv_utils import update_host_sw_version
from software.sysinv_utils import are_all_hosts_unlocked_and_online
from software.sysinv_utils import get_system_info
from software.sysinv_utils import get_oot_drivers
from software.sysinv_utils import trigger_evaluate_apps_reapply
from software.sysinv_utils import trigger_vim_host_audit
from software.db.api import get_instance
import software.messages as messages
import software.constants as constants
from software import states
from tsconfig.tsconfig import INITIAL_CONFIG_COMPLETE_FLAG
from tsconfig.tsconfig import VOLATILE_CONTROLLER_CONFIG_COMPLETE
import xml.etree.ElementTree as ET
CONF = oslo_cfg.CONF
LOG = logging.getLogger('main_logger')
pidfile_path = "/var/run/patch_controller.pid"
sc = None
state_file = "%s/.controller.state" % constants.SOFTWARE_STORAGE_DIR
app_dependency_basename = "app_dependencies.json"
app_dependency_filename = "%s/%s" % (constants.SOFTWARE_STORAGE_DIR, app_dependency_basename)
insvc_patch_restart_controller = "/run/software/.restart.software-controller"
ETC_HOSTS_FILE_PATH = "/etc/hosts"
ETC_HOSTS_BACKUP_FILE_PATH = "/etc/hosts.patchbak"
PATCH_MIGRATION_SCRIPT_DIR = "/etc/update.d"
SOFTWARE_LOG_FILE = "/var/log/software.log"
stale_hosts = []
pending_queries = []
thread_death = None
keep_running = True
system_mode = None
# Limit socket blocking to 5 seconds to allow for thread to shutdown
api_socket_timeout = 5.0
def is_simplex():
global system_mode
if system_mode is None:
_, system_mode = get_system_info()
return system_mode == constants.SYSTEM_MODE_SIMPLEX
class ControllerNeighbour(object):
def __init__(self):
self.last_ack = 0
self.synced = False
def rx_ack(self):
self.last_ack = time.time()
def get_age(self):
return int(time.time() - self.last_ack)
def rx_synced(self):
self.synced = True
def clear_synced(self):
self.synced = False
def get_synced(self):
return self.synced
class AgentNeighbour(object):
def __init__(self, ip):
self.ip = ip
self.last_ack = 0
self.last_query_id = 0
self.out_of_date = False
self.hostname = "n/a"
self.requires_reboot = False
self.patch_failed = False
self.stale = False
self.pending_query = False
self.latest_sysroot_commit = None
self.nodetype = None
self.sw_version = "unknown"
self.subfunctions = []
self.state = None
self._is_alive = False
@property
def is_alive(self):
return self._is_alive
@is_alive.setter
def is_alive(self, value):
self._is_alive = value
def rx_ack(self,
hostname,
out_of_date,
requires_reboot,
query_id,
patch_failed,
sw_version,
state):
self.last_ack = time.time()
self.hostname = hostname
self.patch_failed = patch_failed
self.sw_version = sw_version
self.state = state
if out_of_date != self.out_of_date or requires_reboot != self.requires_reboot:
self.out_of_date = out_of_date
self.requires_reboot = requires_reboot
LOG.info("Agent %s (%s) reporting out_of_date=%s, requires_reboot=%s",
self.hostname,
self.ip,
self.out_of_date,
self.requires_reboot)
if self.last_query_id != query_id:
self.last_query_id = query_id
self.stale = True
if self.ip not in stale_hosts and self.ip not in pending_queries:
stale_hosts.append(self.ip)
def get_age(self):
return int(time.time() - self.last_ack)
def handle_query_detailed_resp(self,
latest_sysroot_commit,
nodetype,
sw_version,
subfunctions,
state):
self.latest_sysroot_commit = latest_sysroot_commit
self.nodetype = nodetype
self.stale = False
self.pending_query = False
self.sw_version = sw_version
self.subfunctions = subfunctions
self.state = state
if self.ip in pending_queries:
pending_queries.remove(self.ip)
if self.ip in stale_hosts:
stale_hosts.remove(self.ip)
def get_dict(self):
d = {"ip": self.ip,
"hostname": self.hostname,
"deployed": not self.out_of_date,
"secs_since_ack": self.get_age(),
"patch_failed": self.patch_failed,
"stale_details": self.stale,
"latest_sysroot_commit": self.latest_sysroot_commit,
"nodetype": self.nodetype,
"subfunctions": self.subfunctions,
"sw_version": self.sw_version,
"state": self.state}
return d
class PatchMessageHello(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_HELLO)
self.patch_op_counter = 0
def decode(self, data):
messages.PatchMessage.decode(self, data)
if 'patch_op_counter' in data:
self.patch_op_counter = data['patch_op_counter']
def encode(self):
global sc
messages.PatchMessage.encode(self)
self.message['patch_op_counter'] = sc.patch_op_counter
def handle(self, sock, addr):
global sc
host = addr[0]
if host == cfg.get_mgmt_ip():
# Ignore messages from self
return
# Send response
if self.patch_op_counter > 0:
sc.handle_nbr_patch_op_counter(host, self.patch_op_counter)
resp = PatchMessageHelloAck()
resp.send(sock)
def send(self, sock):
global sc
if sc.install_local:
return
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class PatchMessageHelloAck(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_HELLO_ACK)
def encode(self):
# Nothing to add, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
global sc
sc.controller_neighbours_lock.acquire()
if not addr[0] in sc.controller_neighbours:
sc.controller_neighbours[addr[0]] = ControllerNeighbour()
sc.controller_neighbours[addr[0]].rx_ack()
sc.controller_neighbours_lock.release()
def send(self, sock):
global sc
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class PatchMessageSyncReq(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_SYNC_REQ)
def encode(self):
# Nothing to add to the SYNC_REQ, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
global sc
host = addr[0]
if host == cfg.get_mgmt_ip():
# Ignore messages from self
return
# We may need to do this in a separate thread, so that we continue to process hellos
LOG.info("Handling sync req")
# NOTE(bqian) sync_from_nbr returns "False" if sync operations failed.
# need to think of reattempt to deal w/ the potential failure.
sc.sync_from_nbr(host)
resp = PatchMessageSyncComplete()
resp.send(sock)
def send(self, sock):
global sc
LOG.info("sending sync req")
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class PatchMessageSyncComplete(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_SYNC_COMPLETE)
def encode(self):
# Nothing to add to the SYNC_COMPLETE, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
global sc
LOG.info("Handling sync complete")
sc.controller_neighbours_lock.acquire()
if not addr[0] in sc.controller_neighbours:
sc.controller_neighbours[addr[0]] = ControllerNeighbour()
sc.controller_neighbours[addr[0]].rx_synced()
sc.controller_neighbours_lock.release()
def send(self, sock):
global sc
LOG.info("sending sync complete")
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class PatchMessageHelloAgent(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_HELLO_AGENT)
def encode(self):
global sc
messages.PatchMessage.encode(self)
self.message['patch_op_counter'] = sc.patch_op_counter
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
global sc
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.agent_address, cfg.agent_port))
if not sc.install_local:
local_hostname = utils.ip_to_versioned_localhost(cfg.agent_mcast_group)
sock.sendto(str.encode(message), (local_hostname, cfg.agent_port))
class PatchMessageSendLatestFeedCommit(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_SEND_LATEST_FEED_COMMIT)
def encode(self):
global sc
messages.PatchMessage.encode(self)
self.message['latest_feed_commit'] = sc.latest_feed_commit
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
global sc
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.agent_address, cfg.agent_port))
if not sc.install_local:
local_hostname = utils.ip_to_versioned_localhost(cfg.agent_mcast_group)
sock.sendto(str.encode(message), (local_hostname, cfg.agent_port))
class PatchMessageHelloAgentAck(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_HELLO_AGENT_ACK)
self.query_id = 0
self.agent_out_of_date = False
self.agent_hostname = "n/a"
self.agent_requires_reboot = False
self.agent_patch_failed = False
self.agent_sw_version = "unknown"
self.agent_state = "unknown"
def decode(self, data):
messages.PatchMessage.decode(self, data)
if 'query_id' in data:
self.query_id = data['query_id']
if 'out_of_date' in data:
self.agent_out_of_date = data['out_of_date']
if 'hostname' in data:
self.agent_hostname = data['hostname']
if 'requires_reboot' in data:
self.agent_requires_reboot = data['requires_reboot']
if 'patch_failed' in data:
self.agent_patch_failed = data['patch_failed']
if 'sw_version' in data:
self.agent_sw_version = data['sw_version']
if 'state' in data:
self.agent_state = data['state']
def encode(self):
# Nothing to add, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
global sc
sc.hosts_lock.acquire()
if not addr[0] in sc.hosts:
sc.hosts[addr[0]] = AgentNeighbour(addr[0])
sc.hosts[addr[0]].rx_ack(self.agent_hostname,
self.agent_out_of_date,
self.agent_requires_reboot,
self.query_id,
self.agent_patch_failed,
self.agent_sw_version,
self.agent_state)
sc.hosts_lock.release()
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class PatchMessageQueryDetailed(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_QUERY_DETAILED)
def encode(self):
# Nothing to add to the message, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
self.encode()
message = json.dumps(self.message)
sock.sendall(str.encode(message))
class PatchMessageQueryDetailedResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_QUERY_DETAILED_RESP)
self.agent_sw_version = "unknown"
self.latest_sysroot_commit = "unknown"
self.subfunctions = []
self.nodetype = "unknown"
self.agent_sw_version = "unknown"
self.agent_state = "unknown"
def decode(self, data):
messages.PatchMessage.decode(self, data)
if 'latest_sysroot_commit' in data:
self.latest_sysroot_commit = data['latest_sysroot_commit']
if 'nodetype' in data:
self.nodetype = data['nodetype']
if 'sw_version' in data:
self.agent_sw_version = data['sw_version']
if 'subfunctions' in data:
self.subfunctions = data['subfunctions']
if 'state' in data:
self.agent_state = data['state']
def encode(self):
LOG.error("Should not get here")
def handle(self, sock, addr):
global sc
ip = addr[0]
sc.hosts_lock.acquire()
if ip in sc.hosts:
sc.hosts[ip].handle_query_detailed_resp(self.latest_sysroot_commit,
self.nodetype,
self.agent_sw_version,
self.subfunctions,
self.agent_state)
for patch_id in list(sc.interim_state):
if ip in sc.interim_state[patch_id]:
sc.interim_state[patch_id].remove(ip)
if len(sc.interim_state[patch_id]) == 0:
del sc.interim_state[patch_id]
sc.hosts_lock.release()
else:
sc.hosts_lock.release()
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class PatchMessageAgentInstallReq(messages.PatchMessage):
def __init__(self, additional_data=None):
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_REQ)
self.ip = None
self.force = False
self.major_release = None
self.commit_id = None
self.additional_data = additional_data
def encode(self):
global sc
messages.PatchMessage.encode(self)
self.message['force'] = self.force
self.message['major_release'] = self.major_release
self.message['commit_id'] = self.commit_id
if self.additional_data:
self.message['additional_data'] = self.additional_data.copy()
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
self.encode()
message = json.dumps(self.message)
msg = f"sending install request to node: {self.ip} with {message}"
LOG.info(msg)
sock.sendto(str.encode(message), (self.ip, cfg.agent_port))
class PatchMessageAgentInstallResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_AGENT_INSTALL_RESP)
self.status = False
self.reject_reason = None
self.reboot_required = False
reload_release_data()
def decode(self, data):
messages.PatchMessage.decode(self, data)
if 'status' in data:
self.status = data['status']
if 'reject_reason' in data:
self.reject_reason = data['reject_reason']
if 'reboot_required' in data:
self.reboot_required = data['reboot_required']
def encode(self):
# Nothing to add, so just call the super class
messages.PatchMessage.encode(self)
def _set_host_install_completed(self, host):
global sc
sc.hosts_lock.acquire()
try:
host.install_status = self.status
host.install_pending = False
host.install_reject_reason = self.reject_reason
finally:
sc.hosts_lock.release()
def handle(self, sock, addr):
LOG.info("Handling install resp from %s", addr[0])
global sc
ip = addr[0]
sc.hosts_lock.acquire()
try:
# NOTE(bqian) seems like trying to tolerate a failure situation
# that a host is directed to install a patch but during the installation
# software-controller-daemon gets restarted
# should remove the sc.hosts which is in memory volatile storage and replaced with
# permanent deploy-host entity
if ip not in sc.hosts:
sc.hosts[ip] = AgentNeighbour(ip)
host = sc.hosts[ip]
hostname = host.hostname
finally:
sc.hosts_lock.release()
dbapi = get_instance()
deploy = dbapi.get_deploy_all()
if len(deploy) == 0:
LOG.info("No deploy in progress. ignore install resp from %s", addr[0])
return
deploy = deploy[0]
success = False
deploy_host_state = DeployHostState(hostname)
try:
if self.status:
deploying = ReleaseState(release_state=states.DEPLOYING)
if deploying.is_major_release_deployment():
# For major release deployment, update sysinv ihost.sw_version
# so that right manifest can be generated.
sw_version = utils.get_major_release_version(deploy.get("to_release"))
msg = f"Update {hostname} to {sw_version}"
LOG.info(msg)
try:
update_host_sw_version(hostname, sw_version)
except Exception:
# Failed a step, fail the host deploy for reattempt
return
success = True
finally:
if success:
deploy_host_state.deployed()
if self.reboot_required:
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_SUCCESS_RR,
fm_constants.FM_ALARM_STATE_SET,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
else:
deploy_host_state.deploy_failed()
sc.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
fm_constants.FM_ALARM_STATE_SET,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname))
self._set_host_install_completed(host)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class PatchMessageDropHostReq(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DROP_HOST_REQ)
self.ip = None
def encode(self):
messages.PatchMessage.encode(self)
self.message['ip'] = self.ip
def decode(self, data):
messages.PatchMessage.decode(self, data)
if 'ip' in data:
self.ip = data['ip']
def handle(self, sock, addr):
global sc
host = addr[0]
if host == cfg.get_mgmt_ip():
# Ignore messages from self
return
if self.ip is None:
LOG.error("Received PATCHMSG_DROP_HOST_REQ with no ip: %s", json.dumps(self.data))
return
sc.drop_host(self.ip, sync_nbr=False)
return
def send(self, sock):
global sc
if sc.install_local:
return
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class SoftwareMessageDeployStateUpdate(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DEPLOY_STATE_UPDATE)
self.data = None
def decode(self, data):
messages.PatchMessage.decode(self, data)
self.data = data
def encode(self):
global sc
messages.PatchMessage.encode(self)
filesystem_data = utils.get_software_filesystem_data()
deploys_state = {"deploy_host": filesystem_data.get("deploy_host", {}),
"deploy": filesystem_data.get("deploy", {})}
self.message["deploy_state"] = deploys_state
def handle(self, sock, addr):
global sc
if sc.mgmt_ip == addr[0]:
# update from localhost, ignore
return
filesystem_data = utils.get_software_filesystem_data()
synced_filesystem_data = utils.get_synced_software_filesystem_data()
actual_state = {"deploy_host": filesystem_data.get("deploy_host", {}),
"deploy": filesystem_data.get("deploy", {})}
synced_state = {"deploy_host": synced_filesystem_data.get("deploy_host", {}),
"deploy": synced_filesystem_data.get("deploy", {})}
peer_state = {"deploy_host": self.data.get("deploy_state").get("deploy_host", {}),
"deploy": self.data.get("deploy_state").get("deploy", {})}
result = "diverged"
if actual_state == peer_state:
result = messages.MSG_ACK_SUCCESS
elif actual_state == synced_state:
result = messages.MSG_ACK_SUCCESS
utils.save_to_json_file(constants.SOFTWARE_JSON_FILE, peer_state)
if result == messages.MSG_ACK_SUCCESS:
utils.save_to_json_file(constants.SYNCED_SOFTWARE_JSON_FILE, peer_state)
resp = SoftwareMessageDeployStateUpdateAck()
resp.send(sock, result)
def send(self, sock):
global sc
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class SoftwareMessageDeployStateUpdateAck(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DEPLOY_STATE_UPDATE_ACK)
self.peer_state_data = {}
def decode(self, data):
messages.PatchMessage.decode(self, data)
self.peer_state_data = data
def encode(self, result): # pylint: disable=arguments-differ
messages.PatchMessage.encode(self)
synced_data = utils.get_synced_software_filesystem_data()
self.message["result"] = result
self.message["deploy_state"] = synced_data
def handle(self, sock, addr):
global sc
if sc.mgmt_ip == addr[0]:
# update from localhost, ignore
return
if self.peer_state_data["result"] == messages.MSG_ACK_SUCCESS:
LOG.debug("Peer controller is synced with value: %s",
self.peer_state_data["deploy_state"])
utils.save_to_json_file(constants.SYNCED_SOFTWARE_JSON_FILE,
self.peer_state_data["deploy_state"])
else:
LOG.error("Peer controller deploy state has diverged.")
def send(self, sock, result):
self.encode(result)
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class SWMessageDeployStateChanged(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DEPLOY_STATE_CHANGED)
self.valid = False
self.agent = None
self.deploy_state = None
self.hostname = None
self.host_state = None
def decode(self, data):
"""
The message is a serialized json object:
{
"msgtype": "deploy-state-changed",
"msgversion": 1,
"agent": "<a valid agent>",
"deploy-state": "<deploy-state>",
"hostname": "<hostname>",
"host-state": "<host-deploy-substate>"
}
"""
messages.PatchMessage.decode(self, data)
self.valid = True
self.agent = None
valid_agents = ['deploy-start', 'deploy-activate', 'deploy-activate-rollback', 'admin']
if 'agent' in data:
self.agent = data['agent']
else:
self.agent = 'unknown'
if self.agent not in valid_agents:
# ignore msg from unknown senders
LOG.info("%s received from unknown agent %s" %
(messages.PATCHMSG_DEPLOY_STATE_CHANGED, self.agent))
self.valid = False
valid_state = {
DEPLOY_STATES.START_DONE.value: DEPLOY_STATES.START_DONE,
DEPLOY_STATES.START_FAILED.value: DEPLOY_STATES.START_FAILED,
DEPLOY_STATES.ACTIVATE_FAILED.value: DEPLOY_STATES.ACTIVATE_FAILED,
DEPLOY_STATES.ACTIVATE_DONE.value: DEPLOY_STATES.ACTIVATE_DONE,
DEPLOY_STATES.ACTIVATE_ROLLBACK_DONE.value: DEPLOY_STATES.ACTIVATE_ROLLBACK_DONE,
DEPLOY_STATES.ACTIVATE_ROLLBACK_FAILED.value: DEPLOY_STATES.ACTIVATE_ROLLBACK_FAILED,
DEPLOY_STATES.HOST_FAILED.value: DEPLOY_STATES.HOST_FAILED
}
if 'deploy-state' in data and data['deploy-state']:
deploy_state = data['deploy-state']
if deploy_state in valid_state:
self.deploy_state = valid_state[deploy_state]
LOG.info("%s received from %s with deploy-state %s" %
(messages.PATCHMSG_DEPLOY_STATE_CHANGED, self.agent, deploy_state))
else:
self.valid = False
LOG.error("%s received from %s with invalid deploy-state %s" %
(messages.PATCHMSG_DEPLOY_STATE_CHANGED, self.agent, deploy_state))
if 'hostname' in data and data['hostname']:
self.hostname = data['hostname']
if 'host-state' in data and data['host-state']:
host_state = states.DEPLOY_HOST_STATES(data['host-state'])
if host_state not in states.VALID_HOST_DEPLOY_STATE:
LOG.error("%s received from %s with invalid host-state %s" %
(messages.PATCHMSG_DEPLOY_STATE_CHANGED, self.agent, host_state))
self.valid = False
else:
self.host_state = host_state
if self.valid:
self.valid = (bool(self.host_state and self.hostname) != bool(self.deploy_state))
if not self.valid:
LOG.error("%s received from %s as invalid %s" %
(messages.PATCHMSG_DEPLOY_STATE_CHANGED, self.agent, data))
def handle(self, sock, addr):
global sc
if not self.valid:
# nothing to do
return
if self.deploy_state:
LOG.info("Received deploy state changed to %s, agent %s" %
(self.deploy_state, self.agent))
try:
sc.deploy_state_changed(self.deploy_state)
except Exception as e:
LOG.error("Deploy state change failed: %s" % str(e))
else:
LOG.info("Received %s deploy host state changed to %s, agent %s" %
(self.hostname, self.host_state, self.agent))
sc.host_deploy_state_changed(self.hostname, self.host_state)
sock.sendto(str.encode("OK"), addr)
def send(self, sock):
global sc
LOG.info("sending sync req")
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.controller_address, cfg.controller_port))
class SoftwareMessageDeployDeleteCleanupReq(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DEPLOY_DELETE_CLEANUP_REQ)
self.ip = None
self.major_release = None
def encode(self):
messages.PatchMessage.encode(self)
self.message["major_release"] = self.major_release
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
global sc
LOG.info("Sending deploy delete cleanup request to all nodes.")
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (sc.agent_address, cfg.agent_port))
class SoftwareMessageDeployDeleteCleanupResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_DEPLOY_DELETE_CLEANUP_RESP)
self.success = None
def decode(self, data):
messages.PatchMessage.decode(self, data)
if 'success' in data:
self.success = data['success']
def encode(self):
# Nothing to add, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
ip = addr[0]
LOG.info("Handling deploy delete cleanup resp from %s", ip)
global sc
if self.success:
LOG.info("Host %s sucessfully executed deploy delete "
"cleanup tasks." % sc.hosts[ip].hostname)
return
LOG.error("Host %s failed executing deploy delete "
"cleanup tasks." % sc.hosts[ip].hostname)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class SoftwareMessageCheckAgentAliveReq(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_CHECK_AGENT_ALIVE_REQ)
self.ip = None
def encode(self):
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
LOG.error("Should not get here")
def send(self, sock):
LOG.info("Sending check agent alive to %s", self.ip)
self.encode()
message = json.dumps(self.message)
sock.sendto(str.encode(message), (self.ip, cfg.agent_port))
class SoftwareMessageCheckAgentAliveResp(messages.PatchMessage):
def __init__(self):
messages.PatchMessage.__init__(self, messages.PATCHMSG_CHECK_AGENT_ALIVE_RESP)
self.status = False
def decode(self, data):
messages.PatchMessage.decode(self, data)
def encode(self):
# Nothing to add, so just call the super class
messages.PatchMessage.encode(self)
def handle(self, sock, addr):
LOG.info("Handling check agent alive resp from %s", addr[0])
global sc
ip = addr[0]
sc.hosts_lock.acquire()
sc.hosts[ip].is_alive = True
sc.hosts_lock.release()
LOG.info("Agent from %s is reachable and alive." % ip)
def send(self, sock): # pylint: disable=unused-argument
LOG.error("Should not get here")
class PatchController(PatchService):
def __init__(self):
PatchService.__init__(self)
# Locks
self.socket_lock = threading.RLock()
self.controller_neighbours_lock = threading.RLock()
self.hosts_lock = threading.RLock()
self.hosts = {}
self.controller_neighbours = {}
self.host_mgmt_ip = []
self.db_api_instance = get_instance()
self.ignore_errors = 'False'
# interim_state is used to track hosts that have not responded
# with fresh queries since a patch was applied or removed, on
# a per-patch basis. This allows the patch controller to move
# patches immediately into a "Partial" state until all nodes
# have responded.
#
self.interim_state = {}
self.sock_out = None
self.sock_in = None
self.controller_address = None
self.agent_address = None
self.patch_op_counter = 1
reload_release_data()
try:
self.latest_feed_commit = ostree_utils.get_feed_latest_commit(SW_VERSION)
except OSTreeCommandFail:
LOG.exception("Failure to fetch the feed ostree latest log while "
"initializing Patch Controller")
self.latest_feed_commit = None
self.base_pkgdata = BasePackageData()
# This is for alarm cache. It will be used to store the last raising alarm id
self.usm_alarm = {constants.LAST_IN_SYNC: False}
self.hostname = socket.gethostname()
self.fm_api = fm_api.FaultAPIs()
self.allow_insvc_patching = True
if os.path.exists(app_dependency_filename):
try:
with open(app_dependency_filename, 'r') as f:
self.app_dependencies = json.loads(f.read())
except Exception:
LOG.exception("Failed to read app dependencies: %s", app_dependency_filename)
else:
self.app_dependencies = {}
if os.path.isfile(state_file):
self.read_state_file()
else:
self.write_state_file()
# Create patch activation scripts folder
if not os.path.exists(PATCH_MIGRATION_SCRIPT_DIR):
os.makedirs(PATCH_MIGRATION_SCRIPT_DIR, 0o755)
self.register_deploy_state_change_listeners()
def _state_changed_sync(self, *args): # pylint: disable=unused-argument
if is_simplex():
# ensure the in-sync state for SX
# treat it as SX for deploy before bootstrap
shutil.copyfile(constants.SOFTWARE_JSON_FILE, constants.SYNCED_SOFTWARE_JSON_FILE)
else:
self._update_state_to_peer()
def _notify_vim_on_state_change(self, target_state):
"""Notify VIM of state change.
This method will notify VIM when one of the following state changes is made:
- start-done
- start-failed
- activate-done
- activate-failed
- activate-rollback-done
- activate-rollback-failed
If new async states are added they should be added here.
Args:
target_state: The new deployment state to notify VIM about
"""
if self.pre_bootstrap:
return
if target_state not in [
DEPLOY_STATES.START_DONE,
DEPLOY_STATES.START_FAILED,
DEPLOY_STATES.ACTIVATE_DONE,
DEPLOY_STATES.ACTIVATE_FAILED,
DEPLOY_STATES.ACTIVATE_ROLLBACK_DONE,
DEPLOY_STATES.ACTIVATE_ROLLBACK_FAILED,
]:
return
# Get local hostname
LOG.info("Notifying VIM of state change: %s", target_state)
trigger_vim_host_audit(socket.gethostname())
def register_deploy_state_change_listeners(self):
# data sync listener
DeployState.register_event_listener(self._state_changed_sync)
DeployHostState.register_event_listener(self._state_changed_sync)
DeployHostState.register_event_listener(DeployState.host_deploy_updated)
DeployState.register_event_listener(ReleaseState.deploy_updated)
DeployState.register_event_listener(self.create_clean_up_deployment_alarm)
# VIM notifications
DeployState.register_event_listener(self._notify_vim_on_state_change)
# TODO(jkraitbe): Add host-deploy when that becomes async
@property
def release_collection(self):
swrc = get_SWReleaseCollection()
return swrc
def update_config(self):
cfg.read_config()
if self.port != cfg.controller_port:
self.port = cfg.controller_port
# Loopback interface does not support multicast messaging, therefore
# revert to using unicast messaging when configured against the
# loopback device
if self.pre_bootstrap:
mgmt_ip = utils.gethostbyname(constants.PREBOOTSTRAP_HOSTNAME)
self.mcast_addr = None
self.controller_address = mgmt_ip
self.agent_address = mgmt_ip
elif cfg.get_mgmt_iface() == constants.LOOPBACK_INTERFACE_NAME:
mgmt_ip = cfg.get_mgmt_ip()
self.mcast_addr = None
self.controller_address = mgmt_ip
self.agent_address = mgmt_ip
else:
self.mcast_addr = cfg.controller_mcast_group
self.controller_address = cfg.controller_mcast_group
self.agent_address = cfg.agent_mcast_group
def socket_lock_acquire(self):
self.socket_lock.acquire()
def socket_lock_release(self):
try:
self.socket_lock.release()
except Exception:
pass
def write_state_file(self):
config = configparser.ConfigParser(strict=False)
cfgfile = open(state_file, 'w')
config.add_section('runtime')
config.set('runtime', 'patch_op_counter', str(self.patch_op_counter))
config.write(cfgfile)
cfgfile.close()
def read_state_file(self):
config = configparser.ConfigParser(strict=False)
config.read(state_file)
try:
counter = config.getint('runtime', 'patch_op_counter')
self.patch_op_counter = counter
LOG.info("patch_op_counter is: %d", self.patch_op_counter)
except configparser.Error:
LOG.exception("Failed to read state info")
def handle_nbr_patch_op_counter(self, host, nbr_patch_op_counter):
if self.patch_op_counter >= nbr_patch_op_counter:
return
# NOTE(bqian) sync_from_nbr returns "False" if sync operations failed.
# need to think of reattempt to deal w/ the potential failure.
self.sync_from_nbr(host)
def sync_from_nbr(self, host):
# Sync the software repo
host_url = utils.ip_to_url(host)
try:
output = subprocess.check_output(["rsync",
"-acv",
"--delete",
"--exclude", "tmp",
"--exclude", "software.json",
"rsync://%s/software/" % host_url,
"%s/" % constants.SOFTWARE_STORAGE_DIR],
stderr=subprocess.STDOUT)
LOG.info("Synced to mate software via rsync: %s", output)
except subprocess.CalledProcessError as e:
LOG.error("Failed to rsync: %s", e.output)
return False
try:
output = subprocess.check_output(["rsync",
"-acv",
"--delete",
"rsync://%s/repo/" % host_url,
"%s/" % repo_root_dir],
stderr=subprocess.STDOUT)
LOG.info("Synced to mate repo via rsync: %s", output)
except subprocess.CalledProcessError:
LOG.error("Failed to rsync: %s", output)
return False
try:
for neighbour in list(self.hosts):
if (self.hosts[neighbour].nodetype == "controller" and
self.hosts[neighbour].ip == host):
LOG.info("Starting sync controllers")
# The output is a string that lists the directories
# Example output:
# >>> dir_names = sh.ls("/var/www/pages/feed/")
# >>> dir_names.stdout
# b'rel-22.12 rel-22.5\n'
dir_names = sh.ls(constants.FEED_OSTREE_BASE_DIR)
# Convert the output above into a list that can be iterated
# >>> list_of_dirs = dir_names.stdout.decode().rstrip().split()
# >>> print(list_of_dirs)
# ['rel-22.12', 'rel-22.5']
list_of_dirs = dir_names.stdout.decode("utf-8").rstrip().split()
for rel_dir in list_of_dirs:
# todo(lvieira): Filtered out the sync of N-1 feed folders.
# Recheck this if in the future a N-1 patch apply will be
# supported in the system controller.
rel_version = rel_dir.split("rel-")[-1]
if rel_version < SW_VERSION:
LOG.info("Skip syncing %s inactive release", rel_dir)
continue
feed_repo = "%s/%s/ostree_repo/" % (constants.FEED_OSTREE_BASE_DIR, rel_dir)
if not os.path.isdir(feed_repo):
LOG.info("Skipping feed dir %s", feed_repo)
continue
LOG.info("Syncing %s", feed_repo)
output = subprocess.check_output(["ostree",
"--repo=%s" % feed_repo,
"pull",
"--depth=-1",
"--mirror",
"starlingx"],
stderr=subprocess.STDOUT)
output = subprocess.check_output(["ostree",
"summary",
"--update",
"--repo=%s" % feed_repo],
stderr=subprocess.STDOUT)
LOG.info("Synced to mate feed via ostree pull: %s", output)
output = subprocess.check_output(["rsync",
"-acv",
"--delete",
"rsync://%s/update_scripts/" % host_url,
"%s/" % PATCH_MIGRATION_SCRIPT_DIR],
stderr=subprocess.STDOUT)
LOG.info("Synced %s folder between controllers: %s"
% (PATCH_MIGRATION_SCRIPT_DIR, output))
except subprocess.CalledProcessError as e:
LOG.error("Failed during controllers sync tasks: %s", e.output)
return False
except Exception as e:
LOG.error("Exception while syncing controllers: %s", e)
return False
self.read_state_file()
self.interim_state = {}
reload_release_data()
if os.path.exists(app_dependency_filename):
try:
with open(app_dependency_filename, 'r') as f:
self.app_dependencies = json.loads(f.read())
except Exception:
LOG.exception("Failed to read app dependencies: %s", app_dependency_filename)
else:
self.app_dependencies = {}
return True
def inc_patch_op_counter(self):
self.patch_op_counter += 1
self.write_state_file()
def get_release_dependency_list(self, release_id, preinstalled_patches=None):
"""
Returns a list of software releases that are required by this release.
Example: If R5 requires R4 and R1, R4 requires R3 and R1, R3 requires R1
then for input param release_id='R5', it will return ['R4', 'R1', 'R3']
:param release: The software release ID
:param preinstalled_patches: A list containing all pre installed patches
"""
def get_dependencies(release_id, visited):
release = self.release_collection.get_release_by_id(release_id)
if release is None:
error = f"Not all required releases are uploaded, missing {release_id}"
raise SoftwareServiceError(error=error)
dependencies = []
for req_release in release.requires_release_ids:
if req_release not in visited:
visited.add(req_release)
dependencies.append(req_release)
if req_release not in preinstalled_patches:
dependencies.extend(get_dependencies(req_release, visited))
return dependencies
if preinstalled_patches is None:
preinstalled_patches = []
return get_dependencies(release_id, set())
def get_ostree_tar_filename(self, patch_sw_version, patch_id):
'''
Returns the path of the ostree tarball
:param patch_sw_version: sw version this patch must be applied to
:param patch_id: The patch ID
'''
ostree_tar_dir = package_dir[patch_sw_version]
ostree_tar_filename = "%s/%s-software.tar" % (ostree_tar_dir, patch_id)
return ostree_tar_filename
def delete_start_install_script(self, patch_id):
'''
Deletes the start and install scripts associated with the patch
:param patch_id: The patch ID
'''
release = self.release_collection.get_release_by_id(patch_id)
scripts = ["pre_start", "post_start", "pre_install", "post_install"]
for script in scripts:
script_name = getattr(release, script, None)
if script_name:
script_path = os.path.join(root_scripts_dir, f"{patch_id}_{script_name}")
try:
os.remove(script_path)
LOG.info("Removed %s script" % script_path)
except OSError:
msg = "Failed to remove start/install script for %s" % patch_id
LOG.warning(msg)
def delete_patch_activate_scripts(self, patch_id):
'''
Deletes the activate scripts associated with the patch
:param patch_id: The patch ID
'''
release = self.release_collection.get_release_by_id(patch_id)
activate_scripts_list = release.activation_scripts
for script in activate_scripts_list:
full_name_file = "%s_%s" % (patch_id, script)
script_path = "%s/%s" % (root_scripts_dir, full_name_file)
try:
os.remove(script_path)
except OSError:
msg = "Failed to remove the activate script for %s" % patch_id
LOG.warning(msg)
def run_semantic_check(self, action, patch_list):
if not os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG):
# Skip semantic checks if initial configuration isn't complete
return
# Pass the current patch state to the semantic check as a series of args
patch_state_args = []
for release in self.release_collection.iterate_releases():
patch_state = '%s=%s' % (release.id, release.state)
patch_state_args += ['-p', patch_state]
# Run semantic checks, if any
for patch_id in patch_list:
semchk = os.path.join(constants.SEMANTICS_DIR, action, patch_id)
if os.path.exists(semchk):
try:
LOG.info("Running semantic check: %s", semchk)
subprocess.check_output([semchk] + patch_state_args,
stderr=subprocess.STDOUT)
LOG.info("Semantic check %s passed", semchk)
except subprocess.CalledProcessError as e:
msg = "Semantic check failed for %s:\n%s" % (patch_id, e.output)
LOG.exception(msg)
raise SoftwareFail(msg)
def software_install_local_api(self, delete):
"""
Enable patch installation to local controller
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
dbapi = get_instance()
deploy = dbapi.get_deploy_all()
if len(deploy) > 0:
msg_info += "Software Deploy operation is in progress.\n"
msg_info += "Please finish current deploy before modifying install local mode.\n"
return dict(info=msg_info, warning=msg_warning, error=msg_error)
if os.path.isfile(INSTALL_LOCAL_FLAG) and delete:
if os.path.isfile(INSTALL_LOCAL_FLAG):
try:
os.remove(INSTALL_LOCAL_FLAG)
except Exception:
LOG.exception("Failed to clear %s flag", INSTALL_LOCAL_FLAG)
msg = "Software deployment in local installation mode is stopped"
msg_info += f"{msg}.\n"
LOG.info(msg)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
elif not delete and not os.path.isfile(INSTALL_LOCAL_FLAG):
open(INSTALL_LOCAL_FLAG, 'a').close()
msg = "Software deployment in local installation mode is started"
msg_info += f"{msg}.\n"
LOG.info(msg)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
else:
mode = 'disabled' if delete else 'enabled'
msg_info += f"Software deployment in local installation mode is already {mode}.\n"
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def major_release_upload_check(self):
"""
major release upload semantic check
"""
valid_controllers = ['controller-0']
if socket.gethostname() not in valid_controllers:
msg = f"Upload rejected, major release must be uploaded to {valid_controllers}"
LOG.info(msg)
raise SoftwareServiceError(error=msg)
max_major_releases = 2
major_releases = []
for rel in self.release_collection.iterate_releases():
major_rel = rel.sw_version
if major_rel not in major_releases:
major_releases.append(major_rel)
# Only system controller can have 2 major releases (N+1 and N-1)
max_releases = max_major_releases + 1 if is_system_controller() else max_major_releases
if len(major_releases) >= max_releases:
msg = f"Major releases {major_releases} have already been uploaded{' in system controller' if is_system_controller() else ''}. " + \
f"Max major releases is {max_releases}"
LOG.info(msg)
raise MaxReleaseExceeded(msg)
def _run_load_import(self, from_release, to_release, iso_mount_dir, upgrade_files):
"""
Run load and import
:param from_release: From release
:param to_release: To release
:param iso_mount_dir: ISO mount directory
:return: info, warning, error messages, dict of release metadata info
"""
local_info = ""
local_warning = ""
local_error = ""
release_meta_info = {}
def run_script_command(cmd):
LOG.info("Running load import command: %s", " ".join(cmd))
result = subprocess.run(cmd, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, check=True, text=True)
return (result.stdout, None) if result.returncode == 0 else (None, result.stdout)
# Check if major-release-upload script exists in the iso
has_release_upload_script = os.path.isfile(os.path.join(
iso_mount_dir, 'upgrades', 'software-deploy', constants.MAJOR_RELEASE_UPLOAD_SCRIPT))
if has_release_upload_script:
# major-release-upload script is found. This iso supports upgrade from USM
try:
# Copy iso /upgrades/software-deploy/ to /opt/software/rel-<rel>/bin/
to_release_bin_dir = os.path.join(
constants.SOFTWARE_STORAGE_DIR, ("rel-%s" % to_release), "bin")
if os.path.exists(to_release_bin_dir):
shutil.rmtree(to_release_bin_dir)
shutil.copytree(os.path.join(iso_mount_dir, "upgrades", constants.SOFTWARE_DEPLOY_FOLDER),
to_release_bin_dir, symlinks=True)
# Run major-release-upload script
import_script = os.path.join(to_release_bin_dir, constants.MAJOR_RELEASE_UPLOAD_SCRIPT)
load_import_cmd = [
str(import_script),
f"--from-release={from_release}",
f"--to-release={to_release}",
f"--iso-dir={iso_mount_dir}"
]
load_import_info, load_import_error = run_script_command(load_import_cmd)
local_info += load_import_info or ""
local_error += load_import_error or ""
# Copy metadata.xml to /opt/software/rel-<rel>/
to_file = os.path.join(constants.SOFTWARE_STORAGE_DIR,
("rel-%s" % to_release), "metadata.xml")
metadata_file = os.path.join(iso_mount_dir, "upgrades", "metadata.xml")
shutil.copyfile(metadata_file, to_file)
# Update the release metadata
# metadata files have been copied over to the metadata/available directory
reload_release_data()
LOG.info("Updated release metadata for %s", to_release)
release_meta_info = self.get_release_meta_info(iso_mount_dir, upgrade_files)
return local_info, local_warning, local_error, release_meta_info
except Exception as e:
LOG.exception("Error occurred while running load import: %s", str(e))
raise
# At this step, major-release-upload script is not found in the iso
# Therefore, we run the local major-release-upload script which supports importing the N-1 iso
# that doesn't support USM feature.
# This is the special case where *only* DC system controller can import this iso
# TODO(ShawnLi): remove the code below when this special case is not supported
try:
local_import_script = os.path.join(
"/usr/sbin/software-deploy/", constants.MAJOR_RELEASE_UPLOAD_SCRIPT)
load_import_cmd = [local_import_script,
"--from-release=%s" % from_release,
"--to-release=%s" % to_release,
"--iso-dir=%s" % iso_mount_dir,
"--is-usm-iso=False"]
load_import_info, load_import_error = run_script_command(load_import_cmd)
local_info += load_import_info or ""
local_error += load_import_error or ""
# Update the release metadata
# metadata files have been copied over to the metadata/available directory
reload_release_data()
LOG.info("Updated release metadata for %s", to_release)
release_meta_info = {
os.path.basename(upgrade_files[constants.ISO_EXTENSION]): {
"id": constants.RELEASE_GA_NAME % to_release,
"sw_release": to_release,
},
os.path.basename(upgrade_files[constants.SIG_EXTENSION]): {
"id": None,
"sw_release": None,
}
}
return local_info, local_warning, local_error, release_meta_info
except Exception as e:
LOG.exception("Error occurred while running local load import script: %s", str(e))
raise
def get_release_meta_info(self, iso_mount_dir, upgrade_files) -> dict:
"""
Get release metadata information from metadata.xml
:param iso_mount_dir: ISO mount directory
:param upgrade_files: dict of upgrade files
:return: dict of release metadata info
"""
# Get release metadata
# NOTE(bqian) to_release is sw_version (MM.mm), the path isn't correct
# also prepatched iso needs to be handled.
# should go through the release_data to find the latest release of major release
# to_release
abs_meta_file_dir = os.path.join(iso_mount_dir, 'upgrades')
release_metadata_file_list = utils.find_file_by_regex(
abs_meta_file_dir, r'^([a-zA-Z]+)-([\d.]+)-metadata\.xml$')
if len(release_metadata_file_list) == 0:
raise SoftwareServiceError("No release metadata file found in %s" % abs_meta_file_dir)
release_metadata_file = release_metadata_file_list[0]
abs_stx_release_metadata_file = os.path.join(
iso_mount_dir, 'upgrades', release_metadata_file)
all_release_meta_info = parse_release_metadata(abs_stx_release_metadata_file)
return {
os.path.basename(upgrade_files[constants.ISO_EXTENSION]): {
"id": all_release_meta_info.get("id"),
"sw_release": all_release_meta_info.get("sw_version"),
},
os.path.basename(upgrade_files[constants.SIG_EXTENSION]): {
"id": None,
"sw_release": None,
}
}
def _clean_up_load_import(
self, iso_mount_dir, to_release, iso_file, is_import_completed, is_max_rel_exceeded):
"""
Clean up load and import
:param iso_mount_dir: ISO mount directory
:param to_release: To release
:param iso_file: ISO file
:param is_import_completed: Is import completed
:param is_max_rel_exceeded: Is max release exceeded
"""
# Unmount the iso file
if iso_mount_dir:
unmount_iso_load(iso_mount_dir)
LOG.info("Unmounted iso file %s", iso_file)
# remove upload leftover in case of failure
if to_release and not is_import_completed and not is_max_rel_exceeded:
to_release_dir = os.path.join(constants.SOFTWARE_STORAGE_DIR, "rel-%s" % to_release)
shutil.rmtree(to_release_dir, ignore_errors=True)
def _clean_up_inactive_load_import(self, release_version):
"""
Clean up inactive load and import
:param release_version: Release version
"""
dirs_to_remove = [
f"{constants.DC_VAULT_PLAYBOOK_DIR}/{release_version}",
f"{constants.DC_VAULT_LOADS_DIR}/{release_version}"
]
for dir_path in dirs_to_remove:
if os.path.exists(dir_path):
shutil.rmtree(dir_path, ignore_errors=True)
LOG.info("Removed %s", dir_path)
# TODO(ShawnLi): the code below is to only clean up those files that were created in usm_laod_import script
# delete 22.12 iso metadata in /opt/software/metadata/unavailable
# delete 22.12 patches in /opt/software/metadata/committed
file_patterns = [
(states.UNAVAILABLE_DIR, fr'^([a-zA-Z]+)-({release_version})-metadata\.xml$'),
(states.COMMITTED_DIR, fr'^([a-zA-Z]+)_({release_version})_PATCH_([0-9]+)-metadata\.xml$')
]
# Remove files matching patterns
for directory, pattern in file_patterns:
matched_file_names = utils.find_file_by_regex(directory, pattern)
for filename in matched_file_names:
abs_filename = os.path.join(directory, filename)
try:
os.remove(abs_filename)
LOG.info("Removed: %s", abs_filename)
except OSError:
LOG.warning("Failed to remove: %s", abs_filename)
def _process_upload_upgrade_files(
self, from_release, to_release, iso_mount_dir, supported_from_releases, upgrade_files):
"""
Process the uploaded upgrade files
:param from_release: From release
:param to_release: To release
:param iso_mount_dir: ISO mount directory
:param supported_from_releases: List of supported releases
:param upgrade_files: dict of upgrade files
:return: info, warning, error messages, dict of release metadata info
"""
# validate this major release upload
self.major_release_upload_check()
try:
# Validate that the current release is supported to upgrade to the new release
supported_versions = [v.get("version") for v in supported_from_releases]
if SW_VERSION not in supported_versions:
raise UpgradeNotSupported("Current release %s not supported to upgrade to %s"
% (SW_VERSION, to_release))
# Run major-release-upload script
LOG.info("Starting load import from %s", upgrade_files[constants.ISO_EXTENSION])
return self._run_load_import(from_release, to_release, iso_mount_dir, upgrade_files)
except Exception as e:
LOG.exception("Error occurred while processing upload upgrade files: %s", str(e))
raise
def _process_inactive_upgrade_files(
self, from_release, to_release, iso_mount_dir, upgrade_files):
"""
Process the uploaded inactive upgrade files, aka N-1 release
:param from_release: From release
:param to_release: To release
:param iso_mount_dir: ISO mount directory
:param upgrade_files: dict of upgrade files
:return: info, warning, error messages, dict of release metadata info
"""
# validate this major release upload
self.major_release_upload_check()
to_release_maj_ver = utils.get_major_release_version(to_release)
try:
# Validate the N-1 release from the iso file is supported to upgrade to the current N release
current_upgrade_supported_versions = read_upgrade_support_versions(
"/usr/rootdirs/opt/")
supported_versions = [v.get("version") for v in current_upgrade_supported_versions]
# to_release is N-1 release in here
if to_release_maj_ver not in supported_versions:
raise UpgradeNotSupported(
"ISO file release version %s not supported to upgrade to %s" %
(to_release_maj_ver, SW_VERSION))
# iso validation completed
LOG.info("Starting load import from %s", upgrade_files[constants.ISO_EXTENSION])
# from_release is set to None when uploading N-1 load
return self._run_load_import(from_release, to_release, iso_mount_dir, upgrade_files)
except Exception as e:
LOG.exception("Error occurred while processing inactive upgrade files: %s", str(e))
raise
def _checkout_commit_to_dc_vault_playbook_dir(self, release_version):
"""
Checkout commit to dc-vault playbook dir
:param release_version: release version
:return: None
"""
dc_vault_playbook_dir = f"{constants.DC_VAULT_PLAYBOOK_DIR}/{release_version}"
os.makedirs(dc_vault_playbook_dir, exist_ok=True)
ostree_repo = os.path.join(constants.FEED_DIR,
"rel-%s/ostree_repo" % release_version)
try:
latest_commit = ostree_utils.get_feed_latest_commit(release_version)
LOG.info("Getting latest commit for %s: %s", release_version, latest_commit)
except OSTreeCommandFail as e:
LOG.exception("Error occurred while getting latest commit for %s: %s",
release_version, str(e))
raise
try:
LOG.info("Checking out commit %s to %s", latest_commit, dc_vault_playbook_dir)
ostree_utils.checkout_commit_to_dir(
ostree_repo, latest_commit, dc_vault_playbook_dir, sub_path=constants.PLAYBOOKS_PATH)
except Exception:
if os.path.exists(dc_vault_playbook_dir):
shutil.rmtree(dc_vault_playbook_dir)
raise
def _process_upload_patch_files(self, patch_files):
"""
Process the uploaded patch files
:param patch_files: list of patch files
:return: info, warning, error messages
"""
local_info = ""
local_warning = ""
local_error = ""
upload_patch_info = []
try:
# Create the directories
for state_dir in states.DEPLOY_STATE_METADATA_DIR:
os.makedirs(state_dir, exist_ok=True)
except os.error:
msg = "Failed to create directories"
LOG.exception(msg)
raise SoftwareFail(msg)
for patch_file in patch_files:
base_patch_filename = os.path.basename(patch_file)
# Get the release_id from the patch's metadata
# and check to see if it's already uploaded
release_id = get_release_from_patch(patch_file, 'id')
release = self.release_collection.get_release_by_id(release_id)
patch_id = None
thispatch = None
try:
if release:
if release.state == states.COMMITTED:
msg = "%s is committed. Metadata not updated" % release_id
LOG.info(msg)
local_info += msg + "\n"
elif release.state != states.AVAILABLE:
msg = "%s is not currently in available state to be deployed." % release_id
LOG.info(msg)
local_info += msg + "\n"
else:
# todo(abailey) PatchFile / extract_patch should be renamed
patch_id, thispatch, error_msg = PatchFile.extract_patch(
patch_file,
metadata_dir=states.AVAILABLE_DIR,
metadata_only=True,
existing_content=release.contents,
base_pkgdata=self.base_pkgdata)
if error_msg:
raise ReleaseValidationFailure(error=error_msg)
PatchFile.unpack_patch(patch_file)
reload_release_data()
msg = "%s is already uploaded. Updated metadata only" % release_id
LOG.info(msg)
local_info += msg + "\n"
else:
patch_id, thispatch, error_msg = PatchFile.extract_patch(
patch_file,
metadata_dir=states.AVAILABLE_DIR,
base_pkgdata=self.base_pkgdata)
if error_msg:
raise ReleaseValidationFailure(error=error_msg)
PatchFile.unpack_patch(patch_file)
local_info += "%s is now uploaded\n" % release_id
reload_release_data()
# NOTE(bqian) Below check an exception raise should be revisit,
# if applicable, should be applied to the beginning of all requests.
if len(self.hosts) == 0:
msg = "service is running in incorrect state. No registered host"
raise InternalError(msg)
except Exception as e:
msg = "Failed to upload release %s" % release_id
LOG.exception("%s: %s" % (msg, e))
local_error += msg + "\n"
if patch_id and thispatch:
PatchFile.delete_extracted_patch(patch_id, thispatch)
try:
release_sw_version = thispatch.metadata[patch_id]["sw_version"]
pkg_feed_dir = "%s/rel-%s" % (constants.PACKAGE_FEED_DIR, release_sw_version)
apt_utils.component_remove(pkg_feed_dir, release_sw_version)
except Exception:
LOG.info("Could not delete apt-ostree component, does not exist")
continue
release = self.release_collection.get_release_by_id(release_id)
if release:
upload_patch_info.append({
base_patch_filename: {
"id": release_id,
"sw_release": release.sw_release, # MM.mm.pp release version
}
})
# create versioned precheck for uploaded patches
for patch in upload_patch_info:
filename, values = list(patch.items())[0]
LOG.info("Creating precheck for release %s..." % values.get("id"))
for pf in patch_files:
if filename in pf:
patch_file = pf
sw_release = values.get("sw_release")
required_patches = []
for dep_id in self.release_collection.get_release_by_id(values.get("id")).requires_release_ids:
required_patches.append(version.parse(dep_id))
# sort the required patches list and get the latest, if available
req_patch_version = None
if len(required_patches) > 0:
req_patch = str(sorted(required_patches)[-1])
_, req_patch_version, _, _ = utils.get_component_and_versions(req_patch)
if self.release_collection.get_release_by_id(req_patch) is None:
LOG.warning("Required patch '%s' is not uploaded." % req_patch)
PatchFile.create_versioned_precheck(patch_file, sw_release, req_patch_version=req_patch_version)
return local_info, local_warning, local_error, upload_patch_info
def software_release_upload(self, release_files):
"""
Upload software release files
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
upload_info = []
is_importing_inactive_load = False
# Refresh data, if needed
self.base_pkgdata.loaddirs()
msg = "Uploading files: %s" % ",".join(release_files)
audit_log_info(msg)
# We now need to put the files in the category (patch or upgrade)
patch_files = []
upgrade_files = {}
for uploaded_file in release_files:
(_, ext) = os.path.splitext(uploaded_file)
if ext in [constants.PATCH_EXTENSION]:
patch_files.append(uploaded_file)
elif ext == constants.ISO_EXTENSION:
upgrade_files[constants.ISO_EXTENSION] = uploaded_file
elif ext == constants.SIG_EXTENSION:
upgrade_files[constants.SIG_EXTENSION] = uploaded_file
else:
msg = "The file extension is not supported. Supported extensions include .patch, .iso and .sig"
LOG.exception(msg)
raise ReleaseValidationFailure(error=msg)
if len(upgrade_files) == 1: # Only one upgrade file uploaded
msg = "Missing upgrade file or signature file"
LOG.error(msg)
msg_error += msg + "\n"
elif upgrade_files.get(constants.ISO_EXTENSION, None) and self.hostname != constants.CONTROLLER_0_HOSTNAME:
raise SoftwareServiceError("Upload can only be performed on controller-0.")
elif len(upgrade_files) == 2: # Two upgrade files uploaded
tmp_info = ""
tmp_error = ""
tmp_warning = ""
tmp_release_meta_info = {}
is_import_completed = True
is_max_rel_exceeded = False
iso = upgrade_files[constants.ISO_EXTENSION]
sig = upgrade_files[constants.SIG_EXTENSION]
if not verify_files([iso], sig):
msg = "Software %s:%s signature validation failed" % (iso, sig)
raise ReleaseValidationFailure(error=msg)
LOG.info("iso and signature files upload completed.")
try:
# Mount the iso file after signature verification
iso_mount_dir = mount_iso_load(iso, constants.TMP_DIR)
LOG.info("Mounted iso file %s to %s", iso, iso_mount_dir)
# Read the metadata from the iso file to get to-release and supported-from-releases
supported_from_releases = read_upgrade_support_versions(iso_mount_dir)
to_release = get_to_release_from_metadata_file(iso_mount_dir)
to_release_maj_ver = utils.get_major_release_version(to_release)
LOG.info("Reading metadata from iso file %s completed. \nto_release: %s", iso, to_release_maj_ver)
# Same release is uploaded, return the metadata info from the iso file
if to_release_maj_ver == SW_VERSION:
tmp_info = f"Uploaded release {to_release} is the same as current release on the controller"
tmp_release_meta_info = self.get_release_meta_info(iso_mount_dir, upgrade_files)
elif to_release > SW_VERSION:
# N + 1 release is uploaded, process it regardless
tmp_info, tmp_warning, tmp_error, tmp_release_meta_info = self._process_upload_upgrade_files(
SW_VERSION, to_release, iso_mount_dir, supported_from_releases, upgrade_files)
elif to_release < SW_VERSION and is_system_controller():
# N - 1 release is uploaded, process it only if the region is system controller
is_importing_inactive_load = True
tmp_info, tmp_warning, tmp_error, tmp_release_meta_info = self._process_inactive_upgrade_files(
None, to_release, iso_mount_dir, upgrade_files)
# Checkout commit to dc-vault/playbooks directory
self._checkout_commit_to_dc_vault_playbook_dir(to_release_maj_ver)
except MaxReleaseExceeded:
is_max_rel_exceeded = True
raise
except Exception as e:
LOG.error("Error occurred while processing software release upload: %s", str(e))
is_import_completed = False
raise
finally:
self._clean_up_load_import(iso_mount_dir, to_release, iso,
is_import_completed, is_max_rel_exceeded)
if is_importing_inactive_load and not is_import_completed:
self._clean_up_inactive_load_import(to_release)
msg_info += tmp_info
msg_warning += tmp_warning
msg_error += tmp_error
upload_info.append(tmp_release_meta_info)
if len(patch_files) > 0:
tmp_info, tmp_warning, tmp_error, tmp_patch_meta_info = self._process_upload_patch_files(
patch_files)
msg_info += tmp_info
msg_warning += tmp_warning
msg_error += tmp_error
upload_info += tmp_patch_meta_info
reload_release_data()
return dict(info=msg_info, warning=msg_warning, error=msg_error, upload_info=upload_info)
def release_apply_order(self, release_id, running_release_sw_version):
"""
Determines the order of releases for applying.
:param release_id: The appliyng release id
:param running_release_sw_version: The running release major version
:return: List of releases in the order for applying
"""
deployed_releases_id = []
preinstalled_patches = []
for rel in self.release_collection.iterate_releases():
if rel.state == states.DEPLOYED:
deployed_releases_id.append(rel.id)
if rel.prepatched_iso:
preinstalled_patches = rel.preinstalled_patches
release_dependencies = self.get_release_dependency_list(release_id, preinstalled_patches)
release_dependencies.append(release_id)
# filter release_dependencies to include only releases
# that matches the major running release version
# and remove all releases already deployed, including prepatched
to_apply_releases = [
rel_id for rel_id in release_dependencies
if f"-{running_release_sw_version}." in rel_id and
rel_id not in deployed_releases_id + preinstalled_patches
]
to_apply_releases.sort()
return to_apply_releases
def release_remove_order(self, target_release_id, running_release_id, running_release_sw_version):
"""
Determines the order of releases for removing based on the feed commit order.
:param target_release_id: The target release id
:param running_release_id: The running release id
:param running_release_sw_version: The running release major version
:return: List of releases in the order for removing
"""
# if removing release is not from the major running version, cannot remove it
if f"-{running_release_sw_version}." not in target_release_id:
return []
releases = list(self.release_collection.iterate_releases_by_state(states.DEPLOYED))
release_map = {release.id: release for release in releases}
to_remove_releases = []
current = running_release_id
while current != target_release_id:
to_remove_releases.append(current)
current_release = release_map.get(current)
if not current_release:
error = f"Release {current} not found in releases map"
raise SoftwareServiceError(error=error)
next_release = next((r for r in releases if r.commit_id == current_release.base_commit_id), None)
if not next_release:
error = f"Release with commit id {current_release.base_commit_id} not found"
raise SoftwareServiceError(error=error)
current = next_release.id
return to_remove_releases
def reset_feed_commit(self, release):
commit_id = release.commit_id
if commit_id is None:
LOG.warning("Unable to find the commit id in metadata")
return
LOG.info("Reset feed to commit %s" % commit_id)
try:
feed_ostree_dir = "%s/rel-%s/ostree_repo" % \
(constants.FEED_OSTREE_BASE_DIR, release.sw_version)
apt_utils.run_rollback(feed_ostree_dir, commit_id)
self.latest_feed_commit = commit_id
except APTOSTreeCommandFail:
msg = "Failure when reseting commit %s" % commit_id
LOG.exception(msg)
raise APTOSTreeCommandFail(msg)
def software_release_delete_api(self, release_ids):
"""
Delete release(s)
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
# Protect against duplications
full_list = sorted(list(set(release_ids)))
not_founds = []
cannot_del = []
used_by_subcloud = []
release_list = []
for rel_id in full_list:
rel = self.release_collection.get_release_by_id(rel_id)
if rel is None:
not_founds.append(rel_id)
else:
if not rel.is_deletable:
cannot_del.append(rel_id)
elif rel.is_ga_release and is_system_controller():
subcloud_by_sw_version = get_subcloud_groupby_version()
if rel.sw_version in subcloud_by_sw_version:
used_by_subcloud.append(rel_id)
else:
release_list.append(rel_id)
else:
release_list.append(rel_id)
err_msg = ""
if not_founds:
list_str = ','.join(not_founds)
err_msg = f"Release{'' if len(not_founds) == 1 else 's'} {list_str} can not be found\n"
if cannot_del:
list_str = ','.join(cannot_del)
err_msg += (f"Release{'' if len(cannot_del) == 1 else 's'} {list_str} "
f"{'is' if len(cannot_del) == 1 else 'are'} not ready to be deleted\n")
if used_by_subcloud:
list_str = ','.join(used_by_subcloud)
err_msg += f"Release{'' if len(used_by_subcloud) == 1 else 's'} {list_str} still used by subcloud(s)"
if len(err_msg) > 0:
raise SoftwareServiceError(error=err_msg)
msg = "Deleting releases: %s" % ",".join(release_list)
LOG.info(msg)
audit_log_info(msg)
# Handle operation
for release_id in release_list:
release = self.release_collection.get_release_by_id(release_id)
release_sw_version = release.sw_version
# Delete ostree content if it exists.
# RPM based patches (from upgrades) will not have ostree contents
ostree_tar_filename = self.get_ostree_tar_filename(release_sw_version, release_id)
if os.path.isfile(ostree_tar_filename):
try:
os.remove(ostree_tar_filename)
except OSError:
msg = "Failed to remove ostree tarball %s" % ostree_tar_filename
LOG.exception(msg)
raise OSTreeTarFail(msg)
is_major_release = ReleaseState(release_ids=[release.id]).is_major_release_deployment()
if not is_major_release:
package_repo_dir = "%s/rel-%s" % (constants.PACKAGE_FEED_DIR, release_sw_version)
apt_utils.component_remove(package_repo_dir, release.sw_release)
# Delete upgrade iso file in folder
# TODO(heitormatsui): treat the prepatched iso scenario
metadata_file = "%s-metadata.xml" % release_id
delete_feed = False
to_release_iso_dir = os.path.join(constants.FEED_OSTREE_BASE_DIR, ("rel-%s" % release_sw_version))
if os.path.isdir(to_release_iso_dir):
# check if the release being deleted is related to this feed
if os.path.isfile("%s/upgrades/%s" % (to_release_iso_dir, metadata_file)):
delete_feed = True
if delete_feed:
try:
shutil.rmtree(to_release_iso_dir)
except OSError:
msg = "Failed to remove release iso %s folder" % to_release_iso_dir
LOG.exception(msg)
raise ReleaseIsoDeleteFailure(msg)
msg = "Deleted feed directory %s" % to_release_iso_dir
LOG.info(msg)
msg_info += msg + "\n"
# TODO(lbonatti): treat the upcoming versioning changes
PatchFile.delete_versioned_directory(release.sw_release)
# Delete N-1 load on system controller
if is_system_controller():
self._clean_up_inactive_load_import(release_sw_version)
try:
# Delete the metadata
metadata_dir = states.RELEASE_STATE_TO_DIR_MAP[release.state]
os.remove("%s/%s" % (metadata_dir, metadata_file))
except OSError:
# When deleting the load from a system controller, the unavailable
# and commited directories are cleaned up and, if the metadata file
# is located in one of those, it will result in a exception since
# it would have been already removed by the
# _clean_up_inactive_load_import method
if (
is_system_controller() and
(
metadata_dir == states.UNAVAILABLE_DIR or
metadata_dir == states.COMMITTED_DIR
)
):
msg = (
f"Metadata file already removed: {metadata_dir}/{metadata_file}"
)
LOG.warning(msg)
else:
msg = "Failed to remove metadata for %s" % release_id
LOG.exception(msg)
raise MetadataFail(msg)
self.delete_start_install_script(release_id)
self.delete_patch_activate_scripts(release_id)
reload_release_data()
msg = "%s has been deleted" % release_id
LOG.info(msg)
msg_info += msg + "\n"
# Refresh data, if needed
self.base_pkgdata.loaddirs()
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def in_sync_controller_api(self):
"""
Check if both controllers are in sync
by checking the database JSON file
"""
is_in_sync = is_deploy_state_in_sync()
return {"in_sync": is_in_sync}
def patch_init_release_api(self, release_id):
"""
Create an empty repo for a new release_id
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
msg = "Initializing repo for: %s" % release_id
LOG.info(msg)
audit_log_info(msg)
if release_id == SW_VERSION:
msg = "Rejected: Requested release %s is running release" % release_id
msg_error += msg + "\n"
LOG.info(msg)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
# Refresh data
self.base_pkgdata.loaddirs()
reload_release_data()
repo_dir[release_id] = "%s/rel-%s" % (repo_root_dir, release_id)
# Verify the release doesn't already exist
if os.path.exists(repo_dir[release_id]):
msg = "Patch repository for %s already exists" % release_id
msg_info += msg + "\n"
LOG.info(msg)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
# Create the repo
try:
# todo(jcasteli) determine if ostree change needs a createrepo equivalent
output = "UNDER CONSTRUCTION for OSTREE"
LOG.info("Repo[%s] updated:\n%s", release_id, output)
except Exception:
msg = "Failed to update the repo for %s" % release_id
LOG.exception(msg)
# Wipe out what was created
shutil.rmtree(repo_dir[release_id])
del repo_dir[release_id]
raise SoftwareFail(msg)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def patch_query_what_requires(self, patch_ids):
"""
Query the known patches to see which have dependencies on the specified patches
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
msg = "Querying what requires patches: %s" % ",".join(patch_ids)
LOG.info(msg)
audit_log_info(msg)
# First, verify that all specified patches exist
id_verification = True
for patch_id in patch_ids:
release = self.release_collection.get_release_by_id(patch_id)
if release is None:
msg = "Patch %s does not exist" % patch_id
LOG.error(msg)
msg_error += msg + "\n"
id_verification = False
if not id_verification:
return dict(info=msg_info, warning=msg_warning, error=msg_error)
required_patches = {}
for release in self.release_collection.iterate_releases():
for req_patch in release.requires_release_ids:
if req_patch not in patch_ids:
continue
if req_patch not in required_patches:
required_patches[req_patch] = []
required_patches[req_patch].append(release.id)
for patch_id in patch_ids:
if patch_id in required_patches:
iter_patch_list = required_patches[patch_id]
msg_info += "%s is required by: %s\n" % (patch_id, ", ".join(sorted(iter_patch_list)))
else:
msg_info += "%s is not required by any patches.\n" % patch_id
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def send_latest_feed_commit_to_agent(self):
"""
Notify the patch agent that the latest commit on the feed
repo has been updated
"""
# Skip sending messages if host not yet provisioned
if self.sock_out is None:
LOG.info("Skipping send feed commit to agent")
return
send_commit_to_agent = PatchMessageSendLatestFeedCommit()
self.socket_lock.acquire()
send_commit_to_agent.send(self.sock_out)
self.socket_lock.release()
def software_sync(self):
# Increment the software_op_counter here
self.inc_patch_op_counter()
if self.sock_out is None or self.install_local:
return True
# Send the sync requests
self.controller_neighbours_lock.acquire()
for n in self.controller_neighbours:
self.controller_neighbours[n].clear_synced()
self.controller_neighbours_lock.release()
msg = PatchMessageSyncReq()
self.socket_lock.acquire()
msg.send(self.sock_out)
self.socket_lock.release()
# Now we wait, up to two mins. future enhancement: Wait on a condition
my_ip = cfg.get_mgmt_ip()
sync_rc = False
max_time = time.time() + 120
while time.time() < max_time:
all_done = True
self.controller_neighbours_lock.acquire()
for n in self.controller_neighbours:
if n != my_ip and not self.controller_neighbours[n].get_synced():
all_done = False
self.controller_neighbours_lock.release()
if all_done:
LOG.info("Sync complete")
sync_rc = True
break
time.sleep(0.5)
# Send hellos to the hosts now, to get queries performed
hello_agent = PatchMessageHelloAgent()
self.socket_lock.acquire()
hello_agent.send(self.sock_out)
self.socket_lock.release()
if not sync_rc:
LOG.info("Timed out waiting for sync completion")
return sync_rc
def software_release_query_cached(self, **kwargs):
query_state = None
if "show" in kwargs:
valid_query_states = [
states.AVAILABLE,
states.UNAVAILABLE,
states.DEPLOYED,
states.REMOVING,
states.COMMITTED,
states.DEPLOYING
]
if kwargs["show"] in valid_query_states:
query_state = kwargs["show"]
query_release = None
if "release" in kwargs:
query_release = kwargs["release"]
results = []
def filter_by_version():
for r in self.release_collection.iterate_releases():
if r.sw_version in query_release:
yield r
def filter_by_state():
for rel in self.release_collection.iterate_releases_by_state(query_state):
yield rel
if query_state is not None:
iterator = filter_by_state
elif query_release is not None:
iterator = filter_by_version
else:
iterator = self.release_collection.iterate_releases
for i in iterator():
data = i.to_query_dict()
results.append(data)
return results
def software_release_query_specific_cached(self, release_ids):
LOG.info("software release show")
results = []
for release_id in release_ids:
release = self.release_collection.get_release_by_id(release_id)
if release is not None:
results.append(release.to_query_dict())
return results
def get_dependencies(self, patch_ids, recursive):
dependencies = set()
patch_added = False
# Add patches to workset
for patch_id in sorted(patch_ids):
dependencies.add(patch_id)
patch_added = True
while patch_added:
patch_added = False
for patch_id in sorted(dependencies):
release = self.release_collection.get_release_by_id(patch_id)
for req in release.requires:
if req not in dependencies:
dependencies.add(req)
patch_added = recursive
return sorted(dependencies)
def patch_query_dependencies(self, patch_ids, **kwargs):
msg = "Patch query-dependencies %s" % patch_ids
LOG.info(msg)
audit_log_info(msg)
failure = False
results = {"patches": [],
"error": ""}
recursive = False
if kwargs.get("recursive") == "yes":
recursive = True
# Verify patch IDs
for patch_id in sorted(patch_ids):
release = self.release_collection.get_release_by_id(patch_id)
if release is None:
errormsg = "%s is unrecognized\n" % patch_id
LOG.info("patch_query_dependencies: %s", errormsg)
results["error"] += errormsg
failure = True
if failure:
LOG.info("patch_query_dependencies failed")
return results
results["patches"] = self.get_dependencies(patch_ids, recursive)
return results
def patch_commit(self, patch_ids, dry_run=False):
msg = "Patch commit %s" % patch_ids
LOG.info(msg)
audit_log_info(msg)
try:
if not os.path.exists(states.COMMITTED_DIR):
os.makedirs(states.COMMITTED_DIR)
except os.error:
msg = "Failed to create %s" % states.COMMITTED_DIR
LOG.exception(msg)
raise SoftwareFail(msg)
failure = False
recursive = True
cleanup_files = set()
results = {"info": "",
"error": ""}
# Ensure there are only REL patches
non_rel_list = []
for release in self.release_collection.iterate_releases():
if release.status != constants.STATUS_RELEASED:
non_rel_list.append(release.id)
if len(non_rel_list) > 0:
errormsg = "A commit cannot be performed with non-REL status patches in the system:\n"
for patch_id in non_rel_list:
errormsg += " %s\n" % patch_id
LOG.info("patch_commit rejected: %s", errormsg)
results["error"] += errormsg
return results
# Verify Release IDs
for patch_id in sorted(patch_ids):
release = self.release_collection.get_release_by_id(patch_id)
if release is None:
errormsg = "%s is unrecognized\n" % patch_id
LOG.info("patch_commit: %s", errormsg)
results["error"] += errormsg
failure = True
if failure:
LOG.info("patch_commit: Failed patch ID check")
return results
commit_list = self.get_dependencies(patch_ids, recursive)
# Check patch states
avail_list = []
for patch_id in commit_list:
release = self.release_collection.get_release_by_id(patch_id)
if release.state not in [states.DEPLOYED, states.COMMITTED]:
avail_list.append(patch_id)
if len(avail_list) > 0:
errormsg = "The following patches are not applied and cannot be committed:\n"
for patch_id in avail_list:
errormsg += " %s\n" % patch_id
LOG.info("patch_commit rejected: %s", errormsg)
results["error"] += errormsg
return results
# TODO(ShawnLi): Comment out for 24.09 release. This is gated to 25.03
# NOTE(lviera): Must include start scripts, refactor like self.delete_start_install_script(patch_id)
# for patch_id in commit_list:
# # Fetch file paths that need to be cleaned up to
# # free patch storage disk space
# pre_install_filename = self.release_data.metadata[patch_id].get("pre_install")
# post_install_filename = self.release_data.metadata[patch_id].get("post_install")
# if pre_install_filename:
# pre_install_script_path = "%s/%s_%s" % (root_scripts_dir, patch_id, pre_install_filename)
# post_install_script_path = "%s/%s_%s" % (root_scripts_dir, patch_id, post_install_filename)
# if os.path.exists(pre_install_script_path):
# cleanup_files.add(pre_install_script_path)
# if os.path.exists(post_install_script_path):
# cleanup_files.add(post_install_script_path)
# patch_sw_version = utils.get_major_release_version(
# self.release_data.metadata[patch_id]["sw_version"])
# abs_ostree_tar_dir = package_dir[patch_sw_version]
# software_tar_path = "%s/%s-software.tar" % (abs_ostree_tar_dir, patch_id)
# if os.path.exists(software_tar_path):
# cleanup_files.add(software_tar_path)
# Calculate disk space
disk_space = 0
for file in cleanup_files:
statinfo = os.stat(file)
disk_space += statinfo.st_size
if dry_run:
results["info"] = "This commit operation would free %0.2f MiB" % (disk_space / (1024.0 * 1024.0))
return results
# Do the commit
# Move the metadata to the committed dir
for patch_id in commit_list:
metadata_fname = "%s-metadata.xml" % patch_id
deployed_fname = os.path.join(states.DEPLOYED_DIR, metadata_fname)
committed_fname = os.path.join(states.COMMITTED_DIR, metadata_fname)
if os.path.exists(deployed_fname):
try:
shutil.move(deployed_fname, committed_fname)
except shutil.Error:
msg = "Failed to move the metadata for %s" % patch_id
LOG.exception(msg)
raise MetadataFail(msg)
# Delete the files
for file in cleanup_files:
try:
os.remove(file)
except OSError:
msg = "Failed to remove: %s" % file
LOG.exception(msg)
raise MetadataFail(msg)
reload_release_data()
results["info"] = "The releases have been committed."
return results
def query_host_cache(self):
output = []
self.hosts_lock.acquire()
for nbr in list(self.hosts):
host = self.hosts[nbr].get_dict()
host["interim_state"] = False
for patch_id in list(sc.interim_state):
if nbr in sc.interim_state[patch_id]:
host["interim_state"] = True
output.append(host)
self.hosts_lock.release()
return output
def any_patch_host_installing(self):
rc = False
with self.hosts_lock:
for host in self.hosts.values():
if host.state == constants.PATCH_AGENT_STATE_INSTALLING:
rc = True
break
return rc
def copy_install_scripts(self):
applying_states = [states.DEPLOYING, states.REMOVING]
for release in self.release_collection.iterate_releases():
pre_install = release.pre_install
post_install = release.post_install
folder = ["preinstall", "postinstall"]
if release.state in applying_states:
try:
for i, file in enumerate([pre_install, post_install]):
if file:
full_name_file = "%s_%s" % (release.id, file)
script_path = "%s/%s" % (root_scripts_dir, full_name_file)
dest_path = constants.PATCH_SCRIPTS_STAGING_DIR + "/" + folder[i]
dest_script_file = "%s/%s" % (dest_path, full_name_file)
if not os.path.exists(dest_path):
os.makedirs(dest_path, 0o700)
shutil.copyfile(script_path, dest_script_file)
os.chmod(dest_script_file, 0o700)
msg = "Creating install script %s for %s" % (full_name_file, release.id)
LOG.info(msg)
except shutil.Error:
msg = "Failed to copy the install script %s for %s" % (full_name_file, release.id)
LOG.exception(msg)
raise SoftwareError(msg)
else:
try:
for i, file in enumerate(file for file in (pre_install, post_install) if file):
full_name_file = "%s_%s" % (release.id, file)
script_path = "%s/%s/%s" % (constants.PATCH_SCRIPTS_STAGING_DIR, folder[i], full_name_file)
if os.path.exists(script_path):
os.remove(script_path)
msg = "Removing install script %s for %s" % (full_name_file, release.id)
LOG.info(msg)
except shutil.Error:
msg = "Failed to delete the install script %s for %s" % (full_name_file, release.id)
LOG.exception(msg)
def _update_state_to_peer(self):
self.socket_lock.acquire()
try:
state_update_msg = SoftwareMessageDeployStateUpdate()
state_update_msg.send(self.sock_out)
finally:
self.socket_lock.release()
def _sanitize_extra_options(self, value):
"""
Make sure value have only allowed characters.
"""
# Only letters, numbers, space, -, and _ are allowed.
if not re.match(r'^[\w\s\-]+$', value):
msg_error = f"Invalid value: '{value}'."
raise SoftwareServiceError(msg_error)
return value
def _parse_and_sanitize_extra_options(self, options_list):
"""
Validate, sanitize and convert a 'key=value' to dictionary.
"""
for item in options_list:
if item.count('=') != 1:
msg_error = f"Invalid format: '{item}'. Expected format is key=value"
raise SoftwareServiceError(msg_error)
options = {}
for item in options_list:
key, value = item.split('=', 1)
key = self._sanitize_extra_options(key.strip())
value = self._sanitize_extra_options(value.strip())
if key in constants.RESERVED_WORDS_SET:
msg_error = f"{key} is a reserved word and can't be used."
raise SoftwareServiceError(msg_error)
options[key] = value
return options
def _release_basic_checks(self, deployment):
"""
Does basic sanity checks on the release data
:param deployment: release to be checked
:return: release object (if exists),
bool with success output,
strings with info, warning and error messages
"""
# We need to verify that the software release exists
release = self.release_collection.get_release_by_id(deployment)
if not release:
msg = "Software release version corresponding to the specified release " \
"%s does not exist." % deployment
LOG.error(msg)
msg = msg + " Try deleting and re-uploading the software for recovery."
raise SoftwareServiceError(error=msg)
return release
def _deploy_precheck(self, release_version: str, force: bool = False,
region_name: typing.Optional[str] = None, patch: bool = False,
**kwargs) -> dict:
"""
Verify if system satisfy the requisites to upgrade to a specified deployment.
:param release_version: full release name, e.g. starlingx-MM.mm.pp
:param force: if True will ignore minor alarms during precheck
:param region_name: region_name
:param patch: if True then indicate precheck is for patch release
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
if region_name is None:
region_name = utils.get_local_region_name()
precheck_script = utils.get_precheck_script(release_version)
if not os.path.isfile(precheck_script) and patch:
# Precheck script may not be available for some patches
# In that case, report system as healthy with info message to proceed
self._save_precheck_result(release_version, healthy=True)
msg_info = f"No deploy-precheck script available for patch version {release_version}"
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True)
if not os.path.isfile(precheck_script):
msg = "Release files for deployment %s are not present on the system, " \
"cannot proceed with the precheck." % release_version
LOG.error(msg)
msg_error = "Fail to perform deploy precheck. " \
"Uploaded release may have been damaged. " \
"Try delete and re-upload the release.\n"
self._save_precheck_result(release_version, healthy=False)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
if self.pre_bootstrap and not patch:
# Deploy precheck should be avoided in case of major release.
msg_info = "Major release precheck is not valid in pre bootstrap scenario.\n"
self._save_precheck_result(release_version, healthy=True)
return dict(info=msg_info, warning=msg_info, error=msg_error, system_healthy=True)
if self.pre_bootstrap and not force:
# Deploy precheck may not be supported in prebootstrap environment if
# script access any of services like sysinv, keystone, etc.
msg_warning = "Pre-bootstrap environment may not support deploy precheck.\n" \
"Use --force option to execute deploy precheck script.\n"
self._save_precheck_result(release_version, healthy=True)
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=True)
deploy_in_progress = self._get_software_upgrade()
# parse local config file to pass parameters to precheck script
try:
cp = configparser.ConfigParser(interpolation=None)
cp.read(constants.SOFTWARE_CONFIG_FILE_LOCAL)
ks_section = dict(cp["keystone_authtoken"]) if cp.has_section("keystone_authtoken") else {}
auth_url = ks_section.get("auth_url")
username = ks_section.get("username")
password = ks_section.get("password")
project_name = ks_section.get("project_name")
user_domain_name = ks_section.get("user_domain_name")
project_domain_name = ks_section.get("project_domain_name")
except Exception as e:
msg = "Error parsing config file: %s." % str(e)
LOG.error(msg)
msg_error = "Fail to perform deploy precheck. Internal error has occured." \
"Try lock and unlock the controller for recovery.\n"
self._save_precheck_result(release_version, healthy=False)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
# Get releases info required for precheck
releases = self.software_release_query_cached()
preinstalled_patches = []
for release in releases:
if release['prepatched_iso']:
preinstalled_patches = release.get('preinstalled_patches', [])
break
for release in releases:
keys_to_delete = ['packages', 'summary', 'description',
'install_instructions', 'warnings', 'component']
for key in keys_to_delete:
del release[key]
# remove patch from requires if present in preinstalled_patches
if preinstalled_patches:
requires = release.get('requires', [])
common = set(requires) & set(preinstalled_patches)
if common:
release['requires'] = [id for id in requires if id not in common]
LOG.info("Removed %s from %s requires list, since these are prepatched"
% (common, release['release_id']))
cmd = [precheck_script,
"--auth_url=%s" % auth_url,
"--username=%s" % username,
"--password=%s" % password,
"--project_name=%s" % project_name,
"--user_domain_name=%s" % user_domain_name,
"--project_domain_name=%s" % project_domain_name,
"--region_name=%s" % region_name,
"--releases=%s" % json.dumps(releases),
"--options=%s" % json.dumps(kwargs.get("options", {})),
"--deploy_in_progress=%s" % json.dumps(deploy_in_progress)]
if force:
cmd.append("--force")
if patch:
cmd.append("--patch")
# Call precheck from the deployment files
precheck_return = subprocess.run(
cmd,
stderr=subprocess.STDOUT,
stdout=subprocess.PIPE,
check=False,
text=True,
)
system_healthy = None
if precheck_return.returncode in [constants.RC_SUCCESS, constants.RC_UNHEALTHY]:
system_healthy = precheck_return.returncode == constants.RC_SUCCESS
self._save_precheck_result(release_version, healthy=system_healthy)
msg_info += precheck_return.stdout
else:
self._save_precheck_result(release_version, healthy=False)
msg_error += precheck_return.stdout
return dict(info=msg_info, warning=msg_warning, error=msg_error, system_healthy=system_healthy)
def _get_release_additional_info(self, release):
"""
Get additional information related to release in precheck api.
:return: dict with release info.
"""
release_info = {}
running_release = self.release_collection.running_release
release_info["major_release"] = utils.is_upgrade_deploy(SW_VERSION, release.sw_release)
release_info["reboot_required"] = release.reboot_required
release_info["prepatched_iso"] = release.prepatched_iso
release_info["apply_operation"] = release > running_release
return release_info
def software_deploy_precheck_api(self, deployment: str, force: bool = False, region_name=None,
**kwargs) -> dict:
"""
Verify if system satisfy the requisites to upgrade to a specified deployment.
:param deployment: full release name, e.g. starlingx-MM.mm.pp
:param force: if True will ignore minor alarms during precheck
:return: dict of info, warning and error messages
"""
release = self._release_basic_checks(deployment)
release_version = release.sw_release
# Check fields (MM.mm) of release_version to set patch flag
is_patch = (not utils.is_upgrade_deploy(SW_VERSION, release_version))
if not is_patch and socket.gethostname() != constants.CONTROLLER_0_HOSTNAME:
raise SoftwareServiceError(f"Deploy precheck for major releases needs to be executed in"
f" {constants.CONTROLLER_0_HOSTNAME} host.")
if kwargs.get("options"):
kwargs["options"] = self._parse_and_sanitize_extra_options(kwargs.get("options"))
ret = self._deploy_precheck(release_version, force, region_name, is_patch, **kwargs)
if ret:
if ret.get("system_healthy") is None:
ret["error"] = "Fail to perform deploy precheck. Internal error has occurred.\n" + \
ret.get("error")
elif not ret.get("system_healthy"):
ret["error"] = "The following issues have been detected, which prevent " \
"deploying %s\n" % deployment + ret.get("info")
release_info = self._get_release_additional_info(release)
ret.update(release_info)
return ret
def _deploy_upgrade_start(self, to_release, commit_id, **kwargs):
LOG.info("start deploy upgrade to %s from %s" % (to_release, SW_VERSION))
deploy_script_name = constants.DEPLOY_START_SCRIPT
cmd_path = utils.get_software_deploy_script(to_release, deploy_script_name)
if not os.path.isfile(cmd_path):
msg = f"{deploy_script_name} was not found"
LOG.error(msg)
raise SoftwareServiceError(f"{deploy_script_name} was not found. "
"The uploaded software could have been damaged. "
"Please delete the software and re-upload it")
major_to_release = utils.get_major_release_version(to_release)
k8s_ver = get_k8s_ver()
postgresql_port = str(cfg.alt_postgresql_port)
feed = os.path.join(constants.FEED_DIR,
"rel-%s/ostree_repo" % major_to_release)
LOG.info("k8s version %s" % k8s_ver)
upgrade_start_cmd = [cmd_path, SW_VERSION, major_to_release, k8s_ver, postgresql_port,
feed]
upgrade_start_cmd.append(commit_id if commit_id is not None else 0)
upgrade_start_cmd.append(json.dumps(kwargs.get("options")) if kwargs.get("options") is not None else "")
# pass in keystone auth through environment variables
# OS_AUTH_URL, OS_USERNAME, OS_PASSWORD, OS_PROJECT_NAME, OS_USER_DOMAIN_NAME,
# OS_PROJECT_DOMAIN_NAME, OS_REGION_NAME are in env variables.
keystone_auth = CONF.get('keystone_authtoken')
env = {}
env["OS_AUTH_URL"] = keystone_auth["auth_url"] + '/v3'
env["OS_USERNAME"] = keystone_auth["username"]
env["OS_PASSWORD"] = keystone_auth["password"]
env["OS_PROJECT_NAME"] = keystone_auth["project_name"]
env["OS_USER_DOMAIN_NAME"] = keystone_auth["user_domain_name"]
env["OS_PROJECT_DOMAIN_NAME"] = keystone_auth["project_domain_name"]
env["OS_REGION_NAME"] = keystone_auth["region_name"]
env["IGNORE_ERRORS"] = self.ignore_errors
try:
LOG.info("starting subprocess %s" % ' '.join(upgrade_start_cmd))
subprocess.Popen(upgrade_start_cmd, start_new_session=True, shell=False, env=env)
LOG.info("subprocess started")
return True
except subprocess.SubprocessError as e:
LOG.error("Failed to start command: %s. Error %s" % (' '.join(upgrade_start_cmd), e))
return False
def deploy_state_changed(self, new_state):
'''Handle 'deploy state change' event, invoked when operations complete. '''
deploy_state = DeployState.get_instance()
state_event = {
DEPLOY_STATES.START_DONE: deploy_state.start_done,
DEPLOY_STATES.START_FAILED: deploy_state.start_failed,
DEPLOY_STATES.ACTIVATE_DONE: deploy_state.activate_done,
DEPLOY_STATES.ACTIVATE_FAILED: deploy_state.activate_failed,
DEPLOY_STATES.ACTIVATE_ROLLBACK_DONE: deploy_state.activate_rollback_done,
DEPLOY_STATES.ACTIVATE_ROLLBACK_FAILED: deploy_state.activate_rollback_failed,
DEPLOY_STATES.HOST_FAILED: deploy_state.deploy_host_failed
}
if new_state in state_event:
state_event[new_state]()
else:
msg = f"Received invalid deploy state update {deploy_state}"
LOG.error(msg)
def host_deploy_state_changed(self, hostname, host_deploy_state):
'''Handle 'host deploy state change' event. '''
deploy_host_state = DeployHostState(hostname)
state_event = {
DEPLOY_HOST_STATES.FAILED: deploy_host_state.failed
}
if host_deploy_state in state_event:
state_event[host_deploy_state]()
else:
msg = f"Received invalid deploy host state update {host_deploy_state}"
LOG.error(msg)
def add_text_tag_to_xml(self, parent, tag, text):
'''Add text to tag. Create it if it does not exist'''
element = parent.find(tag)
if element is None:
element = ET.SubElement(parent, tag)
element.text = text
return element
def is_deployment_list_reboot_required(self, deployment_list):
"""Check if any deploy in deployment list is reboot required"""
for release_id in deployment_list:
release = self.release_collection.get_release_by_id(release_id)
if release.reboot_required:
return True
return False
def copy_patch_activate_scripts(self, release_id, activate_scripts_list):
"""Copy patch activate scripts to /etc/update.d"""
try:
existing_scripts = list(os.listdir(PATCH_MIGRATION_SCRIPT_DIR))
for script in activate_scripts_list:
full_name_file = "%s_%s" % (release_id, script)
script_path = "%s/%s" % (root_scripts_dir, full_name_file)
dest_script_file = "%s/%s" % (PATCH_MIGRATION_SCRIPT_DIR, script)
# Do not copy if script already exists in folder
if script in existing_scripts:
msg = "Script %s already exists in %s. Skipping copy" \
% (script, PATCH_MIGRATION_SCRIPT_DIR)
LOG.info(msg)
continue
shutil.copyfile(script_path, dest_script_file)
os.chmod(dest_script_file, 0o755)
msg = "Creating patch activate script %s for %s" \
% (full_name_file, release_id)
LOG.info(msg)
except shutil.Error:
msg = "Failed to copy patch activate script %s for %s" \
% (full_name_file, release_id)
LOG.exception(msg)
raise SoftwareError(msg)
def delete_all_patch_activate_scripts(self):
"""Delete all patch activate scripts in /etc/update.d"""
if os.path.exists(PATCH_MIGRATION_SCRIPT_DIR):
for script_name in os.listdir(PATCH_MIGRATION_SCRIPT_DIR):
script_path = os.path.join(PATCH_MIGRATION_SCRIPT_DIR, script_name)
try:
os.remove(script_path)
msg = "Deleted patch script: %s" % script_path
LOG.info(msg)
except Exception as e:
msg = "Failed to delete patch script %s. Reason: %s" % (script_path, e)
LOG.error(msg)
def _run_start_script(self, script_name, release_id, operation):
"""Run pre_start or post_start scripts"""
script_path = os.path.join(root_scripts_dir, f"{release_id}_{script_name}")
if os.path.isfile(script_path):
LOG.info("Running %s script", script_name)
try:
output = subprocess.check_output(
["sudo", script_path, f"--operation={operation}"],
stderr=subprocess.STDOUT,
text=True
)
LOG.info("%s output:\n%s" % (script_name, output.strip()))
except subprocess.CalledProcessError as e:
msg = "Failed to execute %s for release %s." % (script_name, release_id)
LOG.exception(msg)
LOG.error("Command output: %s", e.output)
raise SoftwareError(msg)
else:
LOG.warning("Script %s not found", script_name)
def cleanup_old_releases(self, target_commit, all_commits):
index = 0
to_delete_releases = []
while index < len(all_commits) and target_commit != all_commits[index]:
to_delete_release = self.release_collection.get_release_by_commit_id(all_commits[index])
if to_delete_release:
to_delete_releases.append(to_delete_release.id)
LOG.info("Deleting %s not used after prestage" % to_delete_release.id)
index += 1
# Delete metadata and all associated release files
self.software_release_delete_api(to_delete_releases)
def install_releases_thread(self, deployment_list, feed_repo, upgrade=False, **kwargs):
"""
In a separated thread.
Install the debian packages, create the commit and update the metadata.
IF it's an upgrade, also run the upgrade script
"""
def run():
LOG.info("Installing releases on repo: %s" % feed_repo)
try:
deploy_sw_version = None
for release_id in deployment_list:
msg = "Starting deployment for: %s" % release_id
LOG.info(msg)
audit_log_info(msg)
deploy_release = self._release_basic_checks(release_id)
self.copy_patch_activate_scripts(release_id, deploy_release.activation_scripts)
# Run pre_start script
self._run_start_script(deploy_release.pre_start, release_id, constants.APPLY)
# Reload release in case pre_start script made some change
reload_release_data()
deploy_release = self._release_basic_checks(release_id)
deploy_sw_version = deploy_release.sw_version
all_commits = ostree_utils.get_all_feed_commits(deploy_release.sw_version)
latest_commit = all_commits[0]
target_commit = deploy_release.commit_id
if target_commit in all_commits:
# This case is for node with prestaged data where ostree
# commits have been pulled from system controller
LOG.info("Commit %s already exists in feed repo for release %s"
% (deploy_release.commit_id, release_id))
# If this is the last deployment, and it is not the latest commit in feed
# delete the commits until reach this, and delete metadatas
if release_id == deployment_list[-1] and target_commit != latest_commit:
self.cleanup_old_releases(target_commit, all_commits)
# Reset feed to last deployment release
self.reset_feed_commit(deploy_release)
continue
packages = [pkg.split("_")[0] for pkg in deploy_release.packages]
if packages is None:
msg = "Unable to determine packages to install"
LOG.error(msg)
raise MetadataFail(msg)
# Install debian package through apt-ostree
try:
apt_utils.run_install(
feed_repo,
deploy_release.sw_version,
deploy_release.sw_release,
packages)
except APTOSTreeCommandFail:
msg = "Failed to install Debian packages."
LOG.exception(msg)
raise APTOSTreeCommandFail(msg)
# Get the latest commit after performing "apt-ostree install".
self.latest_feed_commit = \
ostree_utils.get_feed_latest_commit(deploy_release.sw_version)
deploystate = deploy_release.state
metadata_dir = states.RELEASE_STATE_TO_DIR_MAP[deploystate]
metadata_file = "%s/%s-metadata.xml" % (metadata_dir, release_id)
reload_release_data()
# NOTE(bqian) Below check an exception raise should be revisit, if applicable,
# should be applied to the begining of all requests.
if len(self.hosts) == 0:
msg = "service is running in incorrect state. No registered host"
raise InternalError(msg)
with self.hosts_lock:
self.interim_state[release_id] = list(self.hosts)
self.latest_feed_commit = \
ostree_utils.get_feed_latest_commit(deploy_release.sw_version)
# Update metadata
tree = ET.parse(metadata_file)
root = tree.getroot()
contents = ET.SubElement(root, constants.CONTENTS_TAG)
ostree = ET.SubElement(contents, constants.OSTREE_TAG)
self.add_text_tag_to_xml(ostree, constants.NUMBER_OF_COMMITS_TAG, "1")
base = ET.SubElement(ostree, constants.BASE_TAG)
self.add_text_tag_to_xml(base, constants.COMMIT_TAG, latest_commit)
self.add_text_tag_to_xml(base, constants.CHECKSUM_TAG, "")
commit1 = ET.SubElement(ostree, constants.COMMIT1_TAG)
self.add_text_tag_to_xml(commit1, constants.COMMIT_TAG, self.latest_feed_commit)
self.add_text_tag_to_xml(commit1, constants.CHECKSUM_TAG, "")
ET.indent(tree, ' ')
with open(metadata_file, "wb") as outfile:
tree = ET.tostring(root)
outfile.write(tree)
LOG.info("Latest feed commit: %s added to metadata file" % self.latest_feed_commit)
# Run post_start script
self._run_start_script(deploy_release.post_start, release_id, constants.APPLY)
# In prepatched add tombstone
ostree_utils.add_tombstone_commit_if_prepatched(constants.OSTREE_REF, feed_repo)
# Update the feed ostree summary
ostree_utils.update_repo_summary_file(feed_repo)
self.latest_feed_commit = ostree_utils.get_feed_latest_commit(deploy_sw_version)
self.send_latest_feed_commit_to_agent()
self.software_sync()
if upgrade:
base_deployment = deployment_list[0]
base_release = self._release_basic_checks(base_deployment)
upgrade_commit_id = base_release.commit_id
if self._deploy_upgrade_start(base_release.sw_release, upgrade_commit_id, **kwargs):
LOG.info("Finished releases %s deploy start" % deployment_list)
else:
raise ValueError("_deploy_upgrade_start failed")
else:
# move the deploy state to start-done
deploy_state = DeployState.get_instance()
deploy_state.start_done(self.latest_feed_commit)
LOG.info("Finished releases %s deploy start" % deployment_list)
except Exception as e:
msg = "Deploy start applying failed: %s" % str(e)
LOG.exception(msg)
audit_log_info(msg)
try:
# set state to failed
deploy_state = DeployState.get_instance()
deploy_state.start_failed()
except Exception as e:
msg = "Unable to set deploy failed: %s" % str(e)
LOG.exception(msg)
audit_log_info(msg)
thread = threading.Thread(target=run)
thread.start()
def _precheck_before_start(self, deployment, release_version, is_patch, force=False, **kwargs):
LOG.info("Running deploy precheck.")
precheck_result = self._deploy_precheck(release_version, patch=is_patch, force=force, **kwargs)
if precheck_result.get('system_healthy') is None:
precheck_result["error"] = (
f"Fail to perform deploy precheck. Internal error has occurred.\n"
f"{precheck_result['error']}"
)
return precheck_result
elif precheck_result.get('system_healthy') is False:
precheck_result["error"] = (
f"The following issues have been detected, which prevent deploying {deployment}\n"
f"{precheck_result['info']}\n"
"Please fix above issues then retry the deploy.\n"
)
return precheck_result
return None
def _get_precheck_result_file_path(self, release_version):
return os.path.join("/opt/software/", f"rel-{release_version}", "precheck-result.json")
def _safe_remove_precheck_result_file(self, release_version):
precheck_result_file = self._get_precheck_result_file_path(release_version)
if os.path.isfile(precheck_result_file):
os.remove(precheck_result_file)
def _save_precheck_result(self, release_version, healthy):
precheck_result_file = self._get_precheck_result_file_path(release_version)
with open(precheck_result_file, "w") as f:
json.dump({"healthy": healthy, "timestamp": time.time()}, f)
def _should_run_precheck_prior_deploy_start(self, release_version, force, is_patch, **kwargs):
# there is not precheck script in this state
if self.pre_bootstrap:
return False
# we should be able to patch an unhealthy system ignoring the unhealthy state
if is_patch and force:
return False
file_path = self._get_precheck_result_file_path(release_version)
if not os.path.isfile(file_path):
LOG.info("The precheck result file %s does not exist." % file_path)
return True
if kwargs:
return True
with open(file_path) as f:
last_result = json.load(f)
if time.time() - last_result["timestamp"] > constants.PRECHECK_RESULT_VALID_PERIOD:
LOG.info("The precheck result expired.")
return True
return not last_result["healthy"]
@require_deploy_state([None],
"There is already a deployment in progress ({state.value}). "
"Please complete/delete the current deployment.")
def software_deploy_start_api(self, deployment: str, force: bool, **kwargs) -> dict:
"""
to start deploy of a specified release.
The operation implies deploying all undeployed dependency releases of
the specified release. i.e, to deploy release 24.09.1, it implies
deploying 24.09.0 and 24.09.1 when 24.09.0 has not been deployed.
The operation includes steps:
1. find all undeployed dependency releases
2. ensure all releases (dependency and specified release) are ready to deployed
3. precheck, if last precheck was not executed or if was executed and failed or
if precheck result expired
4. transform all involved releases to deploying state
5. start the deploy subprocess
"""
msg_info = ""
msg_warning = ""
msg_error = ""
deploy_release = self._release_basic_checks(deployment)
running_release = self.release_collection.running_release
deploy_sw_version = deploy_release.sw_version # MM.mm
is_patch = (not utils.is_upgrade_deploy(SW_VERSION, deploy_sw_version))
# pre-bootstrap patch removal case
if not self.pre_bootstrap:
if (not is_patch) and socket.gethostname() != constants.CONTROLLER_0_HOSTNAME:
raise SoftwareServiceError(f"Deploy start for major releases needs to be executed in "
f"{constants.CONTROLLER_0_HOSTNAME} host.")
feed_repo = "%s/rel-%s/ostree_repo" % (constants.FEED_OSTREE_BASE_DIR, deploy_sw_version)
commit_id = deploy_release.commit_id
# Set hostname in case of local install
hostname = None
if self.pre_bootstrap:
hostname = constants.PREBOOTSTRAP_HOSTNAME
elif self.install_local:
hostname = socket.gethostname()
valid_hostnames = [constants.CONTROLLER_0_HOSTNAME, constants.CONTROLLER_1_HOSTNAME]
if hostname not in valid_hostnames:
LOG.warning("Using unknown hostname for local install: %s", hostname)
to_release = deploy_release.sw_release
if kwargs.get("options"):
kwargs["options"] = self._parse_and_sanitize_extra_options(kwargs.get("options"))
if self._should_run_precheck_prior_deploy_start(to_release, force, is_patch, **kwargs):
LOG.info("Executing software deploy precheck prior to software deploy start")
if precheck_result := self._precheck_before_start(
deployment,
to_release,
is_patch=is_patch,
force=force,
**kwargs
):
return precheck_result
self._safe_remove_precheck_result_file(to_release)
# Patch operation: 'deploy release' major version equals 'running release' major version (MM.mm)
# TODO(bqian) update references of sw_release (string) to SWRelease object
if deploy_release > running_release:
operation = constants.APPLY
elif running_release > deploy_release:
operation = constants.REMOVE
else:
# NOTE(bqian) The error message doesn't seem right. software version format
# or any metadata semantic check should be done during upload. If data
# invalid found subsequently, data is considered damaged, should recommend
# delete and re-upload
msg_error += "The software version format for this release is not correct.\n"
return dict(info=msg_info, warning=msg_warning, error=msg_error)
# NOTE(bqian) shouldn't that patch release deploy and remove are doing the same thing
# in terms of ostree commit, that it deploy to a commit specified by the commit-id that
# associated to the release from the deploy start command?
# If releases are such that:
# R2 requires R1, R3 requires R2, R4 requires R3
# If current running release is R2 and command issued is "software deploy start R4"
# operation is "apply" with order [R3, R4]
# If current running release is R4 and command issued is "software deploy start R2"
# operation is "remove" with order [R4, R3]
if operation == constants.APPLY:
deployment_list = self.release_apply_order(deployment, deploy_sw_version)
collect_current_load_for_hosts(deploy_sw_version, hostname=hostname)
create_deploy_hosts(hostname=hostname)
msg = "Deploy start order for apply operation: %s" % ",".join(deployment_list)
LOG.info(msg)
audit_log_info(msg)
# todo(jcasteli) Do we need this block below?
# Check for patches that can't be applied during an upgrade
upgrade_check = True
for release_id in deployment_list:
release = self.release_collection.get_release_by_id(release_id)
if release.sw_version != SW_VERSION and release.apply_active_release_only == "Y":
msg = "%s cannot be created during an upgrade" % release_id
LOG.error(msg)
msg_error += msg + "\n"
upgrade_check = False
if not upgrade_check:
return dict(info=msg_info, warning=msg_warning, error=msg_error)
if kwargs.get("skip-semantic") != "yes":
self.run_semantic_check(constants.SEMANTIC_PREAPPLY, deployment_list)
running_release = self.release_collection.running_release
to_deploy_release_id = deployment_list[-1]
to_deploy_release = self.release_collection.get_release_by_id(to_deploy_release_id)
reboot_required = self.is_deployment_list_reboot_required(deployment_list)
collect_current_load_for_hosts(to_deploy_release.sw_version, hostname=hostname)
release_state = ReleaseState(release_ids=deployment_list)
release_state.start_deploy()
# Setting deploy state to start, so that it can transition to start-done or start-failed
deploy_state = DeployState.get_instance()
to_release = to_deploy_release.sw_release
if is_patch:
deploy_state.start(running_release, to_release, feed_repo, None, reboot_required)
else:
deploy_state.start(running_release, to_release, feed_repo, commit_id,
reboot_required, **kwargs)
# Start applying the releases
upgrade = not is_patch
self.install_releases_thread(deployment_list, feed_repo, upgrade, **kwargs)
msg_info += "%s is now starting, await for the states: " \
"[deploy-start-done | deploy-start-failed] in " \
"'software deploy show'\n" % deployment_list
elif operation == constants.REMOVE:
collect_current_load_for_hosts(deploy_sw_version, hostname=hostname)
create_deploy_hosts(hostname=hostname)
deployment_list = self.release_remove_order(deployment, running_release.id, running_release.sw_version)
msg = "Deploy start order for remove operation: %s" % ",".join(deployment_list)
LOG.info(msg)
audit_log_info(msg)
remove_unremovable = False
if kwargs.get("removeunremovable") == "yes":
remove_unremovable = True
# See if any of the patches are marked as unremovable
unremovable_verification = True
for release_id in deployment_list:
release = self.release_collection.get_release_by_id(release_id)
if release.unremovable:
if remove_unremovable:
msg = "Unremovable release %s being removed" % release_id
LOG.warning(msg)
msg_warning = msg + "\n"
else:
msg = "Release %s is not removable" % release_id
LOG.error(msg)
msg_error += msg + "\n"
unremovable_verification = False
elif release.state == states.COMMITTED:
msg = "Release %s is committed and cannot be removed" % release_id
LOG.error(msg)
msg_error += msg + "\n"
unremovable_verification = False
if not unremovable_verification:
return dict(info=msg_info, warning=msg_warning, error=msg_error)
if kwargs.get("skipappcheck") != "yes":
# Check application dependencies before removing
required_releases = {}
for release in deployment_list:
for appname, iter_release_list in self.app_dependencies.items():
if release in iter_release_list:
if release not in required_releases:
required_releases[release] = []
required_releases[release].append(appname)
if len(required_releases) > 0:
for req_release, app_list in required_releases.items():
msg = "%s is required by application(s): %s" % (req_release, ", ".join(sorted(app_list)))
msg_error += msg + "\n"
LOG.info(msg)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
if kwargs.get("skip-semantic") != "yes":
self.run_semantic_check(constants.SEMANTIC_PREREMOVE, deployment_list)
collect_current_load_for_hosts(deploy_sw_version, hostname=hostname)
release_state = ReleaseState(release_ids=deployment_list)
release_state.start_remove()
reboot_required = self.is_deployment_list_reboot_required(deployment_list)
deploy_state = DeployState.get_instance()
to_release = deploy_release.sw_release
deploy_state.start(running_release, to_release, feed_repo, commit_id, reboot_required)
try:
for release_id in deployment_list:
release = self.release_collection.get_release_by_id(release_id)
msg = "Removing release: %s" % release_id
LOG.info(msg)
audit_log_info(msg)
# Run pre_start script
self._run_start_script(release.pre_start, release_id, constants.REMOVE)
# Reload release in case pre_start script made some change
reload_release_data()
release = self.release_collection.get_release_by_id(release_id)
if release.state == states.AVAILABLE:
msg = "The deployment for %s has not been created" % release_id
LOG.info(msg)
msg_info += msg + "\n"
continue
self.copy_patch_activate_scripts(release_id, release.activation_scripts)
major_release_sw_version = release.sw_version
# this is an ostree patch
# Base commit is fetched from the patch metadata.
base_commit = release.base_commit_id
feed_repo = "%s/rel-%s/ostree_repo" % (constants.FEED_OSTREE_BASE_DIR, major_release_sw_version)
try:
# Reset the ostree HEAD
ostree_utils.reset_ostree_repo_head(base_commit, feed_repo)
# Delete all commits that belong to this release
# NOTE(bqian) there should be just one commit per release.
commit_to_delete = release.commit_id
ostree_utils.delete_ostree_repo_commit(commit_to_delete, feed_repo)
# Update the feed ostree summary
ostree_utils.update_repo_summary_file(feed_repo)
except OSTreeCommandFail:
LOG.exception("Failure while removing release %s.", release_id)
# Remove contents tag from metadata xml
self.remove_tags_from_metadata(release, constants.CONTENTS_TAG)
try:
# Move the metadata to the deleted dir
self.release_collection.update_state([release_id], states.REMOVING)
msg_info += "%s has been removed from the repo\n" % release_id
except shutil.Error:
msg = "Failed to move the metadata for %s" % release_id
LOG.Error(msg)
raise MetadataFail(msg)
if len(self.hosts) == 0:
msg = "service is running in incorrect state. No registered host"
raise InternalError(msg)
# only update lastest_feed_commit if it is an ostree patch
if release.base_commit_id is not None:
# Base Commit in this release's metadata.xml file represents the latest commit
# after this release has been removed from the feed repo
self.latest_feed_commit = release.base_commit_id
with self.hosts_lock:
self.interim_state[release_id] = list(self.hosts)
# Run post_start script
self._run_start_script(release.post_start, release_id, constants.REMOVE)
# In prepatched add tombstone
if ostree_utils.add_tombstone_commit_if_prepatched(constants.OSTREE_REF, feed_repo):
ostree_utils.update_repo_summary_file(feed_repo)
# There is no defined behavior for deploy start for patching releases, so
# move the deploy state to start-done
deploy_state = DeployState.get_instance()
deploy_state.start_done(self.latest_feed_commit)
self.send_latest_feed_commit_to_agent()
self.software_sync()
except Exception as e:
msg_error = "Deploy start removing failed"
msg = "%s: %s" % (msg_error, e)
LOG.exception(msg)
audit_log_info(msg)
# set state to failed
deploy_state = DeployState.get_instance()
deploy_state.start_failed()
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def remove_tags_from_metadata(self, release, tag):
LOG.info("Removing %s tag from %s metadata" % (tag, release.id))
metadata_dir = states.RELEASE_STATE_TO_DIR_MAP[release.state]
metadata_path = "%s/%s-metadata.xml" % (metadata_dir, release.id)
tree = ET.parse(metadata_path)
root = tree.getroot()
metadata_tag = root.find(tag)
if metadata_tag is not None:
root.remove(metadata_tag)
ET.indent(tree, ' ')
with open(metadata_path, "wb") as outfile:
tree = ET.tostring(root)
outfile.write(tree)
def execute_delete_actions(self):
deploy = self.db_api_instance.get_current_deploy()
to_release = deploy.get("to_release")
from_release = deploy.get("from_release")
delete_cmd = f"/usr/bin/software-deploy-delete {from_release} {to_release} --is_major_release"
runner = DeployPluginRunner(deploy)
runner.execute(delete_cmd)
@require_deploy_state([DEPLOY_STATES.HOST_ROLLBACK_DONE, DEPLOY_STATES.COMPLETED, DEPLOY_STATES.START_DONE,
DEPLOY_STATES.START_FAILED],
"Deploy must be in the following states to be able to delete: %s, %s, %s, %s" % (
DEPLOY_STATES.HOST_ROLLBACK_DONE.value, DEPLOY_STATES.COMPLETED.value,
DEPLOY_STATES.START_DONE.value, DEPLOY_STATES.START_FAILED.value))
def software_deploy_delete_api(self) -> dict:
"""
Delete deployment and the data generated during the deploy.
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
deploy = self.db_api_instance.get_current_deploy()
to_release = deploy.get("to_release")
from_release = deploy.get("from_release")
deploy_state_instance = DeployState.get_instance()
# except in early stages of the deployment, such as deploy start,
# hosts must be unlocked and online since during delete deployment
# a request is sent to all hosts to clear flags and temporary data
# created during the deployment procedure
if not self.pre_bootstrap:
if (deploy_state_instance.get_deploy_state() not in [DEPLOY_STATES.START_DONE,
DEPLOY_STATES.START_FAILED] and
not are_all_hosts_unlocked_and_online()):
msg = f"Hosts must be {constants.ADMIN_UNLOCKED} and {constants.AVAILABILITY_ONLINE}."
raise SoftwareServiceError(error=msg)
is_major_release = False
deploy_state = deploy_state_instance.get_deploy_state()
deploying_release_state = ReleaseState(release_state=states.DEPLOYING)
is_applying = deploying_release_state.has_release_id()
if deploy_state in [
DEPLOY_STATES.START_DONE, DEPLOY_STATES.START_FAILED, DEPLOY_STATES.COMPLETED]:
is_major_release = deploying_release_state.is_major_release_deployment() if is_applying else False
elif deploy_state == DEPLOY_STATES.HOST_ROLLBACK_DONE:
is_major_release = ReleaseState(
release_state=states.DEPLOYING).is_major_release_deployment()
# Only major release is required to be deleted on controller-0
# Patch deletion can take place on either controller
if is_major_release and self.hostname != constants.CONTROLLER_0_HOSTNAME:
raise SoftwareServiceError("Deploy delete can only be performed on controller-0.")
if DEPLOY_STATES.COMPLETED == deploy_state:
if is_applying:
major_release = utils.get_major_release_version(from_release)
# In case of a major release deployment set all the releases related to from_release to unavailable
if is_major_release:
unavailable_releases = []
for release in self.release_collection.iterate_releases():
if release.sw_version == major_release:
unavailable_releases.append(release.id)
ReleaseState(release_ids=unavailable_releases).replaced()
# Set deploying releases to deployed state.
deploying_release_state.deploy_completed()
else:
removing_release_state = ReleaseState(release_state=states.REMOVING)
removing_release_state.available()
elif DEPLOY_STATES.HOST_ROLLBACK_DONE == deploy_state:
major_release = utils.get_major_release_version(from_release)
release_state = ReleaseState(release_state=states.DEPLOYING)
release_state.available()
elif deploy_state in [DEPLOY_STATES.START_DONE, DEPLOY_STATES.START_FAILED]:
# TODO(bqian), this check is redundant. there should be no host deployed/deploying
# when deploy in START_DONE or START_FAILED states
hosts_states = []
for host in self.db_api_instance.get_deploy_host():
hosts_states.append(host.get("state"))
if (DEPLOY_HOST_STATES.DEPLOYED.value in hosts_states or
DEPLOY_HOST_STATES.DEPLOYING.value in hosts_states):
raise SoftwareServiceError(f"There are hosts already {DEPLOY_HOST_STATES.DEPLOYED.value} "
f"or in {DEPLOY_HOST_STATES.DEPLOYING.value} process")
if is_applying:
major_release = utils.get_major_release_version(to_release)
if is_major_release:
try:
# TODO(bqian) Move below function to a delete action
run_remove_temporary_data_script(to_release)
except subprocess.CalledProcessError as e:
msg_error = "Failed to delete deploy"
LOG.error("%s: %s" % (msg_error, e))
raise SoftwareServiceError(msg_error)
else:
deployment_list = deploying_release_state.get_release_ids()
for release in self.release_collection.iterate_releases():
if release.sw_release == from_release:
self.reset_feed_commit(release)
if release.id in deployment_list:
self.remove_tags_from_metadata(release, constants.CONTENTS_TAG)
deploying_release_state.available()
else:
msg_error = "Delete is not supported while removing a release"
LOG.error(msg_error)
raise SoftwareServiceError(msg_error)
if os.path.isfile(INSTALL_LOCAL_FLAG):
# Remove install local flag if enabled
try:
os.remove(INSTALL_LOCAL_FLAG)
except Exception:
msg_error = "Failed to clear install-local mode flag"
LOG.error(msg_error)
raise SoftwareServiceError(msg_error)
LOG.info("Software deployment in local installation mode is stopped")
if is_major_release:
if SW_VERSION == major_release:
msg_error = (
f"Deploy {major_release} can't be deleted as it is still the"
"current running software.An error may have occurred during the deploy.")
LOG.error(msg_error)
raise SoftwareServiceError(msg_error)
# Send message to agents cleanup their ostree environment
# if the deployment has completed or rolled-back successfully
finished_deploy_states = [DEPLOY_STATES.COMPLETED, DEPLOY_STATES.HOST_ROLLBACK_DONE]
if deploy_state in finished_deploy_states:
cleanup_req = SoftwareMessageDeployDeleteCleanupReq()
cleanup_req.major_release = utils.get_major_release_version(to_release)
cleanup_req.encode()
self.socket_lock.acquire()
cleanup_req.send(self.sock_out)
self.socket_lock.release()
self.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_CLEANUP_DEPLOYMENT_DATA,
fm_constants.FM_ALARM_STATE_CLEAR,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, constants.CONTROLLER_FLOATING_HOSTNAME))
# execute deploy delete plugins
# NOTE(bqian) implement for major release deploy delete only as deleting action
# for patching is undefined, i.e, in the case of patch is applied, both from and
# to releases are applied.
self.execute_delete_actions()
else:
self.delete_all_patch_activate_scripts()
msg_info += "Deploy deleted with success"
self.db_api_instance.delete_deploy_host_all()
self.db_api_instance.delete_deploy()
LOG.info("Deploy is deleted")
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def _deploy_complete(self):
is_all_hosts_in_deployed_state = all(host_state.get("state") == DEPLOY_HOST_STATES.DEPLOYED.value
for host_state in self.db_api_instance.get_deploy_host())
if not is_all_hosts_in_deployed_state:
raise SoftwareServiceError(f"Complete not allowed because there are hosts not"
f" in {DEPLOY_HOST_STATES.DEPLOYED.value} state.")
return True
@require_deploy_state([DEPLOY_STATES.ACTIVATE_DONE],
"Deploy must be in %s state to be able to complete." % DEPLOY_STATES.ACTIVATE_DONE.value)
def software_deploy_complete_api(self) -> dict:
"""
Completes a deployment associated with the release
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
deploy_state = DeployState.get_instance()
if self._deploy_complete():
deploy_state.completed()
msg_info += "Deployment has been completed\n"
try:
# the sysinv evaluate_apps_reapply function needs to
# be triggered after the deploy complete.
trigger_evaluate_apps_reapply({"type": "usm-upgrade-complete"})
except Exception as e:
LOG.error("The attempt to trigger the evaluate apps reapply \
failed with message: %s", e)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def _activate(self):
deploy = self.db_api_instance.get_deploy_all()
if deploy:
deploy = deploy[0]
else:
msg = "Deployment is missing unexpectedly"
raise InvalidOperation(msg)
cmd_path = "/usr/bin/software-deploy-activate"
from_release = deploy.get("from_release")
to_release = deploy.get("to_release")
if self.pre_bootstrap:
activate_cmd = [cmd_path, from_release, to_release]
else:
activate_cmd = ["source", "/etc/platform/openrc;", cmd_path, from_release, to_release]
deploying = ReleaseState(release_state=states.DEPLOYING)
if deploying.is_major_release_deployment():
activate_cmd.append('--is_major_release')
env = os.environ.copy()
env["ANSIBLE_LOG_PATH"] = SOFTWARE_LOG_FILE
if not self.pre_bootstrap:
token, endpoint = utils.get_endpoints_token()
env["OS_AUTH_TOKEN"] = token
env["SYSTEM_URL"] = re.sub('/v[1,9]$', '', endpoint) # remove ending /v1
env["IGNORE_ERRORS"] = self.ignore_errors
try:
LOG.info("starting subprocess %s" % ' '.join(activate_cmd))
subprocess.Popen(' '.join(activate_cmd), start_new_session=True, shell=True, env=env)
LOG.info("subprocess started")
except subprocess.SubprocessError as e:
LOG.error("Failed to start command: %s. Error %s" % (' '.join(activate_cmd), e))
return False
return True
def _check_pre_activate(self):
if not self.pre_bootstrap:
if not are_all_hosts_unlocked_and_online():
msg = f"Hosts must be {constants.ADMIN_UNLOCKED} and {constants.AVAILABILITY_ONLINE}."
raise SoftwareServiceError(error=msg)
# check current deployment, deploy to all hosts have completed,
# the deploy state is host-done, or
# activate-failed' as reattempt from a previous failed activate
deploy_state = DeployState.get_deploy_state()
if deploy_state not in [DEPLOY_STATES.HOST_DONE, DEPLOY_STATES.ACTIVATE_FAILED]:
msg = "Must complete deploying all hosts before activating the deployment"
raise InvalidOperation(msg)
deploy_hosts = self.db_api_instance.get_deploy_host()
invalid_hosts = []
for deploy_host in deploy_hosts:
if deploy_host['state'] not in [states.DEPLOYED]:
invalid_hosts.append(deploy_host)
if len(invalid_hosts) > 0:
msg = "All hosts must have completed deployment before activating the deployment"
for invalid_host in invalid_hosts:
msg += "%s: %s\n" % (invalid_host["hostname"], invalid_host["state"])
raise InvalidOperation(msg)
@require_deploy_state([DEPLOY_STATES.ACTIVATE, DEPLOY_STATES.ACTIVATE_DONE, DEPLOY_STATES.ACTIVATE_FAILED,
DEPLOY_STATES.COMPLETED, DEPLOY_STATES.HOST, DEPLOY_STATES.HOST_DONE,
DEPLOY_STATES.HOST_FAILED],
"Deploy must be in the following states to be able to abort: %s, %s, %s, %s, %s, %s, %s" %
(DEPLOY_STATES.ACTIVATE.value, DEPLOY_STATES.ACTIVATE_DONE.value,
DEPLOY_STATES.ACTIVATE_FAILED.value, DEPLOY_STATES.COMPLETED.value, DEPLOY_STATES.HOST.value,
DEPLOY_STATES.HOST_DONE.value, DEPLOY_STATES.HOST_FAILED.value))
def software_deploy_abort_api(self) -> dict:
"""
Aborts the deployment associated with the release
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
deploy = self.db_api_instance.get_current_deploy()
from_release = deploy.get("from_release")
to_release = deploy.get("to_release")
from_release_deployment = self.release_collection.get_release_id_by_sw_release(from_release)
to_release_deployment = self.release_collection.get_release_id_by_sw_release(to_release)
try:
is_major_release = ReleaseState(release_state=states.DEPLOYING).is_major_release_deployment()
except AttributeError:
release = self.release_collection.get_release_by_id(to_release_deployment)
is_major_release = ReleaseState(release_ids=[release.id]).is_major_release_deployment()
if not is_major_release:
removing_release_state = ReleaseState(release_state=states.REMOVING)
is_removing = removing_release_state.has_release_id()
if is_removing:
raise SoftwareServiceError("Abort operation is not supported in patch removal")
from_deployment = self.release_collection.get_release_by_id(from_release_deployment)
self.reset_feed_commit(from_deployment)
self.send_latest_feed_commit_to_agent()
self.software_sync()
major_from_release = utils.get_major_release_version(from_release)
feed_repo = "%s/rel-%s/ostree_repo" % (constants.FEED_OSTREE_BASE_DIR, major_from_release)
deploy_release = self._release_basic_checks(from_release_deployment)
commit_id = deploy_release.commit_id
# TODO(lbonatti): remove this condition when commit-id is built into GA metadata.
if is_major_release and commit_id in [constants.COMMIT_DEFAULT_VALUE, None]:
commit_id = ostree_utils.get_feed_latest_commit(deploy_release.sw_version)
# Update the deployment
deploy_state = DeployState.get_instance()
deploy_state.abort(feed_repo, commit_id)
# Update the host deployment
deploy_host = self.db_api_instance.get_deploy_host()
for host in deploy_host:
hostname = host.get("hostname")
deploy_host_state = DeployHostState(hostname)
deploy_host_state.abort()
msg_info += "Deployment has been aborted\n"
return dict(info=msg_info, warning=msg_warning, error=msg_error)
@require_deploy_state([DEPLOY_STATES.HOST_DONE, DEPLOY_STATES.ACTIVATE_FAILED],
"Activate deployment only when current deployment state is {require_states}")
def software_deploy_activate_api(self) -> dict:
"""
Activates the deployment associated with the release
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
self._check_pre_activate()
deploy_state = DeployState.get_instance()
deploy_state.activate()
try:
self._activate()
msg_info = "Deploy activate has started"
except Exception:
deploy_state.activate_failed()
raise
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def _activate_rollback_major_release(self, deploy):
cmd_path = "/usr/bin/software-deploy-activate-rollback"
from_release = utils.get_major_release_version(deploy.get("from_release"))
to_release = utils.get_major_release_version(deploy.get("to_release"))
token, endpoint = utils.get_endpoints_token()
env = os.environ.copy()
env["ANSIBLE_LOG_PATH"] = SOFTWARE_LOG_FILE
env["OS_AUTH_TOKEN"] = token
env["SYSTEM_URL"] = re.sub('/v[1,9]$', '', endpoint) # remove ending /v1
env["IGNORE_ERRORS"] = self.ignore_errors
upgrade_activate_rollback_cmd = [
"source", "/etc/platform/openrc;", cmd_path, from_release, to_release]
# check if LVM snapshots are enabled and try to restore them
# TODO(heitormatsui): we don't really need to verify the system mode
# as LVM snapshots will only be allowed if the system is AIO-SX
system_mode = utils.get_platform_conf("system_mode")
if system_mode == constants.SYSTEM_MODE_SIMPLEX:
deploy = self.db_api_instance.get_deploy_all()[0]
options = deploy.get("options", {})
enabled_lvm_snapshots = to_bool(options.get("snapshot"))
if enabled_lvm_snapshots:
LOG.info("LVM snapshots are enabled")
manager = lvm_snapshot.LVMSnapshotManager()
success = manager.restore_snapshots()
if success:
LOG.info("LVM snapshots were restored, upgrade scripts with "
"action=activate-rollback will be skipped")
deploy_state = DeployState.get_instance()
deploy_state.activate_rollback_done()
return
else:
LOG.warning("Failure restoring LVM snapshots, falling back "
"to standard activate-rollback procedure")
try:
LOG.info("starting subprocess %s" % ' '.join(upgrade_activate_rollback_cmd))
subprocess.Popen(' '.join(upgrade_activate_rollback_cmd), start_new_session=True, shell=True, env=env)
LOG.info("subprocess started")
except subprocess.SubprocessError as e:
LOG.error("Failed to start command: %s. Error %s" % (' '.join(upgrade_activate_rollback_cmd), e))
raise
def _activate_rollback_patching_release(self):
deploy_state = DeployState.get_instance()
# patching release activate-rollback operations go here
deploy_state.activate_rollback_done()
def _activate_rollback(self):
deploy = self.db_api_instance.get_current_deploy()
if not deploy:
msg = "Deployment is missing unexpectedly"
raise InvalidOperation(msg)
deploying = ReleaseState(release_state=states.DEPLOYING)
if deploying.is_major_release_deployment():
self._activate_rollback_major_release(deploy)
else:
self._activate_rollback_patching_release()
@require_deploy_state([DEPLOY_STATES.ACTIVATE_ROLLBACK_PENDING, DEPLOY_STATES.ACTIVATE_ROLLBACK_FAILED],
"Activate-rollback deployment only when current deployment state is {require_states}")
def software_deploy_activate_rollback_api(self) -> dict:
"""
Rolls back activates the deployment associated with the release
:return: dict of info, warning and error messages
"""
msg_info = ""
msg_warning = ""
msg_error = ""
deploy_state = DeployState.get_instance()
deploy_state.activate_rollback()
try:
self._activate_rollback()
msg_info = "Deploy activate-rollback has started"
except Exception:
deploy_state.activate_rollback_failed()
raise
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def software_deploy_show_api(self, from_release=None, to_release=None):
# Retrieve deploy state from db
if from_release and to_release:
deploy_data = self.db_api_instance.get_deploy(from_release, to_release)
if not deploy_data:
return deploy_data
release_deployment = deploy_data["to_release"]
else:
# Retrieve deploy state from db in list format
deploy_data = self.db_api_instance.get_deploy_all()
if not deploy_data:
return deploy_data
release_deployment = deploy_data[0]["to_release"]
release_id = self.release_collection.get_release_id_by_sw_release(release_deployment)
release = self._release_basic_checks(release_id)
release_info = self._get_release_additional_info(release)
if isinstance(deploy_data, list):
deploy_data[0].update(release_info)
else:
deploy_data.update(release_info)
return deploy_data
def _deploy_host(self, hostname, force, async_req=False, rollback=False):
msg_info = ""
msg_warning = ""
msg_error = ""
try:
ip = utils.gethostbyname(hostname)
except socket.gaierror:
msg_error += "Host %s not found\n" % hostname
return dict(info=msg_info, warning=msg_warning, error=msg_error)
# NOTE(bqian) Get IP address to fulfill the need of patching structure.
# need to review the design
# ensure ip is in table as in some cases the host is aged out from the hosts table
if ip not in self.hosts:
raise HostIpNotFound(hostname)
# check if host agent is reachable via message
self.hosts[ip].is_alive = False
check_alive_req = SoftwareMessageCheckAgentAliveReq()
check_alive_req.ip = ip
self.socket_lock.acquire()
check_alive_req.send(self.sock_out)
self.socket_lock.release()
time.sleep(5) # sleep 5 seconds for agent to reply
if not self.hosts[ip].is_alive:
raise HostAgentUnreachable(hostname)
is_major_release = self.check_upgrade_in_progress()
deploy_host = self.db_api_instance.get_deploy_host_by_hostname(hostname)
if deploy_host is None:
raise HostNotFound(hostname)
deploy = self.db_api_instance.get_deploy_all()[0]
# Determine reboot required from deployment info
self.allow_insvc_patching = True
is_reboot_req = deploy.get(constants.REBOOT_REQUIRED, False)
if is_reboot_req:
self.allow_insvc_patching = False
# for rr patch in pre bootstrap
if self.pre_bootstrap:
self.allow_insvc_patching = True
commit_id = deploy.get("commit_id")
if not self.install_local:
deploy_host_validations(
hostname,
is_major_release=is_major_release,
rollback=rollback
)
deploy_state = DeployState.get_instance()
deploy_host_state = DeployHostState(hostname)
deploy_state.deploy_host()
deploy_host_state.deploy_started()
# if in a 'deploy host' reentrant scenario, i.e. retrying after
# a failure, then clear the failure alarm before retrying
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, hostname)
self.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
fm_constants.FM_ALARM_STATE_CLEAR,
entity_instance_id)
msg = "Running software deploy host for %s (%s), force=%s, async_req=%s" % (
hostname, ip, force, async_req)
LOG.info(msg)
audit_log_info(msg)
if not is_major_release and self.allow_insvc_patching:
LOG.info("Allowing in-service patching")
force = True
self.copy_install_scripts()
# Check if there is a major release deployment in progress
# and set agent request parameters accordingly
major_release = None
additional_data = {}
if is_major_release:
upgrade_release = self.get_software_upgrade()
major_release = upgrade_release["to_release"]
force = False
async_req = False
msg = "Running major release deployment, major_release=%s, force=%s, async_req=%s, commit_id=%s" % (
major_release, force, async_req, commit_id)
msg_info += msg + "\n"
LOG.info(msg)
try:
copy_pxeboot_update_file(major_release, rollback=rollback)
copy_pxeboot_cfg_files(major_release)
except Exception:
LOG.error("Fail to start deploy host")
deploy_host_state.deploy_failed()
raise
# TODO(bqian) This code below is for upgrading to stx-10. Beside the code is specific for the upgrade
# path, the solution is also temporary. Need a better design with smooth support of host deploy with
# predetermined parameters
impacted_upgrade = ["24.09", "22.12"]
if upgrade_release["to_release"] in impacted_upgrade and \
upgrade_release["from_release"] in impacted_upgrade:
if rollback:
oot_drivers = ""
else:
try:
oot_drivers = get_oot_drivers()
except ServiceParameterNotFound:
# the oot_drivers should be identical to the new default service parameter declare in
# config/controllerconfig/controllerconfig/upgrade-scripts/26-add-service-parameter.py#L52
oot_drivers = "ice,i40e,iavf"
additional_data.update({"out-of-tree-drivers": oot_drivers})
self.hosts_lock.acquire()
self.hosts[ip].install_pending = True
self.hosts[ip].install_status = False
self.hosts[ip].install_reject_reason = None
self.hosts_lock.release()
installreq = PatchMessageAgentInstallReq(additional_data)
installreq.ip = ip
installreq.force = force
installreq.major_release = major_release
installreq.commit_id = commit_id
installreq.encode()
self.socket_lock.acquire()
installreq.send(self.sock_out)
self.socket_lock.release()
if async_req:
# async_req install requested, so return now
msg = "Host deployment request sent to %s." % self.hosts[ip].hostname
msg_info += msg + "\n"
LOG.info("host-install async_req: %s", msg)
# TODO(bqian) update deploy state to deploy-host
# Now we wait, up to ten mins. future enhancement: Wait on a condition
resp_rx = False
max_time = time.time() + 600
success = True
# NOTE(bqian) loop below blocks REST API service (slow thread)
# Consider remove.
while time.time() < max_time:
self.hosts_lock.acquire()
if ip not in self.hosts:
# The host aged out while we were waiting
self.hosts_lock.release()
success = False
msg = "Agent expired while waiting: %s" % ip
msg_error += msg + "\n"
LOG.error("Error in host-install: %s", msg)
break
if not self.hosts[ip].install_pending:
# We got a response
resp_rx = True
if self.hosts[ip].install_status:
msg = "Host deployment was successful on %s." % self.hosts[ip].hostname
msg_info += msg + "\n"
LOG.info("host-install: %s", msg)
elif self.hosts[ip].install_reject_reason:
msg = "Host deployment rejected by %s. %s" % (
self.hosts[ip].hostname,
self.hosts[ip].install_reject_reason)
msg_error += msg + "\n"
LOG.error("Error in host-install: %s", msg)
success = False
else:
msg = "Host deployment failed on %s." % self.hosts[ip].hostname
msg_error += msg + "\n"
LOG.error("Error in host-install: %s", msg)
success = False
self.hosts_lock.release()
break
self.hosts_lock.release()
time.sleep(0.5)
if not resp_rx:
msg = "Timeout occurred while waiting response from %s." % ip
msg_error += msg + "\n"
LOG.error("Error in host-install: %s", msg)
success = False
if not success:
deploy_host_state.deploy_failed()
return dict(info=msg_info, warning=msg_warning, error=msg_error)
@require_deploy_state([DEPLOY_STATES.START_DONE, DEPLOY_STATES.HOST, DEPLOY_STATES.HOST_FAILED],
"Current deployment ({state.value}) is not ready to deploy host")
def software_deploy_host_api(self, hostname, force, async_req=False):
return self._deploy_host(hostname, force, async_req)
@require_deploy_state([DEPLOY_STATES.ACTIVATE_ROLLBACK_DONE,
DEPLOY_STATES.HOST_ROLLBACK, DEPLOY_STATES.HOST_ROLLBACK_FAILED],
"Current deployment ({state.value}) is not ready to rollback host")
def software_deploy_host_rollback_api(self, hostname, force, async_req=False):
return self._deploy_host(hostname, force, async_req, rollback=True)
def drop_host(self, host_ip, sync_nbr=True):
msg_info = ""
msg_warning = ""
msg_error = ""
ip = host_ip
self.hosts_lock.acquire()
# If not in hosts table, maybe a hostname was used instead
if host_ip not in self.hosts:
try:
# Because the host may be getting dropped due to deletion,
# we may be unable to do a hostname lookup. Instead, we'll
# iterate through the table here.
for host in list(self.hosts):
if host_ip == self.hosts[host].hostname:
ip = host
break
if ip not in self.hosts:
# Translated successfully, but IP isn't in the table.
# Raise an exception to drop out to the failure handling
raise SoftwareError("Host IP (%s) not in table" % ip)
except Exception:
self.hosts_lock.release()
msg = "Unknown host specified: %s" % host_ip
msg_error += msg + "\n"
LOG.error("Error in drop-host: %s", msg)
return dict(info=msg_info, warning=msg_warning, error=msg_error)
msg = "Running drop-host for %s (%s)" % (host_ip, ip)
LOG.info(msg)
audit_log_info(msg)
del self.hosts[ip]
for patch_id in list(self.interim_state):
if ip in self.interim_state[patch_id]:
self.interim_state[patch_id].remove(ip)
self.hosts_lock.release()
if sync_nbr:
sync_msg = PatchMessageDropHostReq()
sync_msg.ip = ip
self.socket_lock.acquire()
sync_msg.send(self.sock_out)
self.socket_lock.release()
return dict(info=msg_info, warning=msg_warning, error=msg_error)
def check_releases_state(self, release_ids, state):
"""check all releases to be in the specified state"""
all_matched = True
for release_id in release_ids:
release = self.release_collection.get_release_by_id(release_id)
if release is None:
all_matched = False
break
if release.state != state:
all_matched = False
break
return all_matched
def is_available(self, release_ids):
return self.check_releases_state(release_ids, states.AVAILABLE)
def is_deployed(self, release_ids):
return self.check_releases_state(release_ids, states.DEPLOYED)
def is_committed(self, release_ids):
return self.check_releases_state(release_ids, states.COMMITTED)
# NOTE(bqian) report_app_dependencies function not being called?
# which means self.app_dependencies will always be empty and file
# app_dependency_filename will never exist?
def report_app_dependencies(self, patch_ids, **kwargs):
"""
Handle report of application dependencies
"""
if "app" not in kwargs:
raise ReleaseInvalidRequest
appname = kwargs.get("app")
LOG.info("Handling app dependencies report: app=%s, patch_ids=%s",
appname, ','.join(patch_ids))
if len(patch_ids) == 0:
if appname in self.app_dependencies:
del self.app_dependencies[appname]
else:
self.app_dependencies[appname] = patch_ids
try:
tmpfile, tmpfname = tempfile.mkstemp(
prefix=app_dependency_basename,
dir=constants.SOFTWARE_STORAGE_DIR)
os.write(tmpfile, json.dumps(self.app_dependencies).encode())
os.close(tmpfile)
os.rename(tmpfname, app_dependency_filename)
except Exception:
LOG.exception("Failed in report_app_dependencies")
raise SoftwareFail("Internal failure")
return True
# NOTE(bqian) unused function query_app_dependencies
def query_app_dependencies(self):
"""
Query application dependencies
"""
data = self.app_dependencies
return dict(data)
def is_host_next_to_be_deployed_api(self, hostname):
is_major_release = ReleaseState(release_state=states.DEPLOYING).is_major_release_deployment()
deploy_state = DeployState.get_deploy_state()
# If there's no deploy in progress return False
if deploy_state is None:
return False
is_rollback_action = deploy_state in [DEPLOY_STATES.HOST_ROLLBACK, DEPLOY_STATES.ACTIVATE_ROLLBACK_PENDING,
DEPLOY_STATES.HOST_ROLLBACK_FAILED, DEPLOY_STATES.ACTIVATE_ROLLBACK_DONE,
DEPLOY_STATES.ACTIVATE_ROLLBACK_FAILED]
try:
validate_host_deploy_order(hostname, is_major_release, is_rollback_action)
return True
except SoftwareServiceError:
return False
except Exception as err:
msg_error = "Error to check deploy order"
LOG.exception("%s: %s" % (msg_error, err))
return False
def deploy_host_list(self):
deploy_hosts = self.db_api_instance.get_deploy_host()
deploy = self.db_api_instance.get_deploy_all()
if not deploy:
return []
deploy = deploy[0]
deploy_host_list = []
for host in deploy_hosts:
state = host.get("state")
deploy_host = {
"hostname": host.get("hostname"),
"software_release": deploy.get("from_release"),
"target_release": deploy.get("to_release") if state else None,
"reboot_required": deploy.get("reboot_required") if state else None,
"host_state": state
}
deploy_host_list.append(deploy_host)
return deploy_host_list
def manage_software_alarm(self, alarm_id, alarm_state, entity_instance_id, **kwargs):
try:
if alarm_id not in constants.SOFTWARE_ALARMS:
raise Exception("Unknown software alarm '%s'." % alarm_id)
# deal with the alarm clear scenario
if alarm_state == fm_constants.FM_ALARM_STATE_CLEAR:
LOG.info("Clearing alarm: %s for %s" % (alarm_id, entity_instance_id))
self.fm_api.clear_fault(alarm_id, entity_instance_id)
return
# if not clear alarm scenario, create the alarm
alarm_data = constants.SOFTWARE_ALARMS.get(alarm_id)
# update the alarm_data if it is present in kwargs
if kwargs:
for data in alarm_data:
if data in kwargs.keys():
alarm_data[data] = kwargs[data]
alarm = fm_api.Fault(
alarm_id=alarm_id,
alarm_state=alarm_state,
entity_type_id=alarm_data.get("entity_type_id"),
entity_instance_id=entity_instance_id,
severity=alarm_data.get("severity"),
reason_text=alarm_data.get("reason_text"),
alarm_type=alarm_data.get("alarm_type"),
probable_cause=alarm_data.get("probable_cause"),
proposed_repair_action=alarm_data.get("proposed_repair_action"),
service_affecting=alarm_data.get("service_affecting"),
)
LOG.info("Raising alarm: %s for %s" % (alarm_id, entity_instance_id))
self.fm_api.set_fault(alarm)
except Exception as e:
LOG.exception("Failed to manage alarm %s with action %s: %s" % (
alarm_id, alarm_state, str(e)
))
def get_out_of_sync_alarm(self):
"""Get the out-of-sync alarm instance from fm_api"""
return self.fm_api.get_fault(fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC,
constants.ALARM_INSTANCE_ID_OUT_OF_SYNC)
def create_clean_up_deployment_alarm(self, target_state):
"""
Creates the 900.022 alarm to warn the user to clean up the deployment data remaining for the specified release
version.
"""
if target_state in [DEPLOY_STATES.COMPLETED, DEPLOY_STATES.HOST_ROLLBACK_DONE]:
is_major_release = ReleaseState(release_state=states.DEPLOYING).is_major_release_deployment()
# Do not create in case of patch release.
if not is_major_release:
return
reason_text = constants.SOFTWARE_ALARMS[fm_constants.FM_ALARM_ID_USM_CLEANUP_DEPLOYMENT_DATA]["reason_text"]
self.manage_software_alarm(fm_constants.FM_ALARM_ID_USM_CLEANUP_DEPLOYMENT_DATA,
fm_constants.FM_ALARM_STATE_SET,
"%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, constants.CONTROLLER_FLOATING_HOSTNAME),
reason_text=reason_text)
def handle_deploy_state_sync(self):
"""
Handle the deploy state sync.
If deploy state is in sync, clear the alarm.
If not, raise the alarm.
"""
is_in_sync = is_deploy_state_in_sync()
# Deploy in sync state is not changed, no need to update the alarm
if is_in_sync == self.usm_alarm.get(constants.LAST_IN_SYNC):
return
try:
LOG.info("software.json in sync: %s", is_in_sync)
out_of_sync_alarm_fault = self.get_out_of_sync_alarm()
if out_of_sync_alarm_fault and is_in_sync:
# There was an out of sync alarm raised, but local software.json is in sync,
# we clear the alarm
self.manage_software_alarm(
alarm_id=fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC,
alarm_state=fm_constants.FM_ALARM_STATE_CLEAR,
entity_instance_id=constants.ALARM_INSTANCE_ID_OUT_OF_SYNC
)
# Deploy in sync state is changed, update the cache
self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync
elif (not out_of_sync_alarm_fault) and (not is_in_sync):
# There was no out of sync alarm raised, but local software.json is not in sync,
# we raise the alarm
self.manage_software_alarm(
alarm_id=fm_constants.FM_ALARM_ID_SW_UPGRADE_DEPLOY_STATE_OUT_OF_SYNC,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_instance_id=constants.ALARM_INSTANCE_ID_OUT_OF_SYNC
)
# Deploy in sync state is changed, update the cache
self.usm_alarm[constants.LAST_IN_SYNC] = is_in_sync
else:
# Shouldn't come to here
LOG.error("Unexpected case in handling deploy state sync. ")
except Exception as ex:
LOG.exception("Failed in handling deploy state sync. Error: %s" % str(ex))
def _get_software_upgrade(self):
"""
Get the current software upgrade from/to versions and state
:return: dict of from_release, to_release and state
"""
all_deploy = self.db_api_instance.get_deploy_all()
if not all_deploy:
return None
deploy = all_deploy[0]
from_maj_min_release = utils.get_major_release_version(deploy.get("from_release"))
to_maj_min_release = utils.get_major_release_version(deploy.get("to_release"))
state = deploy.get("state")
return {
"from_release": from_maj_min_release,
"to_release": to_maj_min_release,
"state": state
}
def check_upgrade_in_progress(self):
"""
Check if major release upgrade is in progress
"""
_upgrade_in_progress = False
upgrade_release = self._get_software_upgrade()
if not upgrade_release:
return _upgrade_in_progress
from_release = version.Version(upgrade_release["from_release"])
to_release = version.Version(upgrade_release["to_release"])
if (from_release.major != to_release.major) or (from_release.minor != to_release.minor):
_upgrade_in_progress = True
return _upgrade_in_progress
def get_software_upgrade(self):
return self._get_software_upgrade()
def get_all_software_host_upgrade(self):
"""
Get all software host upgrade from/to versions and state
:return: list of dict of hostname, current_sw_version, target_sw_version and host_state
"""
deploy = self._get_software_upgrade()
deploy_hosts = self.db_api_instance.get_deploy_host()
if deploy is None or deploy_hosts is None:
return None
from_maj_min_release = deploy.get("from_release")
to_maj_min_release = deploy.get("to_release")
all_host_upgrades = []
for deploy_host in deploy_hosts:
all_host_upgrades.append({
"hostname": deploy_host.get("hostname"),
"current_sw_version": to_maj_min_release if deploy_host.get(
"state") == states.DEPLOYED else from_maj_min_release,
"target_sw_version": to_maj_min_release,
"host_state": deploy_host.get("state")
})
return all_host_upgrades
def get_one_software_host_upgrade(self, hostname):
"""
Get the given software host upgrade from/to versions and state
:param hostname: hostname
:return: array of dict of hostname, current_sw_version, target_sw_version and host_state
"""
all_host_upgrades = self.get_all_software_host_upgrade()
if not all_host_upgrades:
return None
for host_upgrade in all_host_upgrades:
if host_upgrade.get("hostname") == hostname:
return [host_upgrade]
return None
def is_host_active_controller(self):
"""
Check if current host is active controller by checking if floating ip is assigned
to the host
:return: True if it is active controller, False otherwise
"""
if not os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG):
return False
floating_mgmt_ip = utils.gethostbyname(constants.CONTROLLER_FLOATING_HOSTNAME)
if not floating_mgmt_ip:
return False
ip_family = utils.get_management_family()
mgmt_iface = cfg.get_mgmt_iface()
host_mgmt_ip_list = utils.get_iface_ip(mgmt_iface, ip_family)
return floating_mgmt_ip in host_mgmt_ip_list if host_mgmt_ip_list else False
def set_interruption_fail_state(self):
"""
Set the host failed state after an interruption based on current deployment state
"""
upgrade_status = self.get_software_upgrade()
if self.is_host_active_controller() and os.path.exists(INITIAL_CONFIG_COMPLETE_FLAG) and upgrade_status:
if upgrade_status.get('state') == DEPLOY_STATES.HOST.value and not is_simplex():
to_fail_hostname = CONTROLLER_0_HOSTNAME if self.hostname == CONTROLLER_1_HOSTNAME else \
CONTROLLER_1_HOSTNAME
# In DX, when it is in deploy-host state, we can only set the standby controller to fail
start_set_fail(True, to_fail_hostname)
elif upgrade_status.get('state') in INTERRUPTION_RECOVERY_STATES:
# The deployment was interrupted. We need to update the deployment state first
start_set_fail(True, self.hostname)
class PatchControllerApiThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.wsgi = None
self.name = "PatchControllerApiThread"
def run(self):
global thread_death
host = "127.0.0.1"
port = cfg.api_port
try:
# In order to support IPv6, server_class.address_family must be
# set to the correct address family. Because the unauthenticated
# API always uses IPv4 for the loopback address, the address_family
# variable cannot be set directly in the WSGIServer class, so a
# local subclass needs to be created for the call to make_server,
# where the correct address_family can be specified.
class server_class(simple_server.WSGIServer):
pass
server_class.address_family = socket.AF_INET
self.wsgi = simple_server.make_server(
host, port,
app.VersionSelectorApplication(),
server_class=server_class)
self.wsgi.socket.settimeout(api_socket_timeout)
global keep_running
while keep_running:
self.wsgi.handle_request()
if thread_death.is_set():
LOG.info("%s exits as thread death is detected.", self.name)
return
# Call garbage collect after wsgi request is handled,
# to ensure any open file handles are closed in the case
# of an upload.
gc.collect()
except Exception as ex:
# Log all exceptions
LOG.exception("%s: error occurred during request processing: %s" % (self.name, str(ex)))
thread_death.set()
def kill(self):
# Must run from other thread
if self.wsgi is not None:
self.wsgi.shutdown()
class PatchControllerAuthApiThread(threading.Thread):
def __init__(self, port):
threading.Thread.__init__(self)
# LOG.info ("Initializing Authenticated API thread")
self.wsgi = None
self.port = port
self.name = f"PatchControllerAuthApiThread_{port}"
def run(self):
global thread_death
host = CONF.auth_api_bind_ip
if host is None:
host = utils.get_versioned_address_all()
try:
# Can only launch authenticated server post-config
while not os.path.exists(VOLATILE_CONTROLLER_CONFIG_COMPLETE):
LOG.info("Authorized API: Waiting for controller config complete.")
time.sleep(5)
LOG.info("Authorized API: Initializing")
# In order to support IPv6, server_class.address_family must be
# set to the correct address family. Because the unauthenticated
# API always uses IPv4 for the loopback address, the address_family
# variable cannot be set directly in the WSGIServer class, so a
# local subclass needs to be created for the call to make_server,
# where the correct address_family can be specified.
class server_class(simple_server.WSGIServer):
pass
server_class.address_family = utils.get_management_family()
self.wsgi = simple_server.make_server(
host, self.port,
auth_app.VersionSelectorApplication(),
server_class=server_class)
# self.wsgi.serve_forever()
self.wsgi.socket.settimeout(api_socket_timeout)
global keep_running
while keep_running:
self.wsgi.handle_request()
if thread_death.is_set():
LOG.info("%s exits as thread death is detected.", self.name)
return
# Call garbage collect after wsgi request is handled,
# to ensure any open file handles are closed in the case
# of an upload.
gc.collect()
except Exception as ex:
# Log all exceptions
LOG.exception("%s: error occurred during request processing: %s" % (self.name, str(ex)))
thread_death.set()
def kill(self):
# Must run from other thread
if self.wsgi is not None:
self.wsgi.shutdown()
LOG.info("%s exits as requested", self.name)
global thread_death
thread_death.set()
class PatchControllerMainThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
# LOG.info ("Initializing Main thread")
self.name = "PatchControllerMainThread"
def run(self):
global sc
global thread_death
# TODO(jvazhapp) Fix following temporary workaround
# for eventlet issue resulting in error message:
# 'Resolver configuration could not be read or
# specified no nameservers eventlet fix version'
with open('/etc/resolv.conf', 'a+') as f:
f.seek(0)
data = f.read()
if "nameserver" not in data:
f.writelines("nameserver 8.8.8.8")
# Send periodic messages to the agents
# We only can use one inverval
SEND_MSG_INTERVAL_IN_SECONDS = 30.0
sc.ignore_errors = os.environ.get('IGNORE_ERRORS', 'False')
LOG.info("IGNORE_ERRORS execution flag is set: %s", sc.ignore_errors)
LOG.info("software-controller-daemon is starting")
LOG.info("%s is active controller: %s", sc.hostname, sc.is_host_active_controller())
sc.set_interruption_fail_state()
try:
if sc.pre_bootstrap and cfg.get_mgmt_ip():
sc.pre_bootstrap = False
if sc.pre_bootstrap or os.path.isfile(INSTALL_LOCAL_FLAG):
sc.install_local = True
else:
sc.install_local = False
# Update the out of sync alarm cache when the thread starts
out_of_sync_alarm_fault = sc.get_out_of_sync_alarm()
sc.usm_alarm[constants.LAST_IN_SYNC] = not out_of_sync_alarm_fault
sock_in = sc.setup_socket()
while sock_in is None:
# Check every thirty seconds?
# Once we've got a conf file, tied into packstack,
# we'll get restarted when the file is updated,
# and this should be unnecessary.
time.sleep(30)
sock_in = sc.setup_socket()
# Ok, now we've got our socket. Let's start with a hello!
sc.socket_lock.acquire()
hello = PatchMessageHello()
hello.send(sc.sock_out)
hello_agent = PatchMessageHelloAgent()
hello_agent.send(sc.sock_out)
sc.socket_lock.release()
# Send hello every thirty seconds
hello_timeout = time.time() + SEND_MSG_INTERVAL_IN_SECONDS
# Send deploy state update every thirty seconds
deploy_state_update_timeout = time.time() + SEND_MSG_INTERVAL_IN_SECONDS
remaining = int(SEND_MSG_INTERVAL_IN_SECONDS)
agent_query_conns = []
while True:
# Check to see if any other thread has died
if thread_death.is_set():
LOG.info("%s exits as thread death is detected.", self.name)
return
# Check for in-service patch restart flag
if os.path.exists(insvc_patch_restart_controller):
LOG.info("In-service patch restart flag detected. Exiting.")
global keep_running
keep_running = False
os.remove(insvc_patch_restart_controller)
return
# If bootstrap is completed re-initialize sockets
if sc.pre_bootstrap and cfg.get_mgmt_ip():
sc.pre_bootstrap = False
sock_in = sc.setup_socket()
while sock_in is None:
time.sleep(30)
sock_in = sc.setup_socket()
sc.socket_lock.acquire()
hello = PatchMessageHello()
hello.send(sc.sock_out)
hello_agent = PatchMessageHelloAgent()
hello_agent.send(sc.sock_out)
sc.socket_lock.release()
for s in agent_query_conns.copy():
agent_query_conns.remove(s)
s.shutdown(socket.SHUT_RDWR)
s.close()
local_mode = sc.pre_bootstrap or os.path.isfile(INSTALL_LOCAL_FLAG)
if local_mode and not sc.install_local:
sc.install_local = True
elif not local_mode and sc.install_local:
sc.install_local = False
inputs = [sc.sock_in] + agent_query_conns
outputs = []
rlist, wlist, xlist = select.select(
inputs, outputs, inputs, SEND_MSG_INTERVAL_IN_SECONDS)
if (len(rlist) == 0 and
len(wlist) == 0 and
len(xlist) == 0):
# Timeout hit
sc.audit_socket()
for s in rlist:
data = ''
addr = None
msg = None
if s == sc.sock_in:
# Receive from UDP
sc.socket_lock.acquire()
data, addr = s.recvfrom(1024)
sc.socket_lock.release()
else:
# Receive from TCP
while True:
try:
packet = s.recv(1024)
except socket.error:
LOG.exception("Socket error on recv")
data = ''
break
if packet:
data += packet.decode()
if data == '':
break
try:
json.loads(data)
break
except ValueError:
# Message is incomplete
continue
else:
LOG.info('End of TCP message received')
break
if data == '':
# Connection dropped
agent_query_conns.remove(s)
s.close()
continue
# Get the TCP endpoint address
addr = s.getpeername()
msgdata = json.loads(data)
# For now, discard any messages that are not msgversion==1
if 'msgversion' in msgdata and msgdata['msgversion'] != 1:
continue
if 'msgtype' in msgdata:
if msgdata['msgtype'] == messages.PATCHMSG_HELLO:
msg = PatchMessageHello()
elif msgdata['msgtype'] == messages.PATCHMSG_HELLO_ACK:
msg = PatchMessageHelloAck()
elif msgdata['msgtype'] == messages.PATCHMSG_SYNC_REQ:
msg = PatchMessageSyncReq()
elif msgdata['msgtype'] == messages.PATCHMSG_SYNC_COMPLETE:
msg = PatchMessageSyncComplete()
elif msgdata['msgtype'] == messages.PATCHMSG_HELLO_AGENT_ACK:
msg = PatchMessageHelloAgentAck()
elif msgdata['msgtype'] == messages.PATCHMSG_QUERY_DETAILED_RESP:
msg = PatchMessageQueryDetailedResp()
elif msgdata['msgtype'] == messages.PATCHMSG_AGENT_INSTALL_RESP:
msg = PatchMessageAgentInstallResp()
elif msgdata['msgtype'] == messages.PATCHMSG_DROP_HOST_REQ:
msg = PatchMessageDropHostReq()
elif msgdata['msgtype'] == messages.PATCHMSG_DEPLOY_STATE_UPDATE:
msg = SoftwareMessageDeployStateUpdate()
elif msgdata['msgtype'] == messages.PATCHMSG_DEPLOY_STATE_UPDATE_ACK:
msg = SoftwareMessageDeployStateUpdateAck()
elif msgdata['msgtype'] == messages.PATCHMSG_DEPLOY_STATE_CHANGED:
msg = SWMessageDeployStateChanged()
elif msgdata['msgtype'] == messages.PATCHMSG_DEPLOY_DELETE_CLEANUP_RESP:
msg = SoftwareMessageDeployDeleteCleanupResp()
elif msgdata['msgtype'] == messages.PATCHMSG_CHECK_AGENT_ALIVE_RESP:
msg = SoftwareMessageCheckAgentAliveResp()
if msg is None:
msg = messages.PatchMessage()
msg.decode(msgdata)
if s == sc.sock_in:
msg.handle(sc.sock_out, addr)
else:
msg.handle(s, addr)
# We can drop the connection after a query response
if msg.msgtype == messages.PATCHMSG_QUERY_DETAILED_RESP and s != sc.sock_in:
agent_query_conns.remove(s)
s.shutdown(socket.SHUT_RDWR)
s.close()
while len(stale_hosts) > 0 and len(agent_query_conns) <= 5:
ip = stale_hosts.pop()
try:
agent_sock = socket.create_connection((ip, cfg.agent_port))
query = PatchMessageQueryDetailed()
query.send(agent_sock)
agent_query_conns.append(agent_sock)
except Exception:
# Put it back on the list
stale_hosts.append(ip)
remaining = int(hello_timeout - time.time())
if remaining <= 0 or remaining > int(SEND_MSG_INTERVAL_IN_SECONDS):
hello_timeout = time.time() + SEND_MSG_INTERVAL_IN_SECONDS
remaining = int(SEND_MSG_INTERVAL_IN_SECONDS)
sc.socket_lock.acquire()
hello = PatchMessageHello()
hello.send(sc.sock_out)
hello_agent = PatchMessageHelloAgent()
hello_agent.send(sc.sock_out)
sc.socket_lock.release()
# Age out neighbours
sc.controller_neighbours_lock.acquire()
nbrs = list(sc.controller_neighbours)
for n in nbrs:
# Age out controllers after 2 minutes
if sc.controller_neighbours[n].get_age() >= 120:
LOG.info("Aging out controller %s from table", n)
del sc.controller_neighbours[n]
sc.controller_neighbours_lock.release()
sc.hosts_lock.acquire()
nbrs = list(sc.hosts)
for n in nbrs:
# Age out hosts after 1 hour
if sc.hosts[n].get_age() >= 3600:
LOG.info("Aging out host %s from table", n)
del sc.hosts[n]
for patch_id in list(sc.interim_state):
if n in sc.interim_state[patch_id]:
sc.interim_state[patch_id].remove(n)
sc.hosts_lock.release()
deploy_state_update_remaining = int(deploy_state_update_timeout - time.time())
# Only send the deploy state update from the active controller
if deploy_state_update_remaining <= 0 or deploy_state_update_remaining > int(
SEND_MSG_INTERVAL_IN_SECONDS):
deploy_state_update_timeout = time.time() + SEND_MSG_INTERVAL_IN_SECONDS
deploy_state_update_remaining = int(
SEND_MSG_INTERVAL_IN_SECONDS)
if not is_simplex():
# Get out-of-sync alarm to request peer sync even if no deployment in progress
out_of_sync_alarm_fault = sc.get_out_of_sync_alarm()
# data sync always start only from the active controller
if utils.is_active_controller():
if out_of_sync_alarm_fault or is_deployment_in_progress():
sc.socket_lock.acquire()
try:
deploy_state_update = SoftwareMessageDeployStateUpdate()
deploy_state_update.send(sc.sock_out)
except Exception as e:
LOG.exception("Failed to send deploy state update. Error: %s", str(e))
finally:
sc.socket_lock.release()
if not sc.pre_bootstrap:
sc.handle_deploy_state_sync()
except Exception as ex:
# Log all exceptions
LOG.exception("%s: error occurred during request processing: %s" % (self.name, str(ex)))
thread_death.set()
def main():
software_conf = constants.SOFTWARE_CONFIG_FILE_LOCAL
pkg_feed = ('"http://controller:8080/updates/debian/rel-%s/ %s updates"'
% (constants.STARLINGX_RELEASE, constants.DEBIAN_RELEASE))
config = configparser.ConfigParser()
config.read(software_conf)
config.set("runtime", "package_feed", pkg_feed)
with open(software_conf, "w+") as configfile:
config.write(configfile)
# The following call to CONF is to ensure the oslo config
# has been called to specify a valid config dir.
# Otherwise oslo_policy will fail when it looks for its files.
CONF(
(), # Required to load an anonymous configuration
default_config_files=['/etc/software/software.conf', ]
)
cfg.read_config()
configure_logging()
# daemon.pidlockfile.write_pid_to_pidfile(pidfile_path)
global thread_death
thread_death = threading.Event()
# Set the TMPDIR environment variable to /scratch so that any modules
# that create directories with tempfile will not use /tmp
os.environ['TMPDIR'] = '/scratch'
global sc
sc = PatchController()
LOG.info("launching")
api_thread = PatchControllerApiThread()
auth_api_thread = PatchControllerAuthApiThread(CONF.auth_api_port)
auth_api_alt_thread = PatchControllerAuthApiThread(CONF.auth_api_alt_port)
main_thread = PatchControllerMainThread()
api_thread.start()
auth_api_thread.start()
auth_api_alt_thread.start()
main_thread.start()
thread_death.wait()
global keep_running
keep_running = False
api_thread.join()
auth_api_thread.join()
auth_api_alt_thread.join()
main_thread.join()