Upgrade improvements for kube-apiserver port change

Include upgrade improvements in the activation stage for kube-
apiserver port update. Changes:
- Split the portieris webhook backup/disable to a separate script;
- Add a script to update the kube-apiserver server certificate
  with the management IPs in the controller hosts;
- Move the script that handles the parameter updates for
  kubernetes as well as the port update from a beggining position
  (#04) to be executed more torwards the end of the activation
  (#120).

Test plan:
PASS: Upgrade activation - stx10->11 - AIO-SX* and AIO-DX
PASS: Upgrade activation rollback - stx11->10 - AIO-SX* and AIO-DX

*AIO-SX upgrade included portieris (to validade the splited script)

Story: 2011399
Task: 52560

Depends-on: https://review.opendev.org/c/starlingx/config/+/955534

Change-Id: I2e17e036badf418555ac6c024f3c3dc0d84e5470
Signed-off-by: Marcelo de Castro Loebens <Marcelo.DeCastroLoebens@windriver.com>
This commit is contained in:
Marcelo de Castro Loebens
2025-07-21 16:49:33 -04:00
parent 9523a58fff
commit 35649efd45
3 changed files with 326 additions and 97 deletions

View File

@@ -0,0 +1,107 @@
#!/usr/bin/python
# Copyright (c) 2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script uses puppet to include the management IP on kube
# apiserver certificate SANs on upgrades from stx10 to stx11.
#
import logging
import os
import sys
import time
from oslo_config import cfg
from oslo_context import context as mycontext
from six.moves import configparser
from software.utilities.utils import configure_logging
from sysinv.conductor import rpcapiproxy as conductor_rpcapi
LOG = logging.getLogger('main_logger')
SUCCESS = 0
ERROR = 1
RETRIES = 3
CONF = cfg.CONF
SYSINV_CONFIG_FILE = '/etc/sysinv/sysinv.conf'
KUBE_CERT_SANS_UPDATE_FLAG = '/etc/platform/.upgrade_kube_apiserver_cert_sans_updated'
def get_conductor_rpc_bind_ip():
ini_str = '[DEFAULT]\n' + open(SYSINV_CONFIG_FILE, 'r').read()
config_applied = configparser.RawConfigParser()
config_applied.read_string(ini_str)
conductor_bind_ip = None
if config_applied.has_option('DEFAULT', 'rpc_zeromq_conductor_bind_ip'):
conductor_bind_ip = \
config_applied.get('DEFAULT', 'rpc_zeromq_conductor_bind_ip')
return conductor_bind_ip
def update_kube_apiserver_cert_rpc():
CONF.rpc_zeromq_conductor_bind_ip = get_conductor_rpc_bind_ip()
context = mycontext.get_admin_context()
rpcapi = conductor_rpcapi.ConductorAPI(topic=conductor_rpcapi.MANAGER_TOPIC)
rpcapi.update_kube_apiserver_cert_sans(context)
def check_kube_apiserver_cert_updated():
return os.path.exists(KUBE_CERT_SANS_UPDATE_FLAG)
def main():
# Initialize variables
action = None
from_release = None
to_release = None
arg = 1
# Process command-line arguments
while arg < len(sys.argv):
if arg == 1:
from_release = sys.argv[arg]
elif arg == 2:
to_release = sys.argv[arg]
elif arg == 3:
action = sys.argv[arg]
elif arg == 4:
# port = int(sys.argv[arg])
pass
else:
print(f"Invalid option {sys.argv[arg]}.")
return ERROR
arg += 1
configure_logging()
LOG.info(
"%s invoked from_release = %s invoked to_release = %s action = %s"
% (sys.argv[0], from_release, to_release, action)
)
for retry in range(0, RETRIES):
try:
if action == "activate" and from_release == "24.09":
if not check_kube_apiserver_cert_updated():
update_kube_apiserver_cert_rpc()
else:
LOG.info("Nothing to do. "
"Skipping kube-apiserver certificate update.")
except Exception as ex:
if retry == RETRIES - 1:
LOG.error("Error in kube-apiserver certificate update. "
"Please verify logs.")
return ERROR
else:
LOG.exception(ex)
LOG.error("Exception ocurred during script execution, "
"retrying after 5 seconds.")
time.sleep(5)
else:
return SUCCESS
if __name__ == "__main__":
sys.exit(main())

View File

@@ -8,33 +8,32 @@
# (which need to be executed before this one), and monitors the
# kube-apiserver PID restart in the active controller.
#
# An update of kube-apiserver port (6443 -> 16443) relies on this
# procedure to reconfigure the k8s control plane.
#
# Scripts that should be executed before this one:
# - k8s-disable-sched-controllermanager-leader-election.sh
#
# ** An update of kube-apiserver port (6443 -> 16443) also relies on this
# script.
#
import logging as LOG
import logging
import subprocess
import sys
import os
import tempfile
import time
import yaml
from oslo_config import cfg
from oslo_context import context as mycontext
from six.moves import configparser
from software.utilities.utils import configure_logging
from sysinv.common.kubernetes import k8s_wait_for_endpoints_health
from sysinv.conductor import rpcapiproxy as conductor_rpcapi
LOG = logging.getLogger('main_logger')
SUCCESS = 0
ERROR = 1
RETRIES = 3
CONFIG_DIR_PREFIX = '/opt/platform/config/'
PORTIERIS_BACKUP_FILENAME = 'portieris_backup.yml'
PORTIERIS_WEBHOOK_CRD = 'mutatingwebhookconfigurations image-admission-config'
KUBE_PORT_UPDATED_FLAG = '/etc/platform/.upgrade_kube_apiserver_port_updated'
CONF = cfg.CONF
@@ -49,21 +48,17 @@ class ServiceParametersApplier(object):
The command: "system service-parameters-apply kubernetes" will trigger
many system events including the restart of kube-apiserver process.
"""
def __init__(self, from_side_release) -> None:
self.KUBE_CMD = 'kubectl --kubeconfig=/etc/kubernetes/admin.conf '
def __init__(self) -> None:
self.SP_APPLY_CMD = 'system service-parameter-apply kubernetes'
self.initial_kube_apiserver_pid = -1
# Backup in old config folder, it will be erased when upgrade ends
self.PORTIERIS_BACKUP_FILE = CONFIG_DIR_PREFIX + from_side_release + \
'/' + PORTIERIS_BACKUP_FILENAME
def __system_cmd(self, command: str) -> str:
sub = subprocess.Popen(["bash", "-c", command],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, _ = sub.communicate()
stdout, stderr = sub.communicate()
if sub.returncode != 0:
return ''
raise Exception(stderr.decode('utf-8'))
return stdout.decode('utf-8')
def __service_parameter_apply(self) -> None:
@@ -80,7 +75,7 @@ class ServiceParametersApplier(object):
def __register_kube_apiserver_pid(self):
self.initial_kube_apiserver_pid = self.__get_kube_apiserver_pid()
def __wait_kube_apiserver_ready(self):
def __wait_kube_apiserver_pid_ready(self):
LOG.info("Waiting kube-apiserver PID to restart")
for _ in range(0, 300):
if check_kube_apiserver_port_updated():
@@ -94,88 +89,17 @@ class ServiceParametersApplier(object):
LOG.error("Timeout restarting kube-apiserver.")
sys.exit(ERROR)
def __get_portieris_webhook_data(self):
get_cmd = self.KUBE_CMD + "get " + PORTIERIS_WEBHOOK_CRD + \
" -o yaml --ignore-not-found"
return self.__system_cmd(get_cmd)
def __create_portieris_webhook_backup(self, yaml_data):
if (os.path.isfile(self.PORTIERIS_BACKUP_FILE) and
os.path.getsize(self.PORTIERIS_BACKUP_FILE) > 0):
LOG.info("Backup for portieris webhook already present.")
return
with open(self.PORTIERIS_BACKUP_FILE, 'w') as backup_file:
yaml.safe_dump(yaml_data, backup_file, default_flow_style=False)
LOG.info("Backup created for portieris webhook.")
def __modify_portieris_webhook(self, yaml_data):
delete_cmd = self.KUBE_CMD + "delete " + PORTIERIS_WEBHOOK_CRD
apply_cmd = self.KUBE_CMD + "apply -f "
with tempfile.NamedTemporaryFile(delete=True) as tmp_file_obj:
with open(tmp_file_obj.name, 'w') as tmp_file:
yaml.safe_dump(yaml_data, tmp_file, default_flow_style=False)
self.__system_cmd(delete_cmd)
self.__system_cmd(apply_cmd + tmp_file_obj.name)
def __disable_portieris_webhook(self):
result = self.__get_portieris_webhook_data()
if result != '':
yaml_data = yaml.safe_load(result)
self.__create_portieris_webhook_backup(yaml_data)
yaml_data['webhooks'][0]['failurePolicy'] = 'Ignore'
self.__modify_portieris_webhook(yaml_data)
else:
LOG.info("No webhook from portieris.")
def __remove_portieris_webhook_backup(self):
try:
os.remove(self.PORTIERIS_BACKUP_FILE)
LOG.info("Deleted portieris webhook backup file.")
except OSError:
pass
def __restore_portieris_webhook(self):
if (not os.path.isfile(self.PORTIERIS_BACKUP_FILE) or
not os.path.getsize(self.PORTIERIS_BACKUP_FILE) > 0):
LOG.info("No backup content for portieris webhook. Nothing to do.")
self.__remove_portieris_webhook_backup()
return
result = self.__get_portieris_webhook_data()
current_data = {}
if result != '':
current_data = yaml.safe_load(result)
with open(self.PORTIERIS_BACKUP_FILE, 'r') as backup_file:
backup_data = yaml.safe_load(backup_file)
current_value = current_data.get(
'webhooks', [{}])[0].get('failurePolicy', None)
backup_value = backup_data['webhooks'][0]['failurePolicy']
if current_value != backup_value:
LOG.info("Using backup data to restore portieris webhook.")
# Drop caBundle, cert-manager ca-injector will recreate it
backup_data['webhooks'][0]['clientConfig'].pop('caBundle',
None)
self.__modify_portieris_webhook(backup_data)
self.__remove_portieris_webhook_backup()
def apply(self):
# Disable portieris webhook to avoid issues while restarting pods
self.__disable_portieris_webhook()
# Perform service parameter apply and wait kube-apiserver restart
self.__register_kube_apiserver_pid()
self.__service_parameter_apply()
self.__wait_kube_apiserver_ready()
self.__wait_kube_apiserver_pid_ready()
def rollback(self):
# Perform service parameter apply and wait kube-apiserver restart
self.__register_kube_apiserver_pid()
self.__service_parameter_apply()
self.__wait_kube_apiserver_ready()
# Restore portieris webhook
self.__restore_portieris_webhook()
self.__wait_kube_apiserver_pid_ready()
def check_kube_apiserver_port_updated():
@@ -238,11 +162,6 @@ def wait_conductor_restarted():
def main():
log_format = ('%(asctime)s: [%(process)s]: %(filename)s(%(lineno)s): '
'%(levelname)s: %(message)s')
LOG.basicConfig(filename="/var/log/software.log",
format=log_format, level=LOG.INFO, datefmt="%FT%T")
# Initialize variables
action = None
from_release = None
@@ -265,6 +184,7 @@ def main():
return ERROR
arg += 1
configure_logging()
LOG.info(
"%s invoked from_release = %s invoked to_release = %s action = %s"
% (sys.argv[0], from_release, to_release, action)
@@ -274,21 +194,31 @@ def main():
try:
if action == "activate" and from_release == "24.09":
if not check_kube_apiserver_port_updated():
ServiceParametersApplier(from_release).apply()
ServiceParametersApplier().apply()
wait_kube_apiserver_port_update(True)
if not wait_conductor_restarted():
# No point in retrying without sysinv-conductor
LOG.error("Conductor is unhealthy, check sysinv logs")
return ERROR
if not k8s_wait_for_endpoints_health():
# k8s_wait_for_endpoints_health already has retries
LOG.error("K8s is unhealthy, aborting. "
"Please check logs.")
return ERROR
elif action == "activate-rollback" and to_release == "24.09":
if check_kube_apiserver_port_updated():
create_kube_apiserver_port_rollback_flag_rpc()
ServiceParametersApplier(to_release).rollback()
ServiceParametersApplier().rollback()
wait_kube_apiserver_port_update(False)
if not wait_conductor_restarted():
# No point in retrying without sysinv-conductor
LOG.error("Conductor is unhealthy, check sysinv logs")
return ERROR
if not k8s_wait_for_endpoints_health():
# k8s_wait_for_endpoints_health already has retries
LOG.error("K8s is unhealthy, aborting. "
"Please check logs.")
return ERROR
else:
LOG.info("Nothing to do. "
"Skipping K8s service parameter apply.")

View File

@@ -0,0 +1,192 @@
#!/usr/bin/python
# Copyright (c) 2024-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script backups the portieris webhook and changes the failurePolicy
# to ignore failures. This is require to upgrade portieris during platform
# upgrades, since by default portieris will fail to create the new pods
# when the webhook is down.
#
# *** THIS SCRIPT NEEDS TO BE EXECUTED BEFORE '#-k8s-app-upgrade.sh',
# # in any platform upgrade where portieris is being upversioned. When the
# upgrade ends, a lifecycle hook on portieris should restore the failurePolicy
# for the webhook.
#
import logging
import subprocess
import sys
import os
import tempfile
import time
import yaml
from sysinv.common.kubernetes import test_k8s_health
from software.utilities.utils import configure_logging
LOG = logging.getLogger('main_logger')
SUCCESS = 0
ERROR = 1
RETRIES = 3
CONFIG_DIR_PREFIX = '/opt/platform/config/'
PORTIERIS_BACKUP_FILENAME = 'portieris_backup.yml'
PORTIERIS_WEBHOOK_CRD = 'mutatingwebhookconfigurations image-admission-config'
class PortierisWebhookDisabler(object):
"""
The main purpose of this class is to safely apply service parameters
previously configured in the system.
The command: "system service-parameters-apply kubernetes" will trigger
many system events including the restart of kube-apiserver process.
"""
def __init__(self, from_side_release) -> None:
self.KUBE_CMD = 'kubectl --kubeconfig=/etc/kubernetes/admin.conf '
# Backup in old config folder, it will be erased when upgrade ends
self.PORTIERIS_BACKUP_FILE = CONFIG_DIR_PREFIX + from_side_release + \
'/' + PORTIERIS_BACKUP_FILENAME
def __system_cmd(self, command: str) -> str:
sub = subprocess.Popen(["bash", "-c", command],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = sub.communicate(timeout=10)
if sub.returncode != 0:
raise Exception(stderr.decode('utf-8'))
return stdout.decode('utf-8')
@test_k8s_health
def __get_portieris_webhook_data(self):
get_cmd = self.KUBE_CMD + "get " + PORTIERIS_WEBHOOK_CRD + \
" -o yaml --ignore-not-found"
return self.__system_cmd(get_cmd)
def __create_portieris_webhook_backup(self, yaml_data):
if (os.path.isfile(self.PORTIERIS_BACKUP_FILE) and
os.path.getsize(self.PORTIERIS_BACKUP_FILE) > 0):
LOG.info("Backup for portieris webhook already present.")
return
with open(self.PORTIERIS_BACKUP_FILE, 'w') as backup_file:
yaml.safe_dump(yaml_data, backup_file, default_flow_style=False)
LOG.info("Backup created for portieris webhook.")
@test_k8s_health
def __modify_portieris_webhook(self, yaml_data):
delete_cmd = self.KUBE_CMD + "delete " + PORTIERIS_WEBHOOK_CRD + \
" --ignore-not-found"
apply_cmd = self.KUBE_CMD + "apply -f "
with tempfile.NamedTemporaryFile(delete=True) as tmp_file_obj:
with open(tmp_file_obj.name, 'w') as tmp_file:
yaml.safe_dump(yaml_data, tmp_file, default_flow_style=False)
self.__system_cmd(delete_cmd)
self.__system_cmd(apply_cmd + tmp_file_obj.name)
def __disable_portieris_webhook(self):
result = self.__get_portieris_webhook_data()
if result != '':
yaml_data = yaml.safe_load(result)
self.__create_portieris_webhook_backup(yaml_data)
yaml_data['webhooks'][0]['failurePolicy'] = 'Ignore'
self.__modify_portieris_webhook(yaml_data)
else:
LOG.info("No webhook from portieris.")
def __remove_portieris_webhook_backup(self):
try:
os.remove(self.PORTIERIS_BACKUP_FILE)
LOG.info("Deleted portieris webhook backup file.")
except OSError:
pass
def __restore_portieris_webhook(self):
if (not os.path.isfile(self.PORTIERIS_BACKUP_FILE) or
not os.path.getsize(self.PORTIERIS_BACKUP_FILE) > 0):
LOG.info("No backup content for portieris webhook. Nothing to do.")
self.__remove_portieris_webhook_backup()
return
result = self.__get_portieris_webhook_data()
current_data = {}
if result != '':
current_data = yaml.safe_load(result)
with open(self.PORTIERIS_BACKUP_FILE, 'r') as backup_file:
backup_data = yaml.safe_load(backup_file)
current_value = current_data.get(
'webhooks', [{}])[0].get('failurePolicy', None)
backup_value = backup_data['webhooks'][0]['failurePolicy']
if current_value != backup_value:
LOG.info("Using backup data to restore portieris webhook.")
# Drop caBundle, cert-manager ca-injector will recreate it
backup_data['webhooks'][0]['clientConfig'].pop('caBundle',
None)
self.__modify_portieris_webhook(backup_data)
self.__remove_portieris_webhook_backup()
def apply(self):
self.__disable_portieris_webhook()
def rollback(self):
self.__restore_portieris_webhook()
def main():
# Initialize variables
action = None
from_release = None
to_release = None
arg = 1
# Process command-line arguments
while arg < len(sys.argv):
if arg == 1:
from_release = sys.argv[arg]
elif arg == 2:
to_release = sys.argv[arg]
elif arg == 3:
action = sys.argv[arg]
elif arg == 4:
# port = int(sys.argv[arg])
pass
else:
print(f"Invalid option {sys.argv[arg]}.")
return ERROR
arg += 1
configure_logging()
LOG.info(
"%s invoked from_release = %s invoked to_release = %s action = %s"
% (sys.argv[0], from_release, to_release, action)
)
for retry in range(0, RETRIES):
try:
if action == "activate" and from_release == "24.09":
PortierisWebhookDisabler(from_release).apply()
elif action == "activate-rollback" and to_release == "24.09":
PortierisWebhookDisabler(to_release).rollback()
else:
LOG.info("Nothing to do. "
"Skipping portieris webhook disable script.")
except Exception as ex:
if retry == RETRIES - 1:
LOG.error("Error modifying portieris webhook. "
"Please verify logs.")
return ERROR
else:
LOG.exception(ex)
LOG.error("Exception ocurred during script execution, "
"retrying after 5 seconds.")
time.sleep(5)
else:
return SUCCESS
if __name__ == "__main__":
sys.exit(main())