Modify kube-rootca audit to alarm first

On previous installations, the subcloud kube-rootca certificate is
different than the one from system controller. Currently, the audit
is comparing cert_id and declaring out-of-sync if they don't match,
which leads to an out-of-sync in the endpoint post upgrade. This
commit changes the audit logic to first audit by alarms so upgraded
subclouds can remain in-sync. Audit by cert_id still happen, but
only if the subcloud was rehomed.

Additionally, the force parameter was re-introduced in kube-rootca
update orchestration. Since with alarm based audit different cert_ids
can still present an in-sync status, the user might want to update
subcloud cert to match system controller, so the force parameter is
necessary to allow this.

Note: Dcagent didn't previously allowed extra_args to be sent in
in the payload. To avoid breaking audit with previous versions of
dcagent sending an unknown key in the payload (which will thrown an
error), extra_args are being sent in request header with the key
"X-DCAGENT-HEADERS". Support for extra_args in the payload was added,
but can only be used when all supported dcagent versions have this
option.

Note: Due to the current issue that blocks upgrade test, this commit
did not test subcloud upgrade, but the scenario would follow a
similar path from the second test case below, where updating a
subcloud rootca to a different cert from system controller results in
an in-sync endpoint status.

Test plan:
  - PASS: Deploy a subcloud and verify kube-rootca_sync_status is
          in-sync.
  - PASS: Perform a kube-rootca update orchestration directly in the
          subcloud without passing a cert so it will auto generate
          one and verify kube-rootca_sync_status is still in-sync.
  - PASS: Rehome the subcloud from the previous test and verify
          kube-rootca_sync_status is out-of-sync.
  - PASS: Perform a kube-rootca update orchestration using dcmanager
          in an out-of-sync subcloud providing system controller certs
          and verify the final sync status is in-sync.
  - PASS: Perform a kube-rootca update orchestration using dcmanager
          in an in-sync subcloud with force parameter without
          providing certs and verify the final sync status is in-sync.
  - PASS: Install a N-1 release and verify kube-rootca_sync_status is
          in-sync.

Closes-bug: 2092069

Change-Id: If0cc002d0d4970730771ae90d80dc50c7daf4d4c
Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
This commit is contained in:
Victor Romano 2024-12-17 10:22:41 -03:00 committed by Victor Romano
parent ddb0c2451c
commit e00c7223b5
15 changed files with 161 additions and 37 deletions

View File

@ -1257,6 +1257,7 @@ serviceUnavailable (503)
- subcloud-apply-type: subcloud_apply_type
- type: sw_update_strategy_type
- upload-only: patch_strategy_upload_only
- force: force_sync_status
Request Example
----------------

View File

@ -308,6 +308,13 @@ force:
in: body
required: false
type: boolean
force_sync_status:
description: |
Indicates whether to disregard subcloud endpoint sync status
or management alarm condition depending on strategy type.
in: body
required: false
type: boolean
group_id:
description: |
The ID of a subcloud group. Default is 1.

View File

@ -3,5 +3,6 @@
"type": "patch",
"stop-on-failure": "true",
"max-parallel-subclouds": 2,
"upload-only": "true"
"upload-only": "true",
"force": "true"
}

View File

@ -43,7 +43,21 @@ class AuditController(object):
if not payload:
pecan.abort(http.client.BAD_REQUEST, _("Body required"))
LOG.debug(f"Payload sent by system controller: {payload}")
# TODO(vgluzrom): Remove extra_args from header and keep it only in payload
# once all supported dcagent versions have this possibility. If system
# controller sends extra_args in payload to a dcagent that doesn't support it,
# it will raise an UnsupportedAudit exception.
try:
headers = json.loads(request.headers.get("X-DCAGENT-HEADERS", "{}"))
except ValueError:
pecan.abort(http.client.BAD_REQUEST, _("Request headers decoding error"))
extra_args = payload.pop("extra_args", {})
extra_args = {**extra_args, **headers}
LOG.debug(
f"Payload sent by system controller: {payload}. Extra args: {extra_args}"
)
try:
# Delete "use_cache" from payload so it doesn't get passed as an audit
@ -52,7 +66,7 @@ class AuditController(object):
requested_audit = RequestedAudit(
request_token=context.auth_token, use_cache=use_cache
)
return requested_audit.get_sync_status(payload)
return requested_audit.get_sync_status(payload, extra_args)
except UnsupportedAudit as ex:
LOG.exception(ex)

View File

@ -102,9 +102,16 @@ class PeriodicAudit(utils.BaseAuditManager):
(get_subcloud_base_audit, lambda: [self.sysinv_client, self.fm_client]),
(FirmwareAudit.get_subcloud_audit_data, lambda: [self.sysinv_client]),
(KubernetesAudit.get_subcloud_audit_data, lambda: [self.sysinv_client]),
# Need to call kube rootca function two times as it has a different
# response if the subcloud was rehomed or not and we want to cache both
# results
(
KubeRootcaUpdateAudit.get_subcloud_audit_data,
lambda: [self.sysinv_client, self.fm_client],
lambda: [self.sysinv_client, self.fm_client, False],
),
(
KubeRootcaUpdateAudit.get_subcloud_audit_data,
lambda: [self.sysinv_client, self.fm_client, True],
),
(SoftwareAudit.get_subcloud_audit_data, lambda: [self.software_client]),
]
@ -119,7 +126,7 @@ class RequestedAudit(utils.BaseAuditManager):
self.request_token = request_token
self.use_cache = use_cache
def get_single_audit_status(self, audit_type, regionone_audit_data):
def get_single_audit_status(self, audit_type, regionone_audit_data, extra_args):
# Since this run in parallel, we need to initialize the clients
# here to not use the same socket in every call
sysinv_client, fm_client, software_client = self.initialize_clients(
@ -139,8 +146,9 @@ class RequestedAudit(utils.BaseAuditManager):
sysinv_client, regionone_audit_data
)
elif audit_type == dccommon_consts.KUBE_ROOTCA_AUDIT:
rehomed = extra_args.get("rehomed", False)
resp = KubeRootcaUpdateAudit.get_subcloud_sync_status(
sysinv_client, fm_client, regionone_audit_data
sysinv_client, fm_client, regionone_audit_data, rehomed
)
elif audit_type == dccommon_consts.KUBERNETES_AUDIT:
resp = KubernetesAudit.get_subcloud_sync_status(
@ -161,11 +169,16 @@ class RequestedAudit(utils.BaseAuditManager):
raise exceptions.AuditStatusFailure(audit=audit_type)
return audit_type, resp
def get_sync_status(self, payload):
def get_sync_status(self, payload, extra_args):
sync_resp = {}
pool = GreenPool(size=10)
jobs = [
pool.spawn(self.get_single_audit_status, audit_type, regionone_audit_data)
pool.spawn(
self.get_single_audit_status,
audit_type,
regionone_audit_data,
extra_args,
)
for audit_type, regionone_audit_data in payload.items()
]

View File

@ -13,7 +13,7 @@ from dccommon.drivers import base
LOG = log.getLogger(__name__)
DCAGENT_REST_DEFAULT_TIMEOUT = 900
DCAGENT_REST_DEFAULT_TIMEOUT = 30
class DcagentClient(base.DriverBase):
@ -37,11 +37,17 @@ class DcagentClient(base.DriverBase):
else:
self.endpoint = endpoint
def audit(self, audit_data, timeout=DCAGENT_REST_DEFAULT_TIMEOUT):
def audit(
self,
audit_data: dict,
headers: dict = None,
timeout: int = DCAGENT_REST_DEFAULT_TIMEOUT,
):
"""Audit subcloud"""
url = self.endpoint + "/v1/dcaudit"
headers = headers or {}
response = self.session.patch(
url, json=audit_data, timeout=timeout, raise_exc=False
url, json=audit_data, headers=headers, timeout=timeout, raise_exc=False
)
if response.status_code == 200:

View File

@ -87,18 +87,22 @@ class KubeRootcaUpdateAudit(object):
cls,
sysinv_client: SysinvClient,
fm_client: FmClient,
rehomed: bool = False,
subcloud_name: str = None,
) -> tuple:
skip_audit = 2 * [dccommon_consts.SKIP_AUDIT]
try:
success, subcloud_cert_data = sysinv_client.get_kube_rootca_cert_id()
except Exception:
msg = f"Failed to get Kubernetes root CA status, skip {AUDIT_TYPE} audit."
log_subcloud_msg(LOG.exception, msg, subcloud_name)
return skip_audit
if rehomed:
try:
success, subcloud_cert_data = sysinv_client.get_kube_rootca_cert_id()
except Exception:
msg = (
f"Failed to get Kubernetes root CA status, skip {AUDIT_TYPE} audit."
)
log_subcloud_msg(LOG.exception, msg, subcloud_name)
return skip_audit
if success:
return CERT_BASED, subcloud_cert_data
if success:
return CERT_BASED, subcloud_cert_data
try:
detected_alarms = fm_client.get_alarms_by_ids(KUBE_ROOTCA_ALARM_LIST)
@ -114,12 +118,13 @@ class KubeRootcaUpdateAudit(object):
sysinv_client: SysinvClient,
fm_client: FmClient,
regionone_rootca_certid: str,
rehomed: bool = False,
subcloud_name: str = None,
):
"""Get the sync status of the subcloud's kube root CA cert."""
audit_method, subcloud_audit_data = cls.get_subcloud_audit_data(
sysinv_client, fm_client, subcloud_name
sysinv_client, fm_client, rehomed, subcloud_name
)
sync_status = None
@ -147,8 +152,10 @@ class KubeRootcaUpdateAudit(object):
The audit logic is as follow:
No region one cert ID -> skip audit
Subcloud doesn't have the API to get cert ID -> alarm based
Subcloud has the API to get cert ID -> cert based
Failure to get alarms or subcloud cert ID -> skip audit
Subcloud was not rehomed -> alarm based
Subcloud was rehomed and doesn't have the API to get cert ID -> alarm based
Subcloud was rehomed and has the API to get cert ID -> cert based
:param sysinv_client: the sysinv client object
:param fm_client: the fm client object
@ -165,7 +172,11 @@ class KubeRootcaUpdateAudit(object):
return dccommon_consts.SYNC_STATUS_IN_SYNC
sync_status = self.get_subcloud_sync_status(
sysinv_client, fm_client, regionone_rootca_certid, subcloud.name
sysinv_client,
fm_client,
regionone_rootca_certid,
subcloud.rehomed,
subcloud.name,
)
if sync_status:

View File

@ -15,6 +15,7 @@
#
import copy
import json
import os
import threading
import time
@ -357,6 +358,13 @@ class SubcloudAuditWorkerManager(manager.Manager):
audit_payload["use_cache"] = use_cache
return audit_payload
def _build_dcagent_request_headers(self, subcloud: models.Subcloud):
dc_agent_headers = {}
if subcloud.rehomed:
dc_agent_headers["rehomed"] = subcloud.rehomed
header = {"X-DCAGENT-HEADERS": json.dumps(dc_agent_headers)}
return header
def _update_sw_sync_status_from_deploy_status(self, subcloud, audit_results):
# If the subcloud deploy_status is in any of the following states,
# the sync_status should be set to out-of-sync for software audit.
@ -541,9 +549,10 @@ class SubcloudAuditWorkerManager(manager.Manager):
do_software_audit,
use_cache,
)
headers = self._build_dcagent_request_headers(subcloud)
audit_results = {}
try:
audit_results = dcagent_client.audit(audit_payload)
audit_results = dcagent_client.audit(audit_payload, headers)
except Exception:
LOG.exception(failmsg % (subcloud.name, "dcagent"))
failures.append("dcagent")

View File

@ -248,6 +248,7 @@ class SwUpdateManager(manager.Manager):
max_parallel_subclouds = int(max_parallel_subclouds_str)
stop_on_failure = payload.get("stop-on-failure") in ["true"]
force = payload.get(consts.EXTRA_ARGS_FORCE) in ["true"]
# Has the user specified a specific subcloud?
cloud_name = payload.get("cloud_name")
@ -257,7 +258,6 @@ class SwUpdateManager(manager.Manager):
# Has the user specified for_sw_deploy flag for prestage strategy?
if strategy_type == consts.SW_UPDATE_TYPE_PRESTAGE:
for_sw_deploy = payload.get(consts.PRESTAGE_FOR_SW_DEPLOY) in ["true"]
force = payload.get(consts.EXTRA_ARGS_FORCE) in ["true"]
if cloud_name:
# Make sure subcloud exists
@ -282,7 +282,7 @@ class SwUpdateManager(manager.Manager):
raise exceptions.BadRequest(resource="strategy", msg=str(ex))
else:
self.strategy_validators[strategy_type].validate_strategy_requirements(
context, subcloud.id, subcloud.name
context, subcloud.id, subcloud.name, force
)
extra_args = None
@ -328,7 +328,7 @@ class SwUpdateManager(manager.Manager):
single_group.id if subcloud_group else None,
cloud_name,
self.strategy_validators[strategy_type].build_availability_status_filter(),
self.strategy_validators[strategy_type].build_sync_status_filter(),
self.strategy_validators[strategy_type].build_sync_status_filter(force),
)
# TODO(rlima): move this step to validators

View File

@ -20,17 +20,24 @@ class StrategyValidationBase(object):
"""Base class for strategy validation"""
def __init__(self):
self.accepts_force = False
self.endpoint_type = None
def validate_strategy_requirements(self, context, subcloud_id, subcloud_name):
def validate_strategy_requirements(
self, context, subcloud_id, subcloud_name, force=False
):
"""Validates the requirements for a strategy
:param context: request context object
:param subcloud_id: subcloud's id
:param subcloud_name: subcloud's name
:param force: if the strategy should be forced to execute
:raises BadRequest: if the requirements for the strategy are not met
"""
if self.accepts_force and force:
return
subcloud_status = db_api.subcloud_status_get(
context, subcloud_id, self.endpoint_type
)
@ -64,9 +71,10 @@ class StrategyValidationBase(object):
return dccommon_consts.AVAILABILITY_ONLINE
def build_sync_status_filter(self):
def build_sync_status_filter(self, force):
"""Builds the sync status filter for valid subclouds
:param force: if the strategy should be forced to execute
:return: sync status to filter
:rtype: list
"""

View File

@ -26,6 +26,7 @@ class KubeRootCaStrategyValidator(StrategyValidationBase):
super().__init__()
self.endpoint_type = dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA
self.accepts_force = True
def build_extra_args(self, payload):
"""Builds the extra args for a strategy
@ -40,3 +41,18 @@ class KubeRootCaStrategyValidator(StrategyValidationBase):
consts.EXTRA_ARGS_SUBJECT: payload.get(consts.EXTRA_ARGS_SUBJECT),
consts.EXTRA_ARGS_CERT_FILE: payload.get(consts.EXTRA_ARGS_CERT_FILE),
}
def build_sync_status_filter(self, force):
"""Builds the sync status filter for valid subclouds
:param force: if the strategy should be forced to execute
:return: sync status to filter
:rtype: list
"""
if force:
return [
dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
]
return [dccommon_consts.SYNC_STATUS_OUT_OF_SYNC]

View File

@ -44,9 +44,10 @@ class PatchStrategyValidator(StrategyValidationBase):
consts.EXTRA_ARGS_REMOVE: remove_bool,
}
def build_sync_status_filter(self):
def build_sync_status_filter(self, force):
"""Builds the sync status filter for valid subclouds
:param force: if the strategy should be forced to execute
:return: sync status to filter
:rtype: list
"""

View File

@ -30,7 +30,7 @@ class PrestageStrategyValidator(StrategyValidationBase):
self.endpoint_type = dccommon_consts.AUDIT_TYPE_SOFTWARE
# TODO(rlima): move prestage validations here
def build_sync_status_filter(self):
def build_sync_status_filter(self, force):
"""Builds the sync status filter for valid subclouds
:param force: if the strategy should be forced to execute

View File

@ -53,37 +53,37 @@ SUBCLOUD_1 = {
"name": "subcloud1",
"region_name": "2ec93dfb654846909efe61d1b39dd2ce",
"rehomed": True,
"software_version": "22.12",
"software_version": "24.09",
}
SUBCLOUD_2 = {
"name": "subcloud2",
"region_name": "ca2761ee7aa34cbe8415ec9a3c86854f",
"rehomed": True,
"software_version": "22.12",
"software_version": "24.09",
}
SUBCLOUD_3 = {
"name": "subcloud3",
"region_name": "659e12e5f7ad411abfcd83f5cedca0bf",
"rehomed": True,
"software_version": "21.12",
"software_version": "22.12",
}
SUBCLOUD_4 = {
"name": "subcloud4",
"region_name": "c25f3b0553384104b664789bd93a2ba8",
"rehomed": False,
"software_version": "21.12",
"software_version": "22.12",
}
SUBCLOUD_5 = {
"name": "subcloud5",
"region_name": "809581dc2d154e008480bac1f43b7aff",
"rehomed": False,
"software_version": "21.12",
"software_version": "22.12",
}
SUBCLOUD_6 = {
"name": "subcloud6",
"region_name": "8c60b99f3e1245b7bc5a049802ade8d2",
"rehomed": False,
"software_version": "22.12",
"software_version": "24.09",
}
SUBCLOUD_7 = {"name": "subcloud7", "region_name": "9fde6dca22fa422bb1e8cf03bedc18e4"}
SUBCLOUD_8 = {"name": "subcloud8", "region_name": "f3cb0b109c4543fda3ed50ed5783279d"}

View File

@ -26,6 +26,7 @@ class FakeSubcloudObj(object):
def __init__(self, subcloud_dict):
self.name = subcloud_dict["name"]
self.region_name = subcloud_dict["region_name"]
self.rehomed = subcloud_dict["rehomed"]
self.software_version = subcloud_dict["software_version"]
@ -225,6 +226,9 @@ class TestKubeRootcaUpdateAudit(base.DCManagerTestCase):
self.mock_sysinv_client().get_kube_rootca_cert_id.return_value = (
base.FakeException("API cert ID request failed")
)
self.mock_fm_client().get_alarms_by_ids.side_effect = base.FakeException(
"get_alarms_by_ids failed"
)
response = self.audit.subcloud_kube_rootca_audit(
self.mock_sysinv_client(),
@ -234,3 +238,36 @@ class TestKubeRootcaUpdateAudit(base.DCManagerTestCase):
)
self.assertEqual(response, None)
def test_kube_rootca_update_audit_method(self):
"""Test if kube-rootca is auditing correctly based using alarm or cert_id"""
# Set the region one data
self.kube_rootca_cert_id.return_value = (
True,
FakeKubeRootcaData("cert1", ""),
)
kube_rootca_update_audit_data = self.get_rootca_audit_data()
subclouds = [base.SUBCLOUD_1, base.SUBCLOUD_2]
for subcloud_dict in subclouds:
subcloud = FakeSubcloudObj(subcloud_dict)
self.kube_rootca_cert_id.return_value = True, FakeKubeRootcaData(
"cert1", ""
)
self.mock_sysinv_client().get_kube_rootca_cert_id.return_value = (
True,
FakeKubeRootcaData("cert1", ""),
)
self.mock_fm_client().get_alarms_by_ids.return_value = None
self.audit.subcloud_kube_rootca_audit(
self.mock_sysinv_client(),
self.mock_fm_client(),
subcloud,
kube_rootca_update_audit_data,
)
if subcloud.rehomed:
self.mock_sysinv_client().get_kube_rootca_cert_id.assert_called()
else:
self.mock_fm_client().get_alarms_by_ids.assert_called()