Modify kube-rootca audit to alarm first
On previous installations, the subcloud kube-rootca certificate is different than the one from system controller. Currently, the audit is comparing cert_id and declaring out-of-sync if they don't match, which leads to an out-of-sync in the endpoint post upgrade. This commit changes the audit logic to first audit by alarms so upgraded subclouds can remain in-sync. Audit by cert_id still happen, but only if the subcloud was rehomed. Additionally, the force parameter was re-introduced in kube-rootca update orchestration. Since with alarm based audit different cert_ids can still present an in-sync status, the user might want to update subcloud cert to match system controller, so the force parameter is necessary to allow this. Note: Dcagent didn't previously allowed extra_args to be sent in in the payload. To avoid breaking audit with previous versions of dcagent sending an unknown key in the payload (which will thrown an error), extra_args are being sent in request header with the key "X-DCAGENT-HEADERS". Support for extra_args in the payload was added, but can only be used when all supported dcagent versions have this option. Note: Due to the current issue that blocks upgrade test, this commit did not test subcloud upgrade, but the scenario would follow a similar path from the second test case below, where updating a subcloud rootca to a different cert from system controller results in an in-sync endpoint status. Test plan: - PASS: Deploy a subcloud and verify kube-rootca_sync_status is in-sync. - PASS: Perform a kube-rootca update orchestration directly in the subcloud without passing a cert so it will auto generate one and verify kube-rootca_sync_status is still in-sync. - PASS: Rehome the subcloud from the previous test and verify kube-rootca_sync_status is out-of-sync. - PASS: Perform a kube-rootca update orchestration using dcmanager in an out-of-sync subcloud providing system controller certs and verify the final sync status is in-sync. - PASS: Perform a kube-rootca update orchestration using dcmanager in an in-sync subcloud with force parameter without providing certs and verify the final sync status is in-sync. - PASS: Install a N-1 release and verify kube-rootca_sync_status is in-sync. Closes-bug: 2092069 Change-Id: If0cc002d0d4970730771ae90d80dc50c7daf4d4c Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
This commit is contained in:
parent
ddb0c2451c
commit
e00c7223b5
@ -1257,6 +1257,7 @@ serviceUnavailable (503)
|
||||
- subcloud-apply-type: subcloud_apply_type
|
||||
- type: sw_update_strategy_type
|
||||
- upload-only: patch_strategy_upload_only
|
||||
- force: force_sync_status
|
||||
|
||||
Request Example
|
||||
----------------
|
||||
|
@ -308,6 +308,13 @@ force:
|
||||
in: body
|
||||
required: false
|
||||
type: boolean
|
||||
force_sync_status:
|
||||
description: |
|
||||
Indicates whether to disregard subcloud endpoint sync status
|
||||
or management alarm condition depending on strategy type.
|
||||
in: body
|
||||
required: false
|
||||
type: boolean
|
||||
group_id:
|
||||
description: |
|
||||
The ID of a subcloud group. Default is 1.
|
||||
|
@ -3,5 +3,6 @@
|
||||
"type": "patch",
|
||||
"stop-on-failure": "true",
|
||||
"max-parallel-subclouds": 2,
|
||||
"upload-only": "true"
|
||||
"upload-only": "true",
|
||||
"force": "true"
|
||||
}
|
||||
|
@ -43,7 +43,21 @@ class AuditController(object):
|
||||
if not payload:
|
||||
pecan.abort(http.client.BAD_REQUEST, _("Body required"))
|
||||
|
||||
LOG.debug(f"Payload sent by system controller: {payload}")
|
||||
# TODO(vgluzrom): Remove extra_args from header and keep it only in payload
|
||||
# once all supported dcagent versions have this possibility. If system
|
||||
# controller sends extra_args in payload to a dcagent that doesn't support it,
|
||||
# it will raise an UnsupportedAudit exception.
|
||||
try:
|
||||
headers = json.loads(request.headers.get("X-DCAGENT-HEADERS", "{}"))
|
||||
except ValueError:
|
||||
pecan.abort(http.client.BAD_REQUEST, _("Request headers decoding error"))
|
||||
|
||||
extra_args = payload.pop("extra_args", {})
|
||||
extra_args = {**extra_args, **headers}
|
||||
|
||||
LOG.debug(
|
||||
f"Payload sent by system controller: {payload}. Extra args: {extra_args}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Delete "use_cache" from payload so it doesn't get passed as an audit
|
||||
@ -52,7 +66,7 @@ class AuditController(object):
|
||||
requested_audit = RequestedAudit(
|
||||
request_token=context.auth_token, use_cache=use_cache
|
||||
)
|
||||
return requested_audit.get_sync_status(payload)
|
||||
return requested_audit.get_sync_status(payload, extra_args)
|
||||
|
||||
except UnsupportedAudit as ex:
|
||||
LOG.exception(ex)
|
||||
|
@ -102,9 +102,16 @@ class PeriodicAudit(utils.BaseAuditManager):
|
||||
(get_subcloud_base_audit, lambda: [self.sysinv_client, self.fm_client]),
|
||||
(FirmwareAudit.get_subcloud_audit_data, lambda: [self.sysinv_client]),
|
||||
(KubernetesAudit.get_subcloud_audit_data, lambda: [self.sysinv_client]),
|
||||
# Need to call kube rootca function two times as it has a different
|
||||
# response if the subcloud was rehomed or not and we want to cache both
|
||||
# results
|
||||
(
|
||||
KubeRootcaUpdateAudit.get_subcloud_audit_data,
|
||||
lambda: [self.sysinv_client, self.fm_client],
|
||||
lambda: [self.sysinv_client, self.fm_client, False],
|
||||
),
|
||||
(
|
||||
KubeRootcaUpdateAudit.get_subcloud_audit_data,
|
||||
lambda: [self.sysinv_client, self.fm_client, True],
|
||||
),
|
||||
(SoftwareAudit.get_subcloud_audit_data, lambda: [self.software_client]),
|
||||
]
|
||||
@ -119,7 +126,7 @@ class RequestedAudit(utils.BaseAuditManager):
|
||||
self.request_token = request_token
|
||||
self.use_cache = use_cache
|
||||
|
||||
def get_single_audit_status(self, audit_type, regionone_audit_data):
|
||||
def get_single_audit_status(self, audit_type, regionone_audit_data, extra_args):
|
||||
# Since this run in parallel, we need to initialize the clients
|
||||
# here to not use the same socket in every call
|
||||
sysinv_client, fm_client, software_client = self.initialize_clients(
|
||||
@ -139,8 +146,9 @@ class RequestedAudit(utils.BaseAuditManager):
|
||||
sysinv_client, regionone_audit_data
|
||||
)
|
||||
elif audit_type == dccommon_consts.KUBE_ROOTCA_AUDIT:
|
||||
rehomed = extra_args.get("rehomed", False)
|
||||
resp = KubeRootcaUpdateAudit.get_subcloud_sync_status(
|
||||
sysinv_client, fm_client, regionone_audit_data
|
||||
sysinv_client, fm_client, regionone_audit_data, rehomed
|
||||
)
|
||||
elif audit_type == dccommon_consts.KUBERNETES_AUDIT:
|
||||
resp = KubernetesAudit.get_subcloud_sync_status(
|
||||
@ -161,11 +169,16 @@ class RequestedAudit(utils.BaseAuditManager):
|
||||
raise exceptions.AuditStatusFailure(audit=audit_type)
|
||||
return audit_type, resp
|
||||
|
||||
def get_sync_status(self, payload):
|
||||
def get_sync_status(self, payload, extra_args):
|
||||
sync_resp = {}
|
||||
pool = GreenPool(size=10)
|
||||
jobs = [
|
||||
pool.spawn(self.get_single_audit_status, audit_type, regionone_audit_data)
|
||||
pool.spawn(
|
||||
self.get_single_audit_status,
|
||||
audit_type,
|
||||
regionone_audit_data,
|
||||
extra_args,
|
||||
)
|
||||
for audit_type, regionone_audit_data in payload.items()
|
||||
]
|
||||
|
||||
|
@ -13,7 +13,7 @@ from dccommon.drivers import base
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
DCAGENT_REST_DEFAULT_TIMEOUT = 900
|
||||
DCAGENT_REST_DEFAULT_TIMEOUT = 30
|
||||
|
||||
|
||||
class DcagentClient(base.DriverBase):
|
||||
@ -37,11 +37,17 @@ class DcagentClient(base.DriverBase):
|
||||
else:
|
||||
self.endpoint = endpoint
|
||||
|
||||
def audit(self, audit_data, timeout=DCAGENT_REST_DEFAULT_TIMEOUT):
|
||||
def audit(
|
||||
self,
|
||||
audit_data: dict,
|
||||
headers: dict = None,
|
||||
timeout: int = DCAGENT_REST_DEFAULT_TIMEOUT,
|
||||
):
|
||||
"""Audit subcloud"""
|
||||
url = self.endpoint + "/v1/dcaudit"
|
||||
headers = headers or {}
|
||||
response = self.session.patch(
|
||||
url, json=audit_data, timeout=timeout, raise_exc=False
|
||||
url, json=audit_data, headers=headers, timeout=timeout, raise_exc=False
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
|
@ -87,13 +87,17 @@ class KubeRootcaUpdateAudit(object):
|
||||
cls,
|
||||
sysinv_client: SysinvClient,
|
||||
fm_client: FmClient,
|
||||
rehomed: bool = False,
|
||||
subcloud_name: str = None,
|
||||
) -> tuple:
|
||||
skip_audit = 2 * [dccommon_consts.SKIP_AUDIT]
|
||||
if rehomed:
|
||||
try:
|
||||
success, subcloud_cert_data = sysinv_client.get_kube_rootca_cert_id()
|
||||
except Exception:
|
||||
msg = f"Failed to get Kubernetes root CA status, skip {AUDIT_TYPE} audit."
|
||||
msg = (
|
||||
f"Failed to get Kubernetes root CA status, skip {AUDIT_TYPE} audit."
|
||||
)
|
||||
log_subcloud_msg(LOG.exception, msg, subcloud_name)
|
||||
return skip_audit
|
||||
|
||||
@ -114,12 +118,13 @@ class KubeRootcaUpdateAudit(object):
|
||||
sysinv_client: SysinvClient,
|
||||
fm_client: FmClient,
|
||||
regionone_rootca_certid: str,
|
||||
rehomed: bool = False,
|
||||
subcloud_name: str = None,
|
||||
):
|
||||
"""Get the sync status of the subcloud's kube root CA cert."""
|
||||
|
||||
audit_method, subcloud_audit_data = cls.get_subcloud_audit_data(
|
||||
sysinv_client, fm_client, subcloud_name
|
||||
sysinv_client, fm_client, rehomed, subcloud_name
|
||||
)
|
||||
|
||||
sync_status = None
|
||||
@ -147,8 +152,10 @@ class KubeRootcaUpdateAudit(object):
|
||||
|
||||
The audit logic is as follow:
|
||||
No region one cert ID -> skip audit
|
||||
Subcloud doesn't have the API to get cert ID -> alarm based
|
||||
Subcloud has the API to get cert ID -> cert based
|
||||
Failure to get alarms or subcloud cert ID -> skip audit
|
||||
Subcloud was not rehomed -> alarm based
|
||||
Subcloud was rehomed and doesn't have the API to get cert ID -> alarm based
|
||||
Subcloud was rehomed and has the API to get cert ID -> cert based
|
||||
|
||||
:param sysinv_client: the sysinv client object
|
||||
:param fm_client: the fm client object
|
||||
@ -165,7 +172,11 @@ class KubeRootcaUpdateAudit(object):
|
||||
return dccommon_consts.SYNC_STATUS_IN_SYNC
|
||||
|
||||
sync_status = self.get_subcloud_sync_status(
|
||||
sysinv_client, fm_client, regionone_rootca_certid, subcloud.name
|
||||
sysinv_client,
|
||||
fm_client,
|
||||
regionone_rootca_certid,
|
||||
subcloud.rehomed,
|
||||
subcloud.name,
|
||||
)
|
||||
|
||||
if sync_status:
|
||||
|
@ -15,6 +15,7 @@
|
||||
#
|
||||
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
@ -357,6 +358,13 @@ class SubcloudAuditWorkerManager(manager.Manager):
|
||||
audit_payload["use_cache"] = use_cache
|
||||
return audit_payload
|
||||
|
||||
def _build_dcagent_request_headers(self, subcloud: models.Subcloud):
|
||||
dc_agent_headers = {}
|
||||
if subcloud.rehomed:
|
||||
dc_agent_headers["rehomed"] = subcloud.rehomed
|
||||
header = {"X-DCAGENT-HEADERS": json.dumps(dc_agent_headers)}
|
||||
return header
|
||||
|
||||
def _update_sw_sync_status_from_deploy_status(self, subcloud, audit_results):
|
||||
# If the subcloud deploy_status is in any of the following states,
|
||||
# the sync_status should be set to out-of-sync for software audit.
|
||||
@ -541,9 +549,10 @@ class SubcloudAuditWorkerManager(manager.Manager):
|
||||
do_software_audit,
|
||||
use_cache,
|
||||
)
|
||||
headers = self._build_dcagent_request_headers(subcloud)
|
||||
audit_results = {}
|
||||
try:
|
||||
audit_results = dcagent_client.audit(audit_payload)
|
||||
audit_results = dcagent_client.audit(audit_payload, headers)
|
||||
except Exception:
|
||||
LOG.exception(failmsg % (subcloud.name, "dcagent"))
|
||||
failures.append("dcagent")
|
||||
|
@ -248,6 +248,7 @@ class SwUpdateManager(manager.Manager):
|
||||
max_parallel_subclouds = int(max_parallel_subclouds_str)
|
||||
|
||||
stop_on_failure = payload.get("stop-on-failure") in ["true"]
|
||||
force = payload.get(consts.EXTRA_ARGS_FORCE) in ["true"]
|
||||
|
||||
# Has the user specified a specific subcloud?
|
||||
cloud_name = payload.get("cloud_name")
|
||||
@ -257,7 +258,6 @@ class SwUpdateManager(manager.Manager):
|
||||
# Has the user specified for_sw_deploy flag for prestage strategy?
|
||||
if strategy_type == consts.SW_UPDATE_TYPE_PRESTAGE:
|
||||
for_sw_deploy = payload.get(consts.PRESTAGE_FOR_SW_DEPLOY) in ["true"]
|
||||
force = payload.get(consts.EXTRA_ARGS_FORCE) in ["true"]
|
||||
|
||||
if cloud_name:
|
||||
# Make sure subcloud exists
|
||||
@ -282,7 +282,7 @@ class SwUpdateManager(manager.Manager):
|
||||
raise exceptions.BadRequest(resource="strategy", msg=str(ex))
|
||||
else:
|
||||
self.strategy_validators[strategy_type].validate_strategy_requirements(
|
||||
context, subcloud.id, subcloud.name
|
||||
context, subcloud.id, subcloud.name, force
|
||||
)
|
||||
|
||||
extra_args = None
|
||||
@ -328,7 +328,7 @@ class SwUpdateManager(manager.Manager):
|
||||
single_group.id if subcloud_group else None,
|
||||
cloud_name,
|
||||
self.strategy_validators[strategy_type].build_availability_status_filter(),
|
||||
self.strategy_validators[strategy_type].build_sync_status_filter(),
|
||||
self.strategy_validators[strategy_type].build_sync_status_filter(force),
|
||||
)
|
||||
|
||||
# TODO(rlima): move this step to validators
|
||||
|
@ -20,17 +20,24 @@ class StrategyValidationBase(object):
|
||||
"""Base class for strategy validation"""
|
||||
|
||||
def __init__(self):
|
||||
self.accepts_force = False
|
||||
self.endpoint_type = None
|
||||
|
||||
def validate_strategy_requirements(self, context, subcloud_id, subcloud_name):
|
||||
def validate_strategy_requirements(
|
||||
self, context, subcloud_id, subcloud_name, force=False
|
||||
):
|
||||
"""Validates the requirements for a strategy
|
||||
|
||||
:param context: request context object
|
||||
:param subcloud_id: subcloud's id
|
||||
:param subcloud_name: subcloud's name
|
||||
:param force: if the strategy should be forced to execute
|
||||
:raises BadRequest: if the requirements for the strategy are not met
|
||||
"""
|
||||
|
||||
if self.accepts_force and force:
|
||||
return
|
||||
|
||||
subcloud_status = db_api.subcloud_status_get(
|
||||
context, subcloud_id, self.endpoint_type
|
||||
)
|
||||
@ -64,9 +71,10 @@ class StrategyValidationBase(object):
|
||||
|
||||
return dccommon_consts.AVAILABILITY_ONLINE
|
||||
|
||||
def build_sync_status_filter(self):
|
||||
def build_sync_status_filter(self, force):
|
||||
"""Builds the sync status filter for valid subclouds
|
||||
|
||||
:param force: if the strategy should be forced to execute
|
||||
:return: sync status to filter
|
||||
:rtype: list
|
||||
"""
|
||||
|
@ -26,6 +26,7 @@ class KubeRootCaStrategyValidator(StrategyValidationBase):
|
||||
super().__init__()
|
||||
|
||||
self.endpoint_type = dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA
|
||||
self.accepts_force = True
|
||||
|
||||
def build_extra_args(self, payload):
|
||||
"""Builds the extra args for a strategy
|
||||
@ -40,3 +41,18 @@ class KubeRootCaStrategyValidator(StrategyValidationBase):
|
||||
consts.EXTRA_ARGS_SUBJECT: payload.get(consts.EXTRA_ARGS_SUBJECT),
|
||||
consts.EXTRA_ARGS_CERT_FILE: payload.get(consts.EXTRA_ARGS_CERT_FILE),
|
||||
}
|
||||
|
||||
def build_sync_status_filter(self, force):
|
||||
"""Builds the sync status filter for valid subclouds
|
||||
|
||||
:param force: if the strategy should be forced to execute
|
||||
:return: sync status to filter
|
||||
:rtype: list
|
||||
"""
|
||||
|
||||
if force:
|
||||
return [
|
||||
dccommon_consts.SYNC_STATUS_IN_SYNC,
|
||||
dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
|
||||
]
|
||||
return [dccommon_consts.SYNC_STATUS_OUT_OF_SYNC]
|
||||
|
@ -44,9 +44,10 @@ class PatchStrategyValidator(StrategyValidationBase):
|
||||
consts.EXTRA_ARGS_REMOVE: remove_bool,
|
||||
}
|
||||
|
||||
def build_sync_status_filter(self):
|
||||
def build_sync_status_filter(self, force):
|
||||
"""Builds the sync status filter for valid subclouds
|
||||
|
||||
:param force: if the strategy should be forced to execute
|
||||
:return: sync status to filter
|
||||
:rtype: list
|
||||
"""
|
||||
|
@ -30,7 +30,7 @@ class PrestageStrategyValidator(StrategyValidationBase):
|
||||
self.endpoint_type = dccommon_consts.AUDIT_TYPE_SOFTWARE
|
||||
|
||||
# TODO(rlima): move prestage validations here
|
||||
def build_sync_status_filter(self):
|
||||
def build_sync_status_filter(self, force):
|
||||
"""Builds the sync status filter for valid subclouds
|
||||
|
||||
:param force: if the strategy should be forced to execute
|
||||
|
@ -53,37 +53,37 @@ SUBCLOUD_1 = {
|
||||
"name": "subcloud1",
|
||||
"region_name": "2ec93dfb654846909efe61d1b39dd2ce",
|
||||
"rehomed": True,
|
||||
"software_version": "22.12",
|
||||
"software_version": "24.09",
|
||||
}
|
||||
SUBCLOUD_2 = {
|
||||
"name": "subcloud2",
|
||||
"region_name": "ca2761ee7aa34cbe8415ec9a3c86854f",
|
||||
"rehomed": True,
|
||||
"software_version": "22.12",
|
||||
"software_version": "24.09",
|
||||
}
|
||||
SUBCLOUD_3 = {
|
||||
"name": "subcloud3",
|
||||
"region_name": "659e12e5f7ad411abfcd83f5cedca0bf",
|
||||
"rehomed": True,
|
||||
"software_version": "21.12",
|
||||
"software_version": "22.12",
|
||||
}
|
||||
SUBCLOUD_4 = {
|
||||
"name": "subcloud4",
|
||||
"region_name": "c25f3b0553384104b664789bd93a2ba8",
|
||||
"rehomed": False,
|
||||
"software_version": "21.12",
|
||||
"software_version": "22.12",
|
||||
}
|
||||
SUBCLOUD_5 = {
|
||||
"name": "subcloud5",
|
||||
"region_name": "809581dc2d154e008480bac1f43b7aff",
|
||||
"rehomed": False,
|
||||
"software_version": "21.12",
|
||||
"software_version": "22.12",
|
||||
}
|
||||
SUBCLOUD_6 = {
|
||||
"name": "subcloud6",
|
||||
"region_name": "8c60b99f3e1245b7bc5a049802ade8d2",
|
||||
"rehomed": False,
|
||||
"software_version": "22.12",
|
||||
"software_version": "24.09",
|
||||
}
|
||||
SUBCLOUD_7 = {"name": "subcloud7", "region_name": "9fde6dca22fa422bb1e8cf03bedc18e4"}
|
||||
SUBCLOUD_8 = {"name": "subcloud8", "region_name": "f3cb0b109c4543fda3ed50ed5783279d"}
|
||||
|
@ -26,6 +26,7 @@ class FakeSubcloudObj(object):
|
||||
def __init__(self, subcloud_dict):
|
||||
self.name = subcloud_dict["name"]
|
||||
self.region_name = subcloud_dict["region_name"]
|
||||
self.rehomed = subcloud_dict["rehomed"]
|
||||
self.software_version = subcloud_dict["software_version"]
|
||||
|
||||
|
||||
@ -225,6 +226,9 @@ class TestKubeRootcaUpdateAudit(base.DCManagerTestCase):
|
||||
self.mock_sysinv_client().get_kube_rootca_cert_id.return_value = (
|
||||
base.FakeException("API cert ID request failed")
|
||||
)
|
||||
self.mock_fm_client().get_alarms_by_ids.side_effect = base.FakeException(
|
||||
"get_alarms_by_ids failed"
|
||||
)
|
||||
|
||||
response = self.audit.subcloud_kube_rootca_audit(
|
||||
self.mock_sysinv_client(),
|
||||
@ -234,3 +238,36 @@ class TestKubeRootcaUpdateAudit(base.DCManagerTestCase):
|
||||
)
|
||||
|
||||
self.assertEqual(response, None)
|
||||
|
||||
def test_kube_rootca_update_audit_method(self):
|
||||
"""Test if kube-rootca is auditing correctly based using alarm or cert_id"""
|
||||
# Set the region one data
|
||||
self.kube_rootca_cert_id.return_value = (
|
||||
True,
|
||||
FakeKubeRootcaData("cert1", ""),
|
||||
)
|
||||
kube_rootca_update_audit_data = self.get_rootca_audit_data()
|
||||
|
||||
subclouds = [base.SUBCLOUD_1, base.SUBCLOUD_2]
|
||||
for subcloud_dict in subclouds:
|
||||
subcloud = FakeSubcloudObj(subcloud_dict)
|
||||
|
||||
self.kube_rootca_cert_id.return_value = True, FakeKubeRootcaData(
|
||||
"cert1", ""
|
||||
)
|
||||
self.mock_sysinv_client().get_kube_rootca_cert_id.return_value = (
|
||||
True,
|
||||
FakeKubeRootcaData("cert1", ""),
|
||||
)
|
||||
self.mock_fm_client().get_alarms_by_ids.return_value = None
|
||||
|
||||
self.audit.subcloud_kube_rootca_audit(
|
||||
self.mock_sysinv_client(),
|
||||
self.mock_fm_client(),
|
||||
subcloud,
|
||||
kube_rootca_update_audit_data,
|
||||
)
|
||||
if subcloud.rehomed:
|
||||
self.mock_sysinv_client().get_kube_rootca_cert_id.assert_called()
|
||||
else:
|
||||
self.mock_fm_client().get_alarms_by_ids.assert_called()
|
||||
|
Loading…
x
Reference in New Issue
Block a user