From 036c80f31097f70f28b53a43cac378efe491adde Mon Sep 17 00:00:00 2001 From: Enzo Candotti Date: Thu, 25 Sep 2025 14:36:22 -0300 Subject: [PATCH] Update SEL Events codes to avoid collisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, factory-install and enrollment are using the OEM-specific sensor type ID F0h, which falls within the reserved range (C0h–FFh). On Supermicro BMCs, this value collides with an existing OEM sensor, causing the custom events to be misinterpreted as BIOS OEM memory errors. This change updates the SEL events to use sensor type ID 12h (System Event), which is a standard value and should be interpreted consistently across different BMC implementations. This avoids collisions with vendor-specific OEM ranges and ensures correct event handling. In addition, the Factory Setup Complete and Failed events were moved from the enroll-init-reconfigure script to the run-cloud-init-from-seed.sh script. This fixes an issue in which the monitoring script times out if factory install was not completed, since run-cloud-init-from-seed.sh exits when the completion flag is not present, preventing enroll-init-reconfigure from running. Test Plan: PASS: Build a seed ISO including these changes. Run a factory install that completes successfully and verify that each stage sends the success event with the expected event data value. PASS: Run a factory install forcing a failure in one stage. Repeat for all stages and verify that the failed stage sends a failure event with the corresponding event data value. PASS: Run an enrollment and verify that each stage sends the success event with the expected event data value. Repeat, inducing a failure for each stage, and verify that the events correspond to the expected event data value. PASS: After the previous tests, verify that these events are not detected as errors by the Supermicro BMC. PASS: Run an enrollment with IPMI monitoring enabled when the factory install has not completed yet. Verify that the failure event sent by the run-cloud-init-from-seed.sh script is detected and that the failure message is logged in the System Controller. Story: 2011455 Task: 52855 Change-Id: I08ccc43f949f47a3dc36ae67740ece3b9a37a6fb Signed-off-by: Enzo Candotti --- .../systemd/utils/send-factory-sel-event | 22 ++++---- .../seed-config/run-cloud-init-from-seed.sh | 34 ++++++++++- .../scripts/enroll-init-reconfigure | 56 +++++++------------ 3 files changed, 64 insertions(+), 48 deletions(-) diff --git a/tools/nocloud-factory-install/factory-install/systemd/utils/send-factory-sel-event b/tools/nocloud-factory-install/factory-install/systemd/utils/send-factory-sel-event index 7c196577..eedbf82d 100644 --- a/tools/nocloud-factory-install/factory-install/systemd/utils/send-factory-sel-event +++ b/tools/nocloud-factory-install/factory-install/systemd/utils/send-factory-sel-event @@ -11,17 +11,17 @@ log_info() { echo "$(date '+%F %H:%M:%S') INFO: $*"; } log_warn() { echo "$(date '+%F %H:%M:%S') WARN: $*"; } declare -Ar CODES=( - [bootstrap.ok]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd1" - [bootstrap.err]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd2" - [config.ok]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd3" - [config.err]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd4" - [setup.ok]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd5" - [setup.err]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd6" - [tests.ok]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd7" - [tests.err]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd8" - [backup.ok]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xd9" - [backup.err]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xda" - [finished.ok]="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe0" + [bootstrap.ok]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF6" + [bootstrap.err]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF7" + [config.ok]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF8" + [config.err]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF9" + [setup.ok]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xFA" + [setup.err]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xFB" + [tests.ok]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xFC" + [tests.err]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xFD" + [backup.ok]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xFE" + [backup.err]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xFF" + [finished.ok]="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE0" ) stage="${1:-}"; status="${2:-}" diff --git a/tools/nocloud-factory-install/seed-config/run-cloud-init-from-seed.sh b/tools/nocloud-factory-install/seed-config/run-cloud-init-from-seed.sh index d904aab3..de9bd569 100755 --- a/tools/nocloud-factory-install/seed-config/run-cloud-init-from-seed.sh +++ b/tools/nocloud-factory-install/seed-config/run-cloud-init-from-seed.sh @@ -17,6 +17,10 @@ SEED_SERVICE="/etc/systemd/system/cloud-init-seed.service" SEED_NETWORK_CFG="network-config" NETWORK_CFG_FILE="/run/.$SEED_NETWORK_CFG" CLOUD_INIT_IF_FILE="/etc/network/interfaces.d/50-cloud-init" +readonly EVENT_FACTORY_SETUP_COMPLETE="factory_setup_complete" +readonly EVENT_FACTORY_SETUP_FAILED="factory_setup_failed" +readonly DATA_FACTORY_SETUP_COMPLETE="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE0 # \"Factory Setup Complete\"" +readonly DATA_FACTORY_SETUP_FAILED="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE1 # \"Factory Setup Failed\"" function check_rc_die { local -i rc=${1} @@ -59,6 +63,32 @@ flock -n 200 || { exit 0 } +function send_ipmi_event { + local event_type="$1" + local event_data + case "$event_type" in + "$EVENT_FACTORY_SETUP_COMPLETE") event_data="$DATA_FACTORY_SETUP_COMPLETE" ;; + "$EVENT_FACTORY_SETUP_FAILED") event_data="$DATA_FACTORY_SETUP_FAILED" ;; + *) + log_warn "Unknown IPMI event type: $event_type" + return 1 + ;; + esac + + local temp_file=$(mktemp /tmp/ipmi_event_XXXXXX.txt) + echo "$event_data" > "$temp_file" + + if ipmitool sel add "$temp_file" 2>/dev/null; then + log_info "IPMI event sent successfully: $event_type" + rm -f "$temp_file" + return 0 + else + log_warn "Failed to send IPMI event: $event_type" + rm -f "$temp_file" + return 1 + fi +} + # If clean is passed as an argument, remove the udev rule and service, # the custom cloud.cfg file, and the script itself. # This is to ensure that the cloud-init-seed service is not triggered @@ -78,8 +108,10 @@ log_info "Starting cloud-init using seed ISO..." # Checks if factory-install has been completed. This is required to be able # to run cloud-init from a seed ISO. if [[ ! -f "$FACTORY_INSTALL_COMPLETE_FILE" ]]; then - log_fatal "Cloud-init from factory-install has not been completed yet. Exiting." + send_ipmi_event "$EVENT_FACTORY_SETUP_FAILED" + log_fatal "/var/lib/factory-install/stage/complete does not exist. Ensure factory-install was successful." fi +send_ipmi_event "$EVENT_FACTORY_SETUP_COMPLETE" # Finds the first device found with the label CIDATA or cidata. # If the device is not found, exit the script. diff --git a/utilities/platform-util/scripts/enroll-init-reconfigure b/utilities/platform-util/scripts/enroll-init-reconfigure index 4fbb3546..eac4d62e 100755 --- a/utilities/platform-util/scripts/enroll-init-reconfigure +++ b/utilities/platform-util/scripts/enroll-init-reconfigure @@ -72,36 +72,36 @@ readonly EVENT_PLATFORM_CLOUDINIT_UPDATE_COMPLETE="platform_cloudinit_update_com readonly EVENT_PLATFORM_CLOUDINIT_UPDATE_FAILED="platform_cloudinit_update_failed" # IPMI payloads (data bytes) -readonly DATA_FACTORY_SETUP_COMPLETE="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe0 # \"Factory Setup Complete\"" -readonly DATA_FACTORY_SETUP_FAILED="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe1 # \"Factory Setup Failed\"" +readonly DATA_FACTORY_SETUP_COMPLETE="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE0 # \"Factory Setup Complete\"" +readonly DATA_FACTORY_SETUP_FAILED="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE1 # \"Factory Setup Failed\"" -readonly DATA_APISERVER_CERT_OK="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe2 # \"ApiServer Cert Valid\"" -readonly DATA_LEAF_CERTS_RENEW_FAIL="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe3 # \"K8S Leaf Certs Renew Failed\"" -readonly DATA_RENEW_FAIL_PODS="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe8 # \"Pods Cert Renew Failed\"" -readonly DATA_CERTMANAGER_CERTS_FAIL="0x04 0xF0 0x01 0x6f 0xff 0xff 0xef # \"Cert-manager Secrets Renew Failed\"" -readonly DATA_RENEW_FAIL_KUBECTL="0x04 0xF0 0x01 0x6f 0xff 0xff 0xf0 # \"Kubectl Cert Renew Failed\"" +readonly DATA_APISERVER_CERT_OK="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE2 # \"ApiServer Cert Valid\"" +readonly DATA_LEAF_CERTS_RENEW_FAIL="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE3 # \"K8S Leaf Certs Renew Failed\"" +readonly DATA_RENEW_FAIL_PODS="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE4 # \"Pods Cert Renew Failed\"" +readonly DATA_CERTMANAGER_CERTS_FAIL="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE5 # \"Cert-manager Secrets Renew Failed\"" +readonly DATA_RENEW_FAIL_KUBECTL="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE6 # \"Kubectl Cert Renew Failed\"" -readonly DATA_MANUAL_CAS_OK="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe9 # \"Manual CA Certs Valid\"" -readonly DATA_MANUAL_CA_K8S_FP_FAIL="0x04 0xF0 0x01 0x6f 0xff 0xff 0xec # \"K8S/Front-proxy Cert Expired\"" -readonly DATA_MANUAL_CA_ETCD_FAIL="0x04 0xF0 0x01 0x6f 0xff 0xff 0xed # \"ETCD CA Cert Expired\"" -readonly DATA_MANUAL_CAS_BOTH_FAIL="0x04 0xF0 0x01 0x6f 0xff 0xff 0xee # \"ETCD CA and K8S/Front-proxy Certs Expired\"" +readonly DATA_MANUAL_CAS_OK="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE7 # \"Manual CA Certs Valid\"" +readonly DATA_MANUAL_CA_K8S_FP_FAIL="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE8 # \"K8S/Front-proxy Cert Expired\"" +readonly DATA_MANUAL_CA_ETCD_FAIL="0x04 0x12 0xCC 0x63 0xCC 0x10 0xE9 # \"ETCD CA Cert Expired\"" +readonly DATA_MANUAL_CAS_BOTH_FAIL="0x04 0x12 0xCC 0x63 0xCC 0x10 0xEA # \"ETCD CA and K8S/Front-proxy Certs Expired\"" -readonly DATA_SLOCAL_CA_OK="0x04 0xF0 0x01 0x6f 0xff 0xff 0xea # \"System-local-ca Cert Valid\"" -readonly DATA_SLOCAL_CA_FAIL="0x04 0xF0 0x01 0x6f 0xff 0xff 0xeb # \"System-local-ca Cert Expired\"" +readonly DATA_SLOCAL_CA_OK="0x04 0x12 0xCC 0x63 0xCC 0x10 0xEB # \"System-local-ca Cert Valid\"" +readonly DATA_SLOCAL_CA_FAIL="0x04 0x12 0xCC 0x63 0xCC 0x10 0xEC # \"System-local-ca Cert Expired\"" -readonly DATA_PLATFORM_CLOUDINIT_UPDATE_COMPLETE="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe4 # \"Platform Cloud-init Update Complete\"" -readonly DATA_PLATFORM_CLOUDINIT_UPDATE_FAILED="0x04 0xF0 0x01 0x6f 0xff 0xff 0xe5 # \"Platform Cloud-init Update Failed\"" +readonly DATA_PLATFORM_CLOUDINIT_UPDATE_COMPLETE="0x04 0x12 0xCC 0x63 0xCC 0x10 0xED # \"Platform Cloud-init Update Complete\"" +readonly DATA_PLATFORM_CLOUDINIT_UPDATE_FAILED="0x04 0x12 0xCC 0x63 0xCC 0x10 0xEE # \"Platform Cloud-init Update Failed\"" # IPMI SEL event format reference: # [EvM Revision] [Sensor Type] [Sensor Number] [Event Dir / Event Type Code] # [Event Data 1] [Event Data 2] [Event Data 3] # -# Example: 0x04 0xF0 0x01 0x6f 0xff 0xff 0xe4 +# Example: 0x04 0x12 0xCC 0x63 0xCC 0x10 0xE0 # 0x04 = EvM Revision (IPMI v2.0) -# 0xF0 = Sensor Type (vendor-defined / OEM-specific) -# 0x01 = Sensor Number (firmware-defined) -# 0x6f = Event direction (vendor-specific encoding) -# 0xff 0xff 0xe4 = Event Data bytes (3 bytes, OEM-specific payload) +# 0x12 = Sensor Type (vendor-defined / OEM-specific) +# 0xCC = Sensor Number (firmware-defined) +# 0x63 = Event direction (vendor-specific encoding) +# 0xCC 0x10 0xE0 = Event Data bytes (3 bytes, OEM-specific payload) # # For our usage: # - The third byte (Sensor Number) is set to 0x01, corresponding to a sensor type "Unknown". @@ -155,18 +155,6 @@ function send_ipmi_event { fi } -function verify_factory_install { - log_info "Checking factory-install..." - - if [ ! -f /var/lib/factory-install/stage/final ]; then - send_ipmi_event "$EVENT_FACTORY_SETUP_FAILED" - log_fatal "/var/lib/factory-install/stage/final does not exist. Ensure factory-install was successful." - fi - - send_ipmi_event "$EVENT_FACTORY_SETUP_COMPLETE" - log_info "factory-install check successful." -} - # The enroll-init reconfigure script runs during startup via cloud-init while # system services may not be settled. This timing can lead to intermittent errors # for early system commands. This function is used to mitigate these problems by @@ -639,11 +627,7 @@ while [[ "$#" -gt 0 ]]; do esac done -# This script can only be run if the factory install is complete, so we check that first. -# It's important that we fail due to an invalid factory install before any other -# type of failure, as that's the IPMI SEL event the system controller monitors first. # Main execution flow -verify_factory_install # Ensure all required arguments are provided if [ -z "$OAM_SUBNET" ] || [ -z "$OAM_GATEWAY_IP" ] || [ -z "$OAM_IP" ] || [ -z "$NEW_PASSWORD" ]; then