From 86d3e8632db1e5682f25a02cac9df6f28cb38d27 Mon Sep 17 00:00:00 2001 From: Gustavo Herzmann Date: Thu, 31 Jul 2025 11:05:31 -0300 Subject: [PATCH] Rework IPMI SEL monitoring to remove delays during enrollment This commit updates the IPMI SEL monitoring script and the 99-completion-event enrollment script to eliminate the required delays. The monitoring script now supports a parameter to retrieve the last SEL event ID and another to specify the initial ID to use during monitoring. With these changes, the playbook can determine the initial ID before mounting the seed ISO and pass it to the monitoring task, removing the need to sleep between each enrollment stage. Test Plan: 01. PASS - Run subcloud enrollment and verify that the custom cloud-init complete IPMI SEL event is sent and that the enroll-init completes successfully. 02. PASS - Add a failing custom script before 99-completion-event, verify the completion script does not execute due to run-parts --exit-on-error. 03. PASS - Set ipmi_sel_event_monitoring to false inside the install values and run the subcloud enrollment. Verify that the 99-completion-event script is not created. 04. PASS - Run subcloud enrollment and verify that the last event is successfully obtained, and the consequent events are received correctly. Story: 2011455 Task: 52656 Change-Id: I14917188623dc93ceefa3a77cdd1ee4f8b17ca9d Signed-off-by: Gustavo Herzmann --- .../dccommon/subcloud_enrollment.py | 5 - .../scripts/ipmi_sel_event_monitor.py | 128 ++++++++++++++---- 2 files changed, 101 insertions(+), 32 deletions(-) diff --git a/distributedcloud/dccommon/subcloud_enrollment.py b/distributedcloud/dccommon/subcloud_enrollment.py index 3e1b4c237..19216b426 100644 --- a/distributedcloud/dccommon/subcloud_enrollment.py +++ b/distributedcloud/dccommon/subcloud_enrollment.py @@ -198,15 +198,10 @@ class SubcloudEnrollmentInit(object): enroll_overrides = iso_values["install_values"].get("enroll_overrides", {}) if enroll_overrides.get("ipmi_sel_event_monitoring", True) is not False: completion_script = os.path.join(scripts_dir, "99-completion-event") - # The sleep is necessary so that the system controller has time to - # start monitoring the custom cloud-init update event. If another - # event is sent too soon, the system controller would not be able to - # detect it. with open(completion_script, "w") as f: f.write( """#!/bin/bash echo "$(date '+%F %H:%M:%S'): INFO: All custom scripts completed successfully" - sleep 60s tmp_file=$(mktemp /tmp/ipmi_event_XXXXXX.txt) echo "0x04 0xF0 0x01 0x6f 0xff 0xff 0xe6 # \"Custom complete\"" > "$tmp_file" ipmitool sel add "$tmp_file" 2>/dev/null diff --git a/distributedcloud/scripts/ipmi_sel_event_monitor.py b/distributedcloud/scripts/ipmi_sel_event_monitor.py index 6308cb556..4df108cc3 100644 --- a/distributedcloud/scripts/ipmi_sel_event_monitor.py +++ b/distributedcloud/scripts/ipmi_sel_event_monitor.py @@ -7,14 +7,17 @@ """ Monitors the IPMI System Event Log (SEL) for a specific target event and -associated event data. +associated event data, or retrieves the last event ID. -The script periodically checks the SEL for the desired event. If the event -is found, the script returns a return code of 0 and the matched event in -json format. If the event is not found after the maximum number of checks, -a non-zero return code is returned. +The script operates in two modes. In monitoring mode, it periodically checks +the SEL for the desired event. If the event is found, the script returns a +return code of 0 and the matched event in json format. If the event is not +found after the maximum number of checks, a non-zero return code is returned. +It only monitors new events, existing events are ignored unless --initial-event-id +is specified to start monitoring from a specific point. -It only monitors new events, existing events are ignored. +Alternatively, using --get-last-event makes the script simply return the ID of +the most recent event in the SEL and exit. If the SEL is empty, it returns -1. """ import argparse @@ -168,22 +171,42 @@ class IpmiTool: return None +def get_last_event_only(ipmi_tool: IpmiTool) -> None: + """Get the last event ID and exit""" + last_event_id = ipmi_tool.get_last_event_id() + if last_event_id is not None: + message = f"Last event ID: {last_event_id}" + result = {"success": True, "message": message, "last_event_id": last_event_id} + else: + # SEL is empty, return -1 as the last event ID + message = "SEL is empty, returning -1 as last event ID" + result = {"success": True, "message": message, "last_event_id": -1} + + print(json.dumps(result)) + sys.exit(0) + + def monitor_events( ipmi_tool: IpmiTool, target_pattern: str, event_data_values: list[str], interval: float, timeout: int, + initial_event_id: Optional[int] = None, ) -> tuple[bool, str, Optional[str]]: """Monitor IPMI SEL for target events""" - last_event_id = ipmi_tool.get_last_event_id() - if not last_event_id: - message = "Failed to get initial event ID, SEL might be empty" - logging.warning(message) - # If the SEL is empty, we set the starting event ID to -1 because - # the first event will always be >= 0 - last_event_id = -1 + if initial_event_id is not None: + last_event_id = initial_event_id + logging.info(f"Using provided initial event ID: {last_event_id}") + else: + last_event_id = ipmi_tool.get_last_event_id() + if last_event_id is None: + message = "Failed to get initial event ID, SEL might be empty" + logging.warning(message) + # If the SEL is empty, we set the starting event ID to -1 because + # the first event will always be >= 0 + last_event_id = -1 logging.info(f"Starting monitoring from event ID: {last_event_id}") logging.info( @@ -278,20 +301,59 @@ def monitor_events( def main(): - parser = argparse.ArgumentParser(description="Monitor IPMI SEL for target events") + parser = argparse.ArgumentParser( + description=( + "Monitor new IPMI SEL entries for a pattern + data (monitoring mode) " + "or quickly query the current last event id (query mode)." + ), + epilog="""\ +Examples: + + 1) Query mode: get the last SEL event id (or -1 if empty) and exit. + ipmi_sel_event_monitor.py \\ + --config-file rvmc-config.yaml \\ + --get-last-event + + 2) Monitoring mode: wait for a specific pattern + data up to 5 minutes, + starting after event id 10. Checks every 15s for new events. On match, + exits 0 and prints JSON with "matched_data"; on timeout, exits non-zero. + ipmi_sel_event_monitor.py \\ + --config-file rvmc-config.yaml \\ + --pattern "Unknown #0x01 | | Asserted" \\ + --data-values "ffffe6,ffffe7" \\ + --interval 15 \\ + --timeout 300 \\ + --initial-event-id 10 +""", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) parser.add_argument( "--config-file", required=True, help="Path to BMC configuration file containing host, username, and password", ) parser.add_argument( - "--pattern", required=True, help="Target event pattern to match" + "--get-last-event", + action="store_true", + help="Get the last event ID and exit (ignores other monitoring parameters)", + ) + parser.add_argument( + "--pattern", help="Target event pattern to match (required for monitoring mode)" ) - parser.add_argument( "--data-values", - required=True, - help="Comma-separated list of event data values to match", + help=( + "Comma-separated list of event data values to match " + "(required for monitoring mode)" + ), + ) + parser.add_argument( + "--initial-event-id", + type=int, + help=( + "Initial event ID to start monitoring from " + "(only monitor events after this ID)" + ), ) parser.add_argument( "--interval", type=int, default=30, help="Check interval in seconds" @@ -300,19 +362,31 @@ def main(): args = parser.parse_args() - event_data_values = [d.strip() for d in args.data_values.split(",")] + # Validate arguments for monitoring mode + if not args.get_last_event: + if not args.pattern: + parser.error("--pattern is required when not using --get-last-event") + if not args.data_values: + parser.error("--data-values is required when not using --get-last-event") + try: ipmi_tool = IpmiTool.from_config(args.config_file) - success, message, matched_data = monitor_events( - ipmi_tool, - args.pattern, - event_data_values, - args.interval, - args.timeout, - ) + if args.get_last_event: + get_last_event_only(ipmi_tool) + else: + event_data_values = [d.strip() for d in args.data_values.split(",")] - exit_script(success, message, matched_data) + success, message, matched_data = monitor_events( + ipmi_tool, + args.pattern, + event_data_values, + args.interval, + args.timeout, + args.initial_event_id, + ) + + exit_script(success, message, matched_data) except Exception as e: exit_script(