Fix driver task pattern to reduce periodic db load

Previously, a pattern of periodic tasks was created where nodes, and in many cases, all nodes not actively locked nor those in maintenance state, were pulled in by a periodic task. These periodic tasks would then create tasks which generated additional database queries in order to populate the task object. With the task object populated, the driver would then evaluate if the driver in question was for the the driver interface in question and *then* evaluate if work had to be performed. However, that field containing a pointer to if work needed to be performed as often already queried from the database on the very initial query to generate the list of nodes to evaluate. In essence, we've moved this up in the sequence so we evaluate that field in question prior to creating the task, potentially across every conductor, depending on the query, and ultimately which drivers are enabled. This saves potentially saves hundreds of thousands of needless database queries on a medium size deployment per single day, depending on which drivers and driver interfaces are in use. Change-Id: I409e87de2808d442d39e4d0ae6e995668230cbba
2021-09-13 09:46:08 -07:00
parent fa1c60cbce
commit 4fc1abf91f
7 changed files with 76 additions and 30 deletions
--- a/ironic/drivers/modules/drac/bios.py
+++ b/ironic/drivers/modules/drac/bios.py
@@ -169,6 +169,14 @@ class DracWSManBIOS(base.BIOSInterface):
        for (node_uuid, driver, conductor_group,
             driver_internal_info) in node_list:
            try:
+                # NOTE(TheJulia) Evaluate if work is actually required before
+                # creating a task for every node in the deployment which does
+                # not have a lock and is not in maintenance mode.
+                if (not driver_internal_info.get("bios_config_job_ids")
+                    and not driver_internal_info.get(
+                        "factory_reset_time_before_reboot")):
+                    continue
+
                lock_purpose = 'checking async bios configuration jobs'
                # Performing read-only/non-destructive work with shared lock
                with task_manager.acquire(context, node_uuid,
--- a/ironic/drivers/modules/drac/management.py
+++ b/ironic/drivers/modules/drac/management.py
@@ -502,6 +502,17 @@ class DracRedfishManagement(redfish_management.RedfishManagement):
        for (node_uuid, driver, conductor_group,
             driver_internal_info) in node_list:
            try:
+
+                task_monitor_url = driver_internal_info.get(
+                    'import_task_monitor_url')
+                # NOTE(TheJulia): Evaluate if a task montitor URL exists
+                # based upon our inital DB query before pulling a task for
+                # every node in the deployment which reduces the overall
+                # number of DB queries triggering in the background where
+                # no work is required.
+                if not task_monitor_url:
+                    continue
+
                lock_purpose = 'checking async import configuration task'
                with task_manager.acquire(context, node_uuid,
                                          purpose=lock_purpose,
@@ -509,10 +520,6 @@ class DracRedfishManagement(redfish_management.RedfishManagement):
                    if not isinstance(task.driver.management,
                                      DracRedfishManagement):
                        continue
-                    task_monitor_url = driver_internal_info.get(
-                        'import_task_monitor_url')
-                    if not task_monitor_url:
-                        continue
                    self._check_import_configuration_task(
                        task, task_monitor_url)
            except exception.NodeNotFound:
--- a/ironic/drivers/modules/drac/raid.py
+++ b/ironic/drivers/modules/drac/raid.py
@@ -1482,6 +1482,14 @@ class DracWSManRAID(base.RAIDInterface):
        for (node_uuid, driver, conductor_group,
             driver_internal_info) in node_list:
            try:
+
+                job_ids = driver_internal_info.get('raid_config_job_ids')
+                # NOTE(TheJulia): Evaluate if there is work to be done
+                # based upon the original DB query's results so we don't
+                # proceed creating tasks for every node in the deployment.
+                if not job_ids:
+                    continue
+
                lock_purpose = 'checking async raid configuration jobs'
                with task_manager.acquire(context, node_uuid,
                                          purpose=lock_purpose,
@@ -1489,10 +1497,6 @@ class DracWSManRAID(base.RAIDInterface):
                    if not isinstance(task.driver.raid, DracWSManRAID):
                        continue

-                    job_ids = driver_internal_info.get('raid_config_job_ids')
-                    if not job_ids:
-                        continue
-
                    self._check_node_raid_jobs(task)

            except exception.NodeNotFound:
--- a/ironic/drivers/modules/irmc/raid.py
+++ b/ironic/drivers/modules/irmc/raid.py
@@ -434,6 +434,13 @@ class IRMCRAID(base.RAIDInterface):
        node_list = manager.iter_nodes(fields=fields, filters=filters)
        for (node_uuid, driver, conductor_group, raid_config) in node_list:
            try:
+                # NOTE(TheJulia): Evaluate based upon presence of raid
+                # configuration before triggering a task, as opposed to after
+                # so we don't create excess node task objects with related
+                # DB queries.
+                if not raid_config or raid_config.get('fgi_status'):
+                    continue
+
                lock_purpose = 'checking async RAID configuration tasks'
                with task_manager.acquire(context, node_uuid,
                                          purpose=lock_purpose,
@@ -444,8 +451,6 @@ class IRMCRAID(base.RAIDInterface):
                        continue
                    if task.node.target_raid_config is None:
                        continue
-                    if not raid_config or raid_config.get('fgi_status'):
-                        continue
                    task.upgrade_lock()
                    if node.provision_state != states.CLEANWAIT:
                        continue
--- a/ironic/drivers/modules/redfish/management.py
+++ b/ironic/drivers/modules/redfish/management.py
@@ -872,6 +872,15 @@ class RedfishManagement(base.ManagementInterface):
        for (node_uuid, driver, conductor_group,
             driver_internal_info) in node_list:
            try:
+                firmware_updates = driver_internal_info.get(
+                    'firmware_updates')
+                # NOTE(TheJulia): If we don't have a entry upfront, we can
+                # safely skip past the node as we know work here is not
+                # required, otherwise minimizing the number of potential
+                # nodes to visit.
+                if not firmware_updates:
+                    continue
+
                lock_purpose = 'checking async firmware update failed.'
                with task_manager.acquire(context, node_uuid,
                                          purpose=lock_purpose,
@@ -880,11 +889,6 @@ class RedfishManagement(base.ManagementInterface):
                                      RedfishManagement):
                        continue

-                    firmware_updates = driver_internal_info.get(
-                        'firmware_updates')
-                    if not firmware_updates:
-                        continue
-
                    node = task.node

                    # A firmware update failed. Discard any remaining firmware
@@ -921,6 +925,14 @@ class RedfishManagement(base.ManagementInterface):
        for (node_uuid, driver, conductor_group,
             driver_internal_info) in node_list:
            try:
+                firmware_updates = driver_internal_info.get(
+                    'firmware_updates')
+                # NOTE(TheJulia): Check and skip upfront before creating a
+                # task so we don't generate additional tasks and db queries
+                # for every node in CLEANWAIT which is not locked.
+                if not firmware_updates:
+                    continue
+
                lock_purpose = 'checking async firmware update tasks.'
                with task_manager.acquire(context, node_uuid,
                                          purpose=lock_purpose,
@@ -929,11 +941,6 @@ class RedfishManagement(base.ManagementInterface):
                                      RedfishManagement):
                        continue

-                    firmware_updates = driver_internal_info.get(
-                        'firmware_updates')
-                    if not firmware_updates:
-                        continue
-
                    self._check_node_firmware_update(task)

            except exception.NodeNotFound:
--- a/ironic/drivers/modules/redfish/raid.py
+++ b/ironic/drivers/modules/redfish/raid.py
@@ -1032,6 +1032,15 @@ class RedfishRAID(base.RAIDInterface):
        for (node_uuid, driver, conductor_group,
             driver_internal_info) in node_list:
            try:
+                raid_configs = driver_internal_info.get(
+                    'raid_configs')
+                # NOTE(TheJulia): Evaluate the presence of raid configuration
+                # activity before pulling the task, so we don't needlessly
+                # create database queries with tasks which would be skipped
+                # anyhow.
+                if not raid_configs:
+                    continue
+
                lock_purpose = 'checking async RAID config failed.'
                with task_manager.acquire(context, node_uuid,
                                          purpose=lock_purpose,
@@ -1039,11 +1048,6 @@ class RedfishRAID(base.RAIDInterface):
                    if not isinstance(task.driver.raid, RedfishRAID):
                        continue

-                    raid_configs = driver_internal_info.get(
-                        'raid_configs')
-                    if not raid_configs:
-                        continue
-
                    node = task.node

                    # A RAID config failed. Discard any remaining RAID
@@ -1080,6 +1084,14 @@ class RedfishRAID(base.RAIDInterface):
        for (node_uuid, driver, conductor_group,
             driver_internal_info) in node_list:
            try:
+                raid_configs = driver_internal_info.get(
+                    'raid_configs')
+                # NOTE(TheJulia): Skip to next record if we do not
+                # have raid configuraiton tasks, so we don't pull tasks
+                # for every unrelated node in CLEANWAIT.
+                if not raid_configs:
+                    continue
+
                lock_purpose = 'checking async RAID config tasks.'
                with task_manager.acquire(context, node_uuid,
                                          purpose=lock_purpose,
@@ -1087,11 +1099,6 @@ class RedfishRAID(base.RAIDInterface):
                    if not isinstance(task.driver.raid, RedfishRAID):
                        continue

-                    raid_configs = driver_internal_info.get(
-                        'raid_configs')
-                    if not raid_configs:
-                        continue
-
                    self._check_node_raid_config(task)

            except exception.NodeNotFound:
--- a/releasenotes/notes/optimize-driver-task-pattern-322e02b6a2233919.yaml
+++ b/releasenotes/notes/optimize-driver-task-pattern-322e02b6a2233919.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes the pattern of execution for periodic tasks such that the majority
+    of drivers now evaluate *if* work needs to be performed in advance of
+    creating a node task. Depending on the individual driver query pattern,
+    this prevents excess database queries from being triggered with every
+    task execution.