From b64f70d83424b7738f0bbe2331cca40e5d30c95a Mon Sep 17 00:00:00 2001
From: rabi <ramishra@redhat.com>
Date: Mon, 14 Jul 2025 18:02:56 +0530
Subject: [PATCH] Fix instance_group/autoscaling_group intermittent test
 failures

With convergence there is a chance some leaf resource won't be
processed by workers if the stack is marked as failed due to
failure of one resources and stack traversal set to empty string
i.e traversal cancelled.

Also uses TestResource to simplifly the tests.

Change-Id: I1a04853d42f519d9a14dd345ac8cb441b08c4d77
Signed-off-by: rabi <ramishra@redhat.com>
---
 devstack/upgrade/resources.sh                 |  5 +-
 .../functional/test_autoscaling.py            | 51 +++++++++----------
 .../functional/test_instance_group.py         | 51 +++++++++----------
 3 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/devstack/upgrade/resources.sh b/devstack/upgrade/resources.sh
index ad78c0becc..33d937e02f 100755
--- a/devstack/upgrade/resources.sh
+++ b/devstack/upgrade/resources.sh
@@ -103,7 +103,10 @@ function _run_heat_integrationtests {
 function create {
     if [ "${RUN_HEAT_INTEGRATION_TESTS}" == "True" ]; then
         # run heat integration tests instead of tempest smoke before create
-        _run_heat_integrationtests $BASE_DEVSTACK_DIR
+        # TODO(ramishra) switch to run tests from $BASE_DEVSTACK_DIR once
+        # https://review.opendev.org/c/openstack/heat/+/954938
+        # has been backported
+        _run_heat_integrationtests $TARGET_DEVSTACK_DIR
     fi
 
     source $TOP_DIR/openrc admin admin
diff --git a/heat_integrationtests/functional/test_autoscaling.py b/heat_integrationtests/functional/test_autoscaling.py
index 46da2a86e0..2b34a2b248 100644
--- a/heat_integrationtests/functional/test_autoscaling.py
+++ b/heat_integrationtests/functional/test_autoscaling.py
@@ -77,11 +77,9 @@ parameters:
 
 resources:
   random1:
-    type: OS::Heat::RandomString
-    properties:
-      salt: {get_param: UserData}
+    type: OS::Heat::TestResource
 outputs:
-  PublicIp: {value: {get_attr: [random1, value]}}
+  PublicIp: {value: {get_attr: [random1, output]}}
   AvailabilityZone: {value: 'not-used11'}
   PrivateDnsName: {value: 'not-used12'}
   PublicDnsName: {value: 'not-used13'}
@@ -100,18 +98,12 @@ parameters:
 
 resources:
   random1:
-    type: OS::Heat::RandomString
-    depends_on: waiter
-  ready_poster:
-    type: AWS::CloudFormation::WaitConditionHandle
-  waiter:
-    type: AWS::CloudFormation::WaitCondition
+    type: OS::Heat::TestResource
     properties:
-      Handle: {get_resource: ready_poster}
-      Timeout: 1
+      fail: true
 outputs:
   PublicIp:
-    value: {get_attr: [random1, value]}
+    value: {get_attr: [random1, output]}
 '''
 
     def setUp(self):
@@ -252,7 +244,16 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest):
 
         nested_ident = self.assert_resource_is_a_stack(stack_identifier,
                                                        'JobServerGroup')
-        self._assert_instance_state(nested_ident, 0, 2)
+        # Check at least one resource is in *_FAILED as there is a
+        # chance that before other leaf resources are processed, stack
+        # is marked as failed and traversal is set to empty string,
+        # so that all other workers processing resources bail out
+        # and the traversal gets cancelled.
+        for res in self.client.resources.list(nested_ident):
+            if res.resource_status.endswith('CREATE_FAILED'):
+                break
+        else:
+            self.fail('No resource in CREATE_FAILED')
 
     def test_update_instance_error_causes_group_error(self):
         """Test update failing a resource in the instance group.
@@ -281,8 +282,6 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest):
         nested_ident = self.assert_resource_is_a_stack(stack_identifier,
                                                        'JobServerGroup')
         self._assert_instance_state(nested_ident, 2, 0)
-        initial_list = [res.resource_name
-                        for res in self.client.resources.list(nested_ident)]
 
         env['parameters']['size'] = 3
         files2 = {'provider.yaml': self.bad_instance_template}
@@ -296,20 +295,18 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest):
         )
         self._wait_for_stack_status(stack_identifier, 'UPDATE_FAILED')
 
-        # assert that there are 3 bad instances
         nested_ident = self.assert_resource_is_a_stack(stack_identifier,
                                                        'JobServerGroup')
-
-        # 2 resources should be in update failed, and one create failed.
+        # Check at least one resource is in *_FAILED as there is a
+        # chance that before other leaf resources are processed, stack
+        # is marked as failed and traversal is set to empty string,
+        # so that all other workers processing resources bail out
+        # and the traversal gets cancelled.
         for res in self.client.resources.list(nested_ident):
-            if res.resource_name in initial_list:
-                self._wait_for_resource_status(nested_ident,
-                                               res.resource_name,
-                                               'UPDATE_FAILED')
-            else:
-                self._wait_for_resource_status(nested_ident,
-                                               res.resource_name,
-                                               'CREATE_FAILED')
+            if res.resource_status.endswith('_FAILED'):
+                break
+        else:
+            self.fail('No resource in *_FAILED')
 
     def test_group_suspend_resume(self):
 
diff --git a/heat_integrationtests/functional/test_instance_group.py b/heat_integrationtests/functional/test_instance_group.py
index 44b3aa95f0..0ae8800989 100644
--- a/heat_integrationtests/functional/test_instance_group.py
+++ b/heat_integrationtests/functional/test_instance_group.py
@@ -70,12 +70,10 @@ parameters:
 
 resources:
   random1:
-    type: OS::Heat::RandomString
-    properties:
-      salt: {get_param: UserData}
+    type: OS::Heat::TestResource
 outputs:
   PublicIp:
-    value: {get_attr: [random1, value]}
+    value: {get_attr: [random1, output]}
 '''
 
     # This is designed to fail.
@@ -90,18 +88,12 @@ parameters:
 
 resources:
   random1:
-    type: OS::Heat::RandomString
-    depends_on: waiter
-  ready_poster:
-    type: AWS::CloudFormation::WaitConditionHandle
-  waiter:
-    type: AWS::CloudFormation::WaitCondition
+    type: OS::Heat::TestResource
     properties:
-      Handle: {Ref: ready_poster}
-      Timeout: 1
+      fail: true
 outputs:
   PublicIp:
-    value: {get_attr: [random1, value]}
+    value: {get_attr: [random1, output]}
 '''
 
     def setUp(self):
@@ -242,7 +234,16 @@ class InstanceGroupBasicTest(InstanceGroupTest):
 
         nested_ident = self.assert_resource_is_a_stack(stack_identifier,
                                                        'JobServerGroup')
-        self._assert_instance_state(nested_ident, 0, 2)
+        # Check at least one resource is in *_FAILED as there is a
+        # chance that before other leaf resources are processed, stack
+        # is marked as failed and traversal is set to empty string,
+        # so that all other workers processing resources bail out
+        # and the traversal gets cancelled.
+        for res in self.client.resources.list(nested_ident):
+            if res.resource_status.endswith('CREATE_FAILED'):
+                break
+        else:
+            self.fail('No resource in CREATE_FAILED')
 
     def test_update_instance_error_causes_group_error(self):
         """Test update failing a resource in the instance group.
@@ -271,8 +272,6 @@ class InstanceGroupBasicTest(InstanceGroupTest):
         nested_ident = self.assert_resource_is_a_stack(stack_identifier,
                                                        'JobServerGroup')
         self._assert_instance_state(nested_ident, 2, 0)
-        initial_list = [res.resource_name
-                        for res in self.client.resources.list(nested_ident)]
 
         env['parameters']['size'] = 3
         files2 = {'provider.yaml': self.bad_instance_template}
@@ -285,20 +284,18 @@ class InstanceGroupBasicTest(InstanceGroupTest):
             environment=env
         )
         self._wait_for_stack_status(stack_identifier, 'UPDATE_FAILED')
-
         nested_ident = self.assert_resource_is_a_stack(stack_identifier,
                                                        'JobServerGroup')
-        # assert that there are 3 bad instances
-        # 2 resources should be in update failed, and one create failed.
+        # Check at least one resource is in *_FAILED as there is a
+        # chance that before other leaf resources are processed, stack
+        # is marked as failed and traversal is set to empty string,
+        # so that all other workers processing resources bail out
+        # and the traversal gets cancelled.
         for res in self.client.resources.list(nested_ident):
-            if res.resource_name in initial_list:
-                self._wait_for_resource_status(nested_ident,
-                                               res.resource_name,
-                                               'UPDATE_FAILED')
-            else:
-                self._wait_for_resource_status(nested_ident,
-                                               res.resource_name,
-                                               'CREATE_FAILED')
+            if res.resource_status.endswith('_FAILED'):
+                break
+        else:
+            self.fail('No resource in *_FAILED')
 
 
 class InstanceGroupUpdatePolicyTest(InstanceGroupTest):