From b64f70d83424b7738f0bbe2331cca40e5d30c95a Mon Sep 17 00:00:00 2001 From: rabi Date: Mon, 14 Jul 2025 18:02:56 +0530 Subject: [PATCH] Fix instance_group/autoscaling_group intermittent test failures With convergence there is a chance some leaf resource won't be processed by workers if the stack is marked as failed due to failure of one resources and stack traversal set to empty string i.e traversal cancelled. Also uses TestResource to simplifly the tests. Change-Id: I1a04853d42f519d9a14dd345ac8cb441b08c4d77 Signed-off-by: rabi --- devstack/upgrade/resources.sh | 5 +- .../functional/test_autoscaling.py | 51 +++++++++---------- .../functional/test_instance_group.py | 51 +++++++++---------- 3 files changed, 52 insertions(+), 55 deletions(-) diff --git a/devstack/upgrade/resources.sh b/devstack/upgrade/resources.sh index ad78c0becc..33d937e02f 100755 --- a/devstack/upgrade/resources.sh +++ b/devstack/upgrade/resources.sh @@ -103,7 +103,10 @@ function _run_heat_integrationtests { function create { if [ "${RUN_HEAT_INTEGRATION_TESTS}" == "True" ]; then # run heat integration tests instead of tempest smoke before create - _run_heat_integrationtests $BASE_DEVSTACK_DIR + # TODO(ramishra) switch to run tests from $BASE_DEVSTACK_DIR once + # https://review.opendev.org/c/openstack/heat/+/954938 + # has been backported + _run_heat_integrationtests $TARGET_DEVSTACK_DIR fi source $TOP_DIR/openrc admin admin diff --git a/heat_integrationtests/functional/test_autoscaling.py b/heat_integrationtests/functional/test_autoscaling.py index 46da2a86e0..2b34a2b248 100644 --- a/heat_integrationtests/functional/test_autoscaling.py +++ b/heat_integrationtests/functional/test_autoscaling.py @@ -77,11 +77,9 @@ parameters: resources: random1: - type: OS::Heat::RandomString - properties: - salt: {get_param: UserData} + type: OS::Heat::TestResource outputs: - PublicIp: {value: {get_attr: [random1, value]}} + PublicIp: {value: {get_attr: [random1, output]}} AvailabilityZone: {value: 'not-used11'} PrivateDnsName: {value: 'not-used12'} PublicDnsName: {value: 'not-used13'} @@ -100,18 +98,12 @@ parameters: resources: random1: - type: OS::Heat::RandomString - depends_on: waiter - ready_poster: - type: AWS::CloudFormation::WaitConditionHandle - waiter: - type: AWS::CloudFormation::WaitCondition + type: OS::Heat::TestResource properties: - Handle: {get_resource: ready_poster} - Timeout: 1 + fail: true outputs: PublicIp: - value: {get_attr: [random1, value]} + value: {get_attr: [random1, output]} ''' def setUp(self): @@ -252,7 +244,16 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest): nested_ident = self.assert_resource_is_a_stack(stack_identifier, 'JobServerGroup') - self._assert_instance_state(nested_ident, 0, 2) + # Check at least one resource is in *_FAILED as there is a + # chance that before other leaf resources are processed, stack + # is marked as failed and traversal is set to empty string, + # so that all other workers processing resources bail out + # and the traversal gets cancelled. + for res in self.client.resources.list(nested_ident): + if res.resource_status.endswith('CREATE_FAILED'): + break + else: + self.fail('No resource in CREATE_FAILED') def test_update_instance_error_causes_group_error(self): """Test update failing a resource in the instance group. @@ -281,8 +282,6 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest): nested_ident = self.assert_resource_is_a_stack(stack_identifier, 'JobServerGroup') self._assert_instance_state(nested_ident, 2, 0) - initial_list = [res.resource_name - for res in self.client.resources.list(nested_ident)] env['parameters']['size'] = 3 files2 = {'provider.yaml': self.bad_instance_template} @@ -296,20 +295,18 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest): ) self._wait_for_stack_status(stack_identifier, 'UPDATE_FAILED') - # assert that there are 3 bad instances nested_ident = self.assert_resource_is_a_stack(stack_identifier, 'JobServerGroup') - - # 2 resources should be in update failed, and one create failed. + # Check at least one resource is in *_FAILED as there is a + # chance that before other leaf resources are processed, stack + # is marked as failed and traversal is set to empty string, + # so that all other workers processing resources bail out + # and the traversal gets cancelled. for res in self.client.resources.list(nested_ident): - if res.resource_name in initial_list: - self._wait_for_resource_status(nested_ident, - res.resource_name, - 'UPDATE_FAILED') - else: - self._wait_for_resource_status(nested_ident, - res.resource_name, - 'CREATE_FAILED') + if res.resource_status.endswith('_FAILED'): + break + else: + self.fail('No resource in *_FAILED') def test_group_suspend_resume(self): diff --git a/heat_integrationtests/functional/test_instance_group.py b/heat_integrationtests/functional/test_instance_group.py index 44b3aa95f0..0ae8800989 100644 --- a/heat_integrationtests/functional/test_instance_group.py +++ b/heat_integrationtests/functional/test_instance_group.py @@ -70,12 +70,10 @@ parameters: resources: random1: - type: OS::Heat::RandomString - properties: - salt: {get_param: UserData} + type: OS::Heat::TestResource outputs: PublicIp: - value: {get_attr: [random1, value]} + value: {get_attr: [random1, output]} ''' # This is designed to fail. @@ -90,18 +88,12 @@ parameters: resources: random1: - type: OS::Heat::RandomString - depends_on: waiter - ready_poster: - type: AWS::CloudFormation::WaitConditionHandle - waiter: - type: AWS::CloudFormation::WaitCondition + type: OS::Heat::TestResource properties: - Handle: {Ref: ready_poster} - Timeout: 1 + fail: true outputs: PublicIp: - value: {get_attr: [random1, value]} + value: {get_attr: [random1, output]} ''' def setUp(self): @@ -242,7 +234,16 @@ class InstanceGroupBasicTest(InstanceGroupTest): nested_ident = self.assert_resource_is_a_stack(stack_identifier, 'JobServerGroup') - self._assert_instance_state(nested_ident, 0, 2) + # Check at least one resource is in *_FAILED as there is a + # chance that before other leaf resources are processed, stack + # is marked as failed and traversal is set to empty string, + # so that all other workers processing resources bail out + # and the traversal gets cancelled. + for res in self.client.resources.list(nested_ident): + if res.resource_status.endswith('CREATE_FAILED'): + break + else: + self.fail('No resource in CREATE_FAILED') def test_update_instance_error_causes_group_error(self): """Test update failing a resource in the instance group. @@ -271,8 +272,6 @@ class InstanceGroupBasicTest(InstanceGroupTest): nested_ident = self.assert_resource_is_a_stack(stack_identifier, 'JobServerGroup') self._assert_instance_state(nested_ident, 2, 0) - initial_list = [res.resource_name - for res in self.client.resources.list(nested_ident)] env['parameters']['size'] = 3 files2 = {'provider.yaml': self.bad_instance_template} @@ -285,20 +284,18 @@ class InstanceGroupBasicTest(InstanceGroupTest): environment=env ) self._wait_for_stack_status(stack_identifier, 'UPDATE_FAILED') - nested_ident = self.assert_resource_is_a_stack(stack_identifier, 'JobServerGroup') - # assert that there are 3 bad instances - # 2 resources should be in update failed, and one create failed. + # Check at least one resource is in *_FAILED as there is a + # chance that before other leaf resources are processed, stack + # is marked as failed and traversal is set to empty string, + # so that all other workers processing resources bail out + # and the traversal gets cancelled. for res in self.client.resources.list(nested_ident): - if res.resource_name in initial_list: - self._wait_for_resource_status(nested_ident, - res.resource_name, - 'UPDATE_FAILED') - else: - self._wait_for_resource_status(nested_ident, - res.resource_name, - 'CREATE_FAILED') + if res.resource_status.endswith('_FAILED'): + break + else: + self.fail('No resource in *_FAILED') class InstanceGroupUpdatePolicyTest(InstanceGroupTest):