Fix instance_group/autoscaling_group intermittent test failures

With convergence there is a chance some leaf resource won't be
processed by workers if the stack is marked as failed due to
failure of one resources and stack traversal set to empty string
i.e traversal cancelled.

Also uses TestResource to simplifly the tests.

Change-Id: I1a04853d42f519d9a14dd345ac8cb441b08c4d77
Signed-off-by: rabi <ramishra@redhat.com>
This commit is contained in:
rabi
2025-07-14 18:02:56 +05:30
parent 8a6585045c
commit b64f70d834
3 changed files with 52 additions and 55 deletions

View File

@@ -103,7 +103,10 @@ function _run_heat_integrationtests {
function create {
if [ "${RUN_HEAT_INTEGRATION_TESTS}" == "True" ]; then
# run heat integration tests instead of tempest smoke before create
_run_heat_integrationtests $BASE_DEVSTACK_DIR
# TODO(ramishra) switch to run tests from $BASE_DEVSTACK_DIR once
# https://review.opendev.org/c/openstack/heat/+/954938
# has been backported
_run_heat_integrationtests $TARGET_DEVSTACK_DIR
fi
source $TOP_DIR/openrc admin admin

View File

@@ -77,11 +77,9 @@ parameters:
resources:
random1:
type: OS::Heat::RandomString
properties:
salt: {get_param: UserData}
type: OS::Heat::TestResource
outputs:
PublicIp: {value: {get_attr: [random1, value]}}
PublicIp: {value: {get_attr: [random1, output]}}
AvailabilityZone: {value: 'not-used11'}
PrivateDnsName: {value: 'not-used12'}
PublicDnsName: {value: 'not-used13'}
@@ -100,18 +98,12 @@ parameters:
resources:
random1:
type: OS::Heat::RandomString
depends_on: waiter
ready_poster:
type: AWS::CloudFormation::WaitConditionHandle
waiter:
type: AWS::CloudFormation::WaitCondition
type: OS::Heat::TestResource
properties:
Handle: {get_resource: ready_poster}
Timeout: 1
fail: true
outputs:
PublicIp:
value: {get_attr: [random1, value]}
value: {get_attr: [random1, output]}
'''
def setUp(self):
@@ -252,7 +244,16 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest):
nested_ident = self.assert_resource_is_a_stack(stack_identifier,
'JobServerGroup')
self._assert_instance_state(nested_ident, 0, 2)
# Check at least one resource is in *_FAILED as there is a
# chance that before other leaf resources are processed, stack
# is marked as failed and traversal is set to empty string,
# so that all other workers processing resources bail out
# and the traversal gets cancelled.
for res in self.client.resources.list(nested_ident):
if res.resource_status.endswith('CREATE_FAILED'):
break
else:
self.fail('No resource in CREATE_FAILED')
def test_update_instance_error_causes_group_error(self):
"""Test update failing a resource in the instance group.
@@ -281,8 +282,6 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest):
nested_ident = self.assert_resource_is_a_stack(stack_identifier,
'JobServerGroup')
self._assert_instance_state(nested_ident, 2, 0)
initial_list = [res.resource_name
for res in self.client.resources.list(nested_ident)]
env['parameters']['size'] = 3
files2 = {'provider.yaml': self.bad_instance_template}
@@ -296,20 +295,18 @@ class AutoscalingGroupBasicTest(AutoscalingGroupTest):
)
self._wait_for_stack_status(stack_identifier, 'UPDATE_FAILED')
# assert that there are 3 bad instances
nested_ident = self.assert_resource_is_a_stack(stack_identifier,
'JobServerGroup')
# 2 resources should be in update failed, and one create failed.
# Check at least one resource is in *_FAILED as there is a
# chance that before other leaf resources are processed, stack
# is marked as failed and traversal is set to empty string,
# so that all other workers processing resources bail out
# and the traversal gets cancelled.
for res in self.client.resources.list(nested_ident):
if res.resource_name in initial_list:
self._wait_for_resource_status(nested_ident,
res.resource_name,
'UPDATE_FAILED')
else:
self._wait_for_resource_status(nested_ident,
res.resource_name,
'CREATE_FAILED')
if res.resource_status.endswith('_FAILED'):
break
else:
self.fail('No resource in *_FAILED')
def test_group_suspend_resume(self):

View File

@@ -70,12 +70,10 @@ parameters:
resources:
random1:
type: OS::Heat::RandomString
properties:
salt: {get_param: UserData}
type: OS::Heat::TestResource
outputs:
PublicIp:
value: {get_attr: [random1, value]}
value: {get_attr: [random1, output]}
'''
# This is designed to fail.
@@ -90,18 +88,12 @@ parameters:
resources:
random1:
type: OS::Heat::RandomString
depends_on: waiter
ready_poster:
type: AWS::CloudFormation::WaitConditionHandle
waiter:
type: AWS::CloudFormation::WaitCondition
type: OS::Heat::TestResource
properties:
Handle: {Ref: ready_poster}
Timeout: 1
fail: true
outputs:
PublicIp:
value: {get_attr: [random1, value]}
value: {get_attr: [random1, output]}
'''
def setUp(self):
@@ -242,7 +234,16 @@ class InstanceGroupBasicTest(InstanceGroupTest):
nested_ident = self.assert_resource_is_a_stack(stack_identifier,
'JobServerGroup')
self._assert_instance_state(nested_ident, 0, 2)
# Check at least one resource is in *_FAILED as there is a
# chance that before other leaf resources are processed, stack
# is marked as failed and traversal is set to empty string,
# so that all other workers processing resources bail out
# and the traversal gets cancelled.
for res in self.client.resources.list(nested_ident):
if res.resource_status.endswith('CREATE_FAILED'):
break
else:
self.fail('No resource in CREATE_FAILED')
def test_update_instance_error_causes_group_error(self):
"""Test update failing a resource in the instance group.
@@ -271,8 +272,6 @@ class InstanceGroupBasicTest(InstanceGroupTest):
nested_ident = self.assert_resource_is_a_stack(stack_identifier,
'JobServerGroup')
self._assert_instance_state(nested_ident, 2, 0)
initial_list = [res.resource_name
for res in self.client.resources.list(nested_ident)]
env['parameters']['size'] = 3
files2 = {'provider.yaml': self.bad_instance_template}
@@ -285,20 +284,18 @@ class InstanceGroupBasicTest(InstanceGroupTest):
environment=env
)
self._wait_for_stack_status(stack_identifier, 'UPDATE_FAILED')
nested_ident = self.assert_resource_is_a_stack(stack_identifier,
'JobServerGroup')
# assert that there are 3 bad instances
# 2 resources should be in update failed, and one create failed.
# Check at least one resource is in *_FAILED as there is a
# chance that before other leaf resources are processed, stack
# is marked as failed and traversal is set to empty string,
# so that all other workers processing resources bail out
# and the traversal gets cancelled.
for res in self.client.resources.list(nested_ident):
if res.resource_name in initial_list:
self._wait_for_resource_status(nested_ident,
res.resource_name,
'UPDATE_FAILED')
else:
self._wait_for_resource_status(nested_ident,
res.resource_name,
'CREATE_FAILED')
if res.resource_status.endswith('_FAILED'):
break
else:
self.fail('No resource in *_FAILED')
class InstanceGroupUpdatePolicyTest(InstanceGroupTest):