AWS: Add support for retrying image imports

AWS has limits on the number of image import tasks that can run
simultaneously.  In a busy system with large images, it would be
better to wait until those limits clear rather than delete the
uploaded s3 object and start over, uploading it again.  To support
this, we now detect that condition and optionally retry for a
specified amount of time.

The default remains to bail on the first error.

Change-Id: I6aa7f79b2f73c4aa6743f11221907a731a82be34
This commit is contained in:
James E. Blair 2023-08-10 15:58:42 -07:00
parent 426951af18
commit c2d9c45655
8 changed files with 100 additions and 33 deletions

View File

@ -186,6 +186,17 @@ Selecting the ``aws`` driver adds the following options to the
``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which
are supported by diskimage-builder).
.. attr:: image-import-timeout
:type: int
Generally there is no limit on the amount of time a successful
image import can take. However, some import tasks may encounter
temporary resource limitations from AWS. In these cases, if
this value is set, Nodepool will retry the import tasks until
the timeout is reached. If this is unset (the default), then
the first resource limitation detected will result in an error.
The value is in seconds.
.. attr:: cloud-images
:type: list

View File

@ -434,6 +434,11 @@ class AwsAdapter(statemachine.Adapter):
bucket_name, object_filename):
# Import snapshot
self.log.debug(f"Importing {image_name} as snapshot")
timeout = time.time()
if self.provider.image_import_timeout:
timeout += self.provider.image_import_timeout
while True:
try:
with self.rate_limiter:
import_snapshot_task = self.ec2_client.import_snapshot(
DiskContainer={
@ -450,6 +455,16 @@ class AwsAdapter(statemachine.Adapter):
},
]
)
break
except botocore.exceptions.ClientError as error:
if (error.response['Error']['Code'] ==
'ResourceCountLimitExceeded'):
if time.time() < timeout:
self.log.warning("AWS error: '%s' will retry",
str(error))
time.sleep(self.IMAGE_UPLOAD_SLEEP)
continue
raise
task_id = import_snapshot_task['ImportTaskId']
paginator = self.ec2_client.get_paginator(
@ -527,6 +542,11 @@ class AwsAdapter(statemachine.Adapter):
bucket_name, object_filename):
# Import image as AMI
self.log.debug(f"Importing {image_name} as AMI")
timeout = time.time()
if self.provider.image_import_timeout:
timeout += self.provider.image_import_timeout
while True:
try:
with self.rate_limiter:
import_image_task = self.ec2_client.import_image(
Architecture=provider_image.architecture,
@ -544,6 +564,16 @@ class AwsAdapter(statemachine.Adapter):
},
]
)
break
except botocore.exceptions.ClientError as error:
if (error.response['Error']['Code'] ==
'ResourceCountLimitExceeded'):
if time.time() < timeout:
self.log.warning("AWS error: '%s' will retry",
str(error))
time.sleep(self.IMAGE_UPLOAD_SLEEP)
continue
raise
task_id = import_image_task['ImportTaskId']
paginator = self.ec2_client.get_paginator(

View File

@ -298,6 +298,8 @@ class AwsProviderConfig(ProviderConfig):
self.object_storage = self.provider.get('object-storage')
self.image_type = self.provider.get('image-format', 'raw')
self.image_name_format = '{image_name}-{timestamp}'
self.image_import_timeout = self.provider.get(
'image-import-timeout', None)
self.post_upload_hook = self.provider.get('post-upload-hook')
self.max_servers = self.provider.get('max-servers', math.inf)
self.max_cores = self.provider.get('max-cores', math.inf)
@ -347,6 +349,7 @@ class AwsProviderConfig(ProviderConfig):
'launch-retries': int,
'object-storage': object_storage,
'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'),
'image-import-timeout': int,
'max-servers': int,
'max-cores': int,
'max-ram': int,

View File

@ -27,6 +27,7 @@ providers:
region-name: us-west-2
object-storage:
bucket-name: nodepool
image-import-timeout: 60
diskimages:
- name: fake-image
tags:

View File

@ -27,6 +27,7 @@ providers:
region-name: us-west-2
object-storage:
bucket-name: nodepool
image-import-timeout: 60
diskimages:
- name: fake-image
tags:

View File

@ -16,6 +16,7 @@
import logging
import uuid
import botocore
import boto3
@ -136,8 +137,14 @@ class FakeAws:
self.tasks = {}
self.ec2 = boto3.resource('ec2', region_name='us-west-2')
self.ec2_client = boto3.client('ec2', region_name='us-west-2')
self.fail_import_count = 0
def import_snapshot(self, *args, **kw):
while self.fail_import_count:
self.fail_import_count -= 1
raise botocore.exceptions.ClientError(
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
'ImportSnapshot')
task_id = uuid.uuid4().hex
task = make_import_snapshot_stage_1(
task_id,
@ -162,6 +169,11 @@ class FakeAws:
return snap_id
def import_image(self, *args, **kw):
while self.fail_import_count:
self.fail_import_count -= 1
raise botocore.exceptions.ClientError(
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
'ImportImage')
task_id = uuid.uuid4().hex
task = make_import_image_stage_1(
task_id,

View File

@ -651,6 +651,7 @@ class TestDriverAws(tests.DBTestCase):
self.assertTrue(response['EbsOptimized']['Value'])
def test_aws_diskimage_snapshot(self):
self.fake_aws.fail_import_count = 1
configfile = self.setup_config('aws/diskimage.yaml')
self.useBuilder(configfile)
@ -693,6 +694,7 @@ class TestDriverAws(tests.DBTestCase):
['Throughput'], 200)
def test_aws_diskimage_image(self):
self.fake_aws.fail_import_count = 1
configfile = self.setup_config('aws/diskimage-import-image.yaml')
self.useBuilder(configfile)

View File

@ -0,0 +1,7 @@
---
features:
- |
The AWS driver now supports an
:attr:`providers.[aws].image-import-timeout` option to control
automatic retries and timeouts when AWS import task resource
limits are reached.