AWS: Add support for retrying image imports
AWS has limits on the number of image import tasks that can run simultaneously. In a busy system with large images, it would be better to wait until those limits clear rather than delete the uploaded s3 object and start over, uploading it again. To support this, we now detect that condition and optionally retry for a specified amount of time. The default remains to bail on the first error. Change-Id: I6aa7f79b2f73c4aa6743f11221907a731a82be34
This commit is contained in:
parent
426951af18
commit
c2d9c45655
@ -186,6 +186,17 @@ Selecting the ``aws`` driver adds the following options to the
|
||||
``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which
|
||||
are supported by diskimage-builder).
|
||||
|
||||
.. attr:: image-import-timeout
|
||||
:type: int
|
||||
|
||||
Generally there is no limit on the amount of time a successful
|
||||
image import can take. However, some import tasks may encounter
|
||||
temporary resource limitations from AWS. In these cases, if
|
||||
this value is set, Nodepool will retry the import tasks until
|
||||
the timeout is reached. If this is unset (the default), then
|
||||
the first resource limitation detected will result in an error.
|
||||
The value is in seconds.
|
||||
|
||||
.. attr:: cloud-images
|
||||
:type: list
|
||||
|
||||
|
@ -434,6 +434,11 @@ class AwsAdapter(statemachine.Adapter):
|
||||
bucket_name, object_filename):
|
||||
# Import snapshot
|
||||
self.log.debug(f"Importing {image_name} as snapshot")
|
||||
timeout = time.time()
|
||||
if self.provider.image_import_timeout:
|
||||
timeout += self.provider.image_import_timeout
|
||||
while True:
|
||||
try:
|
||||
with self.rate_limiter:
|
||||
import_snapshot_task = self.ec2_client.import_snapshot(
|
||||
DiskContainer={
|
||||
@ -450,6 +455,16 @@ class AwsAdapter(statemachine.Adapter):
|
||||
},
|
||||
]
|
||||
)
|
||||
break
|
||||
except botocore.exceptions.ClientError as error:
|
||||
if (error.response['Error']['Code'] ==
|
||||
'ResourceCountLimitExceeded'):
|
||||
if time.time() < timeout:
|
||||
self.log.warning("AWS error: '%s' will retry",
|
||||
str(error))
|
||||
time.sleep(self.IMAGE_UPLOAD_SLEEP)
|
||||
continue
|
||||
raise
|
||||
task_id = import_snapshot_task['ImportTaskId']
|
||||
|
||||
paginator = self.ec2_client.get_paginator(
|
||||
@ -527,6 +542,11 @@ class AwsAdapter(statemachine.Adapter):
|
||||
bucket_name, object_filename):
|
||||
# Import image as AMI
|
||||
self.log.debug(f"Importing {image_name} as AMI")
|
||||
timeout = time.time()
|
||||
if self.provider.image_import_timeout:
|
||||
timeout += self.provider.image_import_timeout
|
||||
while True:
|
||||
try:
|
||||
with self.rate_limiter:
|
||||
import_image_task = self.ec2_client.import_image(
|
||||
Architecture=provider_image.architecture,
|
||||
@ -544,6 +564,16 @@ class AwsAdapter(statemachine.Adapter):
|
||||
},
|
||||
]
|
||||
)
|
||||
break
|
||||
except botocore.exceptions.ClientError as error:
|
||||
if (error.response['Error']['Code'] ==
|
||||
'ResourceCountLimitExceeded'):
|
||||
if time.time() < timeout:
|
||||
self.log.warning("AWS error: '%s' will retry",
|
||||
str(error))
|
||||
time.sleep(self.IMAGE_UPLOAD_SLEEP)
|
||||
continue
|
||||
raise
|
||||
task_id = import_image_task['ImportTaskId']
|
||||
|
||||
paginator = self.ec2_client.get_paginator(
|
||||
|
@ -298,6 +298,8 @@ class AwsProviderConfig(ProviderConfig):
|
||||
self.object_storage = self.provider.get('object-storage')
|
||||
self.image_type = self.provider.get('image-format', 'raw')
|
||||
self.image_name_format = '{image_name}-{timestamp}'
|
||||
self.image_import_timeout = self.provider.get(
|
||||
'image-import-timeout', None)
|
||||
self.post_upload_hook = self.provider.get('post-upload-hook')
|
||||
self.max_servers = self.provider.get('max-servers', math.inf)
|
||||
self.max_cores = self.provider.get('max-cores', math.inf)
|
||||
@ -347,6 +349,7 @@ class AwsProviderConfig(ProviderConfig):
|
||||
'launch-retries': int,
|
||||
'object-storage': object_storage,
|
||||
'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'),
|
||||
'image-import-timeout': int,
|
||||
'max-servers': int,
|
||||
'max-cores': int,
|
||||
'max-ram': int,
|
||||
|
@ -27,6 +27,7 @@ providers:
|
||||
region-name: us-west-2
|
||||
object-storage:
|
||||
bucket-name: nodepool
|
||||
image-import-timeout: 60
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
tags:
|
||||
|
1
nodepool/tests/fixtures/aws/diskimage.yaml
vendored
1
nodepool/tests/fixtures/aws/diskimage.yaml
vendored
@ -27,6 +27,7 @@ providers:
|
||||
region-name: us-west-2
|
||||
object-storage:
|
||||
bucket-name: nodepool
|
||||
image-import-timeout: 60
|
||||
diskimages:
|
||||
- name: fake-image
|
||||
tags:
|
||||
|
@ -16,6 +16,7 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
import botocore
|
||||
import boto3
|
||||
|
||||
|
||||
@ -136,8 +137,14 @@ class FakeAws:
|
||||
self.tasks = {}
|
||||
self.ec2 = boto3.resource('ec2', region_name='us-west-2')
|
||||
self.ec2_client = boto3.client('ec2', region_name='us-west-2')
|
||||
self.fail_import_count = 0
|
||||
|
||||
def import_snapshot(self, *args, **kw):
|
||||
while self.fail_import_count:
|
||||
self.fail_import_count -= 1
|
||||
raise botocore.exceptions.ClientError(
|
||||
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
|
||||
'ImportSnapshot')
|
||||
task_id = uuid.uuid4().hex
|
||||
task = make_import_snapshot_stage_1(
|
||||
task_id,
|
||||
@ -162,6 +169,11 @@ class FakeAws:
|
||||
return snap_id
|
||||
|
||||
def import_image(self, *args, **kw):
|
||||
while self.fail_import_count:
|
||||
self.fail_import_count -= 1
|
||||
raise botocore.exceptions.ClientError(
|
||||
{'Error': {'Code': 'ResourceCountLimitExceeded'}},
|
||||
'ImportImage')
|
||||
task_id = uuid.uuid4().hex
|
||||
task = make_import_image_stage_1(
|
||||
task_id,
|
||||
|
@ -651,6 +651,7 @@ class TestDriverAws(tests.DBTestCase):
|
||||
self.assertTrue(response['EbsOptimized']['Value'])
|
||||
|
||||
def test_aws_diskimage_snapshot(self):
|
||||
self.fake_aws.fail_import_count = 1
|
||||
configfile = self.setup_config('aws/diskimage.yaml')
|
||||
|
||||
self.useBuilder(configfile)
|
||||
@ -693,6 +694,7 @@ class TestDriverAws(tests.DBTestCase):
|
||||
['Throughput'], 200)
|
||||
|
||||
def test_aws_diskimage_image(self):
|
||||
self.fake_aws.fail_import_count = 1
|
||||
configfile = self.setup_config('aws/diskimage-import-image.yaml')
|
||||
|
||||
self.useBuilder(configfile)
|
||||
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The AWS driver now supports an
|
||||
:attr:`providers.[aws].image-import-timeout` option to control
|
||||
automatic retries and timeouts when AWS import task resource
|
||||
limits are reached.
|
Loading…
Reference in New Issue
Block a user