diff --git a/doc/source/aws.rst b/doc/source/aws.rst index 71a61c9da..c53e708ce 100644 --- a/doc/source/aws.rst +++ b/doc/source/aws.rst @@ -186,6 +186,17 @@ Selecting the ``aws`` driver adds the following options to the ``ova``, ``vhd``, ``vhdx``, ``vmdk``, ``raw`` (not all of which are supported by diskimage-builder). + .. attr:: image-import-timeout + :type: int + + Generally there is no limit on the amount of time a successful + image import can take. However, some import tasks may encounter + temporary resource limitations from AWS. In these cases, if + this value is set, Nodepool will retry the import tasks until + the timeout is reached. If this is unset (the default), then + the first resource limitation detected will result in an error. + The value is in seconds. + .. attr:: cloud-images :type: list diff --git a/nodepool/driver/aws/adapter.py b/nodepool/driver/aws/adapter.py index 1b9c8ce1d..b7dffb96c 100644 --- a/nodepool/driver/aws/adapter.py +++ b/nodepool/driver/aws/adapter.py @@ -434,22 +434,37 @@ class AwsAdapter(statemachine.Adapter): bucket_name, object_filename): # Import snapshot self.log.debug(f"Importing {image_name} as snapshot") - with self.rate_limiter: - import_snapshot_task = self.ec2_client.import_snapshot( - DiskContainer={ - 'Format': image_format, - 'UserBucket': { - 'S3Bucket': bucket_name, - 'S3Key': object_filename, - }, - }, - TagSpecifications=[ - { - 'ResourceType': 'import-snapshot-task', - 'Tags': tag_dict_to_list(metadata), - }, - ] - ) + timeout = time.time() + if self.provider.image_import_timeout: + timeout += self.provider.image_import_timeout + while True: + try: + with self.rate_limiter: + import_snapshot_task = self.ec2_client.import_snapshot( + DiskContainer={ + 'Format': image_format, + 'UserBucket': { + 'S3Bucket': bucket_name, + 'S3Key': object_filename, + }, + }, + TagSpecifications=[ + { + 'ResourceType': 'import-snapshot-task', + 'Tags': tag_dict_to_list(metadata), + }, + ] + ) + break + except botocore.exceptions.ClientError as error: + if (error.response['Error']['Code'] == + 'ResourceCountLimitExceeded'): + if time.time() < timeout: + self.log.warning("AWS error: '%s' will retry", + str(error)) + time.sleep(self.IMAGE_UPLOAD_SLEEP) + continue + raise task_id = import_snapshot_task['ImportTaskId'] paginator = self.ec2_client.get_paginator( @@ -527,23 +542,38 @@ class AwsAdapter(statemachine.Adapter): bucket_name, object_filename): # Import image as AMI self.log.debug(f"Importing {image_name} as AMI") - with self.rate_limiter: - import_image_task = self.ec2_client.import_image( - Architecture=provider_image.architecture, - DiskContainers=[{ - 'Format': image_format, - 'UserBucket': { - 'S3Bucket': bucket_name, - 'S3Key': object_filename, - }, - }], - TagSpecifications=[ - { - 'ResourceType': 'import-image-task', - 'Tags': tag_dict_to_list(metadata), - }, - ] - ) + timeout = time.time() + if self.provider.image_import_timeout: + timeout += self.provider.image_import_timeout + while True: + try: + with self.rate_limiter: + import_image_task = self.ec2_client.import_image( + Architecture=provider_image.architecture, + DiskContainers=[{ + 'Format': image_format, + 'UserBucket': { + 'S3Bucket': bucket_name, + 'S3Key': object_filename, + }, + }], + TagSpecifications=[ + { + 'ResourceType': 'import-image-task', + 'Tags': tag_dict_to_list(metadata), + }, + ] + ) + break + except botocore.exceptions.ClientError as error: + if (error.response['Error']['Code'] == + 'ResourceCountLimitExceeded'): + if time.time() < timeout: + self.log.warning("AWS error: '%s' will retry", + str(error)) + time.sleep(self.IMAGE_UPLOAD_SLEEP) + continue + raise task_id = import_image_task['ImportTaskId'] paginator = self.ec2_client.get_paginator( diff --git a/nodepool/driver/aws/config.py b/nodepool/driver/aws/config.py index 997d3f283..9f5b01eba 100644 --- a/nodepool/driver/aws/config.py +++ b/nodepool/driver/aws/config.py @@ -298,6 +298,8 @@ class AwsProviderConfig(ProviderConfig): self.object_storage = self.provider.get('object-storage') self.image_type = self.provider.get('image-format', 'raw') self.image_name_format = '{image_name}-{timestamp}' + self.image_import_timeout = self.provider.get( + 'image-import-timeout', None) self.post_upload_hook = self.provider.get('post-upload-hook') self.max_servers = self.provider.get('max-servers', math.inf) self.max_cores = self.provider.get('max-cores', math.inf) @@ -347,6 +349,7 @@ class AwsProviderConfig(ProviderConfig): 'launch-retries': int, 'object-storage': object_storage, 'image-format': v.Any('ova', 'vhd', 'vhdx', 'vmdk', 'raw'), + 'image-import-timeout': int, 'max-servers': int, 'max-cores': int, 'max-ram': int, diff --git a/nodepool/tests/fixtures/aws/diskimage-import-image.yaml b/nodepool/tests/fixtures/aws/diskimage-import-image.yaml index 4c55a93dc..14bf070b3 100644 --- a/nodepool/tests/fixtures/aws/diskimage-import-image.yaml +++ b/nodepool/tests/fixtures/aws/diskimage-import-image.yaml @@ -27,6 +27,7 @@ providers: region-name: us-west-2 object-storage: bucket-name: nodepool + image-import-timeout: 60 diskimages: - name: fake-image tags: diff --git a/nodepool/tests/fixtures/aws/diskimage.yaml b/nodepool/tests/fixtures/aws/diskimage.yaml index 14c9a9748..16508570f 100644 --- a/nodepool/tests/fixtures/aws/diskimage.yaml +++ b/nodepool/tests/fixtures/aws/diskimage.yaml @@ -27,6 +27,7 @@ providers: region-name: us-west-2 object-storage: bucket-name: nodepool + image-import-timeout: 60 diskimages: - name: fake-image tags: diff --git a/nodepool/tests/unit/fake_aws.py b/nodepool/tests/unit/fake_aws.py index 15617abcc..a7e2238d0 100644 --- a/nodepool/tests/unit/fake_aws.py +++ b/nodepool/tests/unit/fake_aws.py @@ -16,6 +16,7 @@ import logging import uuid +import botocore import boto3 @@ -136,8 +137,14 @@ class FakeAws: self.tasks = {} self.ec2 = boto3.resource('ec2', region_name='us-west-2') self.ec2_client = boto3.client('ec2', region_name='us-west-2') + self.fail_import_count = 0 def import_snapshot(self, *args, **kw): + while self.fail_import_count: + self.fail_import_count -= 1 + raise botocore.exceptions.ClientError( + {'Error': {'Code': 'ResourceCountLimitExceeded'}}, + 'ImportSnapshot') task_id = uuid.uuid4().hex task = make_import_snapshot_stage_1( task_id, @@ -162,6 +169,11 @@ class FakeAws: return snap_id def import_image(self, *args, **kw): + while self.fail_import_count: + self.fail_import_count -= 1 + raise botocore.exceptions.ClientError( + {'Error': {'Code': 'ResourceCountLimitExceeded'}}, + 'ImportImage') task_id = uuid.uuid4().hex task = make_import_image_stage_1( task_id, diff --git a/nodepool/tests/unit/test_driver_aws.py b/nodepool/tests/unit/test_driver_aws.py index 7a6a0ff97..8e6ed7226 100644 --- a/nodepool/tests/unit/test_driver_aws.py +++ b/nodepool/tests/unit/test_driver_aws.py @@ -651,6 +651,7 @@ class TestDriverAws(tests.DBTestCase): self.assertTrue(response['EbsOptimized']['Value']) def test_aws_diskimage_snapshot(self): + self.fake_aws.fail_import_count = 1 configfile = self.setup_config('aws/diskimage.yaml') self.useBuilder(configfile) @@ -693,6 +694,7 @@ class TestDriverAws(tests.DBTestCase): ['Throughput'], 200) def test_aws_diskimage_image(self): + self.fake_aws.fail_import_count = 1 configfile = self.setup_config('aws/diskimage-import-image.yaml') self.useBuilder(configfile) diff --git a/releasenotes/notes/aws-image-import-timeout-d53aaa1a921f7aa5.yaml b/releasenotes/notes/aws-image-import-timeout-d53aaa1a921f7aa5.yaml new file mode 100644 index 000000000..2a83fdd4a --- /dev/null +++ b/releasenotes/notes/aws-image-import-timeout-d53aaa1a921f7aa5.yaml @@ -0,0 +1,7 @@ +--- +features: + - | + The AWS driver now supports an + :attr:`providers.[aws].image-import-timeout` option to control + automatic retries and timeouts when AWS import task resource + limits are reached.