From ad206081e64616bac1dc35ed6916e6f49b8f6f1c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 30 Jan 2025 15:27:17 -0500 Subject: [PATCH] Tolerate not finding Flatcar releases in CI (#5201) * Use one outer retry loop and tolerate not finding Flatcar releases * Remove cruft and no-ops. * Remove cruft and no-ops. --------- Co-authored-by: DailyDreaming --- src/toil/lib/aws/ami.py | 84 ++++++++++++++++------------------- src/toil/test/lib/test_ec2.py | 36 ++++++++------- 2 files changed, 57 insertions(+), 63 deletions(-) diff --git a/src/toil/lib/aws/ami.py b/src/toil/lib/aws/ami.py index 126c4faf09..09ff90b9e1 100644 --- a/src/toil/lib/aws/ami.py +++ b/src/toil/lib/aws/ami.py @@ -1,19 +1,24 @@ import json import logging import os +import time import urllib.request from collections.abc import Iterator from typing import Optional, cast from urllib.error import HTTPError, URLError from botocore.client import BaseClient -from botocore.exceptions import ClientError +from botocore.exceptions import ClientError, EndpointConnectionError from toil.lib.retry import retry logger = logging.getLogger(__name__) +class ReleaseFeedUnavailableError(RuntimeError): + """Raised when a Flatcar releases can't be located.""" + pass +@retry(errors=[ReleaseFeedUnavailableError]) def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str: """ Retrieve the flatcar AMI image to use as the base for all Toil autoscaling instances. @@ -25,15 +30,13 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str: 2. Official AMI from stable.release.flatcar-linux.net 3. Search the AWS Marketplace - If all of these sources fail, we raise an error to complain. + :raises ReleaseFeedUnavailableError: if all of these sources fail. :param ec2_client: Boto3 EC2 Client :param architecture: The architecture type for the new AWS machine. Can be either amd64 or arm64 """ - # Take a user override ami = os.environ.get("TOIL_AWS_AMI") - try_number = 0 if not ami: logger.debug( "No AMI found in TOIL_AWS_AMI; checking stable Flatcar release feed" @@ -48,11 +51,6 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str: ami = aws_marketplace_flatcar_ami_search( ec2_client=ec2_client, architecture=architecture ) - if not ami: - logger.debug("No AMI found in marketplace; checking Toil Flatcar release feed") - ami = feed_flatcar_ami_release( - ec2_client=ec2_client, architecture=architecture, source="toil" - ) if not ami: logger.debug( "No AMI found in Toil project feed; checking beta Flatcar release feed" @@ -69,7 +67,7 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str: ) if not ami: logger.critical("No available Flatcar AMI in any source!") - raise RuntimeError( + raise ReleaseFeedUnavailableError( f"Unable to fetch the latest flatcar image. Upload " f"https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_image.bin.bz2 " f"to AWS as am AMI and set TOIL_AWS_AMI in the environment to its AMI ID." @@ -78,7 +76,6 @@ def get_flatcar_ami(ec2_client: BaseClient, architecture: str = "amd64") -> str: return ami -@retry(errors=[HTTPError]) def _fetch_flatcar_feed(architecture: str = "amd64", source: str = "stable") -> bytes: """ Get the binary data of the Flatcar release feed for the given architecture. @@ -94,9 +91,8 @@ def _fetch_flatcar_feed(architecture: str = "amd64", source: str = "stable") -> JSON_FEED_URL = { "stable": f"https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", "beta": f"https://beta.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", - "alpha": f"https://alpha.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", - "archive": f"https://web.archive.org/web/20220625112618if_/https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", - "toil": f"https://raw.githubusercontent.com/DataBiosphere/toil/master/contrib/flatcar/{architecture}-usr/current/flatcar_production_ami_all.json", + # "alpha": f"https://alpha.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json", + "archive": f"https://web.archive.org/web/20220625112618if_/https://stable.release.flatcar-linux.net/{architecture}-usr/current/flatcar_production_ami_all.json" }[source] return cast(bytes, urllib.request.urlopen(JSON_FEED_URL).read()) @@ -121,13 +117,15 @@ def flatcar_release_feed_amis( try_number = 0 while try_number < MAX_TRIES: + if try_number != 0: + time.sleep(1) try: feed = json.loads(_fetch_flatcar_feed(architecture, source)) break except HTTPError: # Flatcar servers did not return the feed logger.exception(f"Could not retrieve {source} Flatcar release feed JSON") - # Don't retry + # This is probably a permanent error, or at least unlikely to go away immediately. return except json.JSONDecodeError: # Feed is not JSON @@ -149,19 +147,12 @@ def flatcar_release_feed_amis( for ami_record in feed.get("amis", []): # Scan the list of regions - if ami_record.get("name", None) == region: - # When we find ours, return the AMI ID - if "hvm" in ami_record: - yield ami_record["hvm"] - # And stop, there should be one per region. - return + if ami_record.get("name") == region: + return ami_record.get("hvm") # We didn't find our region - logger.warning( - f"Flatcar {source} release feed does not have an image for region {region}" - ) + logger.warning(f"Flatcar {source} release feed does not have an image for region {region}") -@retry() # TODO: What errors do we get for timeout, JSON parse failure, etc? def feed_flatcar_ami_release( ec2_client: BaseClient, architecture: str = "amd64", source: str = "stable" ) -> Optional[str]: @@ -170,6 +161,8 @@ def feed_flatcar_ami_release( Verify it's on AWS. + Does not raise exceptions. + :param ec2_client: Boto3 EC2 Client :param architecture: The architecture type for the new AWS machine. Can be either amd64 or arm64 :param source: can be set to a Flatcar release channel ('stable', 'beta', @@ -186,43 +179,42 @@ def feed_flatcar_ami_release( # verify it exists on AWS try: response = ec2_client.describe_images(Filters=[{"Name": "image-id", "Values": [ami]}]) # type: ignore - if ( - len(response["Images"]) == 1 - and response["Images"][0]["State"] == "available" - ): + if (len(response["Images"]) == 1 and response["Images"][0]["State"] == "available"): return ami else: - logger.warning( - f"Flatcar release feed suggests image {ami} which does not exist on AWS in {region}" - ) - except ClientError: + logger.warning(f"Flatcar release feed suggests image {ami} which does not exist on AWS in {region}") + except (ClientError, EndpointConnectionError): # Sometimes we get back nonsense like: # botocore.exceptions.ClientError: An error occurred (AuthFailure) when calling the DescribeImages operation: AWS was not able to validate the provided access credentials # Don't hold that against the AMI. - logger.exception( - f"Unable to check if AMI {ami} exists on AWS in {region}; assuming it does" - ) + logger.exception(f"Unable to check if AMI {ami} exists on AWS in {region}; assuming it does") return ami # We didn't find it - logger.warning( - f"Flatcar release feed does not have an image for region {region} that exists on AWS" - ) - return None + logger.warning(f"Flatcar release feed does not have an image for region {region} that exists on AWS") -@retry() # TODO: What errors do we get for timeout, JSON parse failure, etc? def aws_marketplace_flatcar_ami_search( ec2_client: BaseClient, architecture: str = "amd64" ) -> Optional[str]: - """Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one.""" + """ + Query AWS for all AMI names matching ``Flatcar-stable-*`` and return the most recent one. + + Does not raise exceptions. + + :returns: An AMI name, or None if no matching AMI was found or we could not talk to AWS. + """ # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images # Possible arch choices on AWS: 'i386'|'x86_64'|'arm64'|'x86_64_mac' architecture_mapping = {"amd64": "x86_64", "arm64": "arm64"} - response = ec2_client.describe_images( # type: ignore[attr-defined] - Owners=["aws-marketplace"], - Filters=[{"Name": "name", "Values": ["Flatcar-stable-*"]}], - ) + try: + response = ec2_client.describe_images( # type: ignore[attr-defined] + Owners=["aws-marketplace"], + Filters=[{"Name": "name", "Values": ["Flatcar-stable-*"]}], + ) + except (ClientError, EndpointConnectionError): + logger.exception("Unable to search AWS marketplace") + return None latest: dict[str, str] = {"CreationDate": "0lder than atoms."} for image in response["Images"]: if ( diff --git a/src/toil/test/lib/test_ec2.py b/src/toil/test/lib/test_ec2.py index 16efac4dee..0d53e41a27 100644 --- a/src/toil/test/lib/test_ec2.py +++ b/src/toil/test/lib/test_ec2.py @@ -14,13 +14,14 @@ import logging import os -import pytest +from unittest import mock from toil.lib.aws.ami import ( aws_marketplace_flatcar_ami_search, feed_flatcar_ami_release, flatcar_release_feed_amis, get_flatcar_ami, + ReleaseFeedUnavailableError ) from toil.test import ToilTest, needs_aws_ec2, needs_online @@ -69,15 +70,18 @@ def setUpClass(cls): def test_fetch_flatcar(self): with self.subTest("Test flatcar AMI from user is prioritized."): - os.environ["TOIL_AWS_AMI"] = "overridden" - ami = get_flatcar_ami(self.ec2_client) - self.assertEqual(ami, "overridden") - del os.environ["TOIL_AWS_AMI"] + with mock.patch.dict(os.environ, {"TOIL_AWS_AMI": "overridden"}): + ami = get_flatcar_ami(self.ec2_client) + self.assertEqual(ami, "overridden") with self.subTest("Test flatcar AMI returns an AMI-looking AMI."): - ami = get_flatcar_ami(self.ec2_client) - self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) - self.assertTrue(ami.startswith("ami-")) + try: + ami = get_flatcar_ami(self.ec2_client) + self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) + self.assertTrue(ami.startswith("ami-")) + except ReleaseFeedUnavailableError: + # Ignore any remote systems being down. + pass with self.subTest( "Test feed_flatcar_ami_release() returns an AMI-looking AMI." @@ -90,16 +94,14 @@ def test_fetch_flatcar(self): "Test aws_marketplace_flatcar_ami_search() returns an AMI-looking AMI." ): ami = aws_marketplace_flatcar_ami_search(self.ec2_client) - self.assertEqual(len(ami), len("ami-02b46c73fed689d1c")) - self.assertTrue(ami.startswith("ami-")) + self.assertTrue(ami is None or len(ami), len("ami-02b46c73fed689d1c")) + self.assertTrue(ami is None or ami.startswith("ami-")) - # TODO: This will fail until https://github.com/flatcar/Flatcar/issues/962 is fixed - @pytest.mark.xfail def test_fetch_arm_flatcar(self): """Test flatcar AMI finder architecture parameter.""" - amis = set() - for arch in ["amd64", "arm64"]: - ami = get_flatcar_ami(self.ec2_client, architecture=arch) + try: + ami = get_flatcar_ami(self.ec2_client, architecture="arm64") self.assertTrue(ami.startswith("ami-")) - amis.add(ami) - self.assertTrue(len(amis) == 2) + except ReleaseFeedUnavailableError: + # Ignore any remote systems being down. + pass