From 7e0321e0c4aa3aaeb6ac25e2b3e483684db29cf0 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 21 Jan 2025 14:38:36 -0500 Subject: [PATCH] Add ee cleanup tests * Adds cleanup tests to the live test. --- awx/main/tasks/receptor.py | 10 ++- awx/main/tasks/system.py | 87 ++++++++++++------- awx/main/tests/functional/test_tasks.py | 12 +-- awx/main/tests/live/tests/conftest.py | 18 ++++ .../tests/live/tests/test_cleanup_task.py | 44 +++++++++- 5 files changed, 127 insertions(+), 44 deletions(-) diff --git a/awx/main/tasks/receptor.py b/awx/main/tasks/receptor.py index 2fbf6791ed4c..f3fb91c57395 100644 --- a/awx/main/tasks/receptor.py +++ b/awx/main/tasks/receptor.py @@ -228,22 +228,24 @@ class RemoteJobError(RuntimeError): pass -def run_until_complete(node, timing_data=None, **kwargs): +def run_until_complete(node, timing_data=None, worktype='ansible-runner', ttl='20s', **kwargs): """ Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout. """ + config_data = read_receptor_config() receptor_ctl = get_receptor_ctl(config_data) use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS" kwargs.setdefault('tlsclient', get_tls_client(config_data, use_stream_tls)) - kwargs.setdefault('ttl', '20s') + if ttl is not None: + kwargs['ttl'] = ttl kwargs.setdefault('payload', '') if work_signing_enabled(config_data): kwargs['signwork'] = True transmit_start = time.time() - result = receptor_ctl.submit_work(worktype='ansible-runner', node=node, **kwargs) + result = receptor_ctl.submit_work(worktype=worktype, node=node, **kwargs) unit_id = result['unitid'] run_start = time.time() @@ -371,7 +373,7 @@ def _convert_args_to_cli(vargs): return args -def worker_cleanup(node_name, vargs, timeout=300.0): +def worker_cleanup(node_name, vargs): args = _convert_args_to_cli(vargs) remote_command = ' '.join(args) diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index 6d161d2ef8b5..e9facd55a894 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -25,6 +25,7 @@ from django.utils.translation import gettext_noop from django.core.cache import cache from django.core.exceptions import ObjectDoesNotExist +from django.db.models.query import QuerySet # Django-CRUM from crum import impersonate @@ -379,48 +380,68 @@ def purge_old_stdout_files(): logger.debug("Removing {}".format(os.path.join(settings.JOBOUTPUT_ROOT, f))) -def _cleanup_images_and_files(**kwargs): - if settings.IS_K8S: - return - this_inst = Instance.objects.me() - runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs) - if runner_cleanup_kwargs: - stdout = '' - with StringIO() as buffer: - with redirect_stdout(buffer): - ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs) - stdout = buffer.getvalue() - if '(changed: True)' in stdout: - logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}') - - # if we are the first instance alphabetically, then run cleanup on execution nodes - checker_instance = ( - Instance.objects.filter(node_type__in=['hybrid', 'control'], node_state=Instance.States.READY, enabled=True, capacity__gt=0) - .order_by('-hostname') - .first() - ) - if checker_instance and this_inst.hostname == checker_instance.hostname: - for inst in Instance.objects.filter(node_type='execution', node_state=Instance.States.READY, enabled=True, capacity__gt=0): - runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs) - if not runner_cleanup_kwargs: - continue - try: - stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs) - if '(changed: True)' in stdout: - logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}') - except RuntimeError: - logger.exception(f'Error running cleanup on execution node {inst.hostname}') +class CleanupImagesAndFiles: + @classmethod + def get_first_control_instance(cls) -> Instance | None: + return ( + Instance.objects.filter(node_type__in=['hybrid', 'control'], node_state=Instance.States.READY, enabled=True, capacity__gt=0) + .order_by('-hostname') + .first() + ) + + @classmethod + def get_execution_instances(cls) -> QuerySet[Instance]: + return Instance.objects.filter(node_type='execution', node_state=Instance.States.READY, enabled=True, capacity__gt=0) + + @classmethod + def run_local(cls, this_inst: Instance, **kwargs): + if settings.IS_K8S: + return + runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs) + if runner_cleanup_kwargs: + stdout = '' + with StringIO() as buffer: + with redirect_stdout(buffer): + ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs) + stdout = buffer.getvalue() + if '(changed: True)' in stdout: + logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}') + + @classmethod + def run_remote(cls, this_inst: Instance, **kwargs): + # if we are the first instance alphabetically, then run cleanup on execution nodes + checker_instance = cls.get_first_control_instance() + + if checker_instance and this_inst.hostname == checker_instance.hostname: + for inst in cls.get_execution_instances(): + runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs) + if not runner_cleanup_kwargs: + continue + try: + stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs) + if '(changed: True)' in stdout: + logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}') + except RuntimeError: + logger.exception(f'Error running cleanup on execution node {inst.hostname}') + + @classmethod + def run(cls, **kwargs): + if settings.IS_K8S: + return + this_inst = Instance.objects.me() + cls.run_local(this_inst, **kwargs) + cls.run_remote(this_inst, **kwargs) @task(queue='tower_broadcast_all') def handle_removed_image(remove_images=None): """Special broadcast invocation of this method to handle case of deleted EE""" - _cleanup_images_and_files(remove_images=remove_images, file_pattern='') + CleanupImagesAndFiles.run(remove_images=remove_images, file_pattern='') @task(queue=get_task_queuename) def cleanup_images_and_files(): - _cleanup_images_and_files(image_prune=True) + CleanupImagesAndFiles.run(image_prune=True) @task(queue=get_task_queuename) diff --git a/awx/main/tests/functional/test_tasks.py b/awx/main/tests/functional/test_tasks.py index ba252080c2ac..8e5305807f0f 100644 --- a/awx/main/tests/functional/test_tasks.py +++ b/awx/main/tests/functional/test_tasks.py @@ -4,7 +4,7 @@ import shutil from awx.main.tasks.jobs import RunJob -from awx.main.tasks.system import execution_node_health_check, _cleanup_images_and_files +from awx.main.tasks.system import CleanupImagesAndFiles, execution_node_health_check from awx.main.models import Instance, Job @@ -48,22 +48,22 @@ def mock_job_folder(job_folder_factory): @pytest.mark.django_db def test_folder_cleanup_stale_file(mock_job_folder, mock_me): - _cleanup_images_and_files() + CleanupImagesAndFiles.run() assert os.path.exists(mock_job_folder) # grace period should protect folder from deletion - _cleanup_images_and_files(grace_period=0) + CleanupImagesAndFiles.run(grace_period=0) assert not os.path.exists(mock_job_folder) # should be deleted @pytest.mark.django_db def test_folder_cleanup_running_job(mock_job_folder, me_inst): job = Job.objects.create(id=1234, controller_node=me_inst.hostname, status='running') - _cleanup_images_and_files(grace_period=0) + CleanupImagesAndFiles.run(grace_period=0) assert os.path.exists(mock_job_folder) # running job should prevent folder from getting deleted job.status = 'failed' job.save(update_fields=['status']) - _cleanup_images_and_files(grace_period=0) + CleanupImagesAndFiles.run(grace_period=0) assert not os.path.exists(mock_job_folder) # job is finished and no grace period, should delete @@ -78,7 +78,7 @@ def test_folder_cleanup_multiple_running_jobs(job_folder_factory, me_inst): dirs.append(job_folder_factory(job.id)) jobs.append(job) - _cleanup_images_and_files(grace_period=0) + CleanupImagesAndFiles.run(grace_period=0) assert [os.path.exists(d) for d in dirs] == [True for i in range(num_jobs)] diff --git a/awx/main/tests/live/tests/conftest.py b/awx/main/tests/live/tests/conftest.py index 7531e3ceae11..796e37cb0b54 100644 --- a/awx/main/tests/live/tests/conftest.py +++ b/awx/main/tests/live/tests/conftest.py @@ -1,3 +1,4 @@ +import subprocess import time import pytest @@ -59,3 +60,20 @@ def default_org(): def demo_inv(default_org): inventory, _ = Inventory.objects.get_or_create(name='Demo Inventory', defaults={'organization': default_org}) return inventory + + +@pytest.fixture +def podman_image_generator(): + """ + Generate a tagless podman image from awx base EE + """ + + def fn(): + dockerfile = """ + FROM quay.io/ansible/awx-ee:latest + RUN echo "Hello, Podman!" > /tmp/hello.txt + """ + cmd = ['podman', 'build', '-f', '-'] # Create an image without a tag + subprocess.run(cmd, capture_output=True, input=dockerfile, text=True, check=True) + + return fn diff --git a/awx/main/tests/live/tests/test_cleanup_task.py b/awx/main/tests/live/tests/test_cleanup_task.py index 137032b48cb1..d549e80ec1eb 100644 --- a/awx/main/tests/live/tests/test_cleanup_task.py +++ b/awx/main/tests/live/tests/test_cleanup_task.py @@ -1,11 +1,20 @@ import os +import json +import pytest import tempfile import subprocess -from awx.main.tasks.receptor import _convert_args_to_cli + +from awx.main.tasks.receptor import _convert_args_to_cli, run_until_complete +from awx.main.tasks.system import CleanupImagesAndFiles from awx.main.models import Instance, JobTemplate +def get_podman_images(): + cmd = ['podman', 'images', '--format', 'json'] + return json.loads((subprocess.run(cmd, capture_output=True, text=True, check=True)).stdout) + + def test_folder_cleanup_multiple_running_jobs_execution_node(request): demo_jt = JobTemplate.objects.get(name='Demo Job Template') @@ -37,3 +46,36 @@ def delete_jobs(): print('ansible-runner worker ' + remote_command) assert [os.path.exists(job_dir) for job_dir in job_dirs] == [True for i in range(3)] + + +@pytest.mark.parametrize( + 'worktype', + ('remote', 'local'), +) +def test_tagless_image(podman_image_generator, worktype: str): + """ + Ensure podman images on Control and Hybrid nodes are deleted during cleanup. + """ + podman_image_generator() + + dangling_image = next((image for image in get_podman_images() if image.get('Dangling', False)), None) + assert dangling_image + + instance_me = Instance.objects.me() + + match worktype: + case 'local': + CleanupImagesAndFiles.run_local(instance_me, image_prune=True) + case 'remote': + with ( + mock.patch( + 'awx.main.tasks.receptor.run_until_complete', lambda *args, **kwargs: run_until_complete(*args, worktype='local', ttl=None, **kwargs) + ), + mock.patch('awx.main.tasks.system.CleanupImagesAndFiles.get_execution_instances', lambda: [Instance.objects.me()]), + ): + CleanupImagesAndFiles.run_remote(instance_me, image_prune=True) + case _: + raise ValueError(f'worktype "{worktype}" not supported.') + + for image in get_podman_images(): + assert image['Id'] != dangling_image['Id']