From c6fb70789da46ba78e1863707a98d9c7ee9a5773 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 22 Apr 2025 09:56:22 +0200 Subject: [PATCH 001/218] initial version of grouping tarballs in a single staging PR --- .../automated_ingestion.cfg.example | 43 +++++++++++++++ .../automated_ingestion.py | 55 ++++++++++++++++--- scripts/automated_ingestion/eessitarball.py | 39 +++++++++++++ 3 files changed, 130 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.cfg.example b/scripts/automated_ingestion/automated_ingestion.cfg.example index 68df3e4e..bdf40fa3 100644 --- a/scripts/automated_ingestion/automated_ingestion.cfg.example +++ b/scripts/automated_ingestion/automated_ingestion.cfg.example @@ -63,7 +63,33 @@ pr_body = A new tarball has been staged for {pr_url}. ``` + +
+ Overview of tarball contents + + {tar_overview} + +
+ +# Method for creating staging PRs: +# - 'individual': create one PR per tarball (old method) +# - 'grouped': group tarballs by link2pr and create one PR per group (new method) +staging_pr_method = individual +# Template for individual tarball PRs +individual_pr_body = A new tarball has been staged for {pr_url}. + Please review the contents of this tarball carefully. + Merging this PR will lead to automatic ingestion of the tarball to the repository {cvmfs_repo}. + +
+ Metadata of tarball + + ``` + {metadata} + ``` + +
+
Overview of tarball contents @@ -71,6 +97,23 @@ pr_body = A new tarball has been staged for {pr_url}.
+# Template for grouped tarball PRs +grouped_pr_body = A group of tarballs has been staged for {pr_url}. + Please review the contents of these tarballs carefully. + Merging this PR will lead to automatic ingestion of the approved tarballs to the repository {cvmfs_repo}. + Unchecked tarballs will be marked as rejected. + + {tarballs} + +
+ Overview of tarball contents + + {tar_overview} + +
+ + {metadata} + [slack] ingestion_notification = yes ingestion_message = Tarball `{tarball}` has been ingested into the CVMFS repository `{cvmfs_repo}`. diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 92dac552..41d928c7 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -51,6 +51,37 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt ] return tarballs +def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'): + """Return a dictionary of tarball groups, keyed by (repo, pr_number).""" + tarballs = find_tarballs(s3, bucket, extension, metadata_extension) + groups = {} + + for tarball in tarballs: + # Download metadata to get link2pr info + metadata_file = tarball + metadata_extension + local_metadata = os.path.join(config['paths']['download_dir'], os.path.basename(metadata_file)) + + try: + s3.download_file(bucket, metadata_file, local_metadata) + with open(local_metadata, 'r') as meta: + metadata = json.load(meta) + repo = metadata['link2pr']['repo'] + pr = metadata['link2pr']['pr'] + group_key = (repo, pr) + + if group_key not in groups: + groups[group_key] = [] + groups[group_key].append(tarball) + except Exception as err: + logging.error(f"Failed to process metadata for {tarball}: {err}") + continue + finally: + # Clean up downloaded metadata file + if os.path.exists(local_metadata): + os.remove(local_metadata) + + return groups + def parse_config(path): """Parse the configuration file.""" @@ -102,14 +133,24 @@ def main(): buckets = json.loads(config['aws']['staging_buckets']) for bucket, cvmfs_repo in buckets.items(): - tarballs = find_tarballs(s3, bucket) - if args.list_only: - for num, tarball in enumerate(tarballs): - print(f'[{bucket}] {num}: {tarball}') + if config['github'].get('staging_pr_method', 'individual') == 'grouped': + # use new grouped PR method + tarball_groups = find_tarball_groups(s3, bucket, config) + for (repo, pr_id), tarballs in tarball_groups.items(): + if tarballs: + # Create a group handler for these tarballs + group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) + group_handler.process_group(tarballs) else: - for tarball in tarballs: - tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo) - tar.run_handler() + # use old individual PR method + tarballs = find_tarballs(s3, bucket) + if args.list_only: + for num, tarball in enumerate(tarballs): + print(f'[{bucket}] {num}: {tarball}') + else: + for tarball in tarballs: + tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo) + tar.run_handler() if __name__ == '__main__': diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 40ac6fa1..127ea2db 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -434,3 +434,42 @@ def issue_exists(self, title, state='open'): return True else: return False + + def get_link2pr_info(self): + """Get the link2pr information from the metadata file.""" + with open(self.local_metadata_path, 'r') as meta: + metadata = json.load(meta) + return metadata['link2pr']['repo'], metadata['link2pr']['pr'] + +class EessiTarballGroup: + """Class to handle a group of tarballs that share the same link2pr information.""" + + def __init__(self, first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo): + """Initialize with the first tarball in the group.""" + self.first_tar = EessiTarball(first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo) + self.config = config + self.git_repo = git_staging_repo + self.s3 = s3 + self.bucket = bucket + self.cvmfs_repo = cvmfs_repo + + def process_group(self, tarballs): + """Process a group of tarballs together.""" + # Verify all tarballs have the same link2pr info + if not self.verify_group_consistency(tarballs): + logging.error("Tarballs in group have inconsistent link2pr information") + return + + # Process the group + self.first_tar.make_approval_request(tarballs) + + def verify_group_consistency(self, tarballs): + """Verify all tarballs in the group have the same link2pr information.""" + first_repo, first_pr = self.first_tar.get_link2pr_info() + + for tarball in tarballs[1:]: # Skip first tarball as we already have its info + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + repo, pr = temp_tar.get_link2pr_info() + if repo != first_repo or pr != first_pr: + return False + return True From aa32e71c7b0eaa6dac089846ac3b034309e5ff9b Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 22 Apr 2025 10:05:39 +0200 Subject: [PATCH 002/218] add code linting check --- .flake8 | 14 +++++++++++ .github/workflows/check-flake8.yml | 37 ++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 .flake8 create mode 100644 .github/workflows/check-flake8.yml diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..b6b309e3 --- /dev/null +++ b/.flake8 @@ -0,0 +1,14 @@ +# This file is part of the EESSI filesystem layer, +# see https://github.com/EESSI/filesystem-layer +# +# author: Thomas Roeblitz (@trz42) +# +# license: GPLv2 +# + +[flake8] +max-line-length = 120 + +# ignore "Black would make changes" produced by flake8-black +# see also https://github.com/houndci/hound/issues/1769 +extend-ignore = BLK100 diff --git a/.github/workflows/check-flake8.yml b/.github/workflows/check-flake8.yml new file mode 100644 index 00000000..2a3a425b --- /dev/null +++ b/.github/workflows/check-flake8.yml @@ -0,0 +1,37 @@ +# This file is part of the EESSI filesystem layer, +# see https://github.com/EESSI/filesystem-layer +# +# author: Thomas Roeblitz (@trz42) +# +# license: GPLv2 +# + +name: Run tests +on: [push, pull_request] +# Declare default permissions as read only. +permissions: read-all +jobs: + test: + runs-on: ubuntu-22.04 + strategy: + matrix: + python: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12'] + fail-fast: false + steps: + - name: checkout + uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + + - name: set up Python + uses: actions/setup-python@13ae5bb136fac2878aff31522b9efb785519f984 # v4.3.0 + with: + python-version: ${{matrix.python}} + + - name: Install required Python packages + pytest + flake8 + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + python -m pip install --upgrade flake8 + + - name: Run flake8 to verify PEP8-compliance of Python code + run: | + flake8 From 88ba094206bd359b89f07f2e3221de2c1f1f09f1 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 22 Apr 2025 10:39:53 +0200 Subject: [PATCH 003/218] revise make_approval_request and fix flake8 issues --- scripts/automated_ingestion/eessitarball.py | 179 ++++++++++++-------- 1 file changed, 110 insertions(+), 69 deletions(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 127ea2db..4bfccbf9 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -2,7 +2,6 @@ from pathlib import PurePosixPath -import boto3 import github import json import logging @@ -64,25 +63,23 @@ def download(self, force=False): # and may be optional or required. try: self.s3.download_file(self.bucket, sig_object, local_sig_file) - except: + except Exception as err: + log_msg = 'Failed to download signature file %s for %s from %s to %s.' if self.config['signatures'].getboolean('signatures_required', True): - logging.error( - f'Failed to download signature file {sig_object} for {object} from {self.bucket} to {local_sig_file}.' - ) + log_msg += '\nException: %s' + logging.error(log_msg, sig_object, object, self.bucket, local_sig_file, err) skip = True break else: - logging.warning( - f'Failed to download signature file {sig_object} for {object} from {self.bucket} to {local_sig_file}. ' + - 'Ignoring this, because signatures are not required with the current configuration.' - ) + log_msg += ' Ignoring this, because signatures are not required with the current configuration.' + log_msg += '\nException: %s' + logging.warning(log_msg, sig_object, object, self.bucket, local_sig_file, err) # Now we download the file itself. try: self.s3.download_file(self.bucket, object, local_file) - except: - logging.error( - f'Failed to download {object} from {self.bucket} to {local_file}.' - ) + except Exception as err: + log_msg = 'Failed to download %s from %s to %s.\nException: %s' + logging.error(log_msg, object, self.bucket, local_file, err) skip = True break # If any required download failed, make sure to skip this tarball completely. @@ -100,13 +97,14 @@ def find_state(self): except github.UnknownObjectException: # no metadata file found in this state's directory, so keep searching... continue - except github.GithubException as e: - if e.status == 404: + except github.GithubException as err: + if err.status == 404: # no metadata file found in this state's directory, so keep searching... continue else: # if there was some other (e.g. connection) issue, abort the search for this tarball - logging.warning(f'Unable to determine the state of {self.object}, the GitHub API returned status {e.status}!') + log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' + logging.warning(log_msg, self.object, err.status) return "unknown" else: # if no state was found, we assume this is a new tarball that was ingested to the bucket @@ -128,7 +126,9 @@ def get_contents_overview(self): # determine prefix after filtering out '/init' subdirectory, # to get actual prefix for specific CPU target (like '2023.06/software/linux/aarch64/neoverse_v1') init_subdir = os.path.join('*', 'init') - non_init_paths = sorted([p for p in paths if not any(x.match(init_subdir) for x in PurePosixPath(p).parents)]) + non_init_paths = sorted( + [p for p in paths if not any(x.match(init_subdir) for x in PurePosixPath(p).parents)] + ) if non_init_paths: prefix = os.path.commonprefix(non_init_paths) else: @@ -148,8 +148,8 @@ def get_contents_overview(self): other = [ # anything that is not in /software nor /modules m.path for m in members - if not PurePosixPath(prefix).joinpath('software') in PurePosixPath(m.path).parents - and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(m.path).parents + if (not PurePosixPath(prefix).joinpath('software') in PurePosixPath(m.path).parents + and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(m.path).parents) # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*')) # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*')) ] @@ -204,16 +204,20 @@ def verify_signatures(self): verify_script = self.config['signatures']['signature_verification_script'] allowed_signers_file = self.config['signatures']['allowed_signers_file'] if not os.path.exists(verify_script): - logging.error(f'Unable to verify signatures, the specified signature verification script does not exist!') + logging.error('Unable to verify signatures, the specified signature verification script does not exist!') return False if not os.path.exists(allowed_signers_file): - logging.error(f'Unable to verify signatures, the specified allowed signers file does not exist!') + logging.error('Unable to verify signatures, the specified allowed signers file does not exist!') return False - for (file, sig_file) in [(self.local_path, self.local_sig_path), (self.local_metadata_path, self.local_metadata_sig_path)]: + for (file, sig_file) in [ + (self.local_path, self.local_sig_path), + (self.local_metadata_path, self.local_metadata_sig_path) + ]: verify_cmd = subprocess.run( - [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file, '--file', file, '--signature-file', sig_file], + [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file, + '--file', file, '--signature-file', sig_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) if verify_cmd.returncode == 0: @@ -237,7 +241,7 @@ def verify_checksum(self): def ingest(self): """Process a tarball that is ready to be ingested by running the ingestion script.""" - #TODO: check if there is an open issue for this tarball, and if there is, skip it. + # TODO: check if there is an open issue for this tarball, and if there is, skip it. logging.info(f'Tarball {self.object} is ready to be ingested.') self.download() logging.info('Verifying its signature...') @@ -273,7 +277,9 @@ def ingest(self): if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False): send_slack_message( self.config['secrets']['slack_webhook'], - self.config['slack']['ingestion_message'].format(tarball=os.path.basename(self.object), cvmfs_repo=self.cvmfs_repo) + self.config['slack']['ingestion_message'].format( + tarball=os.path.basename(self.object), + cvmfs_repo=self.cvmfs_repo) ) else: issue_title = f'Failed to ingest {self.object}' @@ -314,7 +320,7 @@ def mark_new_tarball_as_staged(self): logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.') file_path_staged = next_state + '/' + self.metadata_file - new_file = self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch='main') + self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch='main') self.state = next_state self.run_handler() @@ -328,35 +334,39 @@ def print_unknown(self): """Process a tarball which has an unknown state.""" logging.info("The state of this tarball could not be determined, so we're skipping it.") - def make_approval_request(self): + def make_approval_request(self, tarballs_in_group=None): """Process a staged tarball by opening a pull request for ingestion approval.""" next_state = self.next_state(self.state) - file_path_staged = self.state + '/' + self.metadata_file - file_path_to_ingest = next_state + '/' + self.metadata_file - + # file_path_staged = self.state + '/' + self.metadata_file filename = os.path.basename(self.object) - tarball_metadata = self.git_repo.get_contents(file_path_staged) - git_branch = filename + '_' + next_state - self.download() + # Get link2pr info from metadata + with open(self.local_metadata_path, 'r') as meta: + metadata = meta.read() + meta_dict = json.loads(metadata) + repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr'] + pr_url = f"https://github.com/{repo}/pull/{pr_id}" + + # Create branch name based on whether we're handling a group + if tarballs_in_group is None: + # Individual tarball + git_branch = filename + '_' + next_state + else: + # Group of tarballs + sequence = self.find_next_sequence_number(repo, pr_id) + git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}' + + # Check for existing branch and PR main_branch = self.git_repo.get_branch('main') if git_branch in [branch.name for branch in self.git_repo.get_branches()]: - # Existing branch found for this tarball, so we've run this step before. - # Try to find out if there's already a PR as well... - logging.info("Branch already exists for " + self.object) - # Filtering with only head= returns all prs if there's no match, so double-check - find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all') if pr.head.ref == git_branch] - logging.debug('Found PRs: ' + str(find_pr)) + find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all') + if pr.head.ref == git_branch] if find_pr: - # So, we have a branch and a PR for this tarball (if there are more, pick the first one)... pr = find_pr.pop(0) - logging.info(f'PR {pr.number} found for {self.object}') if pr.state == 'open': - # The PR is still open, so it hasn't been reviewed yet: ignore this tarball. logging.info('PR is still open, skipping this tarball...') return elif pr.state == 'closed' and not pr.merged: - # The PR was closed but not merged, i.e. it was rejected for ingestion. logging.info('PR was rejected') self.reject() return @@ -364,48 +374,78 @@ def make_approval_request(self): logging.warn(f'Warning, tarball {self.object} is in a weird state:') logging.warn(f'Branch: {git_branch}\nPR: {pr}\nPR state: {pr.state}\nPR merged: {pr.merged}') else: - # There is a branch, but no PR for this tarball. - # This is weird, so let's remove the branch and reprocess the tarball. logging.info(f'Tarball {self.object} has a branch, but no PR.') - logging.info(f'Removing existing branch...') + logging.info('Removing existing branch...') ref = self.git_repo.get_git_ref(f'heads/{git_branch}') ref.delete() - logging.info(f'Making pull request to get ingestion approval for {self.object}.') - # Create a new branch + + # Create new branch self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha) - # Move the file to the directory of the next stage in this branch - self.move_metadata_file(self.state, next_state, branch=git_branch) - # Get metadata file contents - metadata = '' - with open(self.local_metadata_path, 'r') as meta: - metadata = meta.read() - meta_dict = json.loads(metadata) - repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr'] - pr_url = f"https://github.com/{repo}/pull/{pr_id}" - # Try to get the tarball contents and open a PR to get approval for the ingestion + + # Move metadata file(s) to staged directory + if tarballs_in_group is None: + self.move_metadata_file(self.state, next_state, branch=git_branch) + else: + for tarball in tarballs_in_group: + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar.move_metadata_file('new', 'staged', branch=git_branch) + + # Create PR with appropriate template try: - tarball_contents = self.get_contents_overview() - pr_body = self.config['github']['pr_body'].format( - cvmfs_repo=self.cvmfs_repo, - pr_url=pr_url, - tar_overview=self.get_contents_overview(), - metadata=metadata, - ) - pr_title = '[%s] Ingest %s' % (self.cvmfs_repo, filename) + if tarballs_in_group is None: + # Individual tarball + tarball_contents = self.get_contents_overview() + pr_body = self.config['github']['individual_pr_body'].format( + cvmfs_repo=self.cvmfs_repo, + pr_url=pr_url, + tar_overview=tarball_contents, + metadata=metadata, + ) + pr_title = f'[{self.cvmfs_repo}] Ingest {filename}' + else: + # Group of tarballs + tar_overviews = [] + for tarball in tarballs_in_group: + try: + temp_tar = EessiTarball( + tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar.download() + overview = temp_tar.get_contents_overview() + tar_details_tpl = "
\nContents of %s\n\n%s\n
\n" + tar_overviews.append(tar_details_tpl % (tarball, overview)) + except Exception as err: + logging.error(f"Failed to get contents overview for {tarball}: {err}") + tar_details_tpl = "
\nContents of %s\n\n" + tar_details_tpl += "Failed to get contents overview: %s\n
\n" + tar_overviews.append(tar_details_tpl % (tarball, err)) + + pr_body = self.config['github']['grouped_pr_body'].format( + cvmfs_repo=self.cvmfs_repo, + pr_url=pr_url, + tarballs=self.format_tarball_list(tarballs_in_group), + metadata=self.format_metadata_list(tarballs_in_group), + tar_overview="\n".join(tar_overviews) + ) + pr_title = f'[{self.cvmfs_repo}] Staging PR #{sequence} for {repo}#{pr_id}' + + # Add signature verification status if applicable if self.sig_verified: - pr_body += "\n\n:heavy_check_mark: :closed_lock_with_key: The signature of this tarball has been successfully verified." + pr_body += "\n\n:heavy_check_mark: :closed_lock_with_key: " + pr_body += "The signature of this tarball has been successfully verified." pr_title += ' :closed_lock_with_key:' + self.git_repo.create_pull(title=pr_title, body=pr_body, head=git_branch, base='main') + except Exception as err: issue_title = f'Failed to get contents of {self.object}' issue_body = self.config['github']['failed_tarball_overview_issue_body'].format( tarball=self.object, error=err ) - if len([i for i in self.git_repo.get_issues(state='open') if i.title == issue_title]) == 0: + if not self.issue_exists(issue_title, state='open'): self.git_repo.create_issue(title=issue_title, body=issue_body) else: - logging.info(f'Failed to create tarball overview, but an issue already exists.') + logging.info('Failed to create tarball overview, but an issue already exists.') def move_metadata_file(self, old_state, new_state, branch='main'): """Move the metadata file of a tarball from an old state's directory to a new state's directory.""" @@ -441,6 +481,7 @@ def get_link2pr_info(self): metadata = json.load(meta) return metadata['link2pr']['repo'], metadata['link2pr']['pr'] + class EessiTarballGroup: """Class to handle a group of tarballs that share the same link2pr information.""" From dd9b71a05e52276d0f54dd780f0ebf283c60dd6b Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 22 Apr 2025 10:45:32 +0200 Subject: [PATCH 004/218] requirements.txt is not needed --- .github/workflows/check-flake8.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/check-flake8.yml b/.github/workflows/check-flake8.yml index 2a3a425b..f0ebe250 100644 --- a/.github/workflows/check-flake8.yml +++ b/.github/workflows/check-flake8.yml @@ -29,7 +29,6 @@ jobs: - name: Install required Python packages + pytest + flake8 run: | python -m pip install --upgrade pip - python -m pip install -r requirements.txt python -m pip install --upgrade flake8 - name: Run flake8 to verify PEP8-compliance of Python code From 2cfde96141e919b24770702b765079e92c894e71 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 22 Apr 2025 10:53:23 +0200 Subject: [PATCH 005/218] fix flake8 issues in automated_ingestion.py --- .../automated_ingestion.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 41d928c7..5983abe3 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -1,12 +1,11 @@ #!/usr/bin/env python3 -from eessitarball import EessiTarball -from pid.decorator import pidfile +from eessitarball import EessiTarball, EessiTarballGroup +from pid.decorator import pidfile # noqa: F401 from pid import PidFileError import argparse import boto3 -import botocore import configparser import github import json @@ -38,7 +37,10 @@ def error(msg, code=1): def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt'): - """Return a list of all tarballs in an S3 bucket that have a metadata file with the given extension (and same filename).""" + """ + Return a list of all tarballs in an S3 bucket that have a metadata file with + the given extension (and same filename). + """ # TODO: list_objects_v2 only returns up to 1000 objects s3_objects = s3.list_objects_v2(Bucket=bucket).get('Contents', []) files = [obj['Key'] for obj in s3_objects] @@ -46,11 +48,11 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt tarballs = [ file for file in files - if file.endswith(extension) - and file + metadata_extension in files + if file.endswith(extension) and file + metadata_extension in files ] return tarballs + def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'): """Return a dictionary of tarball groups, keyed by (repo, pr_number).""" tarballs = find_tarballs(s3, bucket, extension, metadata_extension) @@ -88,15 +90,15 @@ def parse_config(path): config = configparser.ConfigParser() try: config.read(path) - except: - error(f'Unable to read configuration file {path}!') + except Exception as err: + error(f'Unable to read configuration file {path}!\nException: {err}') # Check if all required configuration parameters/sections can be found. for section in REQUIRED_CONFIG.keys(): - if not section in config: + if section not in config: error(f'Missing section "{section}" in configuration file {path}.') for item in REQUIRED_CONFIG[section]: - if not item in config[section]: + if item not in config[section]: error(f'Missing configuration item "{item}" in section "{section}" of configuration file {path}.') return config From 0cb2622b8adac2283258181c016e6ead10098ef6 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Tue, 22 Apr 2025 11:02:35 +0200 Subject: [PATCH 006/218] fix flake8 issues in check-stratum-servers.py --- scripts/check-stratum-servers.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/scripts/check-stratum-servers.py b/scripts/check-stratum-servers.py index de4270d9..4e35b09e 100755 --- a/scripts/check-stratum-servers.py +++ b/scripts/check-stratum-servers.py @@ -9,7 +9,8 @@ import yaml # Default location for EESSI's Ansible group vars file containing the CVMFS settings. -DEFAULT_ANSIBLE_GROUP_VARS_LOCATION = 'https://raw.githubusercontent.com/EESSI/filesystem-layer/main/inventory/group_vars/all.yml' +DEFAULT_ANSIBLE_GROUP_VARS_LOCATION = \ + 'https://raw.githubusercontent.com/EESSI/filesystem-layer/main/inventory/group_vars/all.yml' # Default fully qualified CVMFS repository name DEFAULT_CVMFS_FQRN = 'software.eessi.io' # Maximum amount of time (in minutes) that a Stratum 1 is allowed to not having performed a snapshot. @@ -32,8 +33,8 @@ def find_stratum_urls(vars_file, fqrn): """Find all Stratum 0/1 URLs in a given Ansible YAML vars file that contains the EESSI CVMFS configuration.""" try: group_vars = urllib.request.urlopen(vars_file) - except: - error(f'Cannot read the file that contains the Stratum 1 URLs from {vars_file}!') + except Exception as err: + error(f'Cannot read the file that contains the Stratum 1 URLs from {vars_file}!\nException: {err}') try: group_vars_yaml = yaml.safe_load(group_vars) s1_urls = group_vars_yaml['eessi_cvmfs_server_urls'][0]['urls'] @@ -44,8 +45,8 @@ def find_stratum_urls(vars_file, fqrn): break else: error(f'Could not find Stratum 0 URL in {vars_file}!') - except: - error(f'Cannot parse the yaml file from {vars_file}!') + except Exception as err: + error(f'Cannot parse the yaml file from {vars_file}!\nException: {err}') return s0_url, s1_urls @@ -64,7 +65,7 @@ def check_revisions(stratum_urls, fqrn): revisions[stratum] = int(rev_matches[0]) else: errors.append(f'Could not find revision number for stratum {stratum}!') - except urllib.error.HTTPError as e: + except urllib.error.HTTPError: errors.append(f'Could not connect to {stratum}!') # Check if all revisions are the same. @@ -95,10 +96,11 @@ def check_snapshots(s1_urls, fqrn, max_snapshot_delay=DEFAULT_MAX_SNAPSHOT_DELAY # Stratum 1 servers are supposed to make a snapshot every few minutes, # so let's check if it is not too far behind. if now - last_snapshot_time > datetime.timedelta(minutes=max_snapshot_delay): + time_diff = (now - last_snapshot_time).seconds / 60 errors.append( - f'Stratum 1 {s1} has made its last snapshot {(now - last_snapshot_time).seconds / 60:.0f} minutes ago!') - except urllib.error.HTTPError as e: - errors.append(f'Could not connect to {s1_json}!') + f'Stratum 1 {s1} has made its last snapshot {time_diff:.0f} minutes ago!') + except urllib.error.HTTPError: + errors.append(f'Could not connect to {s1_snapshot_file}!') if last_snapshots: # Get the Stratum 1 with the most recent snapshot... From 7214d27526ae7a6d21ea7af27c2a26680426d284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 25 Apr 2025 21:37:39 +0200 Subject: [PATCH 007/218] incremental updates through testing --- .../automated_ingestion.py | 18 +++-- scripts/automated_ingestion/eessitarball.py | 67 +++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 5983abe3..ca059431 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -131,6 +131,8 @@ def main(): 's3', aws_access_key_id=config['secrets']['aws_access_key_id'], aws_secret_access_key=config['secrets']['aws_secret_access_key'], + endpoint_url=config['aws']['endpoint_url'], + verify=config['aws']['verify_cert_path'], ) buckets = json.loads(config['aws']['staging_buckets']) @@ -138,11 +140,17 @@ def main(): if config['github'].get('staging_pr_method', 'individual') == 'grouped': # use new grouped PR method tarball_groups = find_tarball_groups(s3, bucket, config) - for (repo, pr_id), tarballs in tarball_groups.items(): - if tarballs: - # Create a group handler for these tarballs - group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) - group_handler.process_group(tarballs) + if args.list_only: + print(f"#tarball_groups: {len(tarball_groups)}") + for (repo, pr_id), tarballs in tarball_groups.items(): + print(f" {repo}#{pr_id}: #tarballs {len(tarballs)}") + else: + for (repo, pr_id), tarballs in tarball_groups.items(): + if tarballs: + # Create a group handler for these tarballs + group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) + print(f"group_handler created\n{group_handler.to_string()}") + group_handler.process_group(tarballs) else: # use old individual PR method tarballs = find_tarballs(s3, bucket) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 4bfccbf9..88bfc1df 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -181,6 +181,17 @@ def run_handler(self): handler = self.states[self.state]['handler'] handler() + def to_string(self): + """Serialize tarball info so it can be printed.""" + str = f"tarball: {self.object}" + str += f"\n metadt: {self.metadata_file}" + str += f"\n config: {self.config}" + str += f"\n s3....: {self.s3}" + str += f"\n bucket: {self.bucket}" + str += f"\n cvmfs.: {self.cvmfs_repo}" + str += f"\n GHrepo: {self.git_repo}" + return str + def verify_signatures(self): """Verify the signatures of the downloaded tarball and metadata file using the corresponding signature files.""" @@ -334,6 +345,35 @@ def print_unknown(self): """Process a tarball which has an unknown state.""" logging.info("The state of this tarball could not be determined, so we're skipping it.") + def find_next_sequence_number(self, repo, pr_id): + """Find the next available sequence number for staging PRs of a source PR.""" + # Search for existing branches for this source PR + base_branch = f'staging-{repo.replace("/", "-")}-{pr_id}' + existing_branches = [ + ref.ref for ref in self.git_repo.get_git_refs() + if ref.ref.startswith(f'refs/heads/{base_branch}') + ] + + if not existing_branches: + return 1 + + # Extract sequence numbers from existing branches + sequence_numbers = [] + for branch in existing_branches: + try: + # Extract the sequence number from branch name + # Format: staging-repo-pr_id-sequence + sequence = int(branch.split('-')[-1]) + sequence_numbers.append(sequence) + except (ValueError, IndexError): + continue + + if not sequence_numbers: + return 1 + + # Return next available sequence number + return max(sequence_numbers) + 1 + def make_approval_request(self, tarballs_in_group=None): """Process a staged tarball by opening a pull request for ingestion approval.""" next_state = self.next_state(self.state) @@ -494,8 +534,24 @@ def __init__(self, first_tarball, config, git_staging_repo, s3, bucket, cvmfs_re self.bucket = bucket self.cvmfs_repo = cvmfs_repo + def download_tarballs_and_more(self, tarballs): + """Download all files associated with this group of tarballs.""" + for tarball in tarballs: + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + print(f"downloading files for '{temp_tar.object}'") + temp_tar.download(force=True) + if not temp_tar.local_path or not temp_tar.local_metadata_path: + logging.warn(f"Skipping this tarball: {temp_tar.object}") + return False + return True + def process_group(self, tarballs): """Process a group of tarballs together.""" + # download tarballs, metadata files and their signatures + if not self.download_tarballs_and_more(tarballs): + logging.error("Downloading tarballs, metadata files and/or their signatures failed") + return + # Verify all tarballs have the same link2pr info if not self.verify_group_consistency(tarballs): logging.error("Tarballs in group have inconsistent link2pr information") @@ -504,12 +560,23 @@ def process_group(self, tarballs): # Process the group self.first_tar.make_approval_request(tarballs) + def to_string(self): + """Serialize tarball group info so it can be printed.""" + str = f"first tarball: {self.first_tar.to_string()}" + str += f"\n config: {self.config}" + str += f"\n GHrepo: {self.git_repo}" + str += f"\n s3....: {self.s3}" + str += f"\n bucket: {self.bucket}" + str += f"\n cvmfs.: {self.cvmfs_repo}" + return str + def verify_group_consistency(self, tarballs): """Verify all tarballs in the group have the same link2pr information.""" first_repo, first_pr = self.first_tar.get_link2pr_info() for tarball in tarballs[1:]: # Skip first tarball as we already have its info temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + print(f"temp tar: {temp_tar.to_string()}") repo, pr = temp_tar.get_link2pr_info() if repo != first_repo or pr != first_pr: return False From 946dd65e3222b2164f991cf108836c4305ac7874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 25 Apr 2025 23:52:44 +0200 Subject: [PATCH 008/218] add missing functions --- .../automated_ingestion.py | 12 ++++ scripts/automated_ingestion/eessitarball.py | 62 +++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index ca059431..285f2c0c 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -100,6 +100,18 @@ def parse_config(path): for item in REQUIRED_CONFIG[section]: if item not in config[section]: error(f'Missing configuration item "{item}" in section "{section}" of configuration file {path}.') + + # Validate staging_pr_method + staging_method = config['github'].get('staging_pr_method', 'individual') + if staging_method not in ['individual', 'grouped']: + error(f'Invalid staging_pr_method: "{staging_method}" in configuration file {path}. Must be either "individual" or "grouped".') + + # Validate PR body templates + if staging_method == 'individual' and 'individual_pr_body' not in config['github']: + error(f'Missing "individual_pr_body" in configuration file {path}.') + if staging_method == 'grouped' and 'grouped_pr_body' not in config['github']: + error(f'Missing "grouped_pr_body" in configuration file {path}.') + return config diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 88bfc1df..50f6c16f 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -487,6 +487,22 @@ def make_approval_request(self, tarballs_in_group=None): else: logging.info('Failed to create tarball overview, but an issue already exists.') + def format_tarball_list(self, tarballs): + """Format a list of tarballs with checkboxes for approval.""" + formatted = "### Tarballs to be ingested\n\n" + for tarball in tarballs: + formatted += f"- [ ] {tarball}\n" + return formatted + + def format_metadata_list(self, tarballs): + """Format metadata for all tarballs in collapsible sections.""" + formatted = "### Metadata\n\n" + for tarball in tarballs: + with open(self.get_metadata_path(tarball), 'r') as meta: + metadata = meta.read() + formatted += f"
\nMetadata for {tarball}\n\n```\n{metadata}\n```\n
\n\n" + return formatted + def move_metadata_file(self, old_state, new_state, branch='main'): """Move the metadata file of a tarball from an old state's directory to a new state's directory.""" file_path_old = old_state + '/' + self.metadata_file @@ -499,6 +515,52 @@ def move_metadata_file(self, old_state, new_state, branch='main'): self.git_repo.create_file(file_path_new, 'move to ' + new_state, tarball_metadata.decoded_content, branch=branch) + def process_pr_merge(self, pr_number): + """Process a merged PR by handling the checkboxes and moving tarballs to appropriate states.""" + pr = self.git_repo.get_pull(pr_number) + + # Get the branch name + branch_name = pr.head.ref + + # Get the list of tarballs from the PR body + tarballs = self.extract_tarballs_from_pr_body(pr.body) + + # Get the checked status for each tarball + checked_tarballs = self.extract_checked_tarballs(pr.body) + + # Process each tarball + for tarball in tarballs: + if tarball in checked_tarballs: + # Move to approved state + self.move_metadata_file('staged', 'approved', branch=branch_name) + else: + # Move to rejected state + self.move_metadata_file('staged', 'rejected', branch=branch_name) + + # Delete the branch after processing + ref = self.git_repo.get_git_ref(f'heads/{branch_name}') + ref.delete() + + def extract_checked_tarballs(self, pr_body): + """Extract list of checked tarballs from PR body.""" + checked_tarballs = [] + for line in pr_body.split('\n'): + if line.strip().startswith('- [x] '): + tarball = line.strip()[6:] # Remove '- [x] ' prefix + checked_tarballs.append(tarball) + return checked_tarballs + + def extract_tarballs_from_pr_body(self, pr_body): + """Extract list of all tarballs from PR body.""" + tarballs = [] + for line in pr_body.split('\n'): + if line.strip().startswith('- ['): + tarball = line.strip()[6:] # Remove '- [ ] ' or '- [x] ' prefix + tarballs.append(tarball) + return tarballs + + def reject(self): + """Reject a tarball for ingestion.""" def reject(self): """Reject a tarball for ingestion.""" # Let's move the the tarball to the directory for rejected tarballs. From 51894ec71ec0dac855eedd14cb4deabe6b0a2aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 26 Apr 2025 00:31:21 +0200 Subject: [PATCH 009/218] mark tarballs in group as new initially --- scripts/automated_ingestion/automated_ingestion.py | 8 ++++---- scripts/automated_ingestion/eessitarball.py | 9 ++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 285f2c0c..7668dfb0 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -159,10 +159,10 @@ def main(): else: for (repo, pr_id), tarballs in tarball_groups.items(): if tarballs: - # Create a group handler for these tarballs - group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) - print(f"group_handler created\n{group_handler.to_string()}") - group_handler.process_group(tarballs) + # Create a group for these tarballs + group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) + print(f"group created\n{group.to_string()}") + group.process_group(tarballs) else: # use old individual PR method tarballs = find_tarballs(s3, bucket) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 50f6c16f..9e523376 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -559,8 +559,6 @@ def extract_tarballs_from_pr_body(self, pr_body): tarballs.append(tarball) return tarballs - def reject(self): - """Reject a tarball for ingestion.""" def reject(self): """Reject a tarball for ingestion.""" # Let's move the the tarball to the directory for rejected tarballs. @@ -619,7 +617,12 @@ def process_group(self, tarballs): logging.error("Tarballs in group have inconsistent link2pr information") return - # Process the group + # First mark all tarballs as staged by creating their metadata files in the GitHub repository + for tarball in tarballs: + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar.mark_new_tarball_as_staged() + + # Then process the group for approval self.first_tar.make_approval_request(tarballs) def to_string(self): From ea51a5e3eb3380fbc4672fc46a7769df01f97295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 26 Apr 2025 00:48:00 +0200 Subject: [PATCH 010/218] always use group branch only --- scripts/automated_ingestion/eessitarball.py | 22 ++++++++++----------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 9e523376..056666e9 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -310,7 +310,7 @@ def print_ingested(self): """Process a tarball that has already been ingested.""" logging.info(f'{self.object} has already been ingested, skipping...') - def mark_new_tarball_as_staged(self): + def mark_new_tarball_as_staged(self, branch=None): """Process a new tarball that was added to the staging bucket.""" next_state = self.next_state(self.state) logging.info(f'Found new tarball {self.object}, downloading it...') @@ -331,10 +331,14 @@ def mark_new_tarball_as_staged(self): logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.') file_path_staged = next_state + '/' + self.metadata_file - self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch='main') + + # If no branch is provided, use the main branch + target_branch = branch if branch else 'main' + self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch=target_branch) self.state = next_state - self.run_handler() + if not branch: # Only run handler if we're not part of a group + self.run_handler() def print_rejected(self): """Process a (rejected) tarball for which the corresponding PR has been closed witout merging.""" @@ -377,7 +381,6 @@ def find_next_sequence_number(self, repo, pr_id): def make_approval_request(self, tarballs_in_group=None): """Process a staged tarball by opening a pull request for ingestion approval.""" next_state = self.next_state(self.state) - # file_path_staged = self.state + '/' + self.metadata_file filename = os.path.basename(self.object) # Get link2pr info from metadata @@ -387,14 +390,9 @@ def make_approval_request(self, tarballs_in_group=None): repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr'] pr_url = f"https://github.com/{repo}/pull/{pr_id}" - # Create branch name based on whether we're handling a group - if tarballs_in_group is None: - # Individual tarball - git_branch = filename + '_' + next_state - else: - # Group of tarballs - sequence = self.find_next_sequence_number(repo, pr_id) - git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}' + # Always use the consistent branch naming scheme + sequence = self.find_next_sequence_number(repo, pr_id) + git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}' # Check for existing branch and PR main_branch = self.git_repo.get_branch('main') From 7db3dbee78a08c4539abd0255d55debaff56f574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 26 Apr 2025 01:09:52 +0200 Subject: [PATCH 011/218] add a bit debug output --- scripts/automated_ingestion/eessitarball.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 056666e9..833a66f7 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -92,6 +92,7 @@ def find_state(self): for state in list(self.states.keys()): # iterate through the state dirs and try to find the tarball's metadata file try: + print(f"Checking {state} for {self.metadata_file}") self.git_repo.get_contents(state + '/' + self.metadata_file) return state except github.UnknownObjectException: @@ -330,6 +331,7 @@ def mark_new_tarball_as_staged(self, branch=None): contents = meta.read() logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.') + print(f'Adding tarball\'s metadata ({self.metadata_file}) to the "{next_state}" folder of the git repository.') file_path_staged = next_state + '/' + self.metadata_file # If no branch is provided, use the main branch From 9ef35355df9b18b9aecfde7a21c8c7bcb4833eb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 26 Apr 2025 21:30:30 +0200 Subject: [PATCH 012/218] improve logging --- .../automated_ingestion.py | 83 ++++++++++++++++-- scripts/automated_ingestion/eessitarball.py | 84 +++++++++++-------- 2 files changed, 124 insertions(+), 43 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 7668dfb0..67d90a45 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -13,6 +13,7 @@ import os import pid import sys +from pathlib import Path REQUIRED_CONFIG = { 'secrets': ['aws_secret_access_key', 'aws_access_key_id', 'github_pat'], @@ -118,12 +119,81 @@ def parse_config(path): def parse_args(): """Parse the command-line arguments.""" parser = argparse.ArgumentParser() + + # Logging options + logging_group = parser.add_argument_group('Logging options') + logging_group.add_argument('--log-file', + help='Path to log file (overrides config file setting)') + logging_group.add_argument('--console-level', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logging level for console output (overrides config file setting)') + logging_group.add_argument('--file-level', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logging level for file output (overrides config file setting)') + logging_group.add_argument('--quiet', + action='store_true', + help='Suppress console output (overrides all other console settings)') + + # Existing arguments parser.add_argument('-c', '--config', type=str, help='path to configuration file', - default='automated_ingestion.cfg', dest='config') + default='automated_ingestion.cfg', dest='config') parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug') parser.add_argument('-l', '--list', help='only list available tarballs', action='store_true', dest='list_only') - args = parser.parse_args() - return args + + return parser.parse_args() + + +def setup_logging(config, args): + """ + Configure logging based on configuration file and command line arguments. + Command line arguments take precedence over config file settings. + + Args: + config: Configuration dictionary + args: Parsed command line arguments + """ + # Get settings from config file + log_file = config['logging'].get('filename') + log_format = config['logging'].get('format', '%(levelname)s:%(message)s') + config_console_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.INFO) + config_file_level = LOG_LEVELS.get(config['logging'].get('file_level', 'DEBUG').upper(), logging.DEBUG) + + # Override with command line arguments if provided + log_file = args.log_file if args.log_file else log_file + console_level = getattr(logging, args.console_level) if args.console_level else config_console_level + file_level = getattr(logging, args.file_level) if args.file_level else config_file_level + + # Debug mode overrides console level + if args.debug: + console_level = logging.DEBUG + + # Create logger + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) # Set root logger to lowest level + + # Create formatters + console_formatter = logging.Formatter(log_format) + file_formatter = logging.Formatter('%(asctime)s - ' + log_format) + + # Console handler (only if not quiet) + if not args.quiet: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(console_level) + console_handler.setFormatter(console_formatter) + logger.addHandler(console_handler) + + # File handler (if log file is specified) + if log_file: + # Ensure log directory exists + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(file_level) + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + + return logger @pid.decorator.pidfile('automated_ingestion.pid') @@ -131,11 +201,8 @@ def main(): """Main function.""" args = parse_args() config = parse_config(args.config) - log_file = config['logging'].get('filename', None) - log_format = config['logging'].get('format', '%(levelname)s:%(message)s') - log_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.WARN) - log_level = logging.DEBUG if args.debug else log_level - logging.basicConfig(filename=log_file, format=log_format, level=log_level) + setup_logging(config, args) + # TODO: check configuration: secrets, paths, permissions on dirs, etc gh_pat = config['secrets']['github_pat'] gh_staging_repo = github.Github(gh_pat).get_repo(config['github']['staging_repo']) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 833a66f7..f7749a94 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -89,11 +89,11 @@ def download(self, force=False): def find_state(self): """Find the state of this tarball by searching through the state directories in the git repository.""" + logging.debug(f"Find state for {self.object}") for state in list(self.states.keys()): - # iterate through the state dirs and try to find the tarball's metadata file try: - print(f"Checking {state} for {self.metadata_file}") self.git_repo.get_contents(state + '/' + self.metadata_file) + logging.info(f"Found metadata file {self.metadata_file} in state: {state}") return state except github.UnknownObjectException: # no metadata file found in this state's directory, so keep searching... @@ -107,9 +107,8 @@ def find_state(self): log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' logging.warning(log_msg, self.object, err.status) return "unknown" - else: - # if no state was found, we assume this is a new tarball that was ingested to the bucket - return "new" + logging.info(f"Tarball {self.metadata_file} is new") + return "new" def get_contents_overview(self): """Return an overview of what is included in the tarball.""" @@ -319,23 +318,22 @@ def mark_new_tarball_as_staged(self, branch=None): # Use force as it may be a new attempt for an existing tarball that failed before. self.download(force=True) if not self.local_path or not self.local_metadata_path: - logging.warn('Skipping this tarball...') + logging.warning(f"Skipping tarball {self.object} - download failed") return # Verify the signatures of the tarball and metadata file. if not self.verify_signatures(): - logging.warn('Signature verification of the tarball or its metadata failed, skipping this tarball...') + logging.warning(f"Skipping tarball {self.object} - signature verification failed") + return + + # If no branch is provided, use the main branch + target_branch = branch if branch else 'main' + logging.info(f"Adding metadata to '{next_state}' folder in {target_branch} branch") + file_path_staged = next_state + '/' + self.metadata_file contents = '' with open(self.local_metadata_path, 'r') as meta: contents = meta.read() - - logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.') - print(f'Adding tarball\'s metadata ({self.metadata_file}) to the "{next_state}" folder of the git repository.') - file_path_staged = next_state + '/' + self.metadata_file - - # If no branch is provided, use the main branch - target_branch = branch if branch else 'main' self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch=target_branch) self.state = next_state @@ -383,22 +381,21 @@ def find_next_sequence_number(self, repo, pr_id): def make_approval_request(self, tarballs_in_group=None): """Process a staged tarball by opening a pull request for ingestion approval.""" next_state = self.next_state(self.state) - filename = os.path.basename(self.object) - # Get link2pr info from metadata + # obtain link2pr information (repo and pr_id) from metadata file with open(self.local_metadata_path, 'r') as meta: metadata = meta.read() meta_dict = json.loads(metadata) repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr'] - pr_url = f"https://github.com/{repo}/pull/{pr_id}" - # Always use the consistent branch naming scheme + # find next sequence number for staging PRs of this source PR sequence = self.find_next_sequence_number(repo, pr_id) git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}' - # Check for existing branch and PR + # Check if git_branch exists and what the status of the corressponding PR is main_branch = self.git_repo.get_branch('main') if git_branch in [branch.name for branch in self.git_repo.get_branches()]: + logging.info(f"Branch {git_branch} already exists, checking the status of the corresponding PR...") find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all') if pr.head.ref == git_branch] if find_pr: @@ -413,6 +410,8 @@ def make_approval_request(self, tarballs_in_group=None): else: logging.warn(f'Warning, tarball {self.object} is in a weird state:') logging.warn(f'Branch: {git_branch}\nPR: {pr}\nPR state: {pr.state}\nPR merged: {pr.merged}') + # TODO: should we delete the branch or open an issue? + return else: logging.info(f'Tarball {self.object} has a branch, but no PR.') logging.info('Removing existing branch...') @@ -424,16 +423,19 @@ def make_approval_request(self, tarballs_in_group=None): # Move metadata file(s) to staged directory if tarballs_in_group is None: + logging.info(f"Moving metadata for individual tarball to staged") self.move_metadata_file(self.state, next_state, branch=git_branch) else: + logging.info(f"Moving metadata for {len(tarballs_in_group)} tarballs to staged") for tarball in tarballs_in_group: temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) temp_tar.move_metadata_file('new', 'staged', branch=git_branch) # Create PR with appropriate template try: + pr_url=f"https://github.com/{repo}/pull/{pr_id}", if tarballs_in_group is None: - # Individual tarball + logging.info(f"Creating PR for individual tarball: {self.object}") tarball_contents = self.get_contents_overview() pr_body = self.config['github']['individual_pr_body'].format( cvmfs_repo=self.cvmfs_repo, @@ -441,7 +443,7 @@ def make_approval_request(self, tarballs_in_group=None): tar_overview=tarball_contents, metadata=metadata, ) - pr_title = f'[{self.cvmfs_repo}] Ingest {filename}' + pr_title = f'[{self.cvmfs_repo}] Ingest {os.path.basename(self.object)}' else: # Group of tarballs tar_overviews = [] @@ -475,17 +477,18 @@ def make_approval_request(self, tarballs_in_group=None): pr_title += ' :closed_lock_with_key:' self.git_repo.create_pull(title=pr_title, body=pr_body, head=git_branch, base='main') + logging.info(f"Created PR: {pr_title}") except Exception as err: - issue_title = f'Failed to get contents of {self.object}' - issue_body = self.config['github']['failed_tarball_overview_issue_body'].format( - tarball=self.object, - error=err - ) - if not self.issue_exists(issue_title, state='open'): - self.git_repo.create_issue(title=issue_title, body=issue_body) - else: - logging.info('Failed to create tarball overview, but an issue already exists.') + logging.error(f"Failed to create PR: {err}") + if not self.issue_exists(f'Failed to get contents of {self.object}', state='open'): + self.git_repo.create_issue( + title=f'Failed to get contents of {self.object}', + body=self.config['github']['failed_tarball_overview_issue_body'].format( + tarball=self.object, + error=err + ) + ) def format_tarball_list(self, tarballs): """Format a list of tarballs with checkboxes for approval.""" @@ -607,22 +610,33 @@ def download_tarballs_and_more(self, tarballs): def process_group(self, tarballs): """Process a group of tarballs together.""" - # download tarballs, metadata files and their signatures + logging.info(f"Processing group of {len(tarballs)} tarballs") + if not self.download_tarballs_and_more(tarballs): logging.error("Downloading tarballs, metadata files and/or their signatures failed") return # Verify all tarballs have the same link2pr info if not self.verify_group_consistency(tarballs): - logging.error("Tarballs in group have inconsistent link2pr information") + logging.error("Tarballs have inconsistent link2pr information") return - # First mark all tarballs as staged by creating their metadata files in the GitHub repository + # Get branch name from first tarball + with open(self.first_tar.local_metadata_path, 'r') as meta: + metadata = json.load(meta) + repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr'] + sequence = self.first_tar.find_next_sequence_number(repo, pr_id) + git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}' + + logging.info(f"Creating group branch: {git_branch}") + + # Mark all tarballs as staged in the group branch for tarball in tarballs: + logging.info(f"Processing tarball in group: {tarball}") temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - temp_tar.mark_new_tarball_as_staged() + temp_tar.mark_new_tarball_as_staged(branch=git_branch) - # Then process the group for approval + # Process the group for approval self.first_tar.make_approval_request(tarballs) def to_string(self): From d860d1e97de5a22aa9302c7b7a0716a32dbaebd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 26 Apr 2025 22:04:12 +0200 Subject: [PATCH 013/218] more improvements to logging --- .../automated_ingestion.py | 8 ++--- scripts/automated_ingestion/eessitarball.py | 34 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 67d90a45..622f0386 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -220,22 +220,22 @@ def main(): # use new grouped PR method tarball_groups = find_tarball_groups(s3, bucket, config) if args.list_only: - print(f"#tarball_groups: {len(tarball_groups)}") + logging.info(f"#tarball_groups: {len(tarball_groups)}") for (repo, pr_id), tarballs in tarball_groups.items(): - print(f" {repo}#{pr_id}: #tarballs {len(tarballs)}") + logging.info(f" {repo}#{pr_id}: #tarballs {len(tarballs)}") else: for (repo, pr_id), tarballs in tarball_groups.items(): if tarballs: # Create a group for these tarballs group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) - print(f"group created\n{group.to_string()}") + logging.info(f"group created\n{group.to_string()}") group.process_group(tarballs) else: # use old individual PR method tarballs = find_tarballs(s3, bucket) if args.list_only: for num, tarball in enumerate(tarballs): - print(f'[{bucket}] {num}: {tarball}') + logging.info(f'[{bucket}] {num}: {tarball}') else: for tarball in tarballs: tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index f7749a94..1284be5b 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -181,15 +181,16 @@ def run_handler(self): handler = self.states[self.state]['handler'] handler() - def to_string(self): + def to_string(self, oneline=False): """Serialize tarball info so it can be printed.""" str = f"tarball: {self.object}" - str += f"\n metadt: {self.metadata_file}" - str += f"\n config: {self.config}" - str += f"\n s3....: {self.s3}" - str += f"\n bucket: {self.bucket}" - str += f"\n cvmfs.: {self.cvmfs_repo}" - str += f"\n GHrepo: {self.git_repo}" + sep = "\n" if not oneline else "," + str += f"{sep} metadt: {self.metadata_file}" + str += f"{sep} config: {self.config}" + str += f"{sep} s3....: {self.s3}" + str += f"{sep} bucket: {self.bucket}" + str += f"{sep} cvmfs.: {self.cvmfs_repo}" + str += f"{sep} GHrepo: {self.git_repo}" return str def verify_signatures(self): @@ -601,7 +602,7 @@ def download_tarballs_and_more(self, tarballs): """Download all files associated with this group of tarballs.""" for tarball in tarballs: temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - print(f"downloading files for '{temp_tar.object}'") + logging.info(f"downloading files for '{temp_tar.object}'") temp_tar.download(force=True) if not temp_tar.local_path or not temp_tar.local_metadata_path: logging.warn(f"Skipping this tarball: {temp_tar.object}") @@ -639,14 +640,15 @@ def process_group(self, tarballs): # Process the group for approval self.first_tar.make_approval_request(tarballs) - def to_string(self): + def to_string(self, oneline=False): """Serialize tarball group info so it can be printed.""" - str = f"first tarball: {self.first_tar.to_string()}" - str += f"\n config: {self.config}" - str += f"\n GHrepo: {self.git_repo}" - str += f"\n s3....: {self.s3}" - str += f"\n bucket: {self.bucket}" - str += f"\n cvmfs.: {self.cvmfs_repo}" + str = f"first tarball: {self.first_tar.to_string(oneline)}" + sep = "\n" if not oneline else "," + str += f"{sep} config: {self.config}" + str += f"{sep} GHrepo: {self.git_repo}" + str += f"{sep} s3....: {self.s3}" + str += f"{sep} bucket: {self.bucket}" + str += f"{sep} cvmfs.: {self.cvmfs_repo}" return str def verify_group_consistency(self, tarballs): @@ -655,7 +657,7 @@ def verify_group_consistency(self, tarballs): for tarball in tarballs[1:]: # Skip first tarball as we already have its info temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - print(f"temp tar: {temp_tar.to_string()}") + logging.debug(f"temp tar: {temp_tar.to_string()}") repo, pr = temp_tar.get_link2pr_info() if repo != first_repo or pr != first_pr: return False From 18cf44d064c65b1b7134e61a7ee45efb038a9ea1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 26 Apr 2025 22:19:13 +0200 Subject: [PATCH 014/218] tweak logging --- scripts/automated_ingestion/automated_ingestion.py | 4 ++-- scripts/automated_ingestion/eessitarball.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 622f0386..9eb717cb 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -154,7 +154,7 @@ def setup_logging(config, args): """ # Get settings from config file log_file = config['logging'].get('filename') - log_format = config['logging'].get('format', '%(levelname)s:%(message)s') + log_format = config['logging'].get('format', '%(levelname)s: %(message)s') config_console_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.INFO) config_file_level = LOG_LEVELS.get(config['logging'].get('file_level', 'DEBUG').upper(), logging.DEBUG) @@ -228,7 +228,7 @@ def main(): if tarballs: # Create a group for these tarballs group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) - logging.info(f"group created\n{group.to_string()}") + logging.info(f"group created\n{group.to_string(oneline=True)}") group.process_group(tarballs) else: # use old individual PR method diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 1284be5b..b45a5e69 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -186,8 +186,6 @@ def to_string(self, oneline=False): str = f"tarball: {self.object}" sep = "\n" if not oneline else "," str += f"{sep} metadt: {self.metadata_file}" - str += f"{sep} config: {self.config}" - str += f"{sep} s3....: {self.s3}" str += f"{sep} bucket: {self.bucket}" str += f"{sep} cvmfs.: {self.cvmfs_repo}" str += f"{sep} GHrepo: {self.git_repo}" From 4a6fecc0caee0b59893bd8bc289e0301e30444c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 26 Apr 2025 22:44:37 +0200 Subject: [PATCH 015/218] more logging output for downloads and signature verification --- scripts/automated_ingestion/eessitarball.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index b45a5e69..af22f4fa 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -56,12 +56,14 @@ def download(self, force=False): (self.object, self.local_path, self.object_sig, self.local_sig_path), (self.metadata_file, self.local_metadata_path, self.metadata_sig_file, self.local_metadata_sig_path), ] + logging.info(f"Downloading {files}") skip = False for (object, local_file, sig_object, local_sig_file) in files: if force or not os.path.exists(local_file): # First we try to download signature file, which may or may not be available # and may be optional or required. try: + logging.info(f"Downloading signature file {sig_object} to {local_sig_file}") self.s3.download_file(self.bucket, sig_object, local_sig_file) except Exception as err: log_msg = 'Failed to download signature file %s for %s from %s to %s.' @@ -76,6 +78,7 @@ def download(self, force=False): logging.warning(log_msg, sig_object, object, self.bucket, local_sig_file, err) # Now we download the file itself. try: + logging.info(f"Downloading file {object} to {local_file}") self.s3.download_file(self.bucket, object, local_file) except Exception as err: log_msg = 'Failed to download %s from %s to %s.\nException: %s' @@ -200,13 +203,16 @@ def verify_signatures(self): if not os.path.exists(sig_file): logging.warning(sig_missing_msg % sig_file) sig_missing = True + logging.info(f"Signature file {sig_file} is missing.") if sig_missing: # If signature files are missing, we return a failure, # unless the configuration specifies that signatures are not required. if self.config['signatures'].getboolean('signatures_required', True): + logging.error(f"Signature file {sig_file} is missing.") return False else: + logging.info(f"Signature file {sig_file} is missing, but signatures are not required.") return True # If signatures are provided, we should always verify them, regardless of the signatures_required. @@ -234,6 +240,8 @@ def verify_signatures(self): logging.debug(f'Signature for {file} successfully verified.') else: logging.error(f'Failed to verify signature for {file}.') + logging.error(f" stdout: {verify_cmd.stdout.decode('UTF-8')}") + logging.error(f" stderr: {verify_cmd.stderr.decode('UTF-8')}") return False self.sig_verified = True From e1b1ee67c1d3ef4637b094df96d72b7fc13071a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 07:46:03 +0200 Subject: [PATCH 016/218] add capability to run verify script in a container --- scripts/automated_ingestion/eessitarball.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index af22f4fa..795d10e5 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -217,6 +217,7 @@ def verify_signatures(self): # If signatures are provided, we should always verify them, regardless of the signatures_required. # In order to do so, we need the verification script and an allowed signers file. + verify_runenv = self.config['signatures']['signature_verification_runenv'].split() verify_script = self.config['signatures']['signature_verification_script'] allowed_signers_file = self.config['signatures']['allowed_signers_file'] if not os.path.exists(verify_script): @@ -231,9 +232,12 @@ def verify_signatures(self): (self.local_path, self.local_sig_path), (self.local_metadata_path, self.local_metadata_sig_path) ]: + command = verify_runenv + [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file, + '--file', file, '--signature-file', sig_file] + logging.info(f"Running command: {' '.join(command)}") + verify_cmd = subprocess.run( - [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file, - '--file', file, '--signature-file', sig_file], + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if verify_cmd.returncode == 0: From 56a9d682232bc8f7886f69618f89cd64dc389026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 08:14:00 +0200 Subject: [PATCH 017/218] improve branch naming and only add files to staged dir in main branch --- scripts/automated_ingestion/eessitarball.py | 23 ++++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 795d10e5..61d24a7c 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -363,7 +363,7 @@ def print_unknown(self): def find_next_sequence_number(self, repo, pr_id): """Find the next available sequence number for staging PRs of a source PR.""" # Search for existing branches for this source PR - base_branch = f'staging-{repo.replace("/", "-")}-{pr_id}' + base_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-' existing_branches = [ ref.ref for ref in self.git_repo.get_git_refs() if ref.ref.startswith(f'refs/heads/{base_branch}') @@ -377,7 +377,7 @@ def find_next_sequence_number(self, repo, pr_id): for branch in existing_branches: try: # Extract the sequence number from branch name - # Format: staging-repo-pr_id-sequence + # Format: staging--pr--seq- sequence = int(branch.split('-')[-1]) sequence_numbers.append(sequence) except (ValueError, IndexError): @@ -401,7 +401,7 @@ def make_approval_request(self, tarballs_in_group=None): # find next sequence number for staging PRs of this source PR sequence = self.find_next_sequence_number(repo, pr_id) - git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}' + git_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-{sequence}' # Check if git_branch exists and what the status of the corressponding PR is main_branch = self.git_repo.get_branch('main') @@ -633,19 +633,22 @@ def process_group(self, tarballs): return # Get branch name from first tarball - with open(self.first_tar.local_metadata_path, 'r') as meta: - metadata = json.load(meta) - repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr'] - sequence = self.first_tar.find_next_sequence_number(repo, pr_id) - git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}' + # with open(self.first_tar.local_metadata_path, 'r') as meta: + # metadata = json.load(meta) + # repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr'] + # sequence = self.first_tar.find_next_sequence_number(repo, pr_id) + # git_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-{sequence}' - logging.info(f"Creating group branch: {git_branch}") + # logging.info(f"Creating group branch: {git_branch}") # Mark all tarballs as staged in the group branch for tarball in tarballs: logging.info(f"Processing tarball in group: {tarball}") temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - temp_tar.mark_new_tarball_as_staged(branch=git_branch) + # temp_tar.mark_new_tarball_as_staged(branch=git_branch) + temp_tar.mark_new_tarball_as_staged('main') + + exit() # Process the group for approval self.first_tar.make_approval_request(tarballs) From 935025f3b586f923e35a7275ed2ab0fc8f166923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 08:22:03 +0200 Subject: [PATCH 018/218] don't stop after staging files --- scripts/automated_ingestion/eessitarball.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 61d24a7c..9b585067 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -648,8 +648,6 @@ def process_group(self, tarballs): # temp_tar.mark_new_tarball_as_staged(branch=git_branch) temp_tar.mark_new_tarball_as_staged('main') - exit() - # Process the group for approval self.first_tar.make_approval_request(tarballs) From 682ddbddee1fbb7b2c9d2ba6fc37634a3b7b551a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 08:31:20 +0200 Subject: [PATCH 019/218] add more log info when moving metadata files --- scripts/automated_ingestion/eessitarball.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 9b585067..1b722400 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -433,6 +433,7 @@ def make_approval_request(self, tarballs_in_group=None): self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha) # Move metadata file(s) to staged directory + logging.info(f"Moving metadata for {self.object} from {self.state} to {next_state} in branch {git_branch}") if tarballs_in_group is None: logging.info(f"Moving metadata for individual tarball to staged") self.move_metadata_file(self.state, next_state, branch=git_branch) @@ -521,7 +522,7 @@ def move_metadata_file(self, old_state, new_state, branch='main'): """Move the metadata file of a tarball from an old state's directory to a new state's directory.""" file_path_old = old_state + '/' + self.metadata_file file_path_new = new_state + '/' + self.metadata_file - logging.debug(f'Moving metadata file {self.metadata_file} from {file_path_old} to {file_path_new}.') + logging.info(f'Moving metadata file {self.metadata_file} from {file_path_old} to {file_path_new} in branch {branch}') tarball_metadata = self.git_repo.get_contents(file_path_old) # Remove the metadata file from the old state's directory... self.git_repo.delete_file(file_path_old, 'remove from ' + old_state, sha=tarball_metadata.sha, branch=branch) From 412cc464656ac7e241e0985109f8fd8cd9f8c2ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 08:56:10 +0200 Subject: [PATCH 020/218] need to handle first tarball differently --- scripts/automated_ingestion/eessitarball.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 1b722400..ea41f0b7 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -432,7 +432,7 @@ def make_approval_request(self, tarballs_in_group=None): # Create new branch self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha) - # Move metadata file(s) to staged directory + # Move metadata file(s) to approved directory logging.info(f"Moving metadata for {self.object} from {self.state} to {next_state} in branch {git_branch}") if tarballs_in_group is None: logging.info(f"Moving metadata for individual tarball to staged") @@ -633,23 +633,15 @@ def process_group(self, tarballs): logging.error("Tarballs have inconsistent link2pr information") return - # Get branch name from first tarball - # with open(self.first_tar.local_metadata_path, 'r') as meta: - # metadata = json.load(meta) - # repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr'] - # sequence = self.first_tar.find_next_sequence_number(repo, pr_id) - # git_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-{sequence}' - - # logging.info(f"Creating group branch: {git_branch}") - - # Mark all tarballs as staged in the group branch - for tarball in tarballs: + # Mark all tarballs as staged in the group branch, however need to handle first tarball differently + logging.info(f"Processing first tarball in group: {self.first_tar.object}") + self.first_tar.mark_new_tarball_as_staged('main') # this sets the state of the first tarball to 'staged' + for tarball in tarballs[1:]: logging.info(f"Processing tarball in group: {tarball}") temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - # temp_tar.mark_new_tarball_as_staged(branch=git_branch) temp_tar.mark_new_tarball_as_staged('main') - # Process the group for approval + # Process the group for approval, only works correctly if first tarball is already in state 'staged' self.first_tar.make_approval_request(tarballs) def to_string(self, oneline=False): From d0f0d260241d137d8b2e6d8e96bc3a5b0cab2d30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 09:09:05 +0200 Subject: [PATCH 021/218] move from staged to approval plus a little more logging --- scripts/automated_ingestion/eessitarball.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index ea41f0b7..7ed4ff0d 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -392,7 +392,7 @@ def find_next_sequence_number(self, repo, pr_id): def make_approval_request(self, tarballs_in_group=None): """Process a staged tarball by opening a pull request for ingestion approval.""" next_state = self.next_state(self.state) - + logging.info(f"Making approval request for tarball {self.object} in state {self.state} to {next_state}") # obtain link2pr information (repo and pr_id) from metadata file with open(self.local_metadata_path, 'r') as meta: metadata = meta.read() @@ -441,7 +441,7 @@ def make_approval_request(self, tarballs_in_group=None): logging.info(f"Moving metadata for {len(tarballs_in_group)} tarballs to staged") for tarball in tarballs_in_group: temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - temp_tar.move_metadata_file('new', 'staged', branch=git_branch) + temp_tar.move_metadata_file(self.state, next_state, branch=git_branch) # Create PR with appropriate template try: From c2c456d219b2325e7ad4173ce2d7489d193fca71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 09:25:36 +0200 Subject: [PATCH 022/218] fix missing function --- scripts/automated_ingestion/eessitarball.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 7ed4ff0d..e50840a9 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -513,7 +513,7 @@ def format_metadata_list(self, tarballs): """Format metadata for all tarballs in collapsible sections.""" formatted = "### Metadata\n\n" for tarball in tarballs: - with open(self.get_metadata_path(tarball), 'r') as meta: + with open(self.metadata_file, 'r') as meta: metadata = meta.read() formatted += f"
\nMetadata for {tarball}\n\n```\n{metadata}\n```\n
\n\n" return formatted From 5cad5eda71dfd8f76805acc531848200ecbc8f33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 27 Apr 2025 09:38:42 +0200 Subject: [PATCH 023/218] improve get_metadata_path --- scripts/automated_ingestion/eessitarball.py | 25 ++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index e50840a9..8ad92fe5 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -513,11 +513,34 @@ def format_metadata_list(self, tarballs): """Format metadata for all tarballs in collapsible sections.""" formatted = "### Metadata\n\n" for tarball in tarballs: - with open(self.metadata_file, 'r') as meta: + with open(self.get_metadata_path(tarball), 'r') as meta: metadata = meta.read() formatted += f"
\nMetadata for {tarball}\n\n```\n{metadata}\n```\n
\n\n" return formatted + def get_metadata_path(self, tarball=None): + """ + Return the local path of the metadata file. + + Args: + tarball (str, optional): Name of the tarball to get metadata path for. + If None, use the current tarball's metadata file. + """ + if tarball is None: + # For single tarball, use the instance's metadata file + if not self.local_metadata_path: + self.local_metadata_path = os.path.join( + self.config['paths']['download_dir'], + os.path.basename(self.metadata_file) + ) + return self.local_metadata_path + else: + # For group of tarballs, construct path from tarball name + return os.path.join( + self.config['paths']['download_dir'], + os.path.basename(tarball) + self.config['paths']['metadata_file_extension'] + ) + def move_metadata_file(self, old_state, new_state, branch='main'): """Move the metadata file of a tarball from an old state's directory to a new state's directory.""" file_path_old = old_state + '/' + self.metadata_file From d045aa54e90b2ea638c2768ad6ca580f2e4580d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Thu, 1 May 2025 18:42:17 +0200 Subject: [PATCH 024/218] enhance logging capabilities --- .../automated_ingestion.py | 16 +++ scripts/automated_ingestion/eessitarball.py | 10 +- scripts/automated_ingestion/utils.py | 120 ++++++++++++++++++ 3 files changed, 145 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 9eb717cb..78310ea6 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -3,6 +3,7 @@ from eessitarball import EessiTarball, EessiTarballGroup from pid.decorator import pidfile # noqa: F401 from pid import PidFileError +from utils import log_function_entry_exit import argparse import boto3 @@ -54,6 +55,7 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt return tarballs +@log_function_entry_exit() def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'): """Return a dictionary of tarball groups, keyed by (repo, pr_number).""" tarballs = find_tarballs(s3, bucket, extension, metadata_extension) @@ -86,6 +88,7 @@ def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extens return groups +@log_function_entry_exit() def parse_config(path): """Parse the configuration file.""" config = configparser.ConfigParser() @@ -116,6 +119,7 @@ def parse_config(path): return config +@log_function_entry_exit() def parse_args(): """Parse the command-line arguments.""" parser = argparse.ArgumentParser() @@ -133,6 +137,11 @@ def parse_args(): logging_group.add_argument('--quiet', action='store_true', help='Suppress console output (overrides all other console settings)') + logging_group.add_argument('--log-scopes', + help='Comma-separated list of logging scopes using +/- syntax. ' + 'Examples: "+FUNC_ENTRY_EXIT" (enable only function entry/exit), ' + '"+ALL,-FUNC_ENTRY_EXIT" (enable all except function entry/exit), ' + '"+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" (enable function entry/exit but disable example)') # Existing arguments parser.add_argument('-c', '--config', type=str, help='path to configuration file', @@ -143,6 +152,7 @@ def parse_args(): return parser.parse_args() +@log_function_entry_exit() def setup_logging(config, args): """ Configure logging based on configuration file and command line arguments. @@ -167,6 +177,11 @@ def setup_logging(config, args): if args.debug: console_level = logging.DEBUG + # Set up logging scopes + if args.log_scopes: + from utils import set_logging_scopes + set_logging_scopes(args.log_scopes) + # Create logger logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Set root logger to lowest level @@ -197,6 +212,7 @@ def setup_logging(config, args): @pid.decorator.pidfile('automated_ingestion.pid') +@log_function_entry_exit() def main(): """Main function.""" args = parse_args() diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 8ad92fe5..4885f665 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -1,4 +1,4 @@ -from utils import send_slack_message, sha256sum +from utils import send_slack_message, sha256sum, log_function_entry_exit from pathlib import PurePosixPath @@ -18,6 +18,7 @@ class EessiTarball: for which it interfaces with the S3 bucket, GitHub, and CVMFS. """ + @log_function_entry_exit() def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo): """Initialize the tarball object.""" self.config = config @@ -48,6 +49,7 @@ def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo # Find the initial state of this tarball. self.state = self.find_state() + @log_function_entry_exit() def download(self, force=False): """ Download this tarball and its corresponding metadata file, if this hasn't been already done. @@ -90,6 +92,7 @@ def download(self, force=False): self.local_path = None self.local_metadata_path = None + @log_function_entry_exit() def find_state(self): """Find the state of this tarball by searching through the state directories in the git repository.""" logging.debug(f"Find state for {self.object}") @@ -194,6 +197,7 @@ def to_string(self, oneline=False): str += f"{sep} GHrepo: {self.git_repo}" return str + @log_function_entry_exit() def verify_signatures(self): """Verify the signatures of the downloaded tarball and metadata file using the corresponding signature files.""" @@ -251,6 +255,7 @@ def verify_signatures(self): self.sig_verified = True return True + @log_function_entry_exit() def verify_checksum(self): """Verify the checksum of the downloaded tarball with the one in its metadata file.""" local_sha256 = sha256sum(self.local_path) @@ -261,6 +266,7 @@ def verify_checksum(self): logging.debug(f'Checksum stored in metadata file: {meta_sha256}') return local_sha256 == meta_sha256 + @log_function_entry_exit() def ingest(self): """Process a tarball that is ready to be ingested by running the ingestion script.""" # TODO: check if there is an open issue for this tarball, and if there is, skip it. @@ -321,6 +327,7 @@ def print_ingested(self): """Process a tarball that has already been ingested.""" logging.info(f'{self.object} has already been ingested, skipping...') + @log_function_entry_exit() def mark_new_tarball_as_staged(self, branch=None): """Process a new tarball that was added to the staging bucket.""" next_state = self.next_state(self.state) @@ -389,6 +396,7 @@ def find_next_sequence_number(self, repo, pr_id): # Return next available sequence number return max(sequence_numbers) + 1 + @log_function_entry_exit() def make_approval_request(self, tarballs_in_group=None): """Process a staged tarball by opening a pull request for ingestion approval.""" next_state = self.next_state(self.state) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 66843dd9..bed75469 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -1,7 +1,87 @@ import hashlib import json import requests +import logging +import functools +import time +from enum import IntFlag, auto +class LoggingScope(IntFlag): + """Enumeration of different logging scopes.""" + NONE = 0 + FUNC_ENTRY_EXIT = auto() # Function entry/exit logging + # Add more scopes here as needed + # EXAMPLE_SCOPE = auto() + # ANOTHER_SCOPE = auto() + ALL = FUNC_ENTRY_EXIT # Update this when adding new scopes + +# Global setting for logging scopes +ENABLED_LOGGING_SCOPES = LoggingScope.NONE + +def set_logging_scopes(scopes): + """ + Set the enabled logging scopes. + + Args: + scopes: Can be: + - A LoggingScope value + - A string with comma-separated values using +/- syntax: + - "+SCOPE" to enable a scope + - "-SCOPE" to disable a scope + - "ALL" or "+ALL" to enable all scopes + - "-ALL" to disable all scopes + Examples: + "+FUNC_ENTRY_EXIT" # Enable only function entry/exit + "+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" # Enable function entry/exit but disable example + "+ALL,-FUNC_ENTRY_EXIT" # Enable all scopes except function entry/exit + """ + global ENABLED_LOGGING_SCOPES + + if isinstance(scopes, LoggingScope): + ENABLED_LOGGING_SCOPES = scopes + return + + if isinstance(scopes, str): + # Start with no scopes enabled + ENABLED_LOGGING_SCOPES = LoggingScope.NONE + + # Split into individual scope specifications + scope_specs = [s.strip() for s in scopes.split(",")] + + for spec in scope_specs: + if not spec: + continue + + # Check for ALL special case + if spec.upper() in ["ALL", "+ALL"]: + ENABLED_LOGGING_SCOPES = LoggingScope.ALL + continue + elif spec.upper() == "-ALL": + ENABLED_LOGGING_SCOPES = LoggingScope.NONE + continue + + # Parse scope name and operation + operation = spec[0] + scope_name = spec[1:].strip().upper() + + try: + scope_enum = LoggingScope[scope_name] + if operation == '+': + ENABLED_LOGGING_SCOPES |= scope_enum + elif operation == '-': + ENABLED_LOGGING_SCOPES &= ~scope_enum + else: + logging.warning(f"Invalid operation '{operation}' in scope specification: {spec}") + except KeyError: + logging.warning(f"Unknown logging scope: {scope_name}") + + elif isinstance(scopes, list): + # Convert list to comma-separated string and process + set_logging_scopes(",".join(scopes)) + +def is_logging_scope_enabled(scope): + """Check if a specific logging scope is enabled.""" + return bool(ENABLED_LOGGING_SCOPES & scope) def send_slack_message(webhook, msg): """Send a Slack message.""" @@ -25,3 +105,43 @@ def sha256sum(path): for byte_block in iter(lambda: f.read(8192), b''): sha256_hash.update(byte_block) return sha256_hash.hexdigest() + + +def log_function_entry_exit(logger=None): + """ + Decorator that logs function entry and exit with timing information. + Only logs if function entry/exit logging is enabled. + + Args: + logger: Optional logger instance. If not provided, uses the root logger. + """ + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT): + return func(*args, **kwargs) + + # Use provided logger or get root logger + log = logger or logging.getLogger() + + # Log function entry + log.debug(f"Entering {func.__name__}") + start_time = time.time() + + try: + # Execute the function + result = func(*args, **kwargs) + + # Log successful exit + duration = time.time() - start_time + log.debug(f"Exiting {func.__name__} (took {duration:.3f}s)") + return result + + except Exception as e: + # Log error exit + duration = time.time() - start_time + log.error(f"Error in {func.__name__} after {duration:.3f}s: {str(e)}") + raise + + return wrapper + return decorator From f9fd34ec025b5ef9bbb868ecf574b19cc0e300a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Thu, 1 May 2025 19:31:32 +0200 Subject: [PATCH 025/218] add more logging scopes, convert logging calls and improve code readability --- .../automated_ingestion.py | 9 +- scripts/automated_ingestion/eessitarball.py | 179 +++++++++++------- scripts/automated_ingestion/utils.py | 52 ++++- 3 files changed, 164 insertions(+), 76 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 78310ea6..71b8b61d 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -3,7 +3,7 @@ from eessitarball import EessiTarball, EessiTarballGroup from pid.decorator import pidfile # noqa: F401 from pid import PidFileError -from utils import log_function_entry_exit +from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes import argparse import boto3 @@ -108,7 +108,10 @@ def parse_config(path): # Validate staging_pr_method staging_method = config['github'].get('staging_pr_method', 'individual') if staging_method not in ['individual', 'grouped']: - error(f'Invalid staging_pr_method: "{staging_method}" in configuration file {path}. Must be either "individual" or "grouped".') + error( + f'Invalid staging_pr_method: "{staging_method}" in configuration file {path}. ' + 'Must be either "individual" or "grouped".' + ) # Validate PR body templates if staging_method == 'individual' and 'individual_pr_body' not in config['github']: @@ -179,8 +182,8 @@ def setup_logging(config, args): # Set up logging scopes if args.log_scopes: - from utils import set_logging_scopes set_logging_scopes(args.log_scopes) + log_message(LoggingScope.DEBUG, 'DEBUG', "Enabled logging scopes: %s", args.log_scopes) # Create logger logger = logging.getLogger() diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index 4885f665..ab888964 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -1,4 +1,4 @@ -from utils import send_slack_message, sha256sum, log_function_entry_exit +from utils import send_slack_message, sha256sum, log_function_entry_exit, log_message, LoggingScope from pathlib import PurePosixPath @@ -58,33 +58,41 @@ def download(self, force=False): (self.object, self.local_path, self.object_sig, self.local_sig_path), (self.metadata_file, self.local_metadata_path, self.metadata_sig_file, self.local_metadata_sig_path), ] - logging.info(f"Downloading {files}") + log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading %s", files) skip = False for (object, local_file, sig_object, local_sig_file) in files: if force or not os.path.exists(local_file): # First we try to download signature file, which may or may not be available # and may be optional or required. try: - logging.info(f"Downloading signature file {sig_object} to {local_sig_file}") + log_msg = "Downloading signature file %s to %s" + log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, sig_object, local_sig_file) self.s3.download_file(self.bucket, sig_object, local_sig_file) except Exception as err: log_msg = 'Failed to download signature file %s for %s from %s to %s.' if self.config['signatures'].getboolean('signatures_required', True): log_msg += '\nException: %s' - logging.error(log_msg, sig_object, object, self.bucket, local_sig_file, err) + log_message( + LoggingScope.ERROR, 'ERROR', log_msg, + sig_object, object, self.bucket, local_sig_file, err + ) skip = True break else: - log_msg += ' Ignoring this, because signatures are not required with the current configuration.' + log_msg += ' Ignoring this, because signatures are not required' + log_msg += ' with the current configuration.' log_msg += '\nException: %s' - logging.warning(log_msg, sig_object, object, self.bucket, local_sig_file, err) + log_message( + LoggingScope.DOWNLOAD, 'WARNING', log_msg, + sig_object, object, self.bucket, local_sig_file, err + ) # Now we download the file itself. try: - logging.info(f"Downloading file {object} to {local_file}") + log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file %s to %s", object, local_file) self.s3.download_file(self.bucket, object, local_file) except Exception as err: log_msg = 'Failed to download %s from %s to %s.\nException: %s' - logging.error(log_msg, object, self.bucket, local_file, err) + log_message(LoggingScope.ERROR, 'ERROR', log_msg, object, self.bucket, local_file, err) skip = True break # If any required download failed, make sure to skip this tarball completely. @@ -95,11 +103,12 @@ def download(self, force=False): @log_function_entry_exit() def find_state(self): """Find the state of this tarball by searching through the state directories in the git repository.""" - logging.debug(f"Find state for {self.object}") + log_message(LoggingScope.DEBUG, 'DEBUG', "Find state for %s", self.object) for state in list(self.states.keys()): try: self.git_repo.get_contents(state + '/' + self.metadata_file) - logging.info(f"Found metadata file {self.metadata_file} in state: {state}") + log_msg = "Found metadata file %s in state: %s" + log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.metadata_file, state) return state except github.UnknownObjectException: # no metadata file found in this state's directory, so keep searching... @@ -111,9 +120,9 @@ def find_state(self): else: # if there was some other (e.g. connection) issue, abort the search for this tarball log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' - logging.warning(log_msg, self.object, err.status) + log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status) return "unknown" - logging.info(f"Tarball {self.metadata_file} is new") + log_message(LoggingScope.STATE_CHANGE, 'INFO', "Tarball %s is new", self.metadata_file) return "new" def get_contents_overview(self): @@ -199,24 +208,27 @@ def to_string(self, oneline=False): @log_function_entry_exit() def verify_signatures(self): - """Verify the signatures of the downloaded tarball and metadata file using the corresponding signature files.""" - + """ + Verify the signatures of the downloaded tarball and metadata file + using the corresponding signature files. + """ sig_missing_msg = 'Signature file %s is missing.' sig_missing = False for sig_file in [self.local_sig_path, self.local_metadata_sig_path]: if not os.path.exists(sig_file): - logging.warning(sig_missing_msg % sig_file) + log_message(LoggingScope.VERIFICATION, 'WARNING', sig_missing_msg, sig_file) sig_missing = True - logging.info(f"Signature file {sig_file} is missing.") + log_message(LoggingScope.VERIFICATION, 'INFO', "Signature file %s is missing.", sig_file) if sig_missing: # If signature files are missing, we return a failure, # unless the configuration specifies that signatures are not required. if self.config['signatures'].getboolean('signatures_required', True): - logging.error(f"Signature file {sig_file} is missing.") + log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing.", sig_file) return False else: - logging.info(f"Signature file {sig_file} is missing, but signatures are not required.") + log_msg = "Signature file %s is missing, but signatures are not required." + log_message(LoggingScope.VERIFICATION, 'INFO', log_msg, sig_file) return True # If signatures are provided, we should always verify them, regardless of the signatures_required. @@ -225,11 +237,13 @@ def verify_signatures(self): verify_script = self.config['signatures']['signature_verification_script'] allowed_signers_file = self.config['signatures']['allowed_signers_file'] if not os.path.exists(verify_script): - logging.error('Unable to verify signatures, the specified signature verification script does not exist!') + log_msg = 'Unable to verify signatures, the specified signature verification script does not exist!' + log_message(LoggingScope.ERROR, 'ERROR', log_msg) return False if not os.path.exists(allowed_signers_file): - logging.error('Unable to verify signatures, the specified allowed signers file does not exist!') + log_msg = 'Unable to verify signatures, the specified allowed signers file does not exist!' + log_message(LoggingScope.ERROR, 'ERROR', log_msg) return False for (file, sig_file) in [ @@ -238,18 +252,18 @@ def verify_signatures(self): ]: command = verify_runenv + [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file, '--file', file, '--signature-file', sig_file] - logging.info(f"Running command: {' '.join(command)}") + log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(command)) verify_cmd = subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if verify_cmd.returncode == 0: - logging.debug(f'Signature for {file} successfully verified.') + log_message(LoggingScope.VERIFICATION, 'DEBUG', 'Signature for %s successfully verified.', file) else: - logging.error(f'Failed to verify signature for {file}.') - logging.error(f" stdout: {verify_cmd.stdout.decode('UTF-8')}") - logging.error(f" stderr: {verify_cmd.stderr.decode('UTF-8')}") + log_message(LoggingScope.ERROR, 'ERROR', 'Failed to verify signature for %s.', file) + log_message(LoggingScope.ERROR, 'ERROR', " stdout: %s", verify_cmd.stdout.decode('UTF-8')) + log_message(LoggingScope.ERROR, 'ERROR', " stderr: %s", verify_cmd.stderr.decode('UTF-8')) return False self.sig_verified = True @@ -262,39 +276,41 @@ def verify_checksum(self): meta_sha256 = None with open(self.local_metadata_path, 'r') as meta: meta_sha256 = json.load(meta)['payload']['sha256sum'] - logging.debug(f'Checksum of downloaded tarball: {local_sha256}') - logging.debug(f'Checksum stored in metadata file: {meta_sha256}') + log_message(LoggingScope.VERIFICATION, 'DEBUG', 'Checksum of downloaded tarball: %s', local_sha256) + log_message(LoggingScope.VERIFICATION, 'DEBUG', 'Checksum stored in metadata file: %s', meta_sha256) return local_sha256 == meta_sha256 @log_function_entry_exit() def ingest(self): """Process a tarball that is ready to be ingested by running the ingestion script.""" # TODO: check if there is an open issue for this tarball, and if there is, skip it. - logging.info(f'Tarball {self.object} is ready to be ingested.') + log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Tarball %s is ready to be ingested.', self.object) self.download() - logging.info('Verifying its signature...') + log_message(LoggingScope.VERIFICATION, 'INFO', 'Verifying its signature...') if not self.verify_signatures(): issue_msg = f'Failed to verify signatures for `{self.object}`' - logging.error(issue_msg) + log_message(LoggingScope.ERROR, 'ERROR', issue_msg) if not self.issue_exists(issue_msg, state='open'): self.git_repo.create_issue(title=issue_msg, body=issue_msg) return else: - logging.debug(f'Signatures of {self.object} and its metadata file successfully verified.') + log_msg = 'Signatures of %s and its metadata file successfully verified.' + log_message(LoggingScope.VERIFICATION, 'DEBUG', log_msg, self.object) - logging.info('Verifying its checksum...') + log_message(LoggingScope.VERIFICATION, 'INFO', 'Verifying its checksum...') if not self.verify_checksum(): issue_msg = f'Failed to verify checksum for `{self.object}`' - logging.error(issue_msg) + log_message(LoggingScope.ERROR, 'ERROR', issue_msg) if not self.issue_exists(issue_msg, state='open'): self.git_repo.create_issue(title=issue_msg, body=issue_msg) return else: - logging.debug(f'Checksum of {self.object} matches the one in its metadata file.') + log_msg = 'Checksum of %s matches the one in its metadata file.' + log_message(LoggingScope.VERIFICATION, 'DEBUG', log_msg, self.object) script = self.config['paths']['ingestion_script'] sudo = ['sudo'] if self.config['cvmfs'].getboolean('ingest_as_root', True) else [] - logging.info(f'Running the ingestion script for {self.object}...') + log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Running the ingestion script for %s...', self.object) ingest_cmd = subprocess.run( sudo + [script, self.cvmfs_repo, self.local_path], stdout=subprocess.PIPE, @@ -319,34 +335,39 @@ def ingest(self): stderr=ingest_cmd.stderr.decode('UTF-8'), ) if self.issue_exists(issue_title, state='open'): - logging.info(f'Failed to ingest {self.object}, but an open issue already exists, skipping...') + log_msg = 'Failed to ingest %s, but an open issue already exists, skipping...' + log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object) else: self.git_repo.create_issue(title=issue_title, body=issue_body) def print_ingested(self): """Process a tarball that has already been ingested.""" - logging.info(f'{self.object} has already been ingested, skipping...') + log_message(LoggingScope.STATE_CHANGE, 'INFO', '%s has already been ingested, skipping...', self.object) @log_function_entry_exit() def mark_new_tarball_as_staged(self, branch=None): """Process a new tarball that was added to the staging bucket.""" next_state = self.next_state(self.state) - logging.info(f'Found new tarball {self.object}, downloading it...') + log_msg = 'Found new tarball %s, downloading it...' + log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object) # Download the tarball and its metadata file. # Use force as it may be a new attempt for an existing tarball that failed before. self.download(force=True) if not self.local_path or not self.local_metadata_path: - logging.warning(f"Skipping tarball {self.object} - download failed") + log_msg = "Skipping tarball %s - download failed" + log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object) return # Verify the signatures of the tarball and metadata file. if not self.verify_signatures(): - logging.warning(f"Skipping tarball {self.object} - signature verification failed") + log_msg = "Skipping tarball %s - signature verification failed" + log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object) return # If no branch is provided, use the main branch target_branch = branch if branch else 'main' - logging.info(f"Adding metadata to '{next_state}' folder in {target_branch} branch") + log_msg = "Adding metadata to '%s' folder in %s branch" + log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, next_state, target_branch) file_path_staged = next_state + '/' + self.metadata_file contents = '' @@ -360,12 +381,14 @@ def mark_new_tarball_as_staged(self, branch=None): def print_rejected(self): """Process a (rejected) tarball for which the corresponding PR has been closed witout merging.""" - logging.info("This tarball was rejected, so we're skipping it.") + log_message(LoggingScope.STATE_CHANGE, 'INFO', "This tarball was rejected, so we're skipping it.") # Do we want to delete rejected tarballs at some point? def print_unknown(self): """Process a tarball which has an unknown state.""" - logging.info("The state of this tarball could not be determined, so we're skipping it.") + log_msg = "The state of this tarball could not be determined," + log_msg += " so we're skipping it." + log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg) def find_next_sequence_number(self, repo, pr_id): """Find the next available sequence number for staging PRs of a source PR.""" @@ -400,7 +423,8 @@ def find_next_sequence_number(self, repo, pr_id): def make_approval_request(self, tarballs_in_group=None): """Process a staged tarball by opening a pull request for ingestion approval.""" next_state = self.next_state(self.state) - logging.info(f"Making approval request for tarball {self.object} in state {self.state} to {next_state}") + log_msg = "Making approval request for tarball %s in state %s to %s" + log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object, self.state, next_state) # obtain link2pr information (repo and pr_id) from metadata file with open(self.local_metadata_path, 'r') as meta: metadata = meta.read() @@ -414,26 +438,31 @@ def make_approval_request(self, tarballs_in_group=None): # Check if git_branch exists and what the status of the corressponding PR is main_branch = self.git_repo.get_branch('main') if git_branch in [branch.name for branch in self.git_repo.get_branches()]: - logging.info(f"Branch {git_branch} already exists, checking the status of the corresponding PR...") + log_msg = "Branch %s already exists, checking the status of the corresponding PR..." + log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, git_branch) find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all') if pr.head.ref == git_branch] if find_pr: pr = find_pr.pop(0) if pr.state == 'open': - logging.info('PR is still open, skipping this tarball...') + log_message(LoggingScope.GITHUB_OPS, 'INFO', 'PR is still open, skipping this tarball...') return elif pr.state == 'closed' and not pr.merged: - logging.info('PR was rejected') + log_message(LoggingScope.GITHUB_OPS, 'INFO', 'PR was rejected') self.reject() return else: - logging.warn(f'Warning, tarball {self.object} is in a weird state:') - logging.warn(f'Branch: {git_branch}\nPR: {pr}\nPR state: {pr.state}\nPR merged: {pr.merged}') + log_msg = 'Warning, tarball %s is in a weird state:' + log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, self.object) + log_msg = 'Branch: %s\nPR: %s\nPR state: %s\nPR merged: %s' + log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, + git_branch, pr, pr.state, pr.merged) # TODO: should we delete the branch or open an issue? return else: - logging.info(f'Tarball {self.object} has a branch, but no PR.') - logging.info('Removing existing branch...') + log_msg = 'Tarball %s has a branch, but no PR.' + log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object) + log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Removing existing branch...') ref = self.git_repo.get_git_ref(f'heads/{git_branch}') ref.delete() @@ -441,12 +470,15 @@ def make_approval_request(self, tarballs_in_group=None): self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha) # Move metadata file(s) to approved directory - logging.info(f"Moving metadata for {self.object} from {self.state} to {next_state} in branch {git_branch}") + log_msg = "Moving metadata for %s from %s to %s in branch %s" + log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, + self.object, self.state, next_state, git_branch) if tarballs_in_group is None: - logging.info(f"Moving metadata for individual tarball to staged") + log_message(LoggingScope.GITHUB_OPS, 'INFO', "Moving metadata for individual tarball to staged") self.move_metadata_file(self.state, next_state, branch=git_branch) else: - logging.info(f"Moving metadata for {len(tarballs_in_group)} tarballs to staged") + log_msg = "Moving metadata for %d tarballs to staged" + log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, len(tarballs_in_group)) for tarball in tarballs_in_group: temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) temp_tar.move_metadata_file(self.state, next_state, branch=git_branch) @@ -455,7 +487,8 @@ def make_approval_request(self, tarballs_in_group=None): try: pr_url=f"https://github.com/{repo}/pull/{pr_id}", if tarballs_in_group is None: - logging.info(f"Creating PR for individual tarball: {self.object}") + log_msg = "Creating PR for individual tarball: %s" + log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object) tarball_contents = self.get_contents_overview() pr_body = self.config['github']['individual_pr_body'].format( cvmfs_repo=self.cvmfs_repo, @@ -476,7 +509,8 @@ def make_approval_request(self, tarballs_in_group=None): tar_details_tpl = "
\nContents of %s\n\n%s\n
\n" tar_overviews.append(tar_details_tpl % (tarball, overview)) except Exception as err: - logging.error(f"Failed to get contents overview for {tarball}: {err}") + log_msg = "Failed to get contents overview for %s: %s" + log_message(LoggingScope.ERROR, 'ERROR', log_msg, tarball, err) tar_details_tpl = "
\nContents of %s\n\n" tar_details_tpl += "Failed to get contents overview: %s\n
\n" tar_overviews.append(tar_details_tpl % (tarball, err)) @@ -497,10 +531,10 @@ def make_approval_request(self, tarballs_in_group=None): pr_title += ' :closed_lock_with_key:' self.git_repo.create_pull(title=pr_title, body=pr_body, head=git_branch, base='main') - logging.info(f"Created PR: {pr_title}") + log_message(LoggingScope.GITHUB_OPS, 'INFO', "Created PR: %s", pr_title) except Exception as err: - logging.error(f"Failed to create PR: {err}") + log_message(LoggingScope.ERROR, 'ERROR', "Failed to create PR: %s", err) if not self.issue_exists(f'Failed to get contents of {self.object}', state='open'): self.git_repo.create_issue( title=f'Failed to get contents of {self.object}', @@ -523,7 +557,10 @@ def format_metadata_list(self, tarballs): for tarball in tarballs: with open(self.get_metadata_path(tarball), 'r') as meta: metadata = meta.read() - formatted += f"
\nMetadata for {tarball}\n\n```\n{metadata}\n```\n
\n\n" + formatted += ( + f"
\nMetadata for {tarball}\n\n" + f"```\n{metadata}\n```\n
\n\n" + ) return formatted def get_metadata_path(self, tarball=None): @@ -553,7 +590,8 @@ def move_metadata_file(self, old_state, new_state, branch='main'): """Move the metadata file of a tarball from an old state's directory to a new state's directory.""" file_path_old = old_state + '/' + self.metadata_file file_path_new = new_state + '/' + self.metadata_file - logging.info(f'Moving metadata file {self.metadata_file} from {file_path_old} to {file_path_new} in branch {branch}') + log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Moving metadata file %s from %s to %s in branch %s', + self.metadata_file, file_path_old, file_path_new, branch) tarball_metadata = self.git_repo.get_contents(file_path_old) # Remove the metadata file from the old state's directory... self.git_repo.delete_file(file_path_old, 'remove from ' + old_state, sha=tarball_metadata.sha, branch=branch) @@ -608,7 +646,7 @@ def extract_tarballs_from_pr_body(self, pr_body): def reject(self): """Reject a tarball for ingestion.""" # Let's move the the tarball to the directory for rejected tarballs. - logging.info(f'Marking tarball {self.object} as rejected...') + log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Marking tarball %s as rejected...', self.object) next_state = 'rejected' self.move_metadata_file(self.state, next_state) @@ -644,31 +682,34 @@ def download_tarballs_and_more(self, tarballs): """Download all files associated with this group of tarballs.""" for tarball in tarballs: temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - logging.info(f"downloading files for '{temp_tar.object}'") + log_message(LoggingScope.GROUP_OPS, 'INFO', "downloading files for '%s'", temp_tar.object) temp_tar.download(force=True) if not temp_tar.local_path or not temp_tar.local_metadata_path: - logging.warn(f"Skipping this tarball: {temp_tar.object}") + log_message(LoggingScope.GROUP_OPS, 'WARNING', "Skipping this tarball: %s", temp_tar.object) return False return True def process_group(self, tarballs): """Process a group of tarballs together.""" - logging.info(f"Processing group of {len(tarballs)} tarballs") + log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing group of %d tarballs", len(tarballs)) if not self.download_tarballs_and_more(tarballs): - logging.error("Downloading tarballs, metadata files and/or their signatures failed") + log_msg = "Downloading tarballs, metadata files and/or their signatures failed" + log_message(LoggingScope.ERROR, 'ERROR', log_msg) return # Verify all tarballs have the same link2pr info if not self.verify_group_consistency(tarballs): - logging.error("Tarballs have inconsistent link2pr information") + log_message(LoggingScope.ERROR, 'ERROR', "Tarballs have inconsistent link2pr information") return # Mark all tarballs as staged in the group branch, however need to handle first tarball differently - logging.info(f"Processing first tarball in group: {self.first_tar.object}") + log_msg = "Processing first tarball in group: %s" + log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, self.first_tar.object) self.first_tar.mark_new_tarball_as_staged('main') # this sets the state of the first tarball to 'staged' for tarball in tarballs[1:]: - logging.info(f"Processing tarball in group: {tarball}") + log_msg = "Processing tarball in group: %s" + log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, tarball) temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) temp_tar.mark_new_tarball_as_staged('main') @@ -692,7 +733,7 @@ def verify_group_consistency(self, tarballs): for tarball in tarballs[1:]: # Skip first tarball as we already have its info temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) - logging.debug(f"temp tar: {temp_tar.to_string()}") + log_message(LoggingScope.DEBUG, 'DEBUG', "temp tar: %s", temp_tar.to_string()) repo, pr = temp_tar.get_link2pr_info() if repo != first_repo or pr != first_pr: return False diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index bed75469..2c3aeb3c 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -10,10 +10,15 @@ class LoggingScope(IntFlag): """Enumeration of different logging scopes.""" NONE = 0 FUNC_ENTRY_EXIT = auto() # Function entry/exit logging - # Add more scopes here as needed - # EXAMPLE_SCOPE = auto() - # ANOTHER_SCOPE = auto() - ALL = FUNC_ENTRY_EXIT # Update this when adding new scopes + DOWNLOAD = auto() # Logging related to file downloads + VERIFICATION = auto() # Logging related to signature and checksum verification + STATE_CHANGE = auto() # Logging related to tarball state changes + GITHUB_OPS = auto() # Logging related to GitHub operations (PRs, issues, etc.) + GROUP_OPS = auto() # Logging related to tarball group operations + ERROR = auto() # Error logging (separate from other scopes for easier filtering) + DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) + ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE | + GITHUB_OPS | GROUP_OPS | ERROR | DEBUG) # Global setting for logging scopes ENABLED_LOGGING_SCOPES = LoggingScope.NONE @@ -145,3 +150,42 @@ def wrapper(*args, **kwargs): return wrapper return decorator + +def log_with_scope(scope, logger=None): + """ + Decorator that checks if a specific logging scope is enabled before logging. + + Args: + scope: LoggingScope value indicating which scope this logging belongs to + logger: Optional logger instance. If not provided, uses the root logger. + """ + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not is_logging_scope_enabled(scope): + return func(*args, **kwargs) + return func(*args, **kwargs) + return wrapper + return decorator + +def log_message(scope, level, msg, *args, logger=None, **kwargs): + """ + Log a message if the specified scope is enabled. + + Args: + scope: LoggingScope value indicating which scope this logging belongs to + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + msg: Message to log + logger: Optional logger instance. If not provided, uses the root logger. + *args, **kwargs: Additional arguments to pass to the logging function + """ + if not is_logging_scope_enabled(scope): + return + + log = logger or logging.getLogger() + log_func = getattr(log, level.lower()) + log_func(msg, *args, **kwargs) + +# Example usage: +# log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file: %s", filename) +# log_message(LoggingScope.ERROR, 'ERROR', "Failed to download: %s", error_msg) From aab9f6a2539fc8c14b6884c7f443ac19de81bad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Thu, 1 May 2025 19:42:26 +0200 Subject: [PATCH 026/218] show function entry/exit at info level --- scripts/automated_ingestion/utils.py | 32 +++++++++++----------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 2c3aeb3c..214673c0 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -115,10 +115,10 @@ def sha256sum(path): def log_function_entry_exit(logger=None): """ Decorator that logs function entry and exit with timing information. - Only logs if function entry/exit logging is enabled. + Only logs if the FUNC_ENTRY_EXIT scope is enabled. Args: - logger: Optional logger instance. If not provided, uses the root logger. + logger: Optional logger instance. If not provided, uses the module's logger. """ def decorator(func): @functools.wraps(func) @@ -126,28 +126,22 @@ def wrapper(*args, **kwargs): if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT): return func(*args, **kwargs) - # Use provided logger or get root logger - log = logger or logging.getLogger() + if logger is None: + log = logging.getLogger(func.__module__) + else: + log = logger - # Log function entry - log.debug(f"Entering {func.__name__}") start_time = time.time() - + log.info(f"Entering {func.__name__}") try: - # Execute the function result = func(*args, **kwargs) - - # Log successful exit - duration = time.time() - start_time - log.debug(f"Exiting {func.__name__} (took {duration:.3f}s)") + end_time = time.time() + log.info(f"Exiting {func.__name__} (took {end_time - start_time:.2f}s)") return result - - except Exception as e: - # Log error exit - duration = time.time() - start_time - log.error(f"Error in {func.__name__} after {duration:.3f}s: {str(e)}") - raise - + except Exception as err: + end_time = time.time() + log.info(f"Exiting {func.__name__} with exception (took {end_time - start_time:.2f}s)") + raise err return wrapper return decorator From 678f0d755d780e4b24cd92122acc2eb4e579d337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Thu, 1 May 2025 19:55:09 +0200 Subject: [PATCH 027/218] tweak func leave msg and add context info --- scripts/automated_ingestion/utils.py | 33 +++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 214673c0..aa841563 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -4,6 +4,7 @@ import logging import functools import time +import os from enum import IntFlag, auto class LoggingScope(IntFlag): @@ -131,16 +132,42 @@ def wrapper(*args, **kwargs): else: log = logger + # Get context information if available + context = "" + if len(args) > 0 and hasattr(args[0], 'object'): + # For EessiTarball methods, show the tarball name and state + tarball = args[0] + filename = os.path.basename(tarball.object) + + # Format filename to show important parts + if len(filename) > 30: + parts = filename.split('-') + if len(parts) >= 6: # Ensure we have all required parts + # Get version, component, last part of architecture, and epoch + version = parts[1] + component = parts[2] + arch_last = parts[-3].split('-')[-1] # Last part of architecture + epoch = parts[-2] + filename = f"{version}-{component}-{arch_last}-{epoch}.tar.gz" + else: + # Fallback to simple truncation if format doesn't match + filename = f"{filename[:15]}...{filename[-12:]}" + + context = f" [{filename}" + if hasattr(tarball, 'state'): + context += f" in {tarball.state}" + context += "]" + start_time = time.time() - log.info(f"Entering {func.__name__}") + log.info(f"Entering {func.__name__}{context}") try: result = func(*args, **kwargs) end_time = time.time() - log.info(f"Exiting {func.__name__} (took {end_time - start_time:.2f}s)") + log.info(f"Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)") return result except Exception as err: end_time = time.time() - log.info(f"Exiting {func.__name__} with exception (took {end_time - start_time:.2f}s)") + log.info(f"Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)") raise err return wrapper return decorator From 7ecb9be140e87310d98f70bde13c1a0e3e254342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Thu, 1 May 2025 20:05:22 +0200 Subject: [PATCH 028/218] fix shown file components and illustrate call stack depth --- scripts/automated_ingestion/utils.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index aa841563..0786e22f 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -5,6 +5,7 @@ import functools import time import os +import inspect from enum import IntFlag, auto class LoggingScope(IntFlag): @@ -24,6 +25,9 @@ class LoggingScope(IntFlag): # Global setting for logging scopes ENABLED_LOGGING_SCOPES = LoggingScope.NONE +# Global variable to track call stack depth +_call_stack_depth = 0 + def set_logging_scopes(scopes): """ Set the enabled logging scopes. @@ -124,6 +128,8 @@ def log_function_entry_exit(logger=None): def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): + global _call_stack_depth + if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT): return func(*args, **kwargs) @@ -146,9 +152,9 @@ def wrapper(*args, **kwargs): # Get version, component, last part of architecture, and epoch version = parts[1] component = parts[2] - arch_last = parts[-3].split('-')[-1] # Last part of architecture - epoch = parts[-2] - filename = f"{version}-{component}-{arch_last}-{epoch}.tar.gz" + arch_last = parts[-2].split('-')[-1] # Last part of architecture + epoch = parts[-1] # includes file extension + filename = f"{version}-{component}-{arch_last}-{epoch}" else: # Fallback to simple truncation if format doesn't match filename = f"{filename[:15]}...{filename[-12:]}" @@ -158,16 +164,22 @@ def wrapper(*args, **kwargs): context += f" in {tarball.state}" context += "]" + # Create indentation based on call stack depth + indent = " " * _call_stack_depth + start_time = time.time() - log.info(f"Entering {func.__name__}{context}") + log.info(f"{indent}Entering {func.__name__}{context}") + _call_stack_depth += 1 try: result = func(*args, **kwargs) + _call_stack_depth -= 1 end_time = time.time() - log.info(f"Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)") + log.info(f"{indent}Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)") return result except Exception as err: + _call_stack_depth -= 1 end_time = time.time() - log.info(f"Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)") + log.info(f"{indent}Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)") raise err return wrapper return decorator From c65d1f7ae8c9d1c7345b38860284d5824ee651cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Thu, 1 May 2025 20:09:38 +0200 Subject: [PATCH 029/218] convert logging calls in automated_ingestion.py --- scripts/automated_ingestion/automated_ingestion.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 71b8b61d..11113fd7 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -34,7 +34,7 @@ def error(msg, code=1): """Print an error and exit.""" - logging.error(msg) + log_message(LoggingScope.ERROR, 'ERROR', msg) sys.exit(code) @@ -78,7 +78,7 @@ def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extens groups[group_key] = [] groups[group_key].append(tarball) except Exception as err: - logging.error(f"Failed to process metadata for {tarball}: {err}") + log_message(LoggingScope.ERROR, 'ERROR', "Failed to process metadata for %s: %s", tarball, err) continue finally: # Clean up downloaded metadata file @@ -239,22 +239,22 @@ def main(): # use new grouped PR method tarball_groups = find_tarball_groups(s3, bucket, config) if args.list_only: - logging.info(f"#tarball_groups: {len(tarball_groups)}") + log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups)) for (repo, pr_id), tarballs in tarball_groups.items(): - logging.info(f" {repo}#{pr_id}: #tarballs {len(tarballs)}") + log_message(LoggingScope.GROUP_OPS, 'INFO', " %s#%s: #tarballs %d", repo, pr_id, len(tarballs)) else: for (repo, pr_id), tarballs in tarball_groups.items(): if tarballs: # Create a group for these tarballs group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) - logging.info(f"group created\n{group.to_string(oneline=True)}") + log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True)) group.process_group(tarballs) else: # use old individual PR method tarballs = find_tarballs(s3, bucket) if args.list_only: for num, tarball in enumerate(tarballs): - logging.info(f'[{bucket}] {num}: {tarball}') + log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball) else: for tarball in tarballs: tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo) From 0206968c0114ee8239a69db6c07a229207efe6ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 08:39:45 +0200 Subject: [PATCH 030/218] introducing task-based deployments --- .../automated_ingestion.py | 102 ++++++++++++++---- 1 file changed, 81 insertions(+), 21 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 11113fd7..f7d0db89 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -15,6 +15,7 @@ import pid import sys from pathlib import Path +from typing import List, Dict REQUIRED_CONFIG = { 'secrets': ['aws_secret_access_key', 'aws_access_key_id', 'github_pat'], @@ -150,7 +151,8 @@ def parse_args(): parser.add_argument('-c', '--config', type=str, help='path to configuration file', default='automated_ingestion.cfg', dest='config') parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug') - parser.add_argument('-l', '--list', help='only list available tarballs', action='store_true', dest='list_only') + parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true', dest='list_only') + parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based', action='store_true') return parser.parse_args() @@ -235,30 +237,88 @@ def main(): buckets = json.loads(config['aws']['staging_buckets']) for bucket, cvmfs_repo in buckets.items(): - if config['github'].get('staging_pr_method', 'individual') == 'grouped': - # use new grouped PR method - tarball_groups = find_tarball_groups(s3, bucket, config) + if args.task_based: + # Task-based listing + tasks = find_deployment_tasks(s3, bucket) if args.list_only: - log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups)) - for (repo, pr_id), tarballs in tarball_groups.items(): - log_message(LoggingScope.GROUP_OPS, 'INFO', " %s#%s: #tarballs %d", repo, pr_id, len(tarballs)) + log_message(LoggingScope.GROUP_OPS, 'INFO', "#tasks: %d", len(tasks)) + for num, task in enumerate(tasks): + log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, task) else: - for (repo, pr_id), tarballs in tarball_groups.items(): - if tarballs: - # Create a group for these tarballs - group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) - log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True)) - group.process_group(tarballs) + # TODO: Implement task processing + pass else: - # use old individual PR method - tarballs = find_tarballs(s3, bucket) - if args.list_only: - for num, tarball in enumerate(tarballs): - log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball) + # Original tarball-based processing + if config['github'].get('staging_pr_method', 'individual') == 'grouped': + # use new grouped PR method + tarball_groups = find_tarball_groups(s3, bucket, config) + if args.list_only: + log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups)) + for (repo, pr_id), tarballs in tarball_groups.items(): + log_message(LoggingScope.GROUP_OPS, 'INFO', " %s#%s: #tarballs %d", repo, pr_id, len(tarballs)) + else: + for (repo, pr_id), tarballs in tarball_groups.items(): + if tarballs: + # Create a group for these tarballs + group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) + log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True)) + group.process_group(tarballs) else: - for tarball in tarballs: - tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo) - tar.run_handler() + # use old individual PR method + tarballs = find_tarballs(s3, bucket) + if args.list_only: + for num, tarball in enumerate(tarballs): + log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball) + else: + for tarball in tarballs: + tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo) + tar.run_handler() + + +@log_function_entry_exit() +def find_deployment_tasks(s3, bucket: str, extension='.task') -> List[str]: + """ + Return a list of all task files in an S3 bucket with the given extension, + but only if a corresponding payload file exists (same name without extension). + + Args: + s3: boto3 S3 client + bucket: Name of the S3 bucket to scan + extension: File extension to look for (default: '.task') + + Returns: + List of task filenames found in the bucket that have a corresponding payload + """ + files = [] + continuation_token = None + + while True: + # List objects with pagination + if continuation_token: + response = s3.list_objects_v2( + Bucket=bucket, + ContinuationToken=continuation_token + ) + else: + response = s3.list_objects_v2(Bucket=bucket) + + # Add files from this page + files.extend([obj['Key'] for obj in response.get('Contents', [])]) + + # Check if there are more pages + if response.get('IsTruncated'): + continuation_token = response.get('NextContinuationToken') + else: + break + + # Create a set of all files for faster lookup + file_set = set(files) + + # Return only task files that have a corresponding payload + return [ + file for file in files + if file.endswith(extension) and file[:-len(extension)] in file_set + ] if __name__ == '__main__': From 108bec3ef09794c23b747930899972ddc74dee6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 09:14:46 +0200 Subject: [PATCH 031/218] support providing extensions, e.g., . .meta.txt --- .../automated_ingestion.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index f7d0db89..0c6884fb 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -152,7 +152,9 @@ def parse_args(): default='automated_ingestion.cfg', dest='config') parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug') parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true', dest='list_only') - parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based', action='store_true') + parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based. ' + 'Optionally specify comma-separated list of extensions (default: .task)', + nargs='?', const='.task', default=False) return parser.parse_args() @@ -239,7 +241,8 @@ def main(): for bucket, cvmfs_repo in buckets.items(): if args.task_based: # Task-based listing - tasks = find_deployment_tasks(s3, bucket) + extensions = args.task_based.split(',') + tasks = find_deployment_tasks(s3, bucket, extensions) if args.list_only: log_message(LoggingScope.GROUP_OPS, 'INFO', "#tasks: %d", len(tasks)) for num, task in enumerate(tasks): @@ -276,19 +279,22 @@ def main(): @log_function_entry_exit() -def find_deployment_tasks(s3, bucket: str, extension='.task') -> List[str]: +def find_deployment_tasks(s3, bucket: str, extensions: List[str] = None) -> List[str]: """ - Return a list of all task files in an S3 bucket with the given extension, + Return a list of all task files in an S3 bucket with the given extensions, but only if a corresponding payload file exists (same name without extension). Args: s3: boto3 S3 client bucket: Name of the S3 bucket to scan - extension: File extension to look for (default: '.task') + extensions: List of file extensions to look for (default: ['.task']) Returns: List of task filenames found in the bucket that have a corresponding payload """ + if extensions is None: + extensions = ['.task'] + files = [] continuation_token = None @@ -315,10 +321,14 @@ def find_deployment_tasks(s3, bucket: str, extension='.task') -> List[str]: file_set = set(files) # Return only task files that have a corresponding payload - return [ - file for file in files - if file.endswith(extension) and file[:-len(extension)] in file_set - ] + result = [] + for file in files: + for ext in extensions: + if file.endswith(ext) and file[:-len(ext)] in file_set: + result.append(file) + break # Found a matching extension, no need to check others + + return result if __name__ == '__main__': From 4fce92ac22a3ef1b0f60be407ab43dac9743e5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 16:35:38 +0200 Subject: [PATCH 032/218] model remote/local files, download them --- .../automated_ingestion.py | 27 ++- .../automated_ingestion/eessi_data_object.py | 190 ++++++++++++++++++ scripts/automated_ingestion/s3_client.py | 132 ++++++++++++ 3 files changed, 347 insertions(+), 2 deletions(-) create mode 100644 scripts/automated_ingestion/eessi_data_object.py create mode 100644 scripts/automated_ingestion/s3_client.py diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 0c6884fb..ff628957 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from eessitarball import EessiTarball, EessiTarballGroup +from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode from pid.decorator import pidfile # noqa: F401 from pid import PidFileError from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes @@ -248,8 +249,30 @@ def main(): for num, task in enumerate(tasks): log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, task) else: - # TODO: Implement task processing - pass + # Process each task file + for task_path in tasks: + try: + # Create EESSIDataAndSignatureObject for the task file + task_obj = EESSIDataAndSignatureObject(config, task_path, s3) + + # Download the task file and its signature + task_obj.download(mode=DownloadMode.CHECK_REMOTE) + + # Log the ETags of the downloaded task file + file_etag, sig_etag = task_obj.get_etags() + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag) + log_message(LoggingScope.GROUP_OPS, 'INFO', + "Task signature %s has ETag: %s", + task_obj.remote_sig_path, sig_etag) + + # TODO: Process the task file contents + # This would involve reading the task file, parsing its contents, + # and performing the required actions based on the task type + log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing task file: %s", task_path) + + except Exception as err: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err)) + continue else: # Original tarball-based processing if config['github'].get('staging_pr_method', 'individual') == 'grouped': diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py new file mode 100644 index 00000000..45d36308 --- /dev/null +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -0,0 +1,190 @@ +import os +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Optional, Protocol, runtime_checkable + +import boto3 +import configparser + +from .utils import log_function_entry_exit, log_message, LoggingScope + +class DownloadMode(Enum): + """Enum defining different modes for downloading files.""" + FORCE = 'force' # Always download and overwrite + CHECK_REMOTE = 'check-remote' # Download if remote files have changed + CHECK_LOCAL = 'check-local' # Download if files don't exist locally (default) + + +@runtime_checkable +class RemoteStorageClient(Protocol): + """Protocol defining the interface for remote storage clients.""" + + def get_metadata(self, remote_path: str) -> dict: + """Get metadata about a remote object. + + Args: + remote_path: Path to the object in remote storage + + Returns: + Dictionary containing object metadata, including 'ETag' key + """ + ... + + def download(self, remote_path: str, local_path: str) -> None: + """Download a remote file to a local location. + + Args: + remote_path: Path to the object in remote storage + local_path: Local path where to save the file + """ + ... + + +@dataclass +class EESSIDataAndSignatureObject: + """Class representing an EESSI data file and its signature in remote storage and locally.""" + + # Configuration + config: configparser.ConfigParser + + # Remote paths + remote_file_path: str # Path to data file in remote storage + remote_sig_path: str # Path to signature file in remote storage + + # Local paths + local_file_path: Path # Path to local data file + local_sig_path: Path # Path to local signature file + + # Remote storage client + remote_client: RemoteStorageClient + + @log_function_entry_exit() + def __init__(self, config: configparser.ConfigParser, remote_file_path: str, remote_client: RemoteStorageClient): + """ + Initialize an EESSI data and signature object handler. + + Args: + config: Configuration object containing remote storage and local directory information + remote_file_path: Path to data file in remote storage + remote_client: Remote storage client implementing the RemoteStorageClient protocol + """ + self.config = config + self.remote_file_path = remote_file_path + sig_ext = config['signatures']['signature_file_extension'] + self.remote_sig_path = remote_file_path + sig_ext + + # Set up local paths + local_dir = Path(config['paths']['download_dir']) + # Use the full remote path structure, removing any leading slashes + remote_path = remote_file_path.lstrip('/') + self.local_file_path = local_dir.joinpath(remote_path) + self.local_sig_path = local_dir.joinpath(remote_path + sig_ext) + self.remote_client = remote_client + + log_message(LoggingScope.DEBUG, 'DEBUG', "Initialized EESSIDataAndSignatureObject for %s", remote_file_path) + log_message(LoggingScope.DEBUG, 'DEBUG', "Local file path: %s", self.local_file_path) + log_message(LoggingScope.DEBUG, 'DEBUG', "Local signature path: %s", self.local_sig_path) + + def _get_etag_file_path(self, local_path: Path) -> Path: + """Get the path to the .etag file for a given local file.""" + return local_path.with_suffix('.etag') + + def _get_local_etag(self, local_path: Path) -> Optional[str]: + """Get the ETag of a local file from its .etag file.""" + etag_path = self._get_etag_file_path(local_path) + if etag_path.exists(): + try: + with open(etag_path, 'r') as f: + return f.read().strip() + except Exception as err: + log_message(LoggingScope.DEBUG, 'WARNING', "Failed to read ETag file %s: %s", etag_path, str(err)) + return None + return None + + def get_etags(self) -> tuple[Optional[str], Optional[str]]: + """ + Get the ETags of both the data file and its signature. + + Returns: + Tuple containing (data_file_etag, signature_file_etag) + """ + return ( + self._get_local_etag(self.local_file_path), + self._get_local_etag(self.local_sig_path) + ) + + @log_function_entry_exit() + def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool: + """ + Download data file and signature based on the specified mode. + + Args: + mode: Download mode to use + + Returns: + True if files were downloaded, False otherwise + """ + if mode == DownloadMode.FORCE: + should_download = True + log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path) + elif mode == DownloadMode.CHECK_REMOTE: + remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] + remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag'] + local_file_etag = self._get_local_etag(self.local_file_path) + local_sig_etag = self._get_local_etag(self.local_sig_path) + + should_download = ( + remote_file_etag != local_file_etag or + remote_sig_etag != local_sig_etag + ) + if should_download: + log_msg = "Remote files have changed, downloading %s" + log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path) + else: + log_msg = "Remote files unchanged, skipping download of %s" + log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path) + else: # CHECK_LOCAL + should_download = ( + not self.local_file_path.exists() or + not self.local_sig_path.exists() + ) + if should_download: + log_msg = "Local files missing, downloading %s" + log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path) + else: + log_msg = "Local files exist, skipping download of %s" + log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path) + + if not should_download: + return False + + # Ensure local directory exists + self.local_file_path.parent.mkdir(parents=True, exist_ok=True) + + # Download files + try: + self.remote_client.download(self.remote_file_path, str(self.local_file_path)) + self.remote_client.download(self.remote_sig_path, str(self.local_sig_path)) + + # Log the ETags of downloaded files + file_etag = self._get_local_etag(self.local_file_path) + sig_etag = self._get_local_etag(self.local_sig_path) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_file_path, file_etag) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_sig_path, sig_etag) + + log_msg = "Successfully downloaded %s and its signature" + log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path) + return True + except Exception as err: + # Clean up partially downloaded files + if self.local_file_path.exists(): + self.local_file_path.unlink() + if self.local_sig_path.exists(): + self.local_sig_path.unlink() + log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", self.remote_file_path, str(err)) + raise + + def __str__(self) -> str: + """Return a string representation of the EESSI data and signature object.""" + return f"EESSIDataAndSignatureObject({self.remote_file_path})" diff --git a/scripts/automated_ingestion/s3_client.py b/scripts/automated_ingestion/s3_client.py new file mode 100644 index 00000000..e61a5ed7 --- /dev/null +++ b/scripts/automated_ingestion/s3_client.py @@ -0,0 +1,132 @@ +import boto3 +from typing import Dict, Optional +import os +from pathlib import Path + +from .utils import log_function_entry_exit, log_message, LoggingScope +from .eessi_data_object import RemoteStorageClient + +class EESSIS3Client(RemoteStorageClient): + """EESSI-specific S3 client implementation of the RemoteStorageClient protocol.""" + + @log_function_entry_exit() + def __init__(self, config, bucket_name: str): + """ + Initialize the EESSI S3 client. + + Args: + config: Configuration object containing: + - aws.access_key_id: AWS access key ID (optional, can use AWS_ACCESS_KEY_ID env var) + - aws.secret_access_key: AWS secret access key (optional, can use AWS_SECRET_ACCESS_KEY env var) + - aws.endpoint_url: Custom endpoint URL for S3-compatible backends (optional) + - aws.verify: SSL verification setting (optional) + - True: Verify SSL certificates (default) + - False: Skip SSL certificate verification + - str: Path to CA bundle file + bucket_name: Name of the S3 bucket to use + """ + self.bucket = bucket_name + + # Get AWS credentials from environment or config + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'access_key_id') + aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'secret_access_key') + + # Configure boto3 client + client_config = {} + + # Add endpoint URL if specified in config + if config.has_option('aws', 'endpoint_url'): + client_config['endpoint_url'] = config['aws']['endpoint_url'] + log_message(LoggingScope.DEBUG, 'DEBUG', "Using custom endpoint URL: %s", client_config['endpoint_url']) + + # Add SSL verification if specified in config + if config.has_option('aws', 'verify'): + verify = config['aws']['verify'] + if verify.lower() == 'false': + client_config['verify'] = False + log_message(LoggingScope.DEBUG, 'WARNING', "SSL verification disabled") + elif verify.lower() == 'true': + client_config['verify'] = True + else: + client_config['verify'] = verify # Assume it's a path to CA bundle + log_message(LoggingScope.DEBUG, 'DEBUG', "Using custom CA bundle: %s", verify) + + self.client = boto3.client( + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + **client_config + ) + log_message(LoggingScope.DEBUG, 'INFO', "Initialized S3 client for bucket: %s", self.bucket) + + @log_function_entry_exit() + def get_metadata(self, remote_path: str) -> Dict: + """ + Get metadata about an S3 object. + + Args: + remote_path: Path to the object in S3 + + Returns: + Dictionary containing object metadata, including 'ETag' key + """ + try: + log_message(LoggingScope.DEBUG, 'DEBUG', "Getting metadata for S3 object: %s", remote_path) + response = self.client.head_object(Bucket=self.bucket, Key=remote_path) + log_message(LoggingScope.DEBUG, 'DEBUG', "Retrieved metadata for %s: %s", remote_path, response) + return response + except ClientError as e: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to get metadata for %s: %s", remote_path, str(e)) + raise + + def _get_etag_file_path(self, local_path: str) -> Path: + """Get the path to the .etag file for a given local file.""" + return Path(local_path).with_suffix('.etag') + + def _read_etag(self, local_path: str) -> Optional[str]: + """Read the ETag from the .etag file if it exists.""" + etag_path = self._get_etag_file_path(local_path) + if etag_path.exists(): + try: + with open(etag_path, 'r') as f: + return f.read().strip() + except Exception as e: + log_message(LoggingScope.DEBUG, 'WARNING', "Failed to read ETag file %s: %s", etag_path, str(e)) + return None + return None + + def _write_etag(self, local_path: str, etag: str) -> None: + """Write the ETag to the .etag file.""" + etag_path = self._get_etag_file_path(local_path) + try: + with open(etag_path, 'w') as f: + f.write(etag) + log_message(LoggingScope.DEBUG, 'DEBUG', "Wrote ETag to %s", etag_path) + except Exception as e: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to write ETag file %s: %s", etag_path, str(e)) + # If we can't write the etag file, it's not critical + # The file will just be downloaded again next time + + @log_function_entry_exit() + def download(self, remote_path: str, local_path: str) -> None: + """ + Download an S3 object to a local location and store its ETag. + + Args: + remote_path: Path to the object in S3 + local_path: Local path where to save the file + """ + try: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading %s to %s", remote_path, local_path) + self.client.download_file(Bucket=self.bucket, Key=remote_path, Filename=local_path) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s to %s", remote_path, local_path) + except ClientError as e: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", remote_path, str(e)) + raise + + # Get metadata first to obtain the ETag + metadata = self.get_metadata(remote_path) + etag = metadata['ETag'] + + # Store the ETag + self._write_etag(local_path, etag) From bb6351afe6f26f2521dee3b2fa009830df0e97b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 16:57:29 +0200 Subject: [PATCH 033/218] fix imports --- scripts/automated_ingestion/eessi_data_object.py | 2 +- scripts/automated_ingestion/s3_client.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index 45d36308..40c87ffa 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -7,7 +7,7 @@ import boto3 import configparser -from .utils import log_function_entry_exit, log_message, LoggingScope +from utils import log_function_entry_exit, log_message, LoggingScope class DownloadMode(Enum): """Enum defining different modes for downloading files.""" diff --git a/scripts/automated_ingestion/s3_client.py b/scripts/automated_ingestion/s3_client.py index e61a5ed7..c1ea2a71 100644 --- a/scripts/automated_ingestion/s3_client.py +++ b/scripts/automated_ingestion/s3_client.py @@ -1,10 +1,11 @@ -import boto3 -from typing import Dict, Optional import os from pathlib import Path +from typing import Dict, Optional + +import boto3 -from .utils import log_function_entry_exit, log_message, LoggingScope -from .eessi_data_object import RemoteStorageClient +from utils import log_function_entry_exit, log_message, LoggingScope +from eessi_data_object import RemoteStorageClient class EESSIS3Client(RemoteStorageClient): """EESSI-specific S3 client implementation of the RemoteStorageClient protocol.""" From 8e64cc986e2353d28e3f267c4012a29ba140322b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 17:04:55 +0200 Subject: [PATCH 034/218] add more details to function entry/leave logging --- scripts/automated_ingestion/utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 0786e22f..a61fed7f 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -167,19 +167,28 @@ def wrapper(*args, **kwargs): # Create indentation based on call stack depth indent = " " * _call_stack_depth + # Get file name and line number + frame = inspect.currentframe() + while frame.f_back: # Walk up the call stack to find the caller + frame = frame.f_back + file_name = os.path.basename(frame.f_code.co_filename) + line_no = frame.f_lineno + start_time = time.time() - log.info(f"{indent}Entering {func.__name__}{context}") + log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}") _call_stack_depth += 1 try: result = func(*args, **kwargs) _call_stack_depth -= 1 end_time = time.time() - log.info(f"{indent}Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)") + log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}" + f"{context} (took {end_time - start_time:.2f}s)") return result except Exception as err: _call_stack_depth -= 1 end_time = time.time() - log.info(f"{indent}Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)") + log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}" + f"{context} with exception (took {end_time - start_time:.2f}s)") raise err return wrapper return decorator From c7a0254259e2f8b7f3215beb5783f29f792de0a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 17:08:22 +0200 Subject: [PATCH 035/218] fix details to function entry/leave logging --- scripts/automated_ingestion/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index a61fed7f..cf5681b6 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -167,12 +167,12 @@ def wrapper(*args, **kwargs): # Create indentation based on call stack depth indent = " " * _call_stack_depth - # Get file name and line number + # Get file name and line number of the decorated function frame = inspect.currentframe() - while frame.f_back: # Walk up the call stack to find the caller - frame = frame.f_back - file_name = os.path.basename(frame.f_code.co_filename) - line_no = frame.f_lineno + # Get the frame of the decorated function (one level up from the wrapper) + func_frame = frame.f_back + file_name = os.path.basename(func_frame.f_code.co_filename) + line_no = func_frame.f_lineno start_time = time.time() log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}") From a0c8d4d788b2ecfb5e24c124740f45ef3ed7f365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 17:11:06 +0200 Subject: [PATCH 036/218] show actual function for entry/leave logging --- scripts/automated_ingestion/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index cf5681b6..38bcc68d 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -167,12 +167,9 @@ def wrapper(*args, **kwargs): # Create indentation based on call stack depth indent = " " * _call_stack_depth - # Get file name and line number of the decorated function - frame = inspect.currentframe() - # Get the frame of the decorated function (one level up from the wrapper) - func_frame = frame.f_back - file_name = os.path.basename(func_frame.f_code.co_filename) - line_no = func_frame.f_lineno + # Get file name and line number where the function is defined + file_name = os.path.basename(inspect.getsourcefile(func)) + line_no = inspect.getsourcelines(func)[1] start_time = time.time() log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}") From e958b49dafae7d472ffeba5ab3f3d7d4405f10eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 17:38:40 +0200 Subject: [PATCH 037/218] print actual lines of entry or leave --- scripts/automated_ingestion/utils.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 38bcc68d..913f883e 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -169,22 +169,31 @@ def wrapper(*args, **kwargs): # Get file name and line number where the function is defined file_name = os.path.basename(inspect.getsourcefile(func)) - line_no = inspect.getsourcelines(func)[1] + source_lines, start_line = inspect.getsourcelines(func) + # Find the line with the actual function definition + def_line = next(i for i, line in enumerate(source_lines) if line.strip().startswith('def ')) + def_line_no = start_line + def_line start_time = time.time() - log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}") + log.info(f"{indent}Entering {func.__name__} at {file_name}:{def_line_no}{context}") _call_stack_depth += 1 try: result = func(*args, **kwargs) _call_stack_depth -= 1 end_time = time.time() - log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}" + # Get the actual line where the function returned + frame = inspect.currentframe() + return_line_no = frame.f_back.f_lineno + log.info(f"{indent}Leaving {func.__name__} at {file_name}:{return_line_no}" f"{context} (took {end_time - start_time:.2f}s)") return result except Exception as err: _call_stack_depth -= 1 end_time = time.time() - log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}" + # Get the actual line where the exception occurred + frame = inspect.currentframe() + exception_line_no = frame.f_back.f_lineno + log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exception_line_no}" f"{context} with exception (took {end_time - start_time:.2f}s)") raise err return wrapper From 59c722b14d0b9de239cb42539c4a7940b783ba19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 17:48:21 +0200 Subject: [PATCH 038/218] determine lineno when leaving a function --- scripts/automated_ingestion/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 913f883e..9a6007c3 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -183,7 +183,10 @@ def wrapper(*args, **kwargs): end_time = time.time() # Get the actual line where the function returned frame = inspect.currentframe() - return_line_no = frame.f_back.f_lineno + # Walk up the stack to find the frame of the decorated function + while frame.f_back and frame.f_back.f_code.co_name != func.__name__: + frame = frame.f_back + return_line_no = frame.f_lineno log.info(f"{indent}Leaving {func.__name__} at {file_name}:{return_line_no}" f"{context} (took {end_time - start_time:.2f}s)") return result @@ -192,7 +195,10 @@ def wrapper(*args, **kwargs): end_time = time.time() # Get the actual line where the exception occurred frame = inspect.currentframe() - exception_line_no = frame.f_back.f_lineno + # Walk up the stack to find the frame of the decorated function + while frame.f_back and frame.f_back.f_code.co_name != func.__name__: + frame = frame.f_back + exception_line_no = frame.f_lineno log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exception_line_no}" f"{context} with exception (took {end_time - start_time:.2f}s)") raise err From 39f49340fecde2fe4283ec2b6c54a65f19758cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 18:06:40 +0200 Subject: [PATCH 039/218] may only show approx line when leaving a function --- scripts/automated_ingestion/utils.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 9a6007c3..277c6777 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -173,6 +173,9 @@ def wrapper(*args, **kwargs): # Find the line with the actual function definition def_line = next(i for i, line in enumerate(source_lines) if line.strip().startswith('def ')) def_line_no = start_line + def_line + # Find the last non-empty line of the function + last_line = next(i for i, line in enumerate(reversed(source_lines)) if line.strip()) + last_line_no = start_line + len(source_lines) - 1 - last_line start_time = time.time() log.info(f"{indent}Entering {func.__name__} at {file_name}:{def_line_no}{context}") @@ -181,25 +184,19 @@ def wrapper(*args, **kwargs): result = func(*args, **kwargs) _call_stack_depth -= 1 end_time = time.time() - # Get the actual line where the function returned - frame = inspect.currentframe() - # Walk up the stack to find the frame of the decorated function - while frame.f_back and frame.f_back.f_code.co_name != func.__name__: - frame = frame.f_back - return_line_no = frame.f_lineno - log.info(f"{indent}Leaving {func.__name__} at {file_name}:{return_line_no}" + # For normal returns, show the last line of the function + log.info(f"{indent}Leaving {func.__name__} at {file_name}:{last_line_no}" f"{context} (took {end_time - start_time:.2f}s)") return result except Exception as err: _call_stack_depth -= 1 end_time = time.time() - # Get the actual line where the exception occurred - frame = inspect.currentframe() - # Walk up the stack to find the frame of the decorated function - while frame.f_back and frame.f_back.f_code.co_name != func.__name__: - frame = frame.f_back - exception_line_no = frame.f_lineno - log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exception_line_no}" + # For exceptions, try to get the line number from the exception + try: + exc_line_no = err.__traceback__.tb_lineno + except AttributeError: + exc_line_no = last_line_no + log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exc_line_no}" f"{context} with exception (took {end_time - start_time:.2f}s)") raise err return wrapper From 12ea5ca62eea81b86ccee31adfad2bb90fd06f8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 21:39:35 +0200 Subject: [PATCH 040/218] always use own S3 bucket/client, code refactoring and improvements --- .../automated_ingestion.py | 43 ++++++++----------- .../automated_ingestion/eessi_data_object.py | 35 +-------------- scripts/automated_ingestion/eessitarball.py | 42 +++++++++--------- scripts/automated_ingestion/remote_storage.py | 34 +++++++++++++++ .../{s3_client.py => s3_bucket.py} | 30 +++++++++++-- 5 files changed, 101 insertions(+), 83 deletions(-) create mode 100644 scripts/automated_ingestion/remote_storage.py rename scripts/automated_ingestion/{s3_client.py => s3_bucket.py} (86%) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index ff628957..24799a54 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -2,6 +2,7 @@ from eessitarball import EessiTarball, EessiTarballGroup from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode +from s3_bucket import EESSIS3Bucket from pid.decorator import pidfile # noqa: F401 from pid import PidFileError from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes @@ -40,13 +41,13 @@ def error(msg, code=1): sys.exit(code) -def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt'): +def find_tarballs(s3_bucket, extension='.tar.gz', metadata_extension='.meta.txt'): """ Return a list of all tarballs in an S3 bucket that have a metadata file with the given extension (and same filename). """ # TODO: list_objects_v2 only returns up to 1000 objects - s3_objects = s3.list_objects_v2(Bucket=bucket).get('Contents', []) + s3_objects = s3_bucket.list_objects_v2().get('Contents', []) files = [obj['Key'] for obj in s3_objects] tarballs = [ @@ -58,9 +59,9 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt @log_function_entry_exit() -def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'): +def find_tarball_groups(s3_bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'): """Return a dictionary of tarball groups, keyed by (repo, pr_number).""" - tarballs = find_tarballs(s3, bucket, extension, metadata_extension) + tarballs = find_tarballs(s3_bucket, extension, metadata_extension) groups = {} for tarball in tarballs: @@ -69,7 +70,7 @@ def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extens local_metadata = os.path.join(config['paths']['download_dir'], os.path.basename(metadata_file)) try: - s3.download_file(bucket, metadata_file, local_metadata) + s3_bucket.download_file(metadata_file, local_metadata) with open(local_metadata, 'r') as meta: metadata = json.load(meta) repo = metadata['link2pr']['repo'] @@ -230,20 +231,16 @@ def main(): # TODO: check configuration: secrets, paths, permissions on dirs, etc gh_pat = config['secrets']['github_pat'] gh_staging_repo = github.Github(gh_pat).get_repo(config['github']['staging_repo']) - s3 = boto3.client( - 's3', - aws_access_key_id=config['secrets']['aws_access_key_id'], - aws_secret_access_key=config['secrets']['aws_secret_access_key'], - endpoint_url=config['aws']['endpoint_url'], - verify=config['aws']['verify_cert_path'], - ) buckets = json.loads(config['aws']['staging_buckets']) for bucket, cvmfs_repo in buckets.items(): + # Create our custom S3 bucket for this bucket + s3_bucket = EESSIS3Bucket(config, bucket) + if args.task_based: # Task-based listing extensions = args.task_based.split(',') - tasks = find_deployment_tasks(s3, bucket, extensions) + tasks = find_deployment_tasks(s3_bucket, extensions) if args.list_only: log_message(LoggingScope.GROUP_OPS, 'INFO', "#tasks: %d", len(tasks)) for num, task in enumerate(tasks): @@ -253,7 +250,7 @@ def main(): for task_path in tasks: try: # Create EESSIDataAndSignatureObject for the task file - task_obj = EESSIDataAndSignatureObject(config, task_path, s3) + task_obj = EESSIDataAndSignatureObject(config, task_path, s3_bucket) # Download the task file and its signature task_obj.download(mode=DownloadMode.CHECK_REMOTE) @@ -277,7 +274,7 @@ def main(): # Original tarball-based processing if config['github'].get('staging_pr_method', 'individual') == 'grouped': # use new grouped PR method - tarball_groups = find_tarball_groups(s3, bucket, config) + tarball_groups = find_tarball_groups(s3_bucket, config) if args.list_only: log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups)) for (repo, pr_id), tarballs in tarball_groups.items(): @@ -286,30 +283,29 @@ def main(): for (repo, pr_id), tarballs in tarball_groups.items(): if tarballs: # Create a group for these tarballs - group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo) + group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3_bucket, cvmfs_repo) log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True)) group.process_group(tarballs) else: # use old individual PR method - tarballs = find_tarballs(s3, bucket) + tarballs = find_tarballs(s3_bucket) if args.list_only: for num, tarball in enumerate(tarballs): log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball) else: for tarball in tarballs: - tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo) + tar = EessiTarball(tarball, config, gh_staging_repo, s3_bucket, cvmfs_repo) tar.run_handler() @log_function_entry_exit() -def find_deployment_tasks(s3, bucket: str, extensions: List[str] = None) -> List[str]: +def find_deployment_tasks(s3_bucket, extensions: List[str] = None) -> List[str]: """ Return a list of all task files in an S3 bucket with the given extensions, but only if a corresponding payload file exists (same name without extension). Args: - s3: boto3 S3 client - bucket: Name of the S3 bucket to scan + s3_bucket: EESSIS3Bucket instance extensions: List of file extensions to look for (default: ['.task']) Returns: @@ -324,12 +320,11 @@ def find_deployment_tasks(s3, bucket: str, extensions: List[str] = None) -> List while True: # List objects with pagination if continuation_token: - response = s3.list_objects_v2( - Bucket=bucket, + response = s3_bucket.list_objects_v2( ContinuationToken=continuation_token ) else: - response = s3.list_objects_v2(Bucket=bucket) + response = s3_bucket.list_objects_v2() # Add files from this page files.extend([obj['Key'] for obj in response.get('Contents', [])]) diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index 40c87ffa..e12e40c5 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -1,44 +1,13 @@ import os from dataclasses import dataclass -from enum import Enum from pathlib import Path -from typing import Optional, Protocol, runtime_checkable +from typing import Optional import boto3 import configparser from utils import log_function_entry_exit, log_message, LoggingScope - -class DownloadMode(Enum): - """Enum defining different modes for downloading files.""" - FORCE = 'force' # Always download and overwrite - CHECK_REMOTE = 'check-remote' # Download if remote files have changed - CHECK_LOCAL = 'check-local' # Download if files don't exist locally (default) - - -@runtime_checkable -class RemoteStorageClient(Protocol): - """Protocol defining the interface for remote storage clients.""" - - def get_metadata(self, remote_path: str) -> dict: - """Get metadata about a remote object. - - Args: - remote_path: Path to the object in remote storage - - Returns: - Dictionary containing object metadata, including 'ETag' key - """ - ... - - def download(self, remote_path: str, local_path: str) -> None: - """Download a remote file to a local location. - - Args: - remote_path: Path to the object in remote storage - local_path: Local path where to save the file - """ - ... +from remote_storage import RemoteStorageClient, DownloadMode @dataclass diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index ab888964..eca6b67b 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -1,4 +1,5 @@ from utils import send_slack_message, sha256sum, log_function_entry_exit, log_message, LoggingScope +from s3_bucket import EESSIS3Bucket from pathlib import PurePosixPath @@ -19,7 +20,7 @@ class EessiTarball: """ @log_function_entry_exit() - def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo): + def __init__(self, object_name, config, git_staging_repo, s3_bucket, cvmfs_repo): """Initialize the tarball object.""" self.config = config self.git_repo = git_staging_repo @@ -27,15 +28,14 @@ def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo self.metadata_sig_file = self.metadata_file + config['signatures']['signature_file_extension'] self.object = object_name self.object_sig = object_name + config['signatures']['signature_file_extension'] - self.s3 = s3 - self.bucket = bucket + self.s3_bucket = s3_bucket self.cvmfs_repo = cvmfs_repo self.local_path = os.path.join(config['paths']['download_dir'], os.path.basename(object_name)) self.local_sig_path = self.local_path + config['signatures']['signature_file_extension'] self.local_metadata_path = self.local_path + config['paths']['metadata_file_extension'] self.local_metadata_sig_path = self.local_metadata_path + config['signatures']['signature_file_extension'] self.sig_verified = False - self.url = f'https://{bucket}.s3.amazonaws.com/{object_name}' + self.url = f'https://{s3_bucket.bucket}.s3.amazonaws.com/{object_name}' self.states = { 'new': {'handler': self.mark_new_tarball_as_staged, 'next_state': 'staged'}, @@ -67,14 +67,14 @@ def download(self, force=False): try: log_msg = "Downloading signature file %s to %s" log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, sig_object, local_sig_file) - self.s3.download_file(self.bucket, sig_object, local_sig_file) + self.s3_bucket.download_file(self.s3_bucket.bucket, sig_object, local_sig_file) except Exception as err: log_msg = 'Failed to download signature file %s for %s from %s to %s.' if self.config['signatures'].getboolean('signatures_required', True): log_msg += '\nException: %s' log_message( LoggingScope.ERROR, 'ERROR', log_msg, - sig_object, object, self.bucket, local_sig_file, err + sig_object, object, self.s3_bucket.bucket, local_sig_file, err ) skip = True break @@ -84,15 +84,15 @@ def download(self, force=False): log_msg += '\nException: %s' log_message( LoggingScope.DOWNLOAD, 'WARNING', log_msg, - sig_object, object, self.bucket, local_sig_file, err + sig_object, object, self.s3_bucket.bucket, local_sig_file, err ) # Now we download the file itself. try: log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file %s to %s", object, local_file) - self.s3.download_file(self.bucket, object, local_file) + self.s3_bucket.download_file(self.s3_bucket.bucket, object, local_file) except Exception as err: log_msg = 'Failed to download %s from %s to %s.\nException: %s' - log_message(LoggingScope.ERROR, 'ERROR', log_msg, object, self.bucket, local_file, err) + log_message(LoggingScope.ERROR, 'ERROR', log_msg, object, self.s3_bucket.bucket, local_file, err) skip = True break # If any required download failed, make sure to skip this tarball completely. @@ -201,7 +201,7 @@ def to_string(self, oneline=False): str = f"tarball: {self.object}" sep = "\n" if not oneline else "," str += f"{sep} metadt: {self.metadata_file}" - str += f"{sep} bucket: {self.bucket}" + str += f"{sep} bucket: {self.s3_bucket.bucket}" str += f"{sep} cvmfs.: {self.cvmfs_repo}" str += f"{sep} GHrepo: {self.git_repo}" return str @@ -480,7 +480,7 @@ def make_approval_request(self, tarballs_in_group=None): log_msg = "Moving metadata for %d tarballs to staged" log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, len(tarballs_in_group)) for tarball in tarballs_in_group: - temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo) temp_tar.move_metadata_file(self.state, next_state, branch=git_branch) # Create PR with appropriate template @@ -502,8 +502,7 @@ def make_approval_request(self, tarballs_in_group=None): tar_overviews = [] for tarball in tarballs_in_group: try: - temp_tar = EessiTarball( - tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo) temp_tar.download() overview = temp_tar.get_contents_overview() tar_details_tpl = "
\nContents of %s\n\n%s\n
\n" @@ -669,19 +668,18 @@ def get_link2pr_info(self): class EessiTarballGroup: """Class to handle a group of tarballs that share the same link2pr information.""" - def __init__(self, first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo): + def __init__(self, first_tarball, config, git_staging_repo, s3_bucket, cvmfs_repo): """Initialize with the first tarball in the group.""" - self.first_tar = EessiTarball(first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo) + self.first_tar = EessiTarball(first_tarball, config, git_staging_repo, s3_bucket, cvmfs_repo) self.config = config self.git_repo = git_staging_repo - self.s3 = s3 - self.bucket = bucket + self.s3_bucket = s3_bucket self.cvmfs_repo = cvmfs_repo def download_tarballs_and_more(self, tarballs): """Download all files associated with this group of tarballs.""" for tarball in tarballs: - temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo) log_message(LoggingScope.GROUP_OPS, 'INFO', "downloading files for '%s'", temp_tar.object) temp_tar.download(force=True) if not temp_tar.local_path or not temp_tar.local_metadata_path: @@ -710,7 +708,7 @@ def process_group(self, tarballs): for tarball in tarballs[1:]: log_msg = "Processing tarball in group: %s" log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, tarball) - temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo) temp_tar.mark_new_tarball_as_staged('main') # Process the group for approval, only works correctly if first tarball is already in state 'staged' @@ -722,8 +720,8 @@ def to_string(self, oneline=False): sep = "\n" if not oneline else "," str += f"{sep} config: {self.config}" str += f"{sep} GHrepo: {self.git_repo}" - str += f"{sep} s3....: {self.s3}" - str += f"{sep} bucket: {self.bucket}" + str += f"{sep} s3....: {self.s3_bucket}" + str += f"{sep} bucket: {self.s3_bucket.bucket}" str += f"{sep} cvmfs.: {self.cvmfs_repo}" return str @@ -732,7 +730,7 @@ def verify_group_consistency(self, tarballs): first_repo, first_pr = self.first_tar.get_link2pr_info() for tarball in tarballs[1:]: # Skip first tarball as we already have its info - temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo) + temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo) log_message(LoggingScope.DEBUG, 'DEBUG', "temp tar: %s", temp_tar.to_string()) repo, pr = temp_tar.get_link2pr_info() if repo != first_repo or pr != first_pr: diff --git a/scripts/automated_ingestion/remote_storage.py b/scripts/automated_ingestion/remote_storage.py new file mode 100644 index 00000000..ac005af8 --- /dev/null +++ b/scripts/automated_ingestion/remote_storage.py @@ -0,0 +1,34 @@ +from enum import Enum +from typing import Protocol, runtime_checkable + + +class DownloadMode(Enum): + """Enum defining different modes for downloading files.""" + FORCE = 'force' # Always download and overwrite + CHECK_REMOTE = 'check-remote' # Download if remote files have changed + CHECK_LOCAL = 'check-local' # Download if files don't exist locally (default) + + +@runtime_checkable +class RemoteStorageClient(Protocol): + """Protocol defining the interface for remote storage clients.""" + + def get_metadata(self, remote_path: str) -> dict: + """Get metadata about a remote object. + + Args: + remote_path: Path to the object in remote storage + + Returns: + Dictionary containing object metadata, including 'ETag' key + """ + ... + + def download(self, remote_path: str, local_path: str) -> None: + """Download a remote file to a local location. + + Args: + remote_path: Path to the object in remote storage + local_path: Local path where to save the file + """ + ... \ No newline at end of file diff --git a/scripts/automated_ingestion/s3_client.py b/scripts/automated_ingestion/s3_bucket.py similarity index 86% rename from scripts/automated_ingestion/s3_client.py rename to scripts/automated_ingestion/s3_bucket.py index c1ea2a71..52e9b0d2 100644 --- a/scripts/automated_ingestion/s3_client.py +++ b/scripts/automated_ingestion/s3_bucket.py @@ -5,15 +5,15 @@ import boto3 from utils import log_function_entry_exit, log_message, LoggingScope -from eessi_data_object import RemoteStorageClient +from remote_storage import RemoteStorageClient -class EESSIS3Client(RemoteStorageClient): - """EESSI-specific S3 client implementation of the RemoteStorageClient protocol.""" +class EESSIS3Bucket(RemoteStorageClient): + """EESSI-specific S3 bucket implementation of the RemoteStorageClient protocol.""" @log_function_entry_exit() def __init__(self, config, bucket_name: str): """ - Initialize the EESSI S3 client. + Initialize the EESSI S3 bucket. Args: config: Configuration object containing: @@ -60,6 +60,28 @@ def __init__(self, config, bucket_name: str): ) log_message(LoggingScope.DEBUG, 'INFO', "Initialized S3 client for bucket: %s", self.bucket) + def list_objects_v2(self, **kwargs): + """ + List objects in the bucket using the underlying boto3 client. + + Args: + **kwargs: Additional arguments to pass to boto3.client.list_objects_v2 + + Returns: + Response from boto3.client.list_objects_v2 + """ + return self.client.list_objects_v2(Bucket=self.bucket, **kwargs) + + def download_file(self, key: str, filename: str) -> None: + """ + Download a file from S3 to a local file. + + Args: + key: The S3 key of the file to download + filename: The local path where the file should be saved + """ + self.client.download_file(self.bucket, key, filename) + @log_function_entry_exit() def get_metadata(self, remote_path: str) -> Dict: """ From fe6b29c879186e0b95e69ece275b24e38a037d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 21:45:28 +0200 Subject: [PATCH 041/218] fix aws config key names --- scripts/automated_ingestion/s3_bucket.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py index 52e9b0d2..0e91a925 100644 --- a/scripts/automated_ingestion/s3_bucket.py +++ b/scripts/automated_ingestion/s3_bucket.py @@ -29,8 +29,8 @@ def __init__(self, config, bucket_name: str): self.bucket = bucket_name # Get AWS credentials from environment or config - aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'access_key_id') - aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'secret_access_key') + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'aws_access_key_id') + aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'aws_secret_access_key') # Configure boto3 client client_config = {} From 01f73676becc0d0a3eeaee6947431e6658a2ce86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 21:47:48 +0200 Subject: [PATCH 042/218] fix section name for secrets --- scripts/automated_ingestion/s3_bucket.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py index 0e91a925..79b8a055 100644 --- a/scripts/automated_ingestion/s3_bucket.py +++ b/scripts/automated_ingestion/s3_bucket.py @@ -29,8 +29,8 @@ def __init__(self, config, bucket_name: str): self.bucket = bucket_name # Get AWS credentials from environment or config - aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'aws_access_key_id') - aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'aws_secret_access_key') + aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('secrets', 'aws_access_key_id') + aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('secrets', 'aws_secret_access_key') # Configure boto3 client client_config = {} From 9e84165986326da62065c351825a034597dabc02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 22:16:57 +0200 Subject: [PATCH 043/218] optimize download and improve choosing log scope and level --- .../automated_ingestion/eessi_data_object.py | 60 ++++++++++++++----- scripts/automated_ingestion/utils.py | 11 +++- 2 files changed, 53 insertions(+), 18 deletions(-) diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index e12e40c5..902417aa 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -98,32 +98,62 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool: should_download = True log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path) elif mode == DownloadMode.CHECK_REMOTE: - remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] - remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag'] + # First check if we have local ETags local_file_etag = self._get_local_etag(self.local_file_path) local_sig_etag = self._get_local_etag(self.local_sig_path) - should_download = ( - remote_file_etag != local_file_etag or - remote_sig_etag != local_sig_etag - ) - if should_download: - log_msg = "Remote files have changed, downloading %s" - log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path) + if local_file_etag: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag) + else: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found") + if local_sig_etag: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag) + else: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found") + + # If we don't have local ETags, we need to download + if not local_file_etag or not local_sig_etag: + should_download = True + log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", + self.remote_file_path) else: - log_msg = "Remote files unchanged, skipping download of %s" - log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path) + # Get remote ETags and compare + remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] + remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag'] + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag) + + should_download = ( + remote_file_etag != local_file_etag or + remote_sig_etag != local_sig_etag + ) + if should_download: + if remote_file_etag != local_file_etag: + log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", + local_file_etag, remote_file_etag) + if remote_sig_etag != local_sig_etag: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", + local_sig_etag, remote_sig_etag) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", + self.remote_file_path) + else: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", + self.remote_file_path) else: # CHECK_LOCAL should_download = ( not self.local_file_path.exists() or not self.local_sig_path.exists() ) if should_download: - log_msg = "Local files missing, downloading %s" - log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path) + if not self.local_file_path.exists(): + log_message(LoggingScope.DOWNLOAD, 'INFO', "Local file missing: %s", self.local_file_path) + if not self.local_sig_path.exists(): + log_message(LoggingScope.DOWNLOAD, 'INFO', "Local signature missing: %s", self.local_sig_path) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, downloading %s", + self.remote_file_path) else: - log_msg = "Local files exist, skipping download of %s" - log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files exist, skipping download of %s", + self.remote_file_path) if not should_download: return False diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 277c6777..e774eafe 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -221,7 +221,9 @@ def wrapper(*args, **kwargs): def log_message(scope, level, msg, *args, logger=None, **kwargs): """ - Log a message if the specified scope is enabled. + Log a message if either: + 1. The specified scope is enabled, OR + 2. The current log level is equal to or higher than the specified level Args: scope: LoggingScope value indicating which scope this logging belongs to @@ -230,10 +232,13 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): logger: Optional logger instance. If not provided, uses the root logger. *args, **kwargs: Additional arguments to pass to the logging function """ - if not is_logging_scope_enabled(scope): + log = logger or logging.getLogger() + log_level = getattr(logging, level.upper()) + + # Check if either condition is met + if not (is_logging_scope_enabled(scope) or log_level >= log.getEffectiveLevel()): return - log = logger or logging.getLogger() log_func = getattr(log, level.lower()) log_func(msg, *args, **kwargs) From 550de8881eac901c1fb4e6b2c7586a4cecc5265e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 22:36:55 +0200 Subject: [PATCH 044/218] fix logging logic and remove obsolete decorator --- scripts/automated_ingestion/utils.py | 39 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index e774eafe..e4070458 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -7,6 +7,7 @@ import os import inspect from enum import IntFlag, auto +import sys class LoggingScope(IntFlag): """Enumeration of different logging scopes.""" @@ -202,23 +203,6 @@ def wrapper(*args, **kwargs): return wrapper return decorator -def log_with_scope(scope, logger=None): - """ - Decorator that checks if a specific logging scope is enabled before logging. - - Args: - scope: LoggingScope value indicating which scope this logging belongs to - logger: Optional logger instance. If not provided, uses the root logger. - """ - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - if not is_logging_scope_enabled(scope): - return func(*args, **kwargs) - return func(*args, **kwargs) - return wrapper - return decorator - def log_message(scope, level, msg, *args, logger=None, **kwargs): """ Log a message if either: @@ -239,8 +223,25 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): if not (is_logging_scope_enabled(scope) or log_level >= log.getEffectiveLevel()): return - log_func = getattr(log, level.lower()) - log_func(msg, *args, **kwargs) + # Create indentation based on call stack depth + indent = " " * _call_stack_depth + indented_msg = f"{indent}{msg}" + + # If scope is enabled, bypass the logger's level check + if is_logging_scope_enabled(scope): + # Create a temporary handler that accepts all levels + temp_handler = logging.StreamHandler(sys.stdout) + temp_handler.setLevel(logging.DEBUG) + log.addHandler(temp_handler) + try: + log_func = getattr(log, level.lower()) + log_func(indented_msg, *args, **kwargs) + finally: + log.removeHandler(temp_handler) + else: + # Use normal logging with level check + log_func = getattr(log, level.lower()) + log_func(indented_msg, *args, **kwargs) # Example usage: # log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file: %s", filename) From dd5fcd4757c4a8a3d1321ff8f8fcadcd0a0cdc70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 22:40:46 +0200 Subject: [PATCH 045/218] make sure to use full log format incl level --- scripts/automated_ingestion/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index e4070458..4f784f95 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -232,6 +232,11 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): # Create a temporary handler that accepts all levels temp_handler = logging.StreamHandler(sys.stdout) temp_handler.setLevel(logging.DEBUG) + # Use the same format as the root logger's handlers + if log.handlers: + temp_handler.setFormatter(log.handlers[0].formatter) + else: + temp_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) log.addHandler(temp_handler) try: log_func = getattr(log, level.lower()) From bc184d84a81cf06f46294532c20c1c4e2cbe128e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 22:48:13 +0200 Subject: [PATCH 046/218] use fixed length levelname --- scripts/automated_ingestion/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 4f784f95..d69d1530 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -232,11 +232,15 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): # Create a temporary handler that accepts all levels temp_handler = logging.StreamHandler(sys.stdout) temp_handler.setLevel(logging.DEBUG) - # Use the same format as the root logger's handlers + # Use the same format as the root logger's handlers but with fixed-width level names if log.handlers: - temp_handler.setFormatter(log.handlers[0].formatter) + # Get the original format string + orig_format = log.handlers[0].formatter._fmt + # Replace %(levelname)s with %(levelname)-8s to make it fixed width + new_format = orig_format.replace('%(levelname)s', '%(levelname)-8s') + temp_handler.setFormatter(logging.Formatter(new_format)) else: - temp_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) + temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s')) log.addHandler(temp_handler) try: log_func = getattr(log, level.lower()) From 6ed3bdc3c8499c23814d87a7e9fdf1015f3dab90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 22:50:24 +0200 Subject: [PATCH 047/218] use fixed length levelname everywhere --- scripts/automated_ingestion/automated_ingestion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 24799a54..25bc733a 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -196,8 +196,8 @@ def setup_logging(config, args): logger.setLevel(logging.DEBUG) # Set root logger to lowest level # Create formatters - console_formatter = logging.Formatter(log_format) - file_formatter = logging.Formatter('%(asctime)s - ' + log_format) + console_formatter = logging.Formatter('%(levelname)-8s: %(message)s') + file_formatter = logging.Formatter('%(asctime)s - %(levelname)-8s: %(message)s') # Console handler (only if not quiet) if not args.quiet: From 28558b5a8ca99065db9f8e4282bf04a8ab8da76d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 2 May 2025 23:50:34 +0200 Subject: [PATCH 048/218] improve handling of download errors and cleanup of partially downloaded files --- .../automated_ingestion/eessi_data_object.py | 144 ++++++++++++------ 1 file changed, 99 insertions(+), 45 deletions(-) diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index 902417aa..8b60099c 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -99,46 +99,51 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool: log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path) elif mode == DownloadMode.CHECK_REMOTE: # First check if we have local ETags - local_file_etag = self._get_local_etag(self.local_file_path) - local_sig_etag = self._get_local_etag(self.local_sig_path) + try: + local_file_etag = self._get_local_etag(self.local_file_path) + local_sig_etag = self._get_local_etag(self.local_sig_path) - if local_file_etag: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag) - else: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found") - if local_sig_etag: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag) - else: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found") + if local_file_etag: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag) + else: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found") + if local_sig_etag: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag) + else: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found") - # If we don't have local ETags, we need to download - if not local_file_etag or not local_sig_etag: - should_download = True - log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", - self.remote_file_path) - else: - # Get remote ETags and compare - remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] - remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag'] - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag) - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag) - - should_download = ( - remote_file_etag != local_file_etag or - remote_sig_etag != local_sig_etag - ) - if should_download: - if remote_file_etag != local_file_etag: - log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", - local_file_etag, remote_file_etag) - if remote_sig_etag != local_sig_etag: - log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", - local_sig_etag, remote_sig_etag) - log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", + # If we don't have local ETags, we need to download + if not local_file_etag or not local_sig_etag: + should_download = True + log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", self.remote_file_path) else: - log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", - self.remote_file_path) + # Get remote ETags and compare + remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] + remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag'] + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag) + + should_download = ( + remote_file_etag != local_file_etag or + remote_sig_etag != local_sig_etag + ) + if should_download: + if remote_file_etag != local_file_etag: + log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", + local_file_etag, remote_file_etag) + if remote_sig_etag != local_sig_etag: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", + local_sig_etag, remote_sig_etag) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", + self.remote_file_path) + else: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", + self.remote_file_path) + except Exception as etag_err: + # If we get any error with ETags, we'll just download the files + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err)) + should_download = True else: # CHECK_LOCAL should_download = ( not self.local_file_path.exists() or @@ -163,24 +168,73 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool: # Download files try: + # Download the main file first self.remote_client.download(self.remote_file_path, str(self.local_file_path)) - self.remote_client.download(self.remote_sig_path, str(self.local_sig_path)) - # Log the ETags of downloaded files - file_etag = self._get_local_etag(self.local_file_path) - sig_etag = self._get_local_etag(self.local_sig_path) - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_file_path, file_etag) - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_sig_path, sig_etag) + # Get and log the ETag of the downloaded file + try: + file_etag = self._get_local_etag(self.local_file_path) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", + self.remote_file_path, file_etag) + except Exception as etag_err: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", + self.remote_file_path, str(etag_err)) + + # Try to download the signature file + try: + self.remote_client.download(self.remote_sig_path, str(self.local_sig_path)) + try: + sig_etag = self._get_local_etag(self.local_sig_path) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", + self.remote_sig_path, sig_etag) + except Exception as etag_err: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", + self.remote_sig_path, str(etag_err)) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s and its signature", + self.remote_file_path) + except Exception as sig_err: + # Check if signatures are required + if self.config['signatures'].getboolean('signatures_required', True): + # If signatures are required, clean up everything since we can't proceed + if self.local_file_path.exists(): + self.local_file_path.unlink() + # Clean up etag files regardless of whether their data files exist + file_etag_path = self._get_etag_file_path(self.local_file_path) + if file_etag_path.exists(): + file_etag_path.unlink() + sig_etag_path = self._get_etag_file_path(self.local_sig_path) + if sig_etag_path.exists(): + sig_etag_path.unlink() + log_message(LoggingScope.ERROR, 'ERROR', "Failed to download required signature for %s: %s", + self.remote_file_path, str(sig_err)) + raise + else: + # If signatures are optional, just clean up any partial signature files + if self.local_sig_path.exists(): + self.local_sig_path.unlink() + sig_etag_path = self._get_etag_file_path(self.local_sig_path) + if sig_etag_path.exists(): + sig_etag_path.unlink() + log_message(LoggingScope.DOWNLOAD, 'WARNING', "Failed to download optional signature for %s: %s", + self.remote_file_path, str(sig_err)) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s (signature optional)", + self.remote_file_path) - log_msg = "Successfully downloaded %s and its signature" - log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path) return True except Exception as err: - # Clean up partially downloaded files + # This catch block is only for errors in the main file download + # Clean up partially downloaded files and their etags if self.local_file_path.exists(): self.local_file_path.unlink() if self.local_sig_path.exists(): self.local_sig_path.unlink() + # Clean up etag files regardless of whether their data files exist + file_etag_path = self._get_etag_file_path(self.local_file_path) + if file_etag_path.exists(): + file_etag_path.unlink() + sig_etag_path = self._get_etag_file_path(self.local_sig_path) + if sig_etag_path.exists(): + sig_etag_path.unlink() log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", self.remote_file_path, str(err)) raise From 7cf615ef3307b54915f3cd18ae3b82387e703b39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 3 May 2025 00:47:20 +0200 Subject: [PATCH 049/218] add task description and verification of signatures --- .../automated_ingestion.py | 18 +++-- .../automated_ingestion/eessi_data_object.py | 69 ++++++++++++++++++ .../eessi_task_description.py | 71 +++++++++++++++++++ 3 files changed, 151 insertions(+), 7 deletions(-) create mode 100644 scripts/automated_ingestion/eessi_task_description.py diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 25bc733a..85afdffd 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from eessitarball import EessiTarball, EessiTarballGroup -from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode +from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode, EESSITaskDescription from s3_bucket import EESSIS3Bucket from pid.decorator import pidfile # noqa: F401 from pid import PidFileError @@ -249,18 +249,22 @@ def main(): # Process each task file for task_path in tasks: try: - # Create EESSIDataAndSignatureObject for the task file - task_obj = EESSIDataAndSignatureObject(config, task_path, s3_bucket) + # Create EESSITaskDescription for the task file + task_description = EESSITaskDescription( + EESSIDataAndSignatureObject(config, task_path, s3_bucket) + ) - # Download the task file and its signature - task_obj.download(mode=DownloadMode.CHECK_REMOTE) + # Log information about the task + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_description.task_object.local_file_path) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_description.task_object.local_sig_path) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task_description.signature_verified) # Log the ETags of the downloaded task file - file_etag, sig_etag = task_obj.get_etags() + file_etag, sig_etag = task_description.task_object.get_etags() log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag) log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s", - task_obj.remote_sig_path, sig_etag) + task_description.task_object.remote_sig_path, sig_etag) # TODO: Process the task file contents # This would involve reading the task file, parsing its contents, diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index 8b60099c..97867402 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -1,4 +1,5 @@ import os +import subprocess from dataclasses import dataclass from pathlib import Path from typing import Optional @@ -83,6 +84,74 @@ def get_etags(self) -> tuple[Optional[str], Optional[str]]: self._get_local_etag(self.local_sig_path) ) + @log_function_entry_exit() + def verify_signature(self) -> bool: + """ + Verify the signature of the data file using the corresponding signature file. + + Returns: + bool: True if the signature is valid or if signatures are not required, False otherwise + """ + # Check if signature file exists + if not self.local_sig_path.exists(): + log_message(LoggingScope.VERIFICATION, 'WARNING', "Signature file %s is missing", + self.local_sig_path) + + # If signatures are required, return failure + if self.config['signatures'].getboolean('signatures_required', True): + log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing and signatures are required", + self.local_sig_path) + return False + else: + log_message(LoggingScope.VERIFICATION, 'INFO', + "Signature file %s is missing, but signatures are not required", + self.local_sig_path) + return True + + # If signatures are provided, we should always verify them, regardless of the signatures_required setting + verify_runenv = self.config['signatures']['signature_verification_runenv'].split() + verify_script = self.config['signatures']['signature_verification_script'] + allowed_signers_file = self.config['signatures']['allowed_signers_file'] + + # Check if verification tools exist + if not Path(verify_script).exists(): + log_message(LoggingScope.ERROR, 'ERROR', + "Unable to verify signature: verification script %s does not exist", verify_script) + return False + + if not Path(allowed_signers_file).exists(): + log_message(LoggingScope.ERROR, 'ERROR', + "Unable to verify signature: allowed signers file %s does not exist", allowed_signers_file) + return False + + # Run the verification command with named parameters + cmd = verify_runenv + [ + verify_script, + '--verify', + '--allowed-signers-file', allowed_signers_file, + '--file', str(self.local_file_path), + '--signature-file', str(self.local_sig_path) + ] + log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(cmd)) + + try: + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + log_message(LoggingScope.VERIFICATION, 'INFO', + "Successfully verified signature for %s", self.local_file_path) + return True + else: + log_message(LoggingScope.ERROR, 'ERROR', + "Signature verification failed for %s", self.local_file_path) + log_message(LoggingScope.ERROR, 'ERROR', " stdout: %s", result.stdout) + log_message(LoggingScope.ERROR, 'ERROR', " stderr: %s", result.stderr) + return False + except Exception as e: + log_message(LoggingScope.ERROR, 'ERROR', + "Error during signature verification for %s: %s", + self.local_file_path, str(e)) + return False + @log_function_entry_exit() def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool: """ diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py new file mode 100644 index 00000000..a958fa4d --- /dev/null +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -0,0 +1,71 @@ +import json +import subprocess +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Optional + +from eessi_data_object import EESSIDataAndSignatureObject +from utils import log_function_entry_exit, log_message, LoggingScope + + +@dataclass +class EESSITaskDescription: + """Class representing an EESSI task to be performed, including its metadata and associated data files.""" + + # The EESSI data and signature object associated with this task + task_object: EESSIDataAndSignatureObject + + # Whether the signature was successfully verified + signature_verified: bool = False + + # Metadata from the task description file + metadata: Dict[str, Any] = None + + @log_function_entry_exit() + def __init__(self, task_object: EESSIDataAndSignatureObject): + """ + Initialize an EESSITaskDescription object. + + Args: + task_object: The EESSI data and signature object associated with this task + """ + self.task_object = task_object + self.metadata = {} + + # Verify signature and set initial state + self.signature_verified = self.task_object.verify_signature() + + # Try to read metadata (will only succeed if signature is verified) + try: + self._read_metadata() + except RuntimeError: + # Expected if signature is not verified yet + pass + + @log_function_entry_exit() + def _read_metadata(self) -> None: + """ + Internal method to read and parse the metadata from the task description file. + Only reads metadata if the signature has been verified. + """ + if not self.signature_verified: + log_message(LoggingScope.ERROR, 'ERROR', "Cannot read metadata: signature not verified for %s", + self.task_object.local_file_path) + raise RuntimeError("Cannot read metadata: signature not verified") + + try: + with open(self.task_object.local_file_path, 'r') as f: + self.metadata = json.load(f) + log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s", self.task_object.local_file_path) + except json.JSONDecodeError as e: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to parse JSON in task description file %s: %s", + self.task_object.local_file_path, str(e)) + raise + except Exception as e: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to read task description file %s: %s", + self.task_object.local_file_path, str(e)) + raise + + def __str__(self) -> str: + """Return a string representation of the EESSITaskDescription object.""" + return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" \ No newline at end of file From aa548dcd106da8a2585f423c4c9af0b3afebca96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 3 May 2025 00:49:47 +0200 Subject: [PATCH 050/218] fix import of EESSITaskDescription --- scripts/automated_ingestion/automated_ingestion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 85afdffd..4a4d63ac 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 from eessitarball import EessiTarball, EessiTarballGroup -from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode, EESSITaskDescription +from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode +from eessi_task_description import EESSITaskDescription from s3_bucket import EESSIS3Bucket from pid.decorator import pidfile # noqa: F401 from pid import PidFileError From 3e8f8bdd656b7a34a8c22047a52403efd6b191bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 3 May 2025 00:57:55 +0200 Subject: [PATCH 051/218] add more log output for verification --- scripts/automated_ingestion/automated_ingestion.py | 2 +- scripts/automated_ingestion/eessi_data_object.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 4a4d63ac..f68a7373 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -270,7 +270,7 @@ def main(): # TODO: Process the task file contents # This would involve reading the task file, parsing its contents, # and performing the required actions based on the task type - log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing task file: %s", task_path) + log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path) except Exception as err: log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err)) diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index 97867402..aca00adf 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -139,6 +139,8 @@ def verify_signature(self) -> bool: if result.returncode == 0: log_message(LoggingScope.VERIFICATION, 'INFO', "Successfully verified signature for %s", self.local_file_path) + log_message(LoggingScope.VERIFICATION, 'DEBUG', " stdout: %s", result.stdout) + log_message(LoggingScope.VERIFICATION, 'DEBUG', " stderr: %s", result.stderr) return True else: log_message(LoggingScope.ERROR, 'ERROR', From e51200d40f9df34f6159bb09318669a6a3512948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 3 May 2025 01:05:58 +0200 Subject: [PATCH 052/218] add scopes to log messages and avoid message duplicates --- scripts/automated_ingestion/utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index d69d1530..18581d5c 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -179,14 +179,14 @@ def wrapper(*args, **kwargs): last_line_no = start_line + len(source_lines) - 1 - last_line start_time = time.time() - log.info(f"{indent}Entering {func.__name__} at {file_name}:{def_line_no}{context}") + log.info(f"{indent}[FUNC_ENTRY_EXIT] Entering {func.__name__} at {file_name}:{def_line_no}{context}") _call_stack_depth += 1 try: result = func(*args, **kwargs) _call_stack_depth -= 1 end_time = time.time() # For normal returns, show the last line of the function - log.info(f"{indent}Leaving {func.__name__} at {file_name}:{last_line_no}" + log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{last_line_no}" f"{context} (took {end_time - start_time:.2f}s)") return result except Exception as err: @@ -197,7 +197,7 @@ def wrapper(*args, **kwargs): exc_line_no = err.__traceback__.tb_lineno except AttributeError: exc_line_no = last_line_no - log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exc_line_no}" + log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{exc_line_no}" f"{context} with exception (took {end_time - start_time:.2f}s)") raise err return wrapper @@ -225,7 +225,9 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): # Create indentation based on call stack depth indent = " " * _call_stack_depth - indented_msg = f"{indent}{msg}" + # Add scope to the message + scoped_msg = f"[{scope.name}] {msg}" + indented_msg = f"{indent}{scoped_msg}" # If scope is enabled, bypass the logger's level check if is_logging_scope_enabled(scope): @@ -247,7 +249,7 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): log_func(indented_msg, *args, **kwargs) finally: log.removeHandler(temp_handler) - else: + elif log_level >= log.getEffectiveLevel(): # Use normal logging with level check log_func = getattr(log, level.lower()) log_func(indented_msg, *args, **kwargs) From bc9453747780a270b14cd8ffd2c359c8a9297ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 3 May 2025 01:10:01 +0200 Subject: [PATCH 053/218] next attempt to avoid message duplicates --- scripts/automated_ingestion/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 18581d5c..0cc4ba59 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -229,7 +229,7 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): scoped_msg = f"[{scope.name}] {msg}" indented_msg = f"{indent}{scoped_msg}" - # If scope is enabled, bypass the logger's level check + # If scope is enabled, use the temporary handler if is_logging_scope_enabled(scope): # Create a temporary handler that accepts all levels temp_handler = logging.StreamHandler(sys.stdout) @@ -249,7 +249,8 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): log_func(indented_msg, *args, **kwargs) finally: log.removeHandler(temp_handler) - elif log_level >= log.getEffectiveLevel(): + # Only use normal logging if scope is not enabled AND level is high enough + elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel(): # Use normal logging with level check log_func = getattr(log, level.lower()) log_func(indented_msg, *args, **kwargs) From f9aa559da268cb92a5412c0e00f32a039a61463c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 3 May 2025 01:15:52 +0200 Subject: [PATCH 054/218] temporarily disable standard loggers --- .../automated_ingestion.py | 5 +++++ scripts/automated_ingestion/utils.py | 21 +++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index f68a7373..94cf9439 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -196,6 +196,9 @@ def setup_logging(config, args): logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Set root logger to lowest level + # Store original handlers + logger._original_handlers = [] + # Create formatters console_formatter = logging.Formatter('%(levelname)-8s: %(message)s') file_formatter = logging.Formatter('%(asctime)s - %(levelname)-8s: %(message)s') @@ -206,6 +209,7 @@ def setup_logging(config, args): console_handler.setLevel(console_level) console_handler.setFormatter(console_formatter) logger.addHandler(console_handler) + logger._original_handlers.append(console_handler) # File handler (if log file is specified) if log_file: @@ -217,6 +221,7 @@ def setup_logging(config, args): file_handler.setLevel(file_level) file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) + logger._original_handlers.append(file_handler) return logger diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 0cc4ba59..da0bf220 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -231,24 +231,27 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): # If scope is enabled, use the temporary handler if is_logging_scope_enabled(scope): + # Remove all existing handlers + for handler in log.handlers[:]: + log.removeHandler(handler) + # Create a temporary handler that accepts all levels temp_handler = logging.StreamHandler(sys.stdout) temp_handler.setLevel(logging.DEBUG) - # Use the same format as the root logger's handlers but with fixed-width level names - if log.handlers: - # Get the original format string - orig_format = log.handlers[0].formatter._fmt - # Replace %(levelname)s with %(levelname)-8s to make it fixed width - new_format = orig_format.replace('%(levelname)s', '%(levelname)-8s') - temp_handler.setFormatter(logging.Formatter(new_format)) - else: - temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s')) + temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s')) log.addHandler(temp_handler) + try: log_func = getattr(log, level.lower()) log_func(indented_msg, *args, **kwargs) finally: log.removeHandler(temp_handler) + # Restore original handlers + for handler in log.handlers[:]: + log.removeHandler(handler) + if hasattr(log, '_original_handlers'): + for handler in log._original_handlers: + log.addHandler(handler) # Only use normal logging if scope is not enabled AND level is high enough elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel(): # Use normal logging with level check From 0f0dfca3e1d3726cbaa9bd8d6ac33038c186c246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 4 May 2025 07:52:53 +0200 Subject: [PATCH 055/218] change handling of temporary log handler for scopes --- .../automated_ingestion.py | 5 ----- scripts/automated_ingestion/utils.py | 21 ++++++++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 94cf9439..f68a7373 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -196,9 +196,6 @@ def setup_logging(config, args): logger = logging.getLogger() logger.setLevel(logging.DEBUG) # Set root logger to lowest level - # Store original handlers - logger._original_handlers = [] - # Create formatters console_formatter = logging.Formatter('%(levelname)-8s: %(message)s') file_formatter = logging.Formatter('%(asctime)s - %(levelname)-8s: %(message)s') @@ -209,7 +206,6 @@ def setup_logging(config, args): console_handler.setLevel(console_level) console_handler.setFormatter(console_formatter) logger.addHandler(console_handler) - logger._original_handlers.append(console_handler) # File handler (if log file is specified) if log_file: @@ -221,7 +217,6 @@ def setup_logging(config, args): file_handler.setLevel(file_level) file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) - logger._original_handlers.append(file_handler) return logger diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index da0bf220..70fbd9de 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -231,27 +231,32 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): # If scope is enabled, use the temporary handler if is_logging_scope_enabled(scope): - # Remove all existing handlers - for handler in log.handlers[:]: - log.removeHandler(handler) + # Save original handlers + original_handlers = list(log.handlers) # Create a temporary handler that accepts all levels temp_handler = logging.StreamHandler(sys.stdout) temp_handler.setLevel(logging.DEBUG) temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s')) - log.addHandler(temp_handler) try: + # Remove existing handlers temporarily + for handler in original_handlers: + log.removeHandler(handler) + + # Add temporary handler + log.addHandler(temp_handler) + + # Log the message log_func = getattr(log, level.lower()) log_func(indented_msg, *args, **kwargs) finally: log.removeHandler(temp_handler) # Restore original handlers - for handler in log.handlers[:]: - log.removeHandler(handler) - if hasattr(log, '_original_handlers'): - for handler in log._original_handlers: + for handler in original_handlers: + if handler not in log.handlers: log.addHandler(handler) + # Only use normal logging if scope is not enabled AND level is high enough elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel(): # Use normal logging with level check From d87b078a2cab6d373e91f02ded9b12b61e2e21bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 4 May 2025 10:38:09 +0200 Subject: [PATCH 056/218] change default download mode and optimise fetching of etags --- .../automated_ingestion/eessi_data_object.py | 100 ++++++++++-------- 1 file changed, 57 insertions(+), 43 deletions(-) diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index aca00adf..482ca6f3 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -155,7 +155,7 @@ def verify_signature(self) -> bool: return False @log_function_entry_exit() - def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool: + def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: """ Download data file and signature based on the specified mode. @@ -165,56 +165,70 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool: Returns: True if files were downloaded, False otherwise """ + # If mode is FORCE, we always download regardless of local or remote state if mode == DownloadMode.FORCE: should_download = True log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path) + # For CHECK_REMOTE mode, check if we can optimize elif mode == DownloadMode.CHECK_REMOTE: - # First check if we have local ETags - try: - local_file_etag = self._get_local_etag(self.local_file_path) - local_sig_etag = self._get_local_etag(self.local_sig_path) + # Optimization: Check if local files exist first + local_files_exist = ( + self.local_file_path.exists() and + self.local_sig_path.exists() + ) - if local_file_etag: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag) - else: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found") - if local_sig_etag: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag) - else: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found") + # If files don't exist locally, we can skip ETag checks + if not local_files_exist: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, skipping ETag checks and downloading %s", + self.remote_file_path) + should_download = True + else: + # First check if we have local ETags + try: + local_file_etag = self._get_local_etag(self.local_file_path) + local_sig_etag = self._get_local_etag(self.local_sig_path) - # If we don't have local ETags, we need to download - if not local_file_etag or not local_sig_etag: - should_download = True - log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", - self.remote_file_path) - else: - # Get remote ETags and compare - remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] - remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag'] - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag) - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag) - - should_download = ( - remote_file_etag != local_file_etag or - remote_sig_etag != local_sig_etag - ) - if should_download: - if remote_file_etag != local_file_etag: - log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", - local_file_etag, remote_file_etag) - if remote_sig_etag != local_sig_etag: - log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", - local_sig_etag, remote_sig_etag) - log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", - self.remote_file_path) + if local_file_etag: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag) + else: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found") + if local_sig_etag: + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag) else: - log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found") + + # If we don't have local ETags, we need to download + if not local_file_etag or not local_sig_etag: + should_download = True + log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", self.remote_file_path) - except Exception as etag_err: - # If we get any error with ETags, we'll just download the files - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err)) - should_download = True + else: + # Get remote ETags and compare + remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] + remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag'] + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag) + + should_download = ( + remote_file_etag != local_file_etag or + remote_sig_etag != local_sig_etag + ) + if should_download: + if remote_file_etag != local_file_etag: + log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", + local_file_etag, remote_file_etag) + if remote_sig_etag != local_sig_etag: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", + local_sig_etag, remote_sig_etag) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", + self.remote_file_path) + else: + log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", + self.remote_file_path) + except Exception as etag_err: + # If we get any error with ETags, we'll just download the files + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err)) + should_download = True else: # CHECK_LOCAL should_download = ( not self.local_file_path.exists() or From 55aa5a1016c1953d8dabd2449822924845c469f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 4 May 2025 10:49:22 +0200 Subject: [PATCH 057/218] download task object when necessary --- scripts/automated_ingestion/eessi_task_description.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index a958fa4d..c8f96ec9 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -31,7 +31,9 @@ def __init__(self, task_object: EESSIDataAndSignatureObject): """ self.task_object = task_object self.metadata = {} - + + self.task_object.download(mode=DownloadMode.CHECK_REMOTE) + # Verify signature and set initial state self.signature_verified = self.task_object.verify_signature() From 6838435d84f1cd824324f5e311b706f20de5917b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 4 May 2025 10:51:47 +0200 Subject: [PATCH 058/218] import DownloadMode --- scripts/automated_ingestion/eessi_task_description.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index c8f96ec9..fcc5a68b 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -6,6 +6,7 @@ from eessi_data_object import EESSIDataAndSignatureObject from utils import log_function_entry_exit, log_message, LoggingScope +from remote_storage import DownloadMode @dataclass From 7afd28328d6b8b211265cfd57f3a4f0820a37377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 4 May 2025 15:56:26 +0200 Subject: [PATCH 059/218] initial EESSITask and support for handling states for different actions --- .../automated_ingestion.py | 29 +++-- scripts/automated_ingestion/eessi_task.py | 117 ++++++++++++++++++ .../automated_ingestion/eessi_task_action.py | 11 ++ .../eessi_task_description.py | 7 ++ 4 files changed, 154 insertions(+), 10 deletions(-) create mode 100644 scripts/automated_ingestion/eessi_task.py create mode 100644 scripts/automated_ingestion/eessi_task_action.py diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index f68a7373..01606b01 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -2,6 +2,7 @@ from eessitarball import EessiTarball, EessiTarballGroup from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode +from eessi_task import EESSITask from eessi_task_description import EESSITaskDescription from s3_bucket import EESSIS3Bucket from pid.decorator import pidfile # noqa: F401 @@ -250,27 +251,35 @@ def main(): # Process each task file for task_path in tasks: try: - # Create EESSITaskDescription for the task file - task_description = EESSITaskDescription( - EESSIDataAndSignatureObject(config, task_path, s3_bucket) - ) - + # Create EESSITask for the task file + try: + task = EESSITask( + EESSITaskDescription( + EESSIDataAndSignatureObject(config, task_path, s3_bucket) + ) + ) + except Exception as err: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", task_path, str(err)) + continue + + # TODO: update the information shown below (what makes sense to show?) # Log information about the task - log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_description.task_object.local_file_path) - log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_description.task_object.local_sig_path) - log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task_description.signature_verified) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task.task_description.task_object.local_file_path) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task.task_description.task_object.local_sig_path) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task.task_description.signature_verified) # Log the ETags of the downloaded task file - file_etag, sig_etag = task_description.task_object.get_etags() + file_etag, sig_etag = task.task_description.task_object.get_etags() log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag) log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s", - task_description.task_object.remote_sig_path, sig_etag) + task.task_description.task_object.remote_sig_path, sig_etag) # TODO: Process the task file contents # This would involve reading the task file, parsing its contents, # and performing the required actions based on the task type log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path) + task.handle() except Exception as err: log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err)) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py new file mode 100644 index 00000000..875f1b4f --- /dev/null +++ b/scripts/automated_ingestion/eessi_task.py @@ -0,0 +1,117 @@ +from enum import Enum, auto + +from eessi_task_action import EESSITaskAction +from eessi_task_description import EESSITaskDescription + +class TaskState(Enum): + NEW = auto() # The task has been created but not yet processed + STAGED = auto() # The task has been staged to the Stratum-0 + PR_OPENED = auto() # The task has been opened as a PR in some staging repository + APPROVED = auto() # The task has been approved + REJECTED = auto() # The task has been rejected + INGESTED = auto() # The task has been ingested into the target CernVM-FS repository + + def __str__(self): + return self.name.lower() + +class EESSITask: + task_description: EESSITaskDescription + action: EESSITaskAction + state: TaskState + + def __init__(self, task_description: EESSITaskDescription): + self.task_description = task_description + self.action = self._determine_action() + self.state = TaskState.NEW + + # Define valid state transitions for all actions + self.valid_transitions = { + TaskState.NEW: [TaskState.STAGED], + TaskState.STAGED: [TaskState.PR_OPENED], + TaskState.PR_OPENED: [TaskState.APPROVED, TaskState.REJECTED], + TaskState.APPROVED: [TaskState.INGESTED], + TaskState.REJECTED: [], # Terminal state + TaskState.INGESTED: [] # Terminal state + } + + def _determine_action(self) -> EESSITaskAction: + """ + Determine the action type based on task description metadata. + """ + if 'task' in self.task_description.metadata and 'action' in self.task_description.metadata['task']: + action_str = self.task_description.metadata['action'].lower() + if action_str == "nop": + return EESSITaskAction.NOP + elif action_str == "delete": + return EESSITaskAction.DELETE + elif action_str == "add": + return EESSITaskAction.ADD + elif action_str == "update": + return EESSITaskAction.UPDATE + return EESSITaskAction.UNKNOWN + + def handle(self): + """ + Dynamically find and execute the appropriate handler based on action and state. + """ + state_before_handle = self.state + + # Construct handler method name + handler_name = f"_handle_{self.action}_{self.state}" + + # Check if the handler exists + handler = getattr(self, handler_name, None) + + if handler and callable(handler): + # Execute the handler if it exists + handler() + # if state has changed, run handle() again; otherwise, do nothing + if self.state != state_before_handle: + print(f"handler {handler_name} changed state from {state_before_handle} to {self.state} ; running handle() again") + self.handle() + else: + # Default behavior for missing handlers + print(f"No handler for action {self.action} and state {self.state} implemented; nothing to be done") + + # Implement handlers for ADD action + def _handle_add_new(self): + """Handler for ADD action in NEW state""" + print("Handling ADD action in NEW state") + # Implementation for adding in NEW state + return True + + def _handle_add_staged(self): + """Handler for ADD action in STAGED state""" + print("Handling ADD action in STAGED state") + # Implementation for adding in STAGED state + return True + + def _handle_add_pr_opened(self): + """Handler for ADD action in PR_OPENED state""" + print("Handling ADD action in PR_OPENED state") + # Implementation for adding in PR_OPENED state + return True + + def _handle_add_approved(self): + """Handler for ADD action in APPROVED state""" + print("Handling ADD action in APPROVED state") + # Implementation for adding in APPROVED state + return True + + def _handle_add_ingested(self): + """Handler for ADD action in INGESTED state""" + print("Handling ADD action in INGESTED state") + # Implementation for adding in INGESTED state + return True + + def transition_to(self, new_state: TaskState): + """ + Transition the task to a new state if valid. + """ + if new_state in self.valid_transitions[self.state]: + self.state = new_state + return True + return False + + def __str__(self): + return f"EESSITask(task_description={self.task_description})" \ No newline at end of file diff --git a/scripts/automated_ingestion/eessi_task_action.py b/scripts/automated_ingestion/eessi_task_action.py new file mode 100644 index 00000000..8f0ce599 --- /dev/null +++ b/scripts/automated_ingestion/eessi_task_action.py @@ -0,0 +1,11 @@ +from enum import Enum, auto + +class EESSITaskAction(Enum): + NOP = auto() # perform no action + DELETE = auto() # perform a delete operation + ADD = auto() # perform an add operation + UPDATE = auto() # perform an update operation + UNKNOWN = auto() # unknown action + + def __str__(self): + return self.name.lower() diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index fcc5a68b..b615b3df 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -45,6 +45,13 @@ def __init__(self, task_object: EESSIDataAndSignatureObject): # Expected if signature is not verified yet pass + # TODO: Process the task file contents + # check if the task file contains a task field and add that to self + if 'task' in self.metadata: + self.task = self.metadata['task'] + else: + self.task = None + @log_function_entry_exit() def _read_metadata(self) -> None: """ From 49738c51bfd616e24df0db9713483c2ba0c0ea3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 4 May 2025 16:00:44 +0200 Subject: [PATCH 060/218] action is a field in task --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 875f1b4f..225e1c45 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -39,7 +39,7 @@ def _determine_action(self) -> EESSITaskAction: Determine the action type based on task description metadata. """ if 'task' in self.task_description.metadata and 'action' in self.task_description.metadata['task']: - action_str = self.task_description.metadata['action'].lower() + action_str = self.task_description.metadata['task']['action'].lower() if action_str == "nop": return EESSITaskAction.NOP elif action_str == "delete": From f265d7920565dd0d4e4318ecf571b73a6699d054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 20:17:45 +0200 Subject: [PATCH 061/218] determine metadata/task state from GH staging repo --- .../automated_ingestion.py | 5 +- scripts/automated_ingestion/eessi_task.py | 159 +++++++++++++++++- .../eessi_task_description.py | 32 ++++ 3 files changed, 186 insertions(+), 10 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 01606b01..1706b4bb 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -256,12 +256,15 @@ def main(): task = EESSITask( EESSITaskDescription( EESSIDataAndSignatureObject(config, task_path, s3_bucket) - ) + ), + gh_staging_repo ) except Exception as err: log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", task_path, str(err)) continue + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task: %s", task) + # TODO: update the information shown below (what makes sense to show?) # Log information about the task log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task.task_description.task_object.local_file_path) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 225e1c45..4bf122c0 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -11,18 +11,33 @@ class TaskState(Enum): REJECTED = auto() # The task has been rejected INGESTED = auto() # The task has been ingested into the target CernVM-FS repository + @classmethod + def from_string(cls, name, default=None, case_sensitive=False): + if case_sensitive: + return cls.__members__.get(name, default) + + try: + return next( + member for member_name, member in cls.__members__.items() + if member_name.lower() == name.lower() + ) + except StopIteration: + return default + def __str__(self): return self.name.lower() + class EESSITask: - task_description: EESSITaskDescription + description: EESSITaskDescription action: EESSITaskAction state: TaskState + git_repo: Github - def __init__(self, task_description: EESSITaskDescription): - self.task_description = task_description - self.action = self._determine_action() - self.state = TaskState.NEW + def __init__(self, description: EESSITaskDescription, git_repo: Github): + self.description = description + self.git_repo = git_repo + self.action = self._determine_task_action() # Define valid state transitions for all actions self.valid_transitions = { @@ -34,12 +49,14 @@ def __init__(self, task_description: EESSITaskDescription): TaskState.INGESTED: [] # Terminal state } - def _determine_action(self) -> EESSITaskAction: + self.state = self._find_state() + + def _determine_task_action(self) -> EESSITaskAction: """ Determine the action type based on task description metadata. """ - if 'task' in self.task_description.metadata and 'action' in self.task_description.metadata['task']: - action_str = self.task_description.metadata['task']['action'].lower() + if 'task' in self.description.metadata and 'action' in self.description.metadata['task']: + action_str = self.description.metadata['task']['action'].lower() if action_str == "nop": return EESSITaskAction.NOP elif action_str == "delete": @@ -50,6 +67,130 @@ def _determine_action(self) -> EESSITaskAction: return EESSITaskAction.UPDATE return EESSITaskAction.UNKNOWN + def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool: + """ + Check if a file exists in a repository branch. + """ + if branch is None: + branch = self.git_repo.default_branch + try: + self.git_repo.get_contents(file_path, ref=branch) + log_msg = "Found file %s in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch) + return True + except github.UnknownObjectException: + # file_path does not exist in branch + return False + except github.GithubException as err: + if err.status == 404: + # file_path does not exist in branch + return False + else: + # if there was some other (e.g. connection) issue, log message and return False + log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' + log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status) + return False + return False + + def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]: + """ + Determines in which sequence numbers the metadata/task file is included and in which it is not. + + Returns: + A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is included in that sequence number. + + Idea: + - The deployment for a single source PR could be split into multiple staging PRs each is assigned a unique + sequence number. + - For a given source PR (identified by the repo name and the PR number), a staging PR using a branch named + `REPO/PR_NUM/SEQ_NUM` is created. + - In the staging repo we create a corresponding directory `REPO/PR_NUM/SEQ_NUM`. + - If a metadata/task file is handled by the staging PR with sequence number, it is included in that directory. + - We iterate over all directories under `REPO/PR_NUM`: + - If the metadata/task file is available in the directory, we add the sequence number to the list. + + Note: this is a placeholder for now, as we do not know yet if we need to use a sequence number. + """ + sequence_numbers = {} + repo = self.description.metadata['task']['repo'] + pr = self.description.metadata['task']['pr'] + repo_pr_dir = f"{repo}/{pr}" + # iterate over all directories under repo_pr_dir + for dir in self._list_directory_contents(repo_pr_dir): + # check if the directory is a number + if dir.name.isdigit(): + remote_file_path = self.description.task_object.remote_file_path + if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"): + sequence_numbers[int(dir.name)] = True + else: + sequence_numbers[int(dir.name)] = False + else: + # directory is not a number, so we skip it + continue + return sequence_numbers + + def _find_state(self) -> TaskState: + """ + Determine the state of the task based on the task description metadata. + + Returns: + The state of the task. + """ + # obtain repo and pr from metadata + repo = self.description.metadata['task']['repo'] + pr = self.description.metadata['task']['pr'] + + # iterate over all sequence numbers in repo/pr dir + sequence_numbers = self._determine_sequence_numbers_including_task_file() + for sequence_number in [key for key, value in sequence_numbers.items() if value]: + # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name + # format of the metadata file name is: + # eessi-VERSION-COMPONENT-OS-ARCHITECTURE-TIMESTAMP.SUFFIX + # all uppercase words are placeholders + # all placeholders (except ARCHITECTURE) do not include any hyphens + # ARCHITECTURE can include one to two hyphens + # The SUFFIX is composed of two parts: TARBALLSUFFIX and METADATASUFFIX + # TARBALLSUFFIX is defined by the task object or in the configuration file + # METADATASUFFIX is defined by the task object or in the configuration file + # Later, we may switch to using task action files instead of metadata files. The format of the + # SUFFIX would then be defined by the task action or the configuration file. + version, component, os, architecture, timestamp, suffix = self.description.get_metadata_file_components() + metadata_file_name = f"eessi-{version}-{component}-{os}-{architecture}-{timestamp}.{suffix}" + metadata_file_state_path = f"{repo}/{pr}/{sequence_number}/{metadata_file_name}" + # get the state from the file in the metadata_file_state_path + state = self._get_state_from_metadata_file(metadata_file_state_path) + return state + # did not find metadata file in staging repo on GitHub + return TaskState.NEW + + def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskState: + """ + Get the state from the file in the metadata_file_state_path. + """ + # get contents of metadata_file_state_path + contents = self.git_repo.get_contents(metadata_file_state_path) + try: + state = TaskState.from_string(contents.name) + return state + except ValueError: + return TaskState.NEW + + def _list_directory_contents(self, directory_path, branch=None): + try: + # Get contents of the directory + contents = self.git_repo.get_contents(directory_path, ref=branch) + + # If contents is a list, it means we successfully got directory contents + if isinstance(contents, list): + return contents + else: + # If it's not a list, it means the path is not a directory + raise ValueError(f"{directory_path} is not a directory") + except github.GithubException as err: + if err.status == 404: + raise FileNotFoundError(f"Directory not found: {directory_path}") + raise err + def handle(self): """ Dynamically find and execute the appropriate handler based on action and state. @@ -114,4 +255,4 @@ def transition_to(self, new_state: TaskState): return False def __str__(self): - return f"EESSITask(task_description={self.task_description})" \ No newline at end of file + return f"EESSITask(description={self.description}, action={self.action}, state={self.state})" \ No newline at end of file diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index b615b3df..866121ef 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -76,6 +76,38 @@ def _read_metadata(self) -> None: self.task_object.local_file_path, str(e)) raise + def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: + """ + Get the components of the metadata file name. + + An example of the metadata file name is: + eessi-2023.06-software-linux-x86_64-amd-zen2-1745557626.tar.gz.meta.txt + + The components are: + eessi: some prefix + VERSION: 2023.06 + COMPONENT: software + OS: linux + ARCHITECTURE: x86_64-amd-zen2 + TIMESTAMP: 1745557626 + SUFFIX: tar.gz.meta.txt + + The ARCHITECTURE component can include one to two hyphens. + The SUFFIX is the part after the first dot (no other components should include dots). + """ + # obtain file name from local file path using basename + file_name = Path(self.task_object.local_file_path).name + # split file_name into part before suffix and the suffix + # from file_name_without_suffix determine VERSION (2nd element), COMPONENT (3rd element), OS (4th element), + # ARCHITECTURE (5th to second last elements) and TIMESTAMP (last element) + components = file_name_without_suffix.split('-') + version = components[1] + component = components[2] + os = components[3] + architecture = '-'.join(components[4:-1]) + timestamp = components[-1] + return version, component, os, architecture, timestamp, suffix + def __str__(self) -> str: """Return a string representation of the EESSITaskDescription object.""" return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" \ No newline at end of file From 3959780d4634d48da471b6e60c8f06babb62db74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 20:20:36 +0200 Subject: [PATCH 062/218] import missing Tuple --- scripts/automated_ingestion/eessi_task_description.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 866121ef..618b7968 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -2,7 +2,7 @@ import subprocess from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple from eessi_data_object import EESSIDataAndSignatureObject from utils import log_function_entry_exit, log_message, LoggingScope From c5e45a1a8e699c940d6cb36f787eeb504fa0071b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:09:19 +0200 Subject: [PATCH 063/218] flake8 improvements --- .../automated_ingestion.py | 60 +++++----- .../automated_ingestion/eessi_data_object.py | 107 +++++++++--------- scripts/automated_ingestion/eessi_task.py | 24 ++-- .../automated_ingestion/eessi_task_action.py | 1 + .../eessi_task_description.py | 33 +++--- scripts/automated_ingestion/eessitarball.py | 28 +++-- scripts/automated_ingestion/remote_storage.py | 2 +- scripts/automated_ingestion/s3_bucket.py | 11 +- scripts/automated_ingestion/utils.py | 14 ++- 9 files changed, 149 insertions(+), 131 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 1706b4bb..12429e4a 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 from eessitarball import EessiTarball, EessiTarballGroup -from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode +from eessi_data_object import EESSIDataAndSignatureObject from eessi_task import EESSITask from eessi_task_description import EESSITaskDescription from s3_bucket import EESSIS3Bucket @@ -10,7 +10,6 @@ from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes import argparse -import boto3 import configparser import github import json @@ -19,7 +18,7 @@ import pid import sys from pathlib import Path -from typing import List, Dict +from typing import List REQUIRED_CONFIG = { 'secrets': ['aws_secret_access_key', 'aws_access_key_id', 'github_pat'], @@ -135,30 +134,31 @@ def parse_args(): # Logging options logging_group = parser.add_argument_group('Logging options') logging_group.add_argument('--log-file', - help='Path to log file (overrides config file setting)') + help='Path to log file (overrides config file setting)') logging_group.add_argument('--console-level', - choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - help='Logging level for console output (overrides config file setting)') + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logging level for console output (overrides config file setting)') logging_group.add_argument('--file-level', - choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], - help='Logging level for file output (overrides config file setting)') + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Logging level for file output (overrides config file setting)') logging_group.add_argument('--quiet', - action='store_true', - help='Suppress console output (overrides all other console settings)') + action='store_true', + help='Suppress console output (overrides all other console settings)') logging_group.add_argument('--log-scopes', - help='Comma-separated list of logging scopes using +/- syntax. ' - 'Examples: "+FUNC_ENTRY_EXIT" (enable only function entry/exit), ' - '"+ALL,-FUNC_ENTRY_EXIT" (enable all except function entry/exit), ' - '"+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" (enable function entry/exit but disable example)') + help='Comma-separated list of logging scopes using +/- syntax. ' + 'Examples: "+FUNC_ENTRY_EXIT" (enable only function entry/exit), ' + '"+ALL,-FUNC_ENTRY_EXIT" (enable all except function entry/exit), ' + '"+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" (enable function entry/exit but disable example)') # Existing arguments parser.add_argument('-c', '--config', type=str, help='path to configuration file', - default='automated_ingestion.cfg', dest='config') + default='automated_ingestion.cfg', dest='config') parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug') - parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true', dest='list_only') + parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true', + dest='list_only') parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based. ' - 'Optionally specify comma-separated list of extensions (default: .task)', - nargs='?', const='.task', default=False) + 'Optionally specify comma-separated list of extensions (default: .task)', + nargs='?', const='.task', default=False) return parser.parse_args() @@ -175,7 +175,6 @@ def setup_logging(config, args): """ # Get settings from config file log_file = config['logging'].get('filename') - log_format = config['logging'].get('format', '%(levelname)s: %(message)s') config_console_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.INFO) config_file_level = LOG_LEVELS.get(config['logging'].get('file_level', 'DEBUG').upper(), logging.DEBUG) @@ -256,27 +255,29 @@ def main(): task = EESSITask( EESSITaskDescription( EESSIDataAndSignatureObject(config, task_path, s3_bucket) - ), + ), gh_staging_repo ) except Exception as err: - log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", task_path, str(err)) + log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", + task_path, str(err)) continue log_message(LoggingScope.GROUP_OPS, 'INFO', "Task: %s", task) # TODO: update the information shown below (what makes sense to show?) # Log information about the task - log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task.task_description.task_object.local_file_path) - log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task.task_description.task_object.local_sig_path) - log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task.task_description.signature_verified) + task_object = task.task_description.task_object + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", + task.task_description.signature_verified) # Log the ETags of the downloaded task file - file_etag, sig_etag = task.task_description.task_object.get_etags() + file_etag, sig_etag = task_object.get_etags() log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag) - log_message(LoggingScope.GROUP_OPS, 'INFO', - "Task signature %s has ETag: %s", - task.task_description.task_object.remote_sig_path, sig_etag) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s", + task_object.remote_sig_path, sig_etag) # TODO: Process the task file contents # This would involve reading the task file, parsing its contents, @@ -301,7 +302,8 @@ def main(): if tarballs: # Create a group for these tarballs group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3_bucket, cvmfs_repo) - log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True)) + log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", + group.to_string(oneline=True)) group.process_group(tarballs) else: # use old individual PR method diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index 482ca6f3..6e8189fe 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -1,10 +1,8 @@ -import os import subprocess from dataclasses import dataclass from pathlib import Path from typing import Optional -import boto3 import configparser from utils import log_function_entry_exit, log_message, LoggingScope @@ -88,24 +86,24 @@ def get_etags(self) -> tuple[Optional[str], Optional[str]]: def verify_signature(self) -> bool: """ Verify the signature of the data file using the corresponding signature file. - + Returns: bool: True if the signature is valid or if signatures are not required, False otherwise """ # Check if signature file exists if not self.local_sig_path.exists(): - log_message(LoggingScope.VERIFICATION, 'WARNING', "Signature file %s is missing", - self.local_sig_path) - + log_message(LoggingScope.VERIFICATION, 'WARNING', "Signature file %s is missing", + self.local_sig_path) + # If signatures are required, return failure if self.config['signatures'].getboolean('signatures_required', True): - log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing and signatures are required", - self.local_sig_path) + log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing and signatures are required", + self.local_sig_path) return False else: - log_message(LoggingScope.VERIFICATION, 'INFO', - "Signature file %s is missing, but signatures are not required", - self.local_sig_path) + log_message(LoggingScope.VERIFICATION, 'INFO', + "Signature file %s is missing, but signatures are not required", + self.local_sig_path) return True # If signatures are provided, we should always verify them, regardless of the signatures_required setting @@ -115,13 +113,13 @@ def verify_signature(self) -> bool: # Check if verification tools exist if not Path(verify_script).exists(): - log_message(LoggingScope.ERROR, 'ERROR', - "Unable to verify signature: verification script %s does not exist", verify_script) + log_message(LoggingScope.ERROR, 'ERROR', + "Unable to verify signature: verification script %s does not exist", verify_script) return False if not Path(allowed_signers_file).exists(): - log_message(LoggingScope.ERROR, 'ERROR', - "Unable to verify signature: allowed signers file %s does not exist", allowed_signers_file) + log_message(LoggingScope.ERROR, 'ERROR', + "Unable to verify signature: allowed signers file %s does not exist", allowed_signers_file) return False # Run the verification command with named parameters @@ -133,25 +131,25 @@ def verify_signature(self) -> bool: '--signature-file', str(self.local_sig_path) ] log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(cmd)) - + try: result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: - log_message(LoggingScope.VERIFICATION, 'INFO', - "Successfully verified signature for %s", self.local_file_path) + log_message(LoggingScope.VERIFICATION, 'INFO', + "Successfully verified signature for %s", self.local_file_path) log_message(LoggingScope.VERIFICATION, 'DEBUG', " stdout: %s", result.stdout) log_message(LoggingScope.VERIFICATION, 'DEBUG', " stderr: %s", result.stderr) return True else: - log_message(LoggingScope.ERROR, 'ERROR', - "Signature verification failed for %s", self.local_file_path) + log_message(LoggingScope.ERROR, 'ERROR', + "Signature verification failed for %s", self.local_file_path) log_message(LoggingScope.ERROR, 'ERROR', " stdout: %s", result.stdout) log_message(LoggingScope.ERROR, 'ERROR', " stderr: %s", result.stderr) return False - except Exception as e: - log_message(LoggingScope.ERROR, 'ERROR', - "Error during signature verification for %s: %s", - self.local_file_path, str(e)) + except Exception as err: + log_message(LoggingScope.ERROR, 'ERROR', + "Error during signature verification for %s: %s", + self.local_file_path, str(err)) return False @log_function_entry_exit() @@ -179,8 +177,9 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: # If files don't exist locally, we can skip ETag checks if not local_files_exist: - log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, skipping ETag checks and downloading %s", - self.remote_file_path) + log_message(LoggingScope.DOWNLOAD, 'INFO', + "Local files missing, skipping ETag checks and downloading %s", + self.remote_file_path) should_download = True else: # First check if we have local ETags @@ -201,7 +200,7 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: if not local_file_etag or not local_sig_etag: should_download = True log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", - self.remote_file_path) + self.remote_file_path) else: # Get remote ETags and compare remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag'] @@ -216,18 +215,20 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: if should_download: if remote_file_etag != local_file_etag: log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", - local_file_etag, remote_file_etag) + local_file_etag, remote_file_etag) if remote_sig_etag != local_sig_etag: log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", - local_sig_etag, remote_sig_etag) + local_sig_etag, remote_sig_etag) log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", - self.remote_file_path) + self.remote_file_path) else: - log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", - self.remote_file_path) + log_message(LoggingScope.DOWNLOAD, 'INFO', + "Remote files unchanged, skipping download of %s", + self.remote_file_path) except Exception as etag_err: # If we get any error with ETags, we'll just download the files - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err)) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", + str(etag_err)) should_download = True else: # CHECK_LOCAL should_download = ( @@ -239,11 +240,11 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: log_message(LoggingScope.DOWNLOAD, 'INFO', "Local file missing: %s", self.local_file_path) if not self.local_sig_path.exists(): log_message(LoggingScope.DOWNLOAD, 'INFO', "Local signature missing: %s", self.local_sig_path) - log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, downloading %s", - self.remote_file_path) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, downloading %s", + self.remote_file_path) else: - log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files exist, skipping download of %s", - self.remote_file_path) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files exist, skipping download of %s", + self.remote_file_path) if not should_download: return False @@ -259,24 +260,24 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: # Get and log the ETag of the downloaded file try: file_etag = self._get_local_etag(self.local_file_path) - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", - self.remote_file_path, file_etag) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", + self.remote_file_path, file_etag) except Exception as etag_err: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", - self.remote_file_path, str(etag_err)) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", + self.remote_file_path, str(etag_err)) # Try to download the signature file try: self.remote_client.download(self.remote_sig_path, str(self.local_sig_path)) try: sig_etag = self._get_local_etag(self.local_sig_path) - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", - self.remote_sig_path, sig_etag) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", + self.remote_sig_path, sig_etag) except Exception as etag_err: - log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", - self.remote_sig_path, str(etag_err)) - log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s and its signature", - self.remote_file_path) + log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", + self.remote_sig_path, str(etag_err)) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s and its signature", + self.remote_file_path) except Exception as sig_err: # Check if signatures are required if self.config['signatures'].getboolean('signatures_required', True): @@ -290,8 +291,8 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: sig_etag_path = self._get_etag_file_path(self.local_sig_path) if sig_etag_path.exists(): sig_etag_path.unlink() - log_message(LoggingScope.ERROR, 'ERROR', "Failed to download required signature for %s: %s", - self.remote_file_path, str(sig_err)) + log_message(LoggingScope.ERROR, 'ERROR', "Failed to download required signature for %s: %s", + self.remote_file_path, str(sig_err)) raise else: # If signatures are optional, just clean up any partial signature files @@ -300,10 +301,10 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: sig_etag_path = self._get_etag_file_path(self.local_sig_path) if sig_etag_path.exists(): sig_etag_path.unlink() - log_message(LoggingScope.DOWNLOAD, 'WARNING', "Failed to download optional signature for %s: %s", - self.remote_file_path, str(sig_err)) - log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s (signature optional)", - self.remote_file_path) + log_message(LoggingScope.DOWNLOAD, 'WARNING', "Failed to download optional signature for %s: %s", + self.remote_file_path, str(sig_err)) + log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s (signature optional)", + self.remote_file_path) return True except Exception as err: diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 4bf122c0..6bbe498f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,7 +1,10 @@ from enum import Enum, auto - +from typing import Dict from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription +from utils import log_message, LoggingScope +from github import Github + class TaskState(Enum): NEW = auto() # The task has been created but not yet processed @@ -78,14 +81,14 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool: log_msg = "Found file %s in branch %s" log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch) return True - except github.UnknownObjectException: + except Github.UnknownObjectException: # file_path does not exist in branch return False - except github.GithubException as err: + except Github.GithubException as err: if err.status == 404: # file_path does not exist in branch return False - else: + else: # if there was some other (e.g. connection) issue, log message and return False log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status) @@ -97,13 +100,14 @@ def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]: Determines in which sequence numbers the metadata/task file is included and in which it is not. Returns: - A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is included in that sequence number. + A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is + included in that sequence number. Idea: - The deployment for a single source PR could be split into multiple staging PRs each is assigned a unique sequence number. - For a given source PR (identified by the repo name and the PR number), a staging PR using a branch named - `REPO/PR_NUM/SEQ_NUM` is created. + `REPO/PR_NUM/SEQ_NUM` is created. - In the staging repo we create a corresponding directory `REPO/PR_NUM/SEQ_NUM`. - If a metadata/task file is handled by the staging PR with sequence number, it is included in that directory. - We iterate over all directories under `REPO/PR_NUM`: @@ -186,7 +190,7 @@ def _list_directory_contents(self, directory_path, branch=None): else: # If it's not a list, it means the path is not a directory raise ValueError(f"{directory_path} is not a directory") - except github.GithubException as err: + except Github.GithubException as err: if err.status == 404: raise FileNotFoundError(f"Directory not found: {directory_path}") raise err @@ -208,7 +212,9 @@ def handle(self): handler() # if state has changed, run handle() again; otherwise, do nothing if self.state != state_before_handle: - print(f"handler {handler_name} changed state from {state_before_handle} to {self.state} ; running handle() again") + msg = f"handler {handler_name} changed state from {state_before_handle} to {self.state}" + msg += " running handle() again" + print(msg) self.handle() else: # Default behavior for missing handlers @@ -255,4 +261,4 @@ def transition_to(self, new_state: TaskState): return False def __str__(self): - return f"EESSITask(description={self.description}, action={self.action}, state={self.state})" \ No newline at end of file + return f"EESSITask(description={self.description}, action={self.action}, state={self.state})" diff --git a/scripts/automated_ingestion/eessi_task_action.py b/scripts/automated_ingestion/eessi_task_action.py index 8f0ce599..6f141435 100644 --- a/scripts/automated_ingestion/eessi_task_action.py +++ b/scripts/automated_ingestion/eessi_task_action.py @@ -1,5 +1,6 @@ from enum import Enum, auto + class EESSITaskAction(Enum): NOP = auto() # perform no action DELETE = auto() # perform a delete operation diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 618b7968..271ff9a9 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -1,8 +1,7 @@ import json -import subprocess from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Tuple from eessi_data_object import EESSIDataAndSignatureObject from utils import log_function_entry_exit, log_message, LoggingScope @@ -37,7 +36,7 @@ def __init__(self, task_object: EESSIDataAndSignatureObject): # Verify signature and set initial state self.signature_verified = self.task_object.verify_signature() - + # Try to read metadata (will only succeed if signature is verified) try: self._read_metadata() @@ -59,21 +58,22 @@ def _read_metadata(self) -> None: Only reads metadata if the signature has been verified. """ if not self.signature_verified: - log_message(LoggingScope.ERROR, 'ERROR', "Cannot read metadata: signature not verified for %s", - self.task_object.local_file_path) + log_message(LoggingScope.ERROR, 'ERROR', "Cannot read metadata: signature not verified for %s", + self.task_object.local_file_path) raise RuntimeError("Cannot read metadata: signature not verified") try: - with open(self.task_object.local_file_path, 'r') as f: - self.metadata = json.load(f) - log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s", self.task_object.local_file_path) - except json.JSONDecodeError as e: - log_message(LoggingScope.ERROR, 'ERROR', "Failed to parse JSON in task description file %s: %s", - self.task_object.local_file_path, str(e)) + with open(self.task_object.local_file_path, 'r') as file: + self.metadata = json.load(file) + log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s", + self.task_object.local_file_path) + except json.JSONDecodeError as err: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to parse JSON in task description file %s: %s", + self.task_object.local_file_path, str(err)) raise - except Exception as e: - log_message(LoggingScope.ERROR, 'ERROR', "Failed to read task description file %s: %s", - self.task_object.local_file_path, str(e)) + except Exception as err: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to read task description file %s: %s", + self.task_object.local_file_path, str(err)) raise def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: @@ -98,6 +98,9 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: # obtain file name from local file path using basename file_name = Path(self.task_object.local_file_path).name # split file_name into part before suffix and the suffix + # idea: split on last hyphen, then split on first dot + suffix = file_name.split('-')[-1].split('.', 1)[1] + file_name_without_suffix = file_name.strip(f".{suffix}") # from file_name_without_suffix determine VERSION (2nd element), COMPONENT (3rd element), OS (4th element), # ARCHITECTURE (5th to second last elements) and TIMESTAMP (last element) components = file_name_without_suffix.split('-') @@ -110,4 +113,4 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: def __str__(self) -> str: """Return a string representation of the EESSITaskDescription object.""" - return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" \ No newline at end of file + return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index eca6b67b..cb4ae801 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -1,11 +1,9 @@ from utils import send_slack_message, sha256sum, log_function_entry_exit, log_message, LoggingScope -from s3_bucket import EESSIS3Bucket from pathlib import PurePosixPath import github import json -import logging import os import subprocess import tarfile @@ -251,11 +249,11 @@ def verify_signatures(self): (self.local_metadata_path, self.local_metadata_sig_path) ]: command = verify_runenv + [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file, - '--file', file, '--signature-file', sig_file] + '--file', file, '--signature-file', sig_file] log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(command)) verify_cmd = subprocess.run( - command, + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if verify_cmd.returncode == 0: @@ -455,9 +453,9 @@ def make_approval_request(self, tarballs_in_group=None): log_msg = 'Warning, tarball %s is in a weird state:' log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, self.object) log_msg = 'Branch: %s\nPR: %s\nPR state: %s\nPR merged: %s' - log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, - git_branch, pr, pr.state, pr.merged) - # TODO: should we delete the branch or open an issue? + log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, + git_branch, pr, pr.state, pr.merged) + # TODO: should we delete the branch or open an issue? return else: log_msg = 'Tarball %s has a branch, but no PR.' @@ -471,8 +469,8 @@ def make_approval_request(self, tarballs_in_group=None): # Move metadata file(s) to approved directory log_msg = "Moving metadata for %s from %s to %s in branch %s" - log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, - self.object, self.state, next_state, git_branch) + log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, + self.object, self.state, next_state, git_branch) if tarballs_in_group is None: log_message(LoggingScope.GITHUB_OPS, 'INFO', "Moving metadata for individual tarball to staged") self.move_metadata_file(self.state, next_state, branch=git_branch) @@ -485,7 +483,7 @@ def make_approval_request(self, tarballs_in_group=None): # Create PR with appropriate template try: - pr_url=f"https://github.com/{repo}/pull/{pr_id}", + pr_url = f"https://github.com/{repo}/pull/{pr_id}" if tarballs_in_group is None: log_msg = "Creating PR for individual tarball: %s" log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object) @@ -589,8 +587,8 @@ def move_metadata_file(self, old_state, new_state, branch='main'): """Move the metadata file of a tarball from an old state's directory to a new state's directory.""" file_path_old = old_state + '/' + self.metadata_file file_path_new = new_state + '/' + self.metadata_file - log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Moving metadata file %s from %s to %s in branch %s', - self.metadata_file, file_path_old, file_path_new, branch) + log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Moving metadata file %s from %s to %s in branch %s', + self.metadata_file, file_path_old, file_path_new, branch) tarball_metadata = self.git_repo.get_contents(file_path_old) # Remove the metadata file from the old state's directory... self.git_repo.delete_file(file_path_old, 'remove from ' + old_state, sha=tarball_metadata.sha, branch=branch) @@ -629,7 +627,7 @@ def extract_checked_tarballs(self, pr_body): checked_tarballs = [] for line in pr_body.split('\n'): if line.strip().startswith('- [x] '): - tarball = line.strip()[6:] # Remove '- [x] ' prefix + tarball = line.strip()[6:] # Remove '- [x] ' prefix checked_tarballs.append(tarball) return checked_tarballs @@ -638,7 +636,7 @@ def extract_tarballs_from_pr_body(self, pr_body): tarballs = [] for line in pr_body.split('\n'): if line.strip().startswith('- ['): - tarball = line.strip()[6:] # Remove '- [ ] ' or '- [x] ' prefix + tarball = line.strip()[6:] # Remove '- [ ] ' or '- [x] ' prefix tarballs.append(tarball) return tarballs @@ -704,7 +702,7 @@ def process_group(self, tarballs): # Mark all tarballs as staged in the group branch, however need to handle first tarball differently log_msg = "Processing first tarball in group: %s" log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, self.first_tar.object) - self.first_tar.mark_new_tarball_as_staged('main') # this sets the state of the first tarball to 'staged' + self.first_tar.mark_new_tarball_as_staged('main') # this sets the state of the first tarball to 'staged' for tarball in tarballs[1:]: log_msg = "Processing tarball in group: %s" log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, tarball) diff --git a/scripts/automated_ingestion/remote_storage.py b/scripts/automated_ingestion/remote_storage.py index ac005af8..2a386a7d 100644 --- a/scripts/automated_ingestion/remote_storage.py +++ b/scripts/automated_ingestion/remote_storage.py @@ -31,4 +31,4 @@ def download(self, remote_path: str, local_path: str) -> None: remote_path: Path to the object in remote storage local_path: Local path where to save the file """ - ... \ No newline at end of file + ... diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py index 79b8a055..ff62813f 100644 --- a/scripts/automated_ingestion/s3_bucket.py +++ b/scripts/automated_ingestion/s3_bucket.py @@ -3,10 +3,11 @@ from typing import Dict, Optional import boto3 - +from botocore.exceptions import ClientError from utils import log_function_entry_exit, log_message, LoggingScope from remote_storage import RemoteStorageClient + class EESSIS3Bucket(RemoteStorageClient): """EESSI-specific S3 bucket implementation of the RemoteStorageClient protocol.""" @@ -98,8 +99,8 @@ def get_metadata(self, remote_path: str) -> Dict: response = self.client.head_object(Bucket=self.bucket, Key=remote_path) log_message(LoggingScope.DEBUG, 'DEBUG', "Retrieved metadata for %s: %s", remote_path, response) return response - except ClientError as e: - log_message(LoggingScope.ERROR, 'ERROR', "Failed to get metadata for %s: %s", remote_path, str(e)) + except ClientError as err: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to get metadata for %s: %s", remote_path, str(err)) raise def _get_etag_file_path(self, local_path: str) -> Path: @@ -143,8 +144,8 @@ def download(self, remote_path: str, local_path: str) -> None: log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading %s to %s", remote_path, local_path) self.client.download_file(Bucket=self.bucket, Key=remote_path, Filename=local_path) log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s to %s", remote_path, local_path) - except ClientError as e: - log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", remote_path, str(e)) + except ClientError as err: + log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", remote_path, str(err)) raise # Get metadata first to obtain the ETag diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 70fbd9de..ab1e2b2f 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -9,6 +9,7 @@ from enum import IntFlag, auto import sys + class LoggingScope(IntFlag): """Enumeration of different logging scopes.""" NONE = 0 @@ -20,15 +21,18 @@ class LoggingScope(IntFlag): GROUP_OPS = auto() # Logging related to tarball group operations ERROR = auto() # Error logging (separate from other scopes for easier filtering) DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) - ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE | + ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE | GITHUB_OPS | GROUP_OPS | ERROR | DEBUG) + # Global setting for logging scopes ENABLED_LOGGING_SCOPES = LoggingScope.NONE + # Global variable to track call stack depth _call_stack_depth = 0 + def set_logging_scopes(scopes): """ Set the enabled logging scopes. @@ -90,10 +94,12 @@ def set_logging_scopes(scopes): # Convert list to comma-separated string and process set_logging_scopes(",".join(scopes)) + def is_logging_scope_enabled(scope): """Check if a specific logging scope is enabled.""" return bool(ENABLED_LOGGING_SCOPES & scope) + def send_slack_message(webhook, msg): """Send a Slack message.""" slack_data = {'text': msg} @@ -187,7 +193,7 @@ def wrapper(*args, **kwargs): end_time = time.time() # For normal returns, show the last line of the function log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{last_line_no}" - f"{context} (took {end_time - start_time:.2f}s)") + f"{context} (took {end_time - start_time:.2f}s)") return result except Exception as err: _call_stack_depth -= 1 @@ -198,11 +204,12 @@ def wrapper(*args, **kwargs): except AttributeError: exc_line_no = last_line_no log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{exc_line_no}" - f"{context} with exception (took {end_time - start_time:.2f}s)") + f"{context} with exception (took {end_time - start_time:.2f}s)") raise err return wrapper return decorator + def log_message(scope, level, msg, *args, logger=None, **kwargs): """ Log a message if either: @@ -256,7 +263,6 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs): for handler in original_handlers: if handler not in log.handlers: log.addHandler(handler) - # Only use normal logging if scope is not enabled AND level is high enough elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel(): # Use normal logging with level check From 67e74f3dcd663d5dcd4174187670b4560adc3e50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:14:32 +0200 Subject: [PATCH 064/218] add func entry/exit logging to EESSITask --- scripts/automated_ingestion/eessi_task.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 6bbe498f..c3bc2acb 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -2,7 +2,7 @@ from typing import Dict from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription -from utils import log_message, LoggingScope +from utils import log_message, LoggingScope, log_function_entry_exit from github import Github @@ -37,6 +37,7 @@ class EESSITask: state: TaskState git_repo: Github + @log_function_entry_exit() def __init__(self, description: EESSITaskDescription, git_repo: Github): self.description = description self.git_repo = git_repo @@ -54,6 +55,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github): self.state = self._find_state() + @log_function_entry_exit() def _determine_task_action(self) -> EESSITaskAction: """ Determine the action type based on task description metadata. @@ -70,6 +72,7 @@ def _determine_task_action(self) -> EESSITaskAction: return EESSITaskAction.UPDATE return EESSITaskAction.UNKNOWN + @log_function_entry_exit() def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool: """ Check if a file exists in a repository branch. @@ -95,6 +98,7 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool: return False return False + @log_function_entry_exit() def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]: """ Determines in which sequence numbers the metadata/task file is included and in which it is not. @@ -133,6 +137,7 @@ def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]: continue return sequence_numbers + @log_function_entry_exit() def _find_state(self) -> TaskState: """ Determine the state of the task based on the task description metadata. @@ -167,6 +172,7 @@ def _find_state(self) -> TaskState: # did not find metadata file in staging repo on GitHub return TaskState.NEW + @log_function_entry_exit() def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskState: """ Get the state from the file in the metadata_file_state_path. @@ -179,6 +185,7 @@ def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskSt except ValueError: return TaskState.NEW + @log_function_entry_exit() def _list_directory_contents(self, directory_path, branch=None): try: # Get contents of the directory @@ -195,6 +202,7 @@ def _list_directory_contents(self, directory_path, branch=None): raise FileNotFoundError(f"Directory not found: {directory_path}") raise err + @log_function_entry_exit() def handle(self): """ Dynamically find and execute the appropriate handler based on action and state. @@ -221,36 +229,42 @@ def handle(self): print(f"No handler for action {self.action} and state {self.state} implemented; nothing to be done") # Implement handlers for ADD action + @log_function_entry_exit() def _handle_add_new(self): """Handler for ADD action in NEW state""" print("Handling ADD action in NEW state") # Implementation for adding in NEW state return True + @log_function_entry_exit() def _handle_add_staged(self): """Handler for ADD action in STAGED state""" print("Handling ADD action in STAGED state") # Implementation for adding in STAGED state return True + @log_function_entry_exit() def _handle_add_pr_opened(self): """Handler for ADD action in PR_OPENED state""" print("Handling ADD action in PR_OPENED state") # Implementation for adding in PR_OPENED state return True + @log_function_entry_exit() def _handle_add_approved(self): """Handler for ADD action in APPROVED state""" print("Handling ADD action in APPROVED state") # Implementation for adding in APPROVED state return True + @log_function_entry_exit() def _handle_add_ingested(self): """Handler for ADD action in INGESTED state""" print("Handling ADD action in INGESTED state") # Implementation for adding in INGESTED state return True + @log_function_entry_exit() def transition_to(self, new_state: TaskState): """ Transition the task to a new state if valid. @@ -260,5 +274,6 @@ def transition_to(self, new_state: TaskState): return True return False + @log_function_entry_exit() def __str__(self): return f"EESSITask(description={self.description}, action={self.action}, state={self.state})" From c28469283748251ab9019b2379573642e6254fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:24:25 +0200 Subject: [PATCH 065/218] change logginscope name and log info in _find_state --- scripts/automated_ingestion/eessi_task.py | 6 +++++ scripts/automated_ingestion/eessitarball.py | 26 ++++++++++----------- scripts/automated_ingestion/utils.py | 4 ++-- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index c3bc2acb..63c56f5f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -148,9 +148,11 @@ def _find_state(self) -> TaskState: # obtain repo and pr from metadata repo = self.description.metadata['task']['repo'] pr = self.description.metadata['task']['pr'] + log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr) # iterate over all sequence numbers in repo/pr dir sequence_numbers = self._determine_sequence_numbers_including_task_file() + log_message(LoggingScope.TASK_OPS, 'INFO', "sequence_numbers: %s", sequence_numbers) for sequence_number in [key for key, value in sequence_numbers.items() if value]: # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name # format of the metadata file name is: @@ -164,12 +166,16 @@ def _find_state(self) -> TaskState: # Later, we may switch to using task action files instead of metadata files. The format of the # SUFFIX would then be defined by the task action or the configuration file. version, component, os, architecture, timestamp, suffix = self.description.get_metadata_file_components() + log_msg = "version: %s, component: %s, os: %s, architecture: %s, timestamp: %s, suffix: %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, version, component, os, architecture, timestamp, suffix) metadata_file_name = f"eessi-{version}-{component}-{os}-{architecture}-{timestamp}.{suffix}" metadata_file_state_path = f"{repo}/{pr}/{sequence_number}/{metadata_file_name}" # get the state from the file in the metadata_file_state_path state = self._get_state_from_metadata_file(metadata_file_state_path) + log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) return state # did not find metadata file in staging repo on GitHub + log_message(LoggingScope.TASK_OPS, 'INFO', "did not find metadata file in staging repo on GitHub, state: NEW") return TaskState.NEW @log_function_entry_exit() diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py index cb4ae801..cc3c4ae4 100644 --- a/scripts/automated_ingestion/eessitarball.py +++ b/scripts/automated_ingestion/eessitarball.py @@ -106,7 +106,7 @@ def find_state(self): try: self.git_repo.get_contents(state + '/' + self.metadata_file) log_msg = "Found metadata file %s in state: %s" - log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.metadata_file, state) + log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, self.metadata_file, state) return state except github.UnknownObjectException: # no metadata file found in this state's directory, so keep searching... @@ -120,7 +120,7 @@ def find_state(self): log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!' log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status) return "unknown" - log_message(LoggingScope.STATE_CHANGE, 'INFO', "Tarball %s is new", self.metadata_file) + log_message(LoggingScope.STATE_OPS, 'INFO', "Tarball %s is new", self.metadata_file) return "new" def get_contents_overview(self): @@ -282,7 +282,7 @@ def verify_checksum(self): def ingest(self): """Process a tarball that is ready to be ingested by running the ingestion script.""" # TODO: check if there is an open issue for this tarball, and if there is, skip it. - log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Tarball %s is ready to be ingested.', self.object) + log_message(LoggingScope.STATE_OPS, 'INFO', 'Tarball %s is ready to be ingested.', self.object) self.download() log_message(LoggingScope.VERIFICATION, 'INFO', 'Verifying its signature...') if not self.verify_signatures(): @@ -308,7 +308,7 @@ def ingest(self): script = self.config['paths']['ingestion_script'] sudo = ['sudo'] if self.config['cvmfs'].getboolean('ingest_as_root', True) else [] - log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Running the ingestion script for %s...', self.object) + log_message(LoggingScope.STATE_OPS, 'INFO', 'Running the ingestion script for %s...', self.object) ingest_cmd = subprocess.run( sudo + [script, self.cvmfs_repo, self.local_path], stdout=subprocess.PIPE, @@ -334,38 +334,38 @@ def ingest(self): ) if self.issue_exists(issue_title, state='open'): log_msg = 'Failed to ingest %s, but an open issue already exists, skipping...' - log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object) + log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, self.object) else: self.git_repo.create_issue(title=issue_title, body=issue_body) def print_ingested(self): """Process a tarball that has already been ingested.""" - log_message(LoggingScope.STATE_CHANGE, 'INFO', '%s has already been ingested, skipping...', self.object) + log_message(LoggingScope.STATE_OPS, 'INFO', '%s has already been ingested, skipping...', self.object) @log_function_entry_exit() def mark_new_tarball_as_staged(self, branch=None): """Process a new tarball that was added to the staging bucket.""" next_state = self.next_state(self.state) log_msg = 'Found new tarball %s, downloading it...' - log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object) + log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, self.object) # Download the tarball and its metadata file. # Use force as it may be a new attempt for an existing tarball that failed before. self.download(force=True) if not self.local_path or not self.local_metadata_path: log_msg = "Skipping tarball %s - download failed" - log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object) + log_message(LoggingScope.STATE_OPS, 'WARNING', log_msg, self.object) return # Verify the signatures of the tarball and metadata file. if not self.verify_signatures(): log_msg = "Skipping tarball %s - signature verification failed" - log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object) + log_message(LoggingScope.STATE_OPS, 'WARNING', log_msg, self.object) return # If no branch is provided, use the main branch target_branch = branch if branch else 'main' log_msg = "Adding metadata to '%s' folder in %s branch" - log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, next_state, target_branch) + log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, next_state, target_branch) file_path_staged = next_state + '/' + self.metadata_file contents = '' @@ -379,14 +379,14 @@ def mark_new_tarball_as_staged(self, branch=None): def print_rejected(self): """Process a (rejected) tarball for which the corresponding PR has been closed witout merging.""" - log_message(LoggingScope.STATE_CHANGE, 'INFO', "This tarball was rejected, so we're skipping it.") + log_message(LoggingScope.STATE_OPS, 'INFO', "This tarball was rejected, so we're skipping it.") # Do we want to delete rejected tarballs at some point? def print_unknown(self): """Process a tarball which has an unknown state.""" log_msg = "The state of this tarball could not be determined," log_msg += " so we're skipping it." - log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg) + log_message(LoggingScope.STATE_OPS, 'INFO', log_msg) def find_next_sequence_number(self, repo, pr_id): """Find the next available sequence number for staging PRs of a source PR.""" @@ -643,7 +643,7 @@ def extract_tarballs_from_pr_body(self, pr_body): def reject(self): """Reject a tarball for ingestion.""" # Let's move the the tarball to the directory for rejected tarballs. - log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Marking tarball %s as rejected...', self.object) + log_message(LoggingScope.STATE_OPS, 'INFO', 'Marking tarball %s as rejected...', self.object) next_state = 'rejected' self.move_metadata_file(self.state, next_state) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index ab1e2b2f..c5a80f0c 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -16,12 +16,12 @@ class LoggingScope(IntFlag): FUNC_ENTRY_EXIT = auto() # Function entry/exit logging DOWNLOAD = auto() # Logging related to file downloads VERIFICATION = auto() # Logging related to signature and checksum verification - STATE_CHANGE = auto() # Logging related to tarball state changes + STATE_OPS = auto() # Logging related to tarball state changes GITHUB_OPS = auto() # Logging related to GitHub operations (PRs, issues, etc.) GROUP_OPS = auto() # Logging related to tarball group operations ERROR = auto() # Error logging (separate from other scopes for easier filtering) DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) - ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE | + ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS | GITHUB_OPS | GROUP_OPS | ERROR | DEBUG) From fbfc1ab44c6c84fdfb0655dbc6af584c457200c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:27:22 +0200 Subject: [PATCH 066/218] add missing scope for task ops --- scripts/automated_ingestion/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index c5a80f0c..32471fcd 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -16,13 +16,14 @@ class LoggingScope(IntFlag): FUNC_ENTRY_EXIT = auto() # Function entry/exit logging DOWNLOAD = auto() # Logging related to file downloads VERIFICATION = auto() # Logging related to signature and checksum verification - STATE_OPS = auto() # Logging related to tarball state changes + STATE_OPS = auto() # Logging related to tarball state operations GITHUB_OPS = auto() # Logging related to GitHub operations (PRs, issues, etc.) GROUP_OPS = auto() # Logging related to tarball group operations + TASK_OPS = auto() # Logging related to task operations ERROR = auto() # Error logging (separate from other scopes for easier filtering) DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS | - GITHUB_OPS | GROUP_OPS | ERROR | DEBUG) + GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG) # Global setting for logging scopes From 28a4745565b1864f4fecda3b64b84376bb02a7c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:32:18 +0200 Subject: [PATCH 067/218] add a bit more logging --- scripts/automated_ingestion/eessi_task.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 63c56f5f..b4c0ea00 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -146,6 +146,7 @@ def _find_state(self) -> TaskState: The state of the task. """ # obtain repo and pr from metadata + log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object) repo = self.description.metadata['task']['repo'] pr = self.description.metadata['task']['pr'] log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr) From d2b275d863385033a4aff67d19f8b070e7032118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:38:40 +0200 Subject: [PATCH 068/218] obtain repo/pr from task or link2pr OR raise ValueError --- scripts/automated_ingestion/eessi_task.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index b4c0ea00..b6d4f162 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -147,8 +147,16 @@ def _find_state(self) -> TaskState: """ # obtain repo and pr from metadata log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object) - repo = self.description.metadata['task']['repo'] - pr = self.description.metadata['task']['pr'] + task = self.description.metadata['task'] if 'task' in self.description.metadata else None + link2pr = self.description.metadata['link2pr'] if 'link2pr' in self.description.metadata else None + if task: + repo = task['repo'] + pr = task['pr'] + elif link2pr: + repo = link2pr['repo'] + pr = link2pr['pr'] + else: + raise ValueError("no repo or pr found in metadata") log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr) # iterate over all sequence numbers in repo/pr dir From ad7cb11c4eda34aa93375dda497586d48bea3c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:40:51 +0200 Subject: [PATCH 069/218] add logging for obtaining repo/pr from metadata --- scripts/automated_ingestion/eessi_task.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index b6d4f162..05a571d3 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -150,9 +150,11 @@ def _find_state(self) -> TaskState: task = self.description.metadata['task'] if 'task' in self.description.metadata else None link2pr = self.description.metadata['link2pr'] if 'link2pr' in self.description.metadata else None if task: + log_message(LoggingScope.TASK_OPS, 'INFO', "task found in metadata: %s", task) repo = task['repo'] pr = task['pr'] elif link2pr: + log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", link2pr) repo = link2pr['repo'] pr = link2pr['pr'] else: From 11a23a0aad23195335416868022e278046e7df62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:53:03 +0200 Subject: [PATCH 070/218] populate source from metadata (link2pr) and use it in _find_state --- scripts/automated_ingestion/eessi_task.py | 14 +++++++------- .../automated_ingestion/eessi_task_description.py | 12 ++++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 05a571d3..626f3fb4 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -147,16 +147,16 @@ def _find_state(self) -> TaskState: """ # obtain repo and pr from metadata log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object) - task = self.description.metadata['task'] if 'task' in self.description.metadata else None - link2pr = self.description.metadata['link2pr'] if 'link2pr' in self.description.metadata else None - if task: + task = self.description.task + source = self.description.source + if 'repo' in task and 'pr' in task: log_message(LoggingScope.TASK_OPS, 'INFO', "task found in metadata: %s", task) repo = task['repo'] pr = task['pr'] - elif link2pr: - log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", link2pr) - repo = link2pr['repo'] - pr = link2pr['pr'] + elif 'repo' in source and 'pr' in source: + log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", source) + repo = source['repo'] + pr = source['pr'] else: raise ValueError("no repo or pr found in metadata") log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 271ff9a9..43da6139 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -21,6 +21,12 @@ class EESSITaskDescription: # Metadata from the task description file metadata: Dict[str, Any] = None + # task element + task: Dict[str, Any] = None + + # source element + source: Dict[str, Any] = None + @log_function_entry_exit() def __init__(self, task_object: EESSIDataAndSignatureObject): """ @@ -51,6 +57,12 @@ def __init__(self, task_object: EESSIDataAndSignatureObject): else: self.task = None + # check if the task file contains a link2pr field and add that to source element + if 'link2pr' in self.metadata: + self.source = self.metadata['link2pr'] + else: + self.source = None + @log_function_entry_exit() def _read_metadata(self) -> None: """ From 52d7bd388c64973ae6cc6ab2f1f32a8ecff4c672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 21:57:16 +0200 Subject: [PATCH 071/218] hand over repo and pr number as argument --- scripts/automated_ingestion/eessi_task.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 626f3fb4..f07308d5 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -99,10 +99,14 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool: return False @log_function_entry_exit() - def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]: + def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> Dict[int, bool]: """ Determines in which sequence numbers the metadata/task file is included and in which it is not. + Args: + repo: the repository name + pr: the pull request number + Returns: A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is included in that sequence number. @@ -120,8 +124,6 @@ def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]: Note: this is a placeholder for now, as we do not know yet if we need to use a sequence number. """ sequence_numbers = {} - repo = self.description.metadata['task']['repo'] - pr = self.description.metadata['task']['pr'] repo_pr_dir = f"{repo}/{pr}" # iterate over all directories under repo_pr_dir for dir in self._list_directory_contents(repo_pr_dir): @@ -154,7 +156,7 @@ def _find_state(self) -> TaskState: repo = task['repo'] pr = task['pr'] elif 'repo' in source and 'pr' in source: - log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", source) + log_message(LoggingScope.TASK_OPS, 'INFO', "source found in metadata: %s", source) repo = source['repo'] pr = source['pr'] else: @@ -162,7 +164,7 @@ def _find_state(self) -> TaskState: log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr) # iterate over all sequence numbers in repo/pr dir - sequence_numbers = self._determine_sequence_numbers_including_task_file() + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr) log_message(LoggingScope.TASK_OPS, 'INFO', "sequence_numbers: %s", sequence_numbers) for sequence_number in [key for key, value in sequence_numbers.items() if value]: # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name From d0be2eb3fe5f684fd32c5f210d838224885c3b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 22:01:12 +0200 Subject: [PATCH 072/218] fix Github exception issues --- scripts/automated_ingestion/eessi_task.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index f07308d5..8173926e 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -3,7 +3,7 @@ from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription from utils import log_message, LoggingScope, log_function_entry_exit -from github import Github +from github import Github, GithubException, UnknownObjectException class TaskState(Enum): @@ -84,10 +84,10 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool: log_msg = "Found file %s in branch %s" log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch) return True - except Github.UnknownObjectException: + except UnknownObjectException: # file_path does not exist in branch return False - except Github.GithubException as err: + except GithubException as err: if err.status == 404: # file_path does not exist in branch return False @@ -216,7 +216,7 @@ def _list_directory_contents(self, directory_path, branch=None): else: # If it's not a list, it means the path is not a directory raise ValueError(f"{directory_path} is not a directory") - except Github.GithubException as err: + except GithubException as err: if err.status == 404: raise FileNotFoundError(f"Directory not found: {directory_path}") raise err From 37ea68409f1cd5c1ef83ac76a52419be30216165 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 22:06:36 +0200 Subject: [PATCH 073/218] add logging when listing dir contents --- scripts/automated_ingestion/eessi_task.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8173926e..8eb9df08 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -208,6 +208,7 @@ def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskSt def _list_directory_contents(self, directory_path, branch=None): try: # Get contents of the directory + log_message(LoggingScope.TASK_OPS, 'INFO', "listing contents of %s in branch %s", directory_path, branch) contents = self.git_repo.get_contents(directory_path, ref=branch) # If contents is a list, it means we successfully got directory contents From f170a257676d4b52c742f9348df9b5be60e3dcc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 22:08:27 +0200 Subject: [PATCH 074/218] set branch to default if needed --- scripts/automated_ingestion/eessi_task.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8eb9df08..4a4d5322 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -208,6 +208,7 @@ def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskSt def _list_directory_contents(self, directory_path, branch=None): try: # Get contents of the directory + branch = self.git_repo.default_branch if branch is None else branch log_message(LoggingScope.TASK_OPS, 'INFO', "listing contents of %s in branch %s", directory_path, branch) contents = self.git_repo.get_contents(directory_path, ref=branch) From a795710c371236c8dbfaae28bd5786b64cdaf8eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 22:15:45 +0200 Subject: [PATCH 075/218] handle file not found exception --- scripts/automated_ingestion/eessi_task.py | 29 +++++++++++++++-------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 4a4d5322..7d004707 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -126,17 +126,26 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> sequence_numbers = {} repo_pr_dir = f"{repo}/{pr}" # iterate over all directories under repo_pr_dir - for dir in self._list_directory_contents(repo_pr_dir): - # check if the directory is a number - if dir.name.isdigit(): - remote_file_path = self.description.task_object.remote_file_path - if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"): - sequence_numbers[int(dir.name)] = True + try: + directories = self._list_directory_contents(repo_pr_dir) + for dir in directories: + # check if the directory is a number + if dir.name.isdigit(): + remote_file_path = self.description.task_object.remote_file_path + if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"): + sequence_numbers[int(dir.name)] = True + else: + sequence_numbers[int(dir.name)] = False else: - sequence_numbers[int(dir.name)] = False - else: - # directory is not a number, so we skip it - continue + # directory is not a number, so we skip it + continue + except FileNotFoundError: + # repo_pr_dir does not exist, so we return an empty dictionary + return {} + except GithubException as err: + if err.status != 404: # 404 is catched by FileNotFoundError + # some other error than the directory not existing + return {} return sequence_numbers @log_function_entry_exit() From c63208f1b653eeef36dd6ab1b82c1c6697dcd501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 11 May 2025 22:20:26 +0200 Subject: [PATCH 076/218] fix element naming --- scripts/automated_ingestion/automated_ingestion.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 12429e4a..7ece86cc 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -267,17 +267,17 @@ def main(): # TODO: update the information shown below (what makes sense to show?) # Log information about the task - task_object = task.task_description.task_object + task_object = task.description.task_object log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path) log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path) log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", - task.task_description.signature_verified) + task.description.signature_verified) # Log the ETags of the downloaded task file - file_etag, sig_etag = task_object.get_etags() + file_etag, sig_etag = task.description.task_object.get_etags() log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag) log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s", - task_object.remote_sig_path, sig_etag) + task.description.task_object.remote_sig_path, sig_etag) # TODO: Process the task file contents # This would involve reading the task file, parsing its contents, From 2c0673f9631d47b7aa9f6287a544e93a1099dcda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 16 May 2025 19:45:33 +0200 Subject: [PATCH 077/218] first step towards obtaining payload when handling new task --- scripts/automated_ingestion/eessi_task.py | 18 +++++++++ .../automated_ingestion/eessi_task_payload.py | 40 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 scripts/automated_ingestion/eessi_task_payload.py diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7d004707..802cdc4d 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,9 +1,12 @@ from enum import Enum, auto from typing import Dict +from eessi_data_object import EESSIDataAndSignatureObject from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription +from eessi_task_payload import EESSITaskPayload from utils import log_message, LoggingScope, log_function_entry_exit from github import Github, GithubException, UnknownObjectException +import os class TaskState(Enum): @@ -33,6 +36,7 @@ def __str__(self): class EESSITask: description: EESSITaskDescription + payload: EESSITaskPayload action: EESSITaskAction state: TaskState git_repo: Github @@ -264,6 +268,20 @@ def _handle_add_new(self): """Handler for ADD action in NEW state""" print("Handling ADD action in NEW state") # Implementation for adding in NEW state + # get name of of payload from metadata + payload_name = self.description.metadata['payload']['filename'] + # get config and remote_client from self.description.task_object + config = self.description.task_object.config + remote_client = self.description.task_object.remote_client + # determine remote_file_path by replacing basename of remote_file_path in self.description.task_object + # with payload_name + description_remote_file_path = self.description.task_object.remote_file_path + payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name) + # initialize payload object + payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client) + self.payload = EESSITaskPayload(payload_object) + log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload) + return True @log_function_entry_exit() diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py new file mode 100644 index 00000000..bba630fe --- /dev/null +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -0,0 +1,40 @@ +from dataclasses import dataclass + +from eessi_data_object import EESSIDataAndSignatureObject +from utils import log_function_entry_exit +from remote_storage import DownloadMode + + +@dataclass +class EESSITaskPayload: + """Class representing an EESSI task payload (tarball/artifact) and its signature.""" + + # The EESSI data and signature object associated with this payload + payload_object: EESSIDataAndSignatureObject + + # Whether the signature was successfully verified + signature_verified: bool = False + + # possibly at a later point in time, we will add inferred metadata here + # such as the prefix in a tarball, the main elements, or which software + # package it includes + + @log_function_entry_exit() + def __init__(self, payload_object: EESSIDataAndSignatureObject): + """ + Initialize an EESSITaskPayload object. + + Args: + payload_object: The EESSI data and signature object associated with this payload + """ + self.payload_object = payload_object + + # Download the payload and its signature + self.payload_object.download(mode=DownloadMode.CHECK_REMOTE) + + # Verify signature + self.signature_verified = self.payload_object.verify_signature() + + def __str__(self) -> str: + """Return a string representation of the EESSITaskPayload object.""" + return f"EESSITaskPayload({self.payload_object.local_file_path}, verified={self.signature_verified})" From 7bcf5eb724808dac1bcbc7f64b1786e1f9f65d7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 16 May 2025 19:46:46 +0200 Subject: [PATCH 078/218] add a bit logging when creating payload instance --- scripts/automated_ingestion/eessi_task.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 802cdc4d..8271c39f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -270,6 +270,7 @@ def _handle_add_new(self): # Implementation for adding in NEW state # get name of of payload from metadata payload_name = self.description.metadata['payload']['filename'] + log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name) # get config and remote_client from self.description.task_object config = self.description.task_object.config remote_client = self.description.task_object.remote_client @@ -277,6 +278,7 @@ def _handle_add_new(self): # with payload_name description_remote_file_path = self.description.task_object.remote_file_path payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name) + log_message(LoggingScope.TASK_OPS, 'INFO', "payload_remote_file_path: %s", payload_remote_file_path) # initialize payload object payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client) self.payload = EESSITaskPayload(payload_object) From 795da23839c29e1eae4d3186b127b8dd98bbeeed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 16 May 2025 19:54:38 +0200 Subject: [PATCH 079/218] code formatting improvements --- scripts/automated_ingestion/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py index 32471fcd..4b867764 100644 --- a/scripts/automated_ingestion/utils.py +++ b/scripts/automated_ingestion/utils.py @@ -16,12 +16,12 @@ class LoggingScope(IntFlag): FUNC_ENTRY_EXIT = auto() # Function entry/exit logging DOWNLOAD = auto() # Logging related to file downloads VERIFICATION = auto() # Logging related to signature and checksum verification - STATE_OPS = auto() # Logging related to tarball state operations + STATE_OPS = auto() # Logging related to tarball state operations GITHUB_OPS = auto() # Logging related to GitHub operations (PRs, issues, etc.) GROUP_OPS = auto() # Logging related to tarball group operations - TASK_OPS = auto() # Logging related to task operations - ERROR = auto() # Error logging (separate from other scopes for easier filtering) - DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) + TASK_OPS = auto() # Logging related to task operations + ERROR = auto() # Error logging (separate from other scopes for easier filtering) + DEBUG = auto() # Debug-level logging (separate from other scopes for easier filtering) ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS | GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG) From 4f148240b16c02d45fe4f11b665c17e1c2a86b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 16 May 2025 20:13:40 +0200 Subject: [PATCH 080/218] build up path to store task file in staging repo --- scripts/automated_ingestion/eessi_task.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8271c39f..3e6aa48a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -48,6 +48,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github): self.action = self._determine_task_action() # Define valid state transitions for all actions + # NOTE, TaskState.APPROVED must be the first element or _next_state() will not work self.valid_transitions = { TaskState.NEW: [TaskState.STAGED], TaskState.STAGED: [TaskState.PR_OPENED], @@ -236,6 +237,16 @@ def _list_directory_contents(self, directory_path, branch=None): raise FileNotFoundError(f"Directory not found: {directory_path}") raise err + @log_function_entry_exit() + def _next_state(self) -> TaskState: + """ + Determine the next state based on the current state using the valid_transitions dictionary. + + NOTE, it assumes that function is only called for non-terminal states and that the next state is the first + element of the list returned by the valid_transitions dictionary. + """ + return self.valid_transitions[self.state][0] + @log_function_entry_exit() def handle(self): """ @@ -283,7 +294,13 @@ def _handle_add_new(self): payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client) self.payload = EESSITaskPayload(payload_object) log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload) - + # determine next state (NEXT_STATE), put metadata/task file into GH staging repo in main branch under directory + # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE + next_state = self._next_state() + log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + repo_pr_dir = f"{self.description.task_object.repo}/{self.description.task_object.pr}" + staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}" + log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path) return True @log_function_entry_exit() From 14762ef1c7aeb96814d1853e7f84ece15216e24a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 16 May 2025 21:48:32 +0200 Subject: [PATCH 081/218] add functions to determine repo name and pr number --- .../eessi_task_description.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 43da6139..4e5d638e 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -88,6 +88,7 @@ def _read_metadata(self) -> None: self.task_object.local_file_path, str(err)) raise + @log_function_entry_exit() def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: """ Get the components of the metadata file name. @@ -123,6 +124,27 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: timestamp = components[-1] return version, component, os, architecture, timestamp, suffix + @log_function_entry_exit() + def get_pr_number(self) -> str: + """ + Get the PR number from the task description / metadata file. + """ + if self.source and 'pr' in self.source: + return self.source['pr'] + else: + return '0' + + @log_function_entry_exit() + def get_repo_name(self) -> str: + """ + Get the repository name from the task description / metadata file. + """ + if self.source and 'repo' in self.source: + return self.source['repo'] + else: + return 'None' + + @log_function_entry_exit() def __str__(self) -> str: """Return a string representation of the EESSITaskDescription object.""" return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" From ffae1cfc75f9d0ff0b5ec66202ea13148ba07a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 16 May 2025 21:50:56 +0200 Subject: [PATCH 082/218] use functions to determine repo name and pr number --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 3e6aa48a..b45c176e 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -298,7 +298,7 @@ def _handle_add_new(self): # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE next_state = self._next_state() log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) - repo_pr_dir = f"{self.description.task_object.repo}/{self.description.task_object.pr}" + repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}" staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}" log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path) return True From 066ad16674eba6cac4ab0c60af040ec6fb7b82f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 16 May 2025 23:55:52 +0200 Subject: [PATCH 083/218] add metadata / task file to GH staging repo --- scripts/automated_ingestion/eessi_task.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index b45c176e..ae48a14c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -298,9 +298,15 @@ def _handle_add_new(self): # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE next_state = self._next_state() log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) - repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}" + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + repo_pr_dir = f"{repo_name}/{pr_number}" staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}" log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path) + # contents of task description / metadata file + contents = self.description.get_contents() + self.git_repo.create_file(staging_repo_path, f"new task for {repo_name} PR {pr_number} add build for arch" , + contents) return True @log_function_entry_exit() From d671c827f8c9a8fa07f8adf9763f3606aa493b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 00:22:13 +0200 Subject: [PATCH 084/218] add function to return raw contents of metadata / task file --- scripts/automated_ingestion/eessi_task_description.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 4e5d638e..686f1b90 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -76,7 +76,8 @@ def _read_metadata(self) -> None: try: with open(self.task_object.local_file_path, 'r') as file: - self.metadata = json.load(file) + self.raw_contents = file.read() + self.metadata = json.loads(self.raw_contents) log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s", self.task_object.local_file_path) except json.JSONDecodeError as err: @@ -88,6 +89,13 @@ def _read_metadata(self) -> None: self.task_object.local_file_path, str(err)) raise + @log_function_entry_exit() + def get_contents(self) -> str: + """ + Get the contents of the task description / metadata file. + """ + return self.raw_contents + @log_function_entry_exit() def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: """ From 14d62e5ec896ca364bd5bac6edb4b6b0a6f61d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 09:39:04 +0200 Subject: [PATCH 085/218] include sequence number in path and various improvements --- scripts/automated_ingestion/eessi_task.py | 135 +++++++++++------- .../eessi_task_description.py | 45 ++++-- 2 files changed, 117 insertions(+), 63 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index ae48a14c..352b7dae 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,5 +1,5 @@ from enum import Enum, auto -from typing import Dict +from typing import Dict, List from eessi_data_object import EESSIDataAndSignatureObject from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription @@ -78,23 +78,44 @@ def _determine_task_action(self) -> EESSITaskAction: return EESSITaskAction.UNKNOWN @log_function_entry_exit() - def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool: + def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch=None) -> bool: """ Check if a file exists in a repository branch. + + Args: + file_path_prefix: the prefix of the file path + branch: the branch to check + + Returns: + True if a file with the prefix exists in the branch, False otherwise """ if branch is None: branch = self.git_repo.default_branch try: - self.git_repo.get_contents(file_path, ref=branch) - log_msg = "Found file %s in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch) - return True + # get all files in directory part of file_path_prefix + directory_part = os.path.dirname(file_path_prefix) + files = self.git_repo.get_contents(directory_part, ref=branch) + log_msg = "Found files %s in directory %s in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch) + # check if any of the files has file_path_prefix as prefix + for file in files: + if file.path.startswith(file_path_prefix): + log_msg = "Found file %s in directory %s in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch) + return True + log_msg = "No file with prefix %s found in directory %s in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch) + return False except UnknownObjectException: # file_path does not exist in branch + log_msg = "Directory %s or file with prefix %s does not exist in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch) return False except GithubException as err: if err.status == 404: # file_path does not exist in branch + log_msg = "Directory %s or file with prefix %s does not exist in branch %s" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch) return False else: # if there was some other (e.g. connection) issue, log message and return False @@ -136,8 +157,10 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> for dir in directories: # check if the directory is a number if dir.name.isdigit(): + # determin if a state file with prefix exists in the sequence number directory remote_file_path = self.description.task_object.remote_file_path - if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"): + state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path}" + if self._state_file_with_prefix_exists_in_repo_branch(state_file_name_prefix): sequence_numbers[int(dir.name)] = True else: sequence_numbers[int(dir.name)] = False @@ -153,6 +176,15 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> return {} return sequence_numbers + @log_function_entry_exit() + def _find_highest_number(self, str_list: List[str]) -> int: + """ + Find the highest number in a list of strings. + """ + # Convert all strings to integers + int_list = [int(num) for num in str_list] + return max(int_list) + @log_function_entry_exit() def _find_state(self) -> TaskState: """ @@ -163,60 +195,46 @@ def _find_state(self) -> TaskState: """ # obtain repo and pr from metadata log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object) - task = self.description.task - source = self.description.source - if 'repo' in task and 'pr' in task: - log_message(LoggingScope.TASK_OPS, 'INFO', "task found in metadata: %s", task) - repo = task['repo'] - pr = task['pr'] - elif 'repo' in source and 'pr' in source: - log_message(LoggingScope.TASK_OPS, 'INFO', "source found in metadata: %s", source) - repo = source['repo'] - pr = source['pr'] - else: - raise ValueError("no repo or pr found in metadata") + repo = self.description.get_repo_name() + pr = self.description.get_pr_number() log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr) - # iterate over all sequence numbers in repo/pr dir + # obtain all sequence numbers in repo/pr dir which include a state file for this task sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr) - log_message(LoggingScope.TASK_OPS, 'INFO', "sequence_numbers: %s", sequence_numbers) - for sequence_number in [key for key, value in sequence_numbers.items() if value]: - # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name - # format of the metadata file name is: - # eessi-VERSION-COMPONENT-OS-ARCHITECTURE-TIMESTAMP.SUFFIX - # all uppercase words are placeholders - # all placeholders (except ARCHITECTURE) do not include any hyphens - # ARCHITECTURE can include one to two hyphens - # The SUFFIX is composed of two parts: TARBALLSUFFIX and METADATASUFFIX - # TARBALLSUFFIX is defined by the task object or in the configuration file - # METADATASUFFIX is defined by the task object or in the configuration file - # Later, we may switch to using task action files instead of metadata files. The format of the - # SUFFIX would then be defined by the task action or the configuration file. - version, component, os, architecture, timestamp, suffix = self.description.get_metadata_file_components() - log_msg = "version: %s, component: %s, os: %s, architecture: %s, timestamp: %s, suffix: %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, version, component, os, architecture, timestamp, suffix) - metadata_file_name = f"eessi-{version}-{component}-{os}-{architecture}-{timestamp}.{suffix}" - metadata_file_state_path = f"{repo}/{pr}/{sequence_number}/{metadata_file_name}" - # get the state from the file in the metadata_file_state_path - state = self._get_state_from_metadata_file(metadata_file_state_path) - log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) - return state - # did not find metadata file in staging repo on GitHub - log_message(LoggingScope.TASK_OPS, 'INFO', "did not find metadata file in staging repo on GitHub, state: NEW") - return TaskState.NEW + if len(sequence_numbers) == 0: + # no sequence numbers found, so we return NEW + log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW") + return TaskState.NEW + # because a new sequence number is only created after the previous staging PR has been approved or rejected, + # we need to check if the processing of the highest sequence number is finished. + highest_sequence_number = self._find_highest_number(sequence_numbers.keys()) + # we obtain the state from the file in the highest sequence number directory + # TODO: verify if the state matches other information, e.g. the state of the staging PR + # for now, we assume that the state is correct + task_file_name = self.description.get_task_file_name() + metadata_file_state_path_prefix = f"{repo}/{pr}/{highest_sequence_number}/{task_file_name}." + state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix) + log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) + return state @log_function_entry_exit() - def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskState: + def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str) -> TaskState: """ - Get the state from the file in the metadata_file_state_path. + Get the state from the file in the metadata_file_state_path_prefix. """ - # get contents of metadata_file_state_path - contents = self.git_repo.get_contents(metadata_file_state_path) - try: - state = TaskState.from_string(contents.name) + # first get all files in directory part of metadata_file_state_path_prefix + directory_part = os.path.dirname(metadata_file_state_path_prefix) + files = self._list_directory_contents(directory_part) + # check if any of the files has metadata_file_state_path_prefix as prefix + for file in files: + if file.path.startswith(metadata_file_state_path_prefix): + # get state from file name taking only the suffix + state = TaskState.from_string(file.name.split('.')[-1]) return state - except ValueError: - return TaskState.NEW + # did not find any file with metadata_file_state_path_prefix as prefix + log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s", + metadata_file_state_path_prefix) + return TaskState.NEW @log_function_entry_exit() def _list_directory_contents(self, directory_path, branch=None): @@ -301,12 +319,19 @@ def _handle_add_new(self): repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() repo_pr_dir = f"{repo_name}/{pr_number}" - staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}" + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) + if len(sequence_numbers) == 0: + sequence_number = 0 + else: + sequence_number = self._find_highest_number(sequence_numbers.keys()) + staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{payload_name}.{next_state}" log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path) # contents of task description / metadata file contents = self.description.get_contents() - self.git_repo.create_file(staging_repo_path, f"new task for {repo_name} PR {pr_number} add build for arch" , + self.git_repo.create_file(staging_repo_path, + f"new task for {repo_name} PR {pr_number} seq {sequence_number}: add build for arch", contents) + self.state = next_state return True @log_function_entry_exit() diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 686f1b90..c8e627ab 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -132,25 +132,54 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]: timestamp = components[-1] return version, component, os, architecture, timestamp, suffix + @log_function_entry_exit() + def get_metadata_value(self, key: str) -> str: + """ + Get the value of a key from the task description / metadata file. + """ + # check that key is defined and has a length > 0 + if not key or len(key) == 0: + raise ValueError("get_metadata_value: key is not defined or has a length of 0") + + value = None + task = self.description.task + source = self.description.source + if task and 'repo' in task and key in task['repo']: + value = task['repo'][key] + log_message(LoggingScope.TASK_OPS, 'INFO', + f"Value '{value}' for key {key} found in information from task metadata: {task}") + elif source and 'repo' in source and key in source['repo']: + value = source['repo'][key] + log_message(LoggingScope.TASK_OPS, 'INFO', + f"Value '{value}' for key {key} found in information from source metadata: {source}") + else: + log_message(LoggingScope.TASK_OPS, 'INFO', + f"Value '{value}' for key {key} neither found in task metadata nor source metadata") + raise ValueError(f"Value '{value}' for key {key} neither found in task metadata nor source metadata") + return value + @log_function_entry_exit() def get_pr_number(self) -> str: """ Get the PR number from the task description / metadata file. """ - if self.source and 'pr' in self.source: - return self.source['pr'] - else: - return '0' + return self.get_metadata_value('pr') @log_function_entry_exit() def get_repo_name(self) -> str: """ Get the repository name from the task description / metadata file. """ - if self.source and 'repo' in self.source: - return self.source['repo'] - else: - return 'None' + return self.get_metadata_value('repo') + + @log_function_entry_exit() + def get_task_file_name(self) -> str: + """ + Get the file name from the task description / metadata file. + """ + # get file name from remote file path using basename + file_name = Path(self.task_object.remote_file_path).name + return file_name @log_function_entry_exit() def __str__(self) -> str: From 2357abc425f7a0ed9cb73b2b6b336d54ecc404f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 09:51:56 +0200 Subject: [PATCH 086/218] fix issue with non-existing class element --- scripts/automated_ingestion/eessi_task_description.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index c8e627ab..f847c29e 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -142,8 +142,8 @@ def get_metadata_value(self, key: str) -> str: raise ValueError("get_metadata_value: key is not defined or has a length of 0") value = None - task = self.description.task - source = self.description.source + task = self.task + source = self.source if task and 'repo' in task and key in task['repo']: value = task['repo'][key] log_message(LoggingScope.TASK_OPS, 'INFO', From 370b6a6759fcb31d077dffd9361277edb79828f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 10:07:25 +0200 Subject: [PATCH 087/218] add a bit more logging when obtaining value from metadata --- scripts/automated_ingestion/eessi_task_description.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index f847c29e..3bababe5 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -144,17 +144,19 @@ def get_metadata_value(self, key: str) -> str: value = None task = self.task source = self.source + log_message(LoggingScope.TASK_OPS, 'INFO', + f"checking if either task ({task}) or source ({source}) contains information for key '{key}'") if task and 'repo' in task and key in task['repo']: value = task['repo'][key] log_message(LoggingScope.TASK_OPS, 'INFO', - f"Value '{value}' for key {key} found in information from task metadata: {task}") + f"Value '{value}' for key '{key}' found in information from task metadata: {task}") elif source and 'repo' in source and key in source['repo']: value = source['repo'][key] log_message(LoggingScope.TASK_OPS, 'INFO', - f"Value '{value}' for key {key} found in information from source metadata: {source}") + f"Value '{value}' for key '{key}' found in information from source metadata: {source}") else: log_message(LoggingScope.TASK_OPS, 'INFO', - f"Value '{value}' for key {key} neither found in task metadata nor source metadata") + f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata") raise ValueError(f"Value '{value}' for key {key} neither found in task metadata nor source metadata") return value From 6dcb675e64239c86e67233ac2ba25577d68c3765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 10:14:04 +0200 Subject: [PATCH 088/218] show data types of task and source elements --- scripts/automated_ingestion/eessi_task_description.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 3bababe5..91691223 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -144,8 +144,9 @@ def get_metadata_value(self, key: str) -> str: value = None task = self.task source = self.source - log_message(LoggingScope.TASK_OPS, 'INFO', - f"checking if either task ({task}) or source ({source}) contains information for key '{key}'") + log_msg = f"checking if either task ({task}, type {type(task)}) or" + log_msg += f" source ({source}, type {type(source)}) contains information for key '{key}'" + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg) if task and 'repo' in task and key in task['repo']: value = task['repo'][key] log_message(LoggingScope.TASK_OPS, 'INFO', @@ -157,7 +158,7 @@ def get_metadata_value(self, key: str) -> str: else: log_message(LoggingScope.TASK_OPS, 'INFO', f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata") - raise ValueError(f"Value '{value}' for key {key} neither found in task metadata nor source metadata") + raise ValueError(f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata") return value @log_function_entry_exit() From 057a9535ccb134ad9efc556f28eeb0f4321686a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 10:18:45 +0200 Subject: [PATCH 089/218] fix logic to obtain value for key from task or source --- .../eessi_task_description.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py index 91691223..5ff4c196 100644 --- a/scripts/automated_ingestion/eessi_task_description.py +++ b/scripts/automated_ingestion/eessi_task_description.py @@ -144,21 +144,19 @@ def get_metadata_value(self, key: str) -> str: value = None task = self.task source = self.source - log_msg = f"checking if either task ({task}, type {type(task)}) or" - log_msg += f" source ({source}, type {type(source)}) contains information for key '{key}'" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg) - if task and 'repo' in task and key in task['repo']: - value = task['repo'][key] + # check if key is in task or source + if task and key in task: + value = task[key] log_message(LoggingScope.TASK_OPS, 'INFO', f"Value '{value}' for key '{key}' found in information from task metadata: {task}") - elif source and 'repo' in source and key in source['repo']: - value = source['repo'][key] + elif source and key in source: + value = source[key] log_message(LoggingScope.TASK_OPS, 'INFO', f"Value '{value}' for key '{key}' found in information from source metadata: {source}") else: log_message(LoggingScope.TASK_OPS, 'INFO', - f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata") - raise ValueError(f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata") + f"Value for key '{key}' neither found in task metadata nor source metadata") + raise ValueError(f"Value for key '{key}' neither found in task metadata nor source metadata") return value @log_function_entry_exit() From 167cff29dc9c056c4dcd51a1666fdb1b66669531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 11:12:30 +0200 Subject: [PATCH 090/218] use basename of remote_file_path --- scripts/automated_ingestion/eessi_task.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 352b7dae..8830e34a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -157,9 +157,10 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> for dir in directories: # check if the directory is a number if dir.name.isdigit(): - # determin if a state file with prefix exists in the sequence number directory - remote_file_path = self.description.task_object.remote_file_path - state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path}" + # determine if a state file with prefix exists in the sequence number directory + # we need to use the basename of the remote file path + remote_file_path_basename = os.path.basename(self.description.task_object.remote_file_path) + state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path_basename}" if self._state_file_with_prefix_exists_in_repo_branch(state_file_name_prefix): sequence_numbers[int(dir.name)] = True else: From d700826bd16e28aef98873178aa9092748bcec36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 11:26:28 +0200 Subject: [PATCH 091/218] use task file name for storing state file in GH staging repo --- scripts/automated_ingestion/eessi_task.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8830e34a..703f87b0 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -325,7 +325,9 @@ def _handle_add_new(self): sequence_number = 0 else: sequence_number = self._find_highest_number(sequence_numbers.keys()) - staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{payload_name}.{next_state}" + # we use the basename of the remote file path for the task description file + task_file_name = self.description.get_task_file_name() + staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{task_file_name}.{next_state}" log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path) # contents of task description / metadata file contents = self.description.get_contents() From a9dad6888e8aae54950b59daa3de7701e5c1c962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 17 May 2025 12:02:49 +0200 Subject: [PATCH 092/218] fix indentation --- scripts/automated_ingestion/eessi_task.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 703f87b0..7806b508 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -231,7 +231,8 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s if file.path.startswith(metadata_file_state_path_prefix): # get state from file name taking only the suffix state = TaskState.from_string(file.name.split('.')[-1]) - return state + log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) + return state # did not find any file with metadata_file_state_path_prefix as prefix log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s", metadata_file_state_path_prefix) From bc6423b68523b81931b0db5b4a8b8253d86e2f1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 18 May 2025 00:43:48 +0200 Subject: [PATCH 093/218] various improvement for determining the state of a task --- scripts/automated_ingestion/eessi_task.py | 94 +++++++++++++++++------ 1 file changed, 70 insertions(+), 24 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7806b508..26760dea 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -128,6 +128,9 @@ def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, b def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> Dict[int, bool]: """ Determines in which sequence numbers the metadata/task file is included and in which it is not. + NOTE, we only need to check the default branch of the repository, because a for a new task a file + is added to the default branch and for the subsequent processing of the task we use a different branch. + Thus, until the PR is closed, the task file stays in the default branch. Args: repo: the repository name @@ -206,37 +209,60 @@ def _find_state(self) -> TaskState: # no sequence numbers found, so we return NEW log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW") return TaskState.NEW - # because a new sequence number is only created after the previous staging PR has been approved or rejected, - # we need to check if the processing of the highest sequence number is finished. - highest_sequence_number = self._find_highest_number(sequence_numbers.keys()) - # we obtain the state from the file in the highest sequence number directory - # TODO: verify if the state matches other information, e.g. the state of the staging PR - # for now, we assume that the state is correct + # we got at least one sequence number + # if one value for a sequence number is True, we can determine the state from the file in the directory + sequence_including_task = [key for key, value in sequence_numbers.items() if value is True] + if len(sequence_including_task) == 0: + # no sequence number includes the task file, so we return NEW + log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence number includes the task file, state: NEW") + return TaskState.NEW + # we got at least one sequence number which includes the task file + # we can determine the state from the filename in the directory + # NOTE, we use the first element in sequence_including_task (there should be only one) + # we ignore other elements in sequence_including_task + sequence_number = sequence_including_task[0] task_file_name = self.description.get_task_file_name() - metadata_file_state_path_prefix = f"{repo}/{pr}/{highest_sequence_number}/{task_file_name}." - state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix) + metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}." + state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number) log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) return state @log_function_entry_exit() - def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str) -> TaskState: + def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str, + sequence_number: int) -> TaskState: """ Get the state from the file in the metadata_file_state_path_prefix. """ - # first get all files in directory part of metadata_file_state_path_prefix + # depending on the state of the deployment (NEW, STAGED, PR_OPENED, APPROVED, REJECTED, INGESTED) + # we need to check the task file in the default branch or in the branch corresponding to the sequence number directory_part = os.path.dirname(metadata_file_state_path_prefix) - files = self._list_directory_contents(directory_part) - # check if any of the files has metadata_file_state_path_prefix as prefix - for file in files: - if file.path.startswith(metadata_file_state_path_prefix): - # get state from file name taking only the suffix - state = TaskState.from_string(file.name.split('.')[-1]) - log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) - return state - # did not find any file with metadata_file_state_path_prefix as prefix - log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s", - metadata_file_state_path_prefix) - return TaskState.NEW + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + default_branch_name = self.git_repo.default_branch + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + all_branch_names = [branch.name for branch in self.git_repo.get_branches()] + states = [] + for branch in [default_branch_name, branch_name]: + if branch in all_branch_names: + # first get all files in directory part of metadata_file_state_path_prefix + files = self._list_directory_contents(directory_part, branch) + # check if any of the files has metadata_file_state_path_prefix as prefix + for file in files: + if file.path.startswith(metadata_file_state_path_prefix): + # get state from file name taking only the suffix + state = TaskState.from_string(file.name.split('.')[-1]) + log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) + states.append(state) + if len(states) == 0: + # did not find any file with metadata_file_state_path_prefix as prefix + log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s", + metadata_file_state_path_prefix) + return TaskState.NEW + # sort the states and return the last one + states.sort() + state = states[-1] + log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state) + return state @log_function_entry_exit() def _list_directory_contents(self, directory_path, branch=None): @@ -298,7 +324,7 @@ def handle(self): def _handle_add_new(self): """Handler for ADD action in NEW state""" print("Handling ADD action in NEW state") - # Implementation for adding in NEW state + # Implementation for adding in NEW state: a task is only NEW if it was not processed yet # get name of of payload from metadata payload_name = self.description.metadata['payload']['filename'] log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name) @@ -315,7 +341,7 @@ def _handle_add_new(self): self.payload = EESSITaskPayload(payload_object) log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload) # determine next state (NEXT_STATE), put metadata/task file into GH staging repo in main branch under directory - # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE + # REPO/PR_NUM/SEQ_NUM/task_file_name.NEXT_STATE next_state = self._next_state() log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) repo_name = self.description.get_repo_name() @@ -325,7 +351,18 @@ def _handle_add_new(self): if len(sequence_numbers) == 0: sequence_number = 0 else: + # we need to figure out the status of the last deployment (with the highest sequence number) + # if a PR exists and it is closed, we add the task to the *next* higher sequence number + # otherwise we add the task to the highest sequence number sequence_number = self._find_highest_number(sequence_numbers.keys()) + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + if branch_name in [branch.name for branch in self.git_repo.get_branches()]: + # branch exists, check if PR exists + find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')] + if find_pr: + pr = find_pr.pop(0) + if pr.state == 'closed': + sequence_number += 1 # we use the basename of the remote file path for the task description file task_file_name = self.description.get_task_file_name() staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{task_file_name}.{next_state}" @@ -343,6 +380,15 @@ def _handle_add_staged(self): """Handler for ADD action in STAGED state""" print("Handling ADD action in STAGED state") # Implementation for adding in STAGED state + # construct supposed branch name + # check if branch exists + # - yes: check if corresponding PR exists + # - yes: check status of PR + # - open: rename file and add it to branch, set state, update PR contents, return + # - closed && !merged: rename file to rejected, set state + # - else: weird state, log message, return + # - no: delete branch + # create new branch, add task file to branch, set state, create PR, update PR contents, return return True @log_function_entry_exit() From 092f32b7e0a86b25acc6ea9d33f15c365eca7e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Thu, 22 May 2025 21:42:44 +0200 Subject: [PATCH 094/218] add functions to handle sequences of deployments --- scripts/automated_ingestion/eessi_task.py | 157 +++++++++++++++++++++- 1 file changed, 156 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 26760dea..8a65c84b 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -9,6 +9,12 @@ import os +class SequenceStatus(Enum): + DOES_NOT_EXIST = auto() + IN_PROGRESS = auto() + FINISHED = auto() + + class TaskState(Enum): NEW = auto() # The task has been created but not yet processed STAGED = auto() # The task has been staged to the Stratum-0 @@ -189,6 +195,112 @@ def _find_highest_number(self, str_list: List[str]) -> int: int_list = [int(num) for num in str_list] return max(int_list) + @log_function_entry_exit() + def _get_sequence_number_for_task_file(self) -> int: + """ + Get the sequence number this task is assigned to at the moment. + NOTE, should only be called if the task is actually assigned to a sequence number. + """ + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) + if len(sequence_numbers) == 0: + raise ValueError("Found no sequence numbers at all") + else: + # get all entries with value True, there should be only one, so we return the first one + sequence_numbers_true = [key for key, value in sequence_numbers.items() if value is True] + if len(sequence_numbers_true) == 0: + raise ValueError("Found no sequence numbers that include the task file for task %s", + self.description) + else: + return sequence_numbers_true[0] + + @log_function_entry_exit() + def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None) -> int: + """ + Get the current sequence number based on the sequence numbers. + If sequence_numbers is not provided, we determine the sequence numbers from the task description. + """ + if sequence_numbers is None: + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) + if len(sequence_numbers) == 0: + return 0 + return self._find_highest_number(sequence_numbers.keys()) + + @log_function_entry_exit() + def _determine_sequence_status(self, sequence_number: int = None) -> int: + """ + Determine the status of the sequence number. It could be: DOES_NOT_EXIST, IN_PROGRESS, FINISHED + If sequence_number is not provided, we use the highest existing sequence number. + """ + if sequence_number is None: + sequence_number = self._get_current_sequence_number() + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) + if len(sequence_numbers) == 0: + return SequenceStatus.DOES_NOT_EXIST + elif sequence_number not in sequence_numbers.keys(): + return SequenceStatus.DOES_NOT_EXIST + elif sequence_number < self._find_highest_number(sequence_numbers.keys()): + return SequenceStatus.FINISHED + else: + # check status of PR if it exists + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + if branch_name in [branch.name for branch in self.git_repo.get_branches()]: + find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')] + if find_pr: + pr = find_pr.pop(0) + if pr.state == 'closed': + return SequenceStatus.FINISHED + return SequenceStatus.IN_PROGRESS + + @log_function_entry_exit() + def _find_staging_pr(self) -> Tuple[PullRequest, str, int]: + """ + Find the staging PR for the task. + TODO: arg sequence number --> make function simpler + """ + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + try: + sequence_number = self._get_sequence_number_for_task_file() + except ValueError: + # no sequence number found, so we return None + log_message(LoggingScope.ERROR, 'ERROR', "no sequence number found for task %s", self.description) + return None, None, None + except Exception as err: + # some other error + log_message(LoggingScope.ERROR, 'ERROR', "error finding staging PR for task %s: %s", + self.description, err) + return None, None, None + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + if branch_name in [branch.name for branch in self.git_repo.get_branches()]: + find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')] + if find_pr: + pr = find_pr.pop(0) + return pr, branch_name, sequence_number + else: + return None, branch_name, sequence_number + else: + return None, None, None + + @log_function_entry_exit() + def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]: + """ + Create a staging PR for the task. + NOTE, SHALL only be called if no staging PR for the task exists yet. + """ + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", + body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", + head=branch_name, base=self.git_repo.default_branch) + return pr, branch_name + @log_function_entry_exit() def _find_state(self) -> TaskState: """ @@ -380,7 +492,50 @@ def _handle_add_staged(self): """Handler for ADD action in STAGED state""" print("Handling ADD action in STAGED state") # Implementation for adding in STAGED state - # construct supposed branch name + # - create or find PR + # - update PR contents + # determine PR + # - no PR -> create one + # - PR && closed -> create one (may require to move task file to different sequence number) + # - PR && open -> update PR contents, task file status, etc + # TODO: determine sequence number, then use it to find staging pr + # find staging PR + staging_pr, staging_branch = self._find_staging_pr(sequence_number) + # create PR if necessary + if staging_pr is None and sequence_number is None: + # no PR found, create one + staging_pr, staging_branch = self._create_staging_pr(sequence_number) + elif staging_pr is None and sequence_number is not None: + # no PR found, create one + staging_pr, staging_branch = self._create_staging_pr(sequence_number) + elif staging_pr.state == 'closed': + # PR closed, create new one + staging_pr, staging_branch = self._create_staging_pr(sequence_number + 1) + if staging_pr is None: + # something went wrong, we cannot continue + log_message(LoggingScope.ERROR, 'ERROR', "no staging PR found for task %s", self.description) + return False + # update PR contents + self._update_pr_contents(staging_pr) + # update task file status + self._update_task_file_status(staging_branch) + + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + # current sequence + sequence_number = self._get_current_sequence_number() + sequence_status = self._determine_sequence_status(sequence_number) + if sequence_status == SequenceStatus.FINISHED: + sequence_number += 1 + # re-determine sequence status + sequence_status = self._determine_sequence_status(sequence_number) + if sequence_status == SequenceStatus.DOES_NOT_EXIST: + # something is odd, the task file should already be in the default branch + log_message(LoggingScope.ERROR, 'ERROR', "sequence number %s does not exist", sequence_number) + return False + elif sequence_status == SequenceStatus.FINISHED: + # we need to figure out the status of the last deployment (with the highest sequence number) + branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" # check if branch exists # - yes: check if corresponding PR exists # - yes: check status of PR From f44d80967b881593e921601e96d546e14a68fdd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 12:12:36 +0200 Subject: [PATCH 095/218] revise states of a task --- scripts/automated_ingestion/eessi_task.py | 67 ++++++++++++----------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8a65c84b..0d9ff203 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -16,12 +16,14 @@ class SequenceStatus(Enum): class TaskState(Enum): - NEW = auto() # The task has been created but not yet processed - STAGED = auto() # The task has been staged to the Stratum-0 - PR_OPENED = auto() # The task has been opened as a PR in some staging repository - APPROVED = auto() # The task has been approved - REJECTED = auto() # The task has been rejected - INGESTED = auto() # The task has been ingested into the target CernVM-FS repository + UNDETERMINED = auto() # The task state was not determined yet + NEW_TASK = auto() # The task has been created but not yet processed + PAYLOAD_STAGED = auto() # The task's payload has been staged to the Stratum-0 + PULL_REQUEST = auto() # A PR for the task has been created or updated in some staging repository + APPROVED = auto() # The PR for the task has been approved + REJECTED = auto() # The PR for the task has been rejected + INGESTED = auto() # The task's payload has been applied to the target CernVM-FS repository + DONE = auto() # The task has been completed @classmethod def from_string(cls, name, default=None, case_sensitive=False): @@ -56,12 +58,15 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github): # Define valid state transitions for all actions # NOTE, TaskState.APPROVED must be the first element or _next_state() will not work self.valid_transitions = { - TaskState.NEW: [TaskState.STAGED], - TaskState.STAGED: [TaskState.PR_OPENED], - TaskState.PR_OPENED: [TaskState.APPROVED, TaskState.REJECTED], + TaskState.UNDETERMINED: [TaskState.NEW_TASK, TaskState.PAYLOAD_STAGED, TaskState.PULL_REQUEST, + TaskState.APPROVED, TaskState.REJECTED, TaskState.INGESTED, TaskState.DONE], + TaskState.NEW_TASK: [TaskState.PAYLOAD_STAGED], + TaskState.PAYLOAD_STAGED: [TaskState.PULL_REQUEST], + TaskState.PULL_REQUEST: [TaskState.APPROVED, TaskState.REJECTED], TaskState.APPROVED: [TaskState.INGESTED], - TaskState.REJECTED: [], # Terminal state - TaskState.INGESTED: [] # Terminal state + TaskState.REJECTED: [TaskState.DONE], + TaskState.INGESTED: [TaskState.DONE], + TaskState.DONE: [] # Terminal state } self.state = self._find_state() @@ -318,16 +323,16 @@ def _find_state(self) -> TaskState: # obtain all sequence numbers in repo/pr dir which include a state file for this task sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr) if len(sequence_numbers) == 0: - # no sequence numbers found, so we return NEW - log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW") - return TaskState.NEW + # no sequence numbers found, so we return NEW_TASK + log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW_TASK") + return TaskState.NEW_TASK # we got at least one sequence number # if one value for a sequence number is True, we can determine the state from the file in the directory sequence_including_task = [key for key, value in sequence_numbers.items() if value is True] if len(sequence_including_task) == 0: - # no sequence number includes the task file, so we return NEW - log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence number includes the task file, state: NEW") - return TaskState.NEW + # no sequence number includes the task file, so we return NEW_TASK + log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence number includes the task file, state: NEW_TASK") + return TaskState.NEW_TASK # we got at least one sequence number which includes the task file # we can determine the state from the filename in the directory # NOTE, we use the first element in sequence_including_task (there should be only one) @@ -345,7 +350,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s """ Get the state from the file in the metadata_file_state_path_prefix. """ - # depending on the state of the deployment (NEW, STAGED, PR_OPENED, APPROVED, REJECTED, INGESTED) + # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED, INGESTED) # we need to check the task file in the default branch or in the branch corresponding to the sequence number directory_part = os.path.dirname(metadata_file_state_path_prefix) repo_name = self.description.get_repo_name() @@ -369,7 +374,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s # did not find any file with metadata_file_state_path_prefix as prefix log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s", metadata_file_state_path_prefix) - return TaskState.NEW + return TaskState.NEW_TASK # sort the states and return the last one states.sort() state = states[-1] @@ -433,10 +438,10 @@ def handle(self): # Implement handlers for ADD action @log_function_entry_exit() - def _handle_add_new(self): - """Handler for ADD action in NEW state""" - print("Handling ADD action in NEW state") - # Implementation for adding in NEW state: a task is only NEW if it was not processed yet + def _handle_add_new_task(self): + """Handler for ADD action in NEW_TASK state""" + print("Handling ADD action in NEW_TASK state") + # Implementation for adding in NEW_TASK state: a task is only NEW_TASK if it was not processed yet # get name of of payload from metadata payload_name = self.description.metadata['payload']['filename'] log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name) @@ -488,10 +493,10 @@ def _handle_add_new(self): return True @log_function_entry_exit() - def _handle_add_staged(self): - """Handler for ADD action in STAGED state""" - print("Handling ADD action in STAGED state") - # Implementation for adding in STAGED state + def _handle_add_payload_staged(self): + """Handler for ADD action in PAYLOAD_STAGED state""" + print("Handling ADD action in PAYLOAD_STAGED state") + # Implementation for adding in PAYLOAD_STAGED state # - create or find PR # - update PR contents # determine PR @@ -547,10 +552,10 @@ def _handle_add_staged(self): return True @log_function_entry_exit() - def _handle_add_pr_opened(self): - """Handler for ADD action in PR_OPENED state""" - print("Handling ADD action in PR_OPENED state") - # Implementation for adding in PR_OPENED state + def _handle_add_pull_request(self): + """Handler for ADD action in PULL_REQUEST state""" + print("Handling ADD action in PULL_REQUEST state") + # Implementation for adding in PULL_REQUEST state return True @log_function_entry_exit() From 3f9279f7b242c04db96520cf5ef0c4894d445759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 12:49:41 +0200 Subject: [PATCH 096/218] start revising determining state --- .../automated_ingestion.py | 3 ++ scripts/automated_ingestion/eessi_task.py | 39 ++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 7ece86cc..fc0d6f72 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -258,6 +258,9 @@ def main(): ), gh_staging_repo ) + current_state = task.determine_state() + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'", task_path, current_state) + except Exception as err: log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", task_path, str(err)) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0d9ff203..ef96f10d 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -69,7 +69,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github): TaskState.DONE: [] # Terminal state } - self.state = self._find_state() + # self.state = self._find_state() @log_function_entry_exit() def _determine_task_action(self) -> EESSITaskAction: @@ -410,6 +410,43 @@ def _next_state(self) -> TaskState: """ return self.valid_transitions[self.state][0] + @log_function_entry_exit() + def _path_exists_in_branch(self, path: str, branch: str = None) -> bool: + """ + Check if a path exists in a branch. + """ + try: + branch = self.git_repo.default_branch if branch is None else branch + contents = self._list_directory_contents(path, branch) + if isinstance(contents, list): + return True + else: + return False + return True + except FileNotFoundError: + return False + + @log_function_entry_exit() + def determine_state(self) -> TaskState: + """ + Determine the state of the task based on the state of the staging repository. + """ + # High-level logic: + # 1. Check if path representing the task file exists in the default branch + path_in_default_branch = self.description.task_object.remote_file_path + if self._path_exists_in_branch(path_in_default_branch, branch=self.git_repo.default_branch): + log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch", + path_in_default_branch) + else: + log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch", + path_in_default_branch) + # check if path exists in any other branch + for branch in self.git_repo.get_branches(): + if self._path_exists_in_branch(path_in_default_branch, branch): + log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s", + exit(0) + return TaskState.UNDETERMINED + @log_function_entry_exit() def handle(self): """ From d0739f82ad87ac513b9fb81e6d3c33927485b749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 13:00:12 +0200 Subject: [PATCH 097/218] fix syntax --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index ef96f10d..1031b860 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -445,7 +445,7 @@ def determine_state(self) -> TaskState: if self._path_exists_in_branch(path_in_default_branch, branch): log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s", exit(0) - return TaskState.UNDETERMINED + # return TaskState.UNDETERMINED @log_function_entry_exit() def handle(self): From a068b7296c9e7c6d8f7cf935778dea13b11826a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 13:07:06 +0200 Subject: [PATCH 098/218] fix various flake8 issues --- .../automated_ingestion.py | 3 ++- scripts/automated_ingestion/eessi_task.py | 22 +++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index fc0d6f72..29fe68ec 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -259,7 +259,8 @@ def main(): gh_staging_repo ) current_state = task.determine_state() - log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'", task_path, current_state) + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'", + task_path, current_state.name) except Exception as err: log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 1031b860..8e07959c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,11 +1,11 @@ from enum import Enum, auto -from typing import Dict, List +from typing import Dict, List, Tuple from eessi_data_object import EESSIDataAndSignatureObject from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription from eessi_task_payload import EESSITaskPayload from utils import log_message, LoggingScope, log_function_entry_exit -from github import Github, GithubException, UnknownObjectException +from github import Github, GithubException, UnknownObjectException, PullRequest import os @@ -59,7 +59,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github): # NOTE, TaskState.APPROVED must be the first element or _next_state() will not work self.valid_transitions = { TaskState.UNDETERMINED: [TaskState.NEW_TASK, TaskState.PAYLOAD_STAGED, TaskState.PULL_REQUEST, - TaskState.APPROVED, TaskState.REJECTED, TaskState.INGESTED, TaskState.DONE], + TaskState.APPROVED, TaskState.REJECTED, TaskState.INGESTED, TaskState.DONE], TaskState.NEW_TASK: [TaskState.PAYLOAD_STAGED], TaskState.PAYLOAD_STAGED: [TaskState.PULL_REQUEST], TaskState.PULL_REQUEST: [TaskState.APPROVED, TaskState.REJECTED], @@ -296,14 +296,14 @@ def _find_staging_pr(self) -> Tuple[PullRequest, str, int]: def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]: """ Create a staging PR for the task. - NOTE, SHALL only be called if no staging PR for the task exists yet. + NOTE, SHALL only be called if no staging PR for the task exists yet. """ repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", - body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", - head=branch_name, base=self.git_repo.default_branch) + body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", + head=branch_name, base=self.git_repo.default_branch) return pr, branch_name @log_function_entry_exit() @@ -350,7 +350,8 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s """ Get the state from the file in the metadata_file_state_path_prefix. """ - # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED, INGESTED) + # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED, + # INGESTED, DONE) # we need to check the task file in the default branch or in the branch corresponding to the sequence number directory_part = os.path.dirname(metadata_file_state_path_prefix) repo_name = self.description.get_repo_name() @@ -444,8 +445,9 @@ def determine_state(self) -> TaskState: for branch in self.git_repo.get_branches(): if self._path_exists_in_branch(path_in_default_branch, branch): log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s", + path_in_default_branch, branch) exit(0) - # return TaskState.UNDETERMINED + return TaskState.UNDETERMINED @log_function_entry_exit() def handle(self): @@ -542,6 +544,7 @@ def _handle_add_payload_staged(self): # - PR && open -> update PR contents, task file status, etc # TODO: determine sequence number, then use it to find staging pr # find staging PR + sequence_number = self._get_sequence_number_for_task_file() staging_pr, staging_branch = self._find_staging_pr(sequence_number) # create PR if necessary if staging_pr is None and sequence_number is None: @@ -564,7 +567,7 @@ def _handle_add_payload_staged(self): repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() - # current sequence + # current sequence sequence_number = self._get_current_sequence_number() sequence_status = self._determine_sequence_status(sequence_number) if sequence_status == SequenceStatus.FINISHED: @@ -578,6 +581,7 @@ def _handle_add_payload_staged(self): elif sequence_status == SequenceStatus.FINISHED: # we need to figure out the status of the last deployment (with the highest sequence number) branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s exists", branch_name) # check if branch exists # - yes: check if corresponding PR exists # - yes: check status of PR From c4a60d29de07c471127a3a05bf59e1c8b3c4ef0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 13:12:35 +0200 Subject: [PATCH 099/218] fix typing issue --- scripts/automated_ingestion/eessi_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8e07959c..ab15bed3 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,5 +1,5 @@ from enum import Enum, auto -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Optional from eessi_data_object import EESSIDataAndSignatureObject from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription @@ -263,7 +263,7 @@ def _determine_sequence_status(self, sequence_number: int = None) -> int: return SequenceStatus.IN_PROGRESS @log_function_entry_exit() - def _find_staging_pr(self) -> Tuple[PullRequest, str, int]: + def _find_staging_pr(self) -> Tuple[Optional[PullRequest], Optional[str], Optional[int]]: """ Find the staging PR for the task. TODO: arg sequence number --> make function simpler From 0121bed2b6cb4701d8a31075b6e24303b6d75624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 13:14:19 +0200 Subject: [PATCH 100/218] fix typing issue, take 2 --- scripts/automated_ingestion/eessi_task.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index ab15bed3..90b3aa39 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -5,7 +5,8 @@ from eessi_task_description import EESSITaskDescription from eessi_task_payload import EESSITaskPayload from utils import log_message, LoggingScope, log_function_entry_exit -from github import Github, GithubException, UnknownObjectException, PullRequest +from github import Github, GithubException, UnknownObjectException +from github.PullRequest import PullRequest import os From a1b81e03576e2ffd816b3034d54f9ceefd59ef9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 13:53:30 +0200 Subject: [PATCH 101/218] print task to be processed --- scripts/automated_ingestion/automated_ingestion.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 29fe68ec..d72554d5 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -249,6 +249,7 @@ def main(): else: # Process each task file for task_path in tasks: + log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing task: %s", task_path) try: # Create EESSITask for the task file try: From 0801ee358e740f45ac8d64d82d9f460f9caad8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 13:57:58 +0200 Subject: [PATCH 102/218] only check default branch --- scripts/automated_ingestion/eessi_task.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 90b3aa39..dab6fe3f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -442,11 +442,6 @@ def determine_state(self) -> TaskState: else: log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch", path_in_default_branch) - # check if path exists in any other branch - for branch in self.git_repo.get_branches(): - if self._path_exists_in_branch(path_in_default_branch, branch): - log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s", - path_in_default_branch, branch) exit(0) return TaskState.UNDETERMINED From e41670a18b2f9eef9cab870250ba9bdbe3e4e2fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 14:09:23 +0200 Subject: [PATCH 103/218] add main processing loop --- .../automated_ingestion/automated_ingestion.py | 18 ++++++++++++++---- scripts/automated_ingestion/eessi_task.py | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index d72554d5..312582e7 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -2,7 +2,7 @@ from eessitarball import EessiTarball, EessiTarballGroup from eessi_data_object import EESSIDataAndSignatureObject -from eessi_task import EESSITask +from eessi_task import EESSITask, TaskState from eessi_task_description import EESSITaskDescription from s3_bucket import EESSIS3Bucket from pid.decorator import pidfile # noqa: F401 @@ -259,9 +259,6 @@ def main(): ), gh_staging_repo ) - current_state = task.determine_state() - log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'", - task_path, current_state.name) except Exception as err: log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", @@ -270,6 +267,19 @@ def main(): log_message(LoggingScope.GROUP_OPS, 'INFO', "Task: %s", task) + previous_state = None + current_state = task.determine_state() + log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'", + task_path, current_state.name) + while (current_state is not None and + current_state != TaskState.DONE and + previous_state != current_state): + previous_state = current_state + current_state = task.handle() + log_message(LoggingScope.GROUP_OPS, 'INFO', + "Task '%s': previous state = '%s', current state = '%s'", + task_path, previous_state.name, current_state.name) + # TODO: update the information shown below (what makes sense to show?) # Log information about the task task_object = task.description.task_object diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index dab6fe3f..6ab6fa12 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -442,8 +442,8 @@ def determine_state(self) -> TaskState: else: log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch", path_in_default_branch) + return TaskState.UNDETERMINED exit(0) - return TaskState.UNDETERMINED @log_function_entry_exit() def handle(self): From 9b0fb9a97d3c1e5654419e7d1c75fd659b1a35e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 14:12:15 +0200 Subject: [PATCH 104/218] fix missing state attribute --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 6ab6fa12..16f6c7ae 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -621,4 +621,4 @@ def transition_to(self, new_state: TaskState): @log_function_entry_exit() def __str__(self): - return f"EESSITask(description={self.description}, action={self.action}, state={self.state})" + return f"EESSITask(description={self.description}, action={self.action}, state={self.determine_state()})" From 7fb88d30668c4513aa5e7f5ac0fcf5be2919f60d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 14:21:41 +0200 Subject: [PATCH 105/218] remove recursion from handle and return current state --- scripts/automated_ingestion/eessi_task.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 16f6c7ae..3d20b26a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -450,26 +450,23 @@ def handle(self): """ Dynamically find and execute the appropriate handler based on action and state. """ - state_before_handle = self.state + state_before_handle = self.determine_state() # Construct handler method name - handler_name = f"_handle_{self.action}_{self.state}" + handler_name = f"_handle_{self.action}_{state_before_handle}" # Check if the handler exists handler = getattr(self, handler_name, None) if handler and callable(handler): # Execute the handler if it exists - handler() - # if state has changed, run handle() again; otherwise, do nothing - if self.state != state_before_handle: - msg = f"handler {handler_name} changed state from {state_before_handle} to {self.state}" - msg += " running handle() again" - print(msg) - self.handle() + return handler() else: # Default behavior for missing handlers - print(f"No handler for action {self.action} and state {self.state} implemented; nothing to be done") + log_message(LoggingScope.TASK_OPS, 'ERROR', + "No handler for action %s and state %s implemented; nothing to be done", + self.action, state_before_handle) + return state_before_handle # Implement handlers for ADD action @log_function_entry_exit() From 9b3bd002a23e664ef7e32b3d97f6d91de60413e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 14:26:55 +0200 Subject: [PATCH 106/218] commented out some logging and obsolete processing --- .../automated_ingestion.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 312582e7..146075b6 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -280,25 +280,25 @@ def main(): "Task '%s': previous state = '%s', current state = '%s'", task_path, previous_state.name, current_state.name) - # TODO: update the information shown below (what makes sense to show?) - # Log information about the task - task_object = task.description.task_object - log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path) - log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path) - log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", - task.description.signature_verified) - - # Log the ETags of the downloaded task file - file_etag, sig_etag = task.description.task_object.get_etags() - log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag) - log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s", - task.description.task_object.remote_sig_path, sig_etag) - - # TODO: Process the task file contents - # This would involve reading the task file, parsing its contents, - # and performing the required actions based on the task type - log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path) - task.handle() + # # TODO: update the information shown below (what makes sense to show?) + # # Log information about the task + # task_object = task.description.task_object + # log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path) + # log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path) + # log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", + # task.description.signature_verified) + + # # Log the ETags of the downloaded task file + # file_etag, sig_etag = task.description.task_object.get_etags() + # log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag) + # log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s", + # task.description.task_object.remote_sig_path, sig_etag) + + # # TODO: Process the task file contents + # # This would involve reading the task file, parsing its contents, + # # and performing the required actions based on the task type + # log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path) + # task.handle() except Exception as err: log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err)) From 7768a91611718af6983ca6bdb9d77542d7757b7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 14:54:15 +0200 Subject: [PATCH 107/218] first version of handler for undetermined task state --- .../automated_ingestion.py | 1 + scripts/automated_ingestion/eessi_task.py | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 146075b6..e0ca710b 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -279,6 +279,7 @@ def main(): log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s': previous state = '%s', current state = '%s'", task_path, previous_state.name, current_state.name) + exit(0) # run loop body only once # # TODO: update the information shown below (what makes sense to show?) # # Log information about the task diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 3d20b26a..113ad69b 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -235,6 +235,13 @@ def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None) return 0 return self._find_highest_number(sequence_numbers.keys()) + @log_function_entry_exit() + def _get_fixed_sequence_number(self) -> int: + """ + Get a fixed sequence number. + """ + return 0 + @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: """ @@ -469,6 +476,28 @@ def handle(self): return state_before_handle # Implement handlers for ADD action + @log_function_entry_exit() + def _handle_add_undetermined(self): + """Handler for ADD action in UNDETERMINED state""" + print("Handling ADD action in UNDETERMINED state") + # create symlink target directory (REPO/PR/SEQ/TASK_FILE_NAME/) + # create task file in target directory (TARGET_DIR/TaskDescription) + # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK) + # create symlink from task file path to target directory (remote_file_path -> TARGET_DIR) + branch = self.git_repo.default_branch + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_number = self._get_fixed_sequence_number() + task_file_name = self.description.get_task_file_name() + target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}/" + self.git_repo.create_file(target_dir, "TaskDescription", + self.description.get_contents(), branch=branch) + self.git_repo.create_file(target_dir, f"TaskState.{TaskState.NEW_TASK.name}", + "", branch=branch) + self.git_repo.create_symlink(self.description.task_object.remote_file_path, + target_dir, branch=branch) + return TaskState.NEW_TASK + @log_function_entry_exit() def _handle_add_new_task(self): """Handler for ADD action in NEW_TASK state""" From e3ffa6cfa5ecd5e59ec84d416b26f513c04083e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 14:58:58 +0200 Subject: [PATCH 108/218] remove trailing / from target_dir name --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 113ad69b..2e60431c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -489,7 +489,7 @@ def _handle_add_undetermined(self): pr_number = self.description.get_pr_number() sequence_number = self._get_fixed_sequence_number() task_file_name = self.description.get_task_file_name() - target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}/" + target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" self.git_repo.create_file(target_dir, "TaskDescription", self.description.get_contents(), branch=branch) self.git_repo.create_file(target_dir, f"TaskState.{TaskState.NEW_TASK.name}", From 0fa6afbe987668fbb4fb350cea0313718e4e10c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 15:12:00 +0200 Subject: [PATCH 109/218] fix target directory structure for NEW_TASK and skip creating symlink --- scripts/automated_ingestion/eessi_task.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 2e60431c..a92f8914 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -490,12 +490,16 @@ def _handle_add_undetermined(self): sequence_number = self._get_fixed_sequence_number() task_file_name = self.description.get_task_file_name() target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" - self.git_repo.create_file(target_dir, "TaskDescription", + task_description_file_path = f"{target_dir}/TaskDescription" + task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}" + self.git_repo.create_file(task_description_file_path, + f"new task description for {repo_name} PR {pr_number} seq {sequence_number}", self.description.get_contents(), branch=branch) - self.git_repo.create_file(target_dir, f"TaskState.{TaskState.NEW_TASK.name}", + self.git_repo.create_file(task_state_file_path, + f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", "", branch=branch) - self.git_repo.create_symlink(self.description.task_object.remote_file_path, - target_dir, branch=branch) + # self.git_repo.create_symlink(self.description.task_object.remote_file_path, + # target_dir, branch=branch) return TaskState.NEW_TASK @log_function_entry_exit() From 570babbc69433405a271e7f50cc88754405c9c0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 19:58:57 +0200 Subject: [PATCH 110/218] add creation of symlink --- scripts/automated_ingestion/eessi_task.py | 41 +++++++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a92f8914..d51c2c25 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -476,6 +476,40 @@ def handle(self): return state_before_handle # Implement handlers for ADD action + @log_function_entry_exit() + def _create_symlink(self, source_path: str, target_path: str, branch: str = None): + """Create a symlink in the given branch.""" + try: + branch = self.git_repo.default_branch if branch is None else branch + ref = self.git_repo.get_git_ref(f"heads/{branch}") + commit = self.git_repo.get_git_commit(ref.object.sha) + base_tree = self.git_repo.get_git_tree(commit.tree.sha) + + # Create blob for symlink target + blob = self.git_repo.create_git_blob(target_path, "utf-8") + + # Create tree element + tree_element = { + "path": source_path, + "mode": "120000", + "type": "blob", + "sha": blob.sha + } + + # Create new tree and commit + new_tree = self.git_repo.create_git_tree([tree_element], base_tree) + commit_message = f"Add symlink {source_path} -> {target_path}" + new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit]) + + # Update reference + ref.edit(new_commit.sha) + + log_message(LoggingScope.TASK_OPS, 'INFO', "Symlink created: %s -> %s", + source_path, target_path) + + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err) + @log_function_entry_exit() def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" @@ -487,7 +521,7 @@ def _handle_add_undetermined(self): branch = self.git_repo.default_branch repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() - sequence_number = self._get_fixed_sequence_number() + sequence_number = self._get_fixed_sequence_number() # corresponds to an open or yet to be created PR task_file_name = self.description.get_task_file_name() target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" task_description_file_path = f"{target_dir}/TaskDescription" @@ -498,8 +532,9 @@ def _handle_add_undetermined(self): self.git_repo.create_file(task_state_file_path, f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", "", branch=branch) - # self.git_repo.create_symlink(self.description.task_object.remote_file_path, - # target_dir, branch=branch) + self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch) + # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still open or + # yet to be created); if it is not valid, perform corrective actions return TaskState.NEW_TASK @log_function_entry_exit() From f1f813e1ff6b91f526099d76ae232214b42a0609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 20:09:00 +0200 Subject: [PATCH 111/218] add error handling and log messages --- scripts/automated_ingestion/eessi_task.py | 31 +++++++++++++++++------ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index d51c2c25..0ccc3e2c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -526,15 +526,30 @@ def _handle_add_undetermined(self): target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" task_description_file_path = f"{target_dir}/TaskDescription" task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}" - self.git_repo.create_file(task_description_file_path, - f"new task description for {repo_name} PR {pr_number} seq {sequence_number}", - self.description.get_contents(), branch=branch) - self.git_repo.create_file(task_state_file_path, - f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", - "", branch=branch) + try: + self.git_repo.create_file(task_description_file_path, + f"new task description for {repo_name} PR {pr_number} seq {sequence_number}", + self.description.get_contents(), branch=branch) + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task description file: %s", err) + return TaskState.UNDETERMINED + log_message(LoggingScope.TASK_OPS, 'INFO', + "task description file created: %s", task_description_file_path) + + try: + self.git_repo.create_file(task_state_file_path, + f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", + "", branch=branch) + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err) + return TaskState.UNDETERMINED + log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path) + self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch) - # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still open or - # yet to be created); if it is not valid, perform corrective actions + log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s", + self.description.task_object.remote_file_path, target_dir) + # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still + # open or yet to be created); if it is not valid, perform corrective actions return TaskState.NEW_TASK @log_function_entry_exit() From fc67e434a980f87531ea96430aaa3f8ec6a7487d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 20:25:57 +0200 Subject: [PATCH 112/218] add _safe_create_file and error handling --- scripts/automated_ingestion/eessi_task.py | 49 ++++++++++++++++------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0ccc3e2c..56dd9897 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -509,6 +509,22 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err) + raise err + + @log_function_entry_exit() + def _safe_create_file(self, path: str, message: str, content: str, branch: str = None): + """Create a file in the given branch.""" + try: + branch = self.git_repo.default_branch if branch is None else branch + existing_file = self.git_repo.get_contents(path, ref=branch) + log_message(LoggingScope.TASK_OPS, 'INFO', "File %s already exists", path) + return existing_file + except GithubException as err: + if err.status == 404: # File doesn't exist + # Safe to create + return self.git_repo.create_file(path, message, content, branch=branch) + else: + raise err # Some other error @log_function_entry_exit() def _handle_add_undetermined(self): @@ -527,29 +543,34 @@ def _handle_add_undetermined(self): task_description_file_path = f"{target_dir}/TaskDescription" task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}" try: - self.git_repo.create_file(task_description_file_path, - f"new task description for {repo_name} PR {pr_number} seq {sequence_number}", - self.description.get_contents(), branch=branch) + self._safe_create_file(task_description_file_path, + f"new task description for {repo_name} PR {pr_number} seq {sequence_number}", + self.description.get_contents(), branch=branch) + log_message(LoggingScope.TASK_OPS, 'INFO', + "task description file created: %s", task_description_file_path) except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task description file: %s", err) return TaskState.UNDETERMINED - log_message(LoggingScope.TASK_OPS, 'INFO', - "task description file created: %s", task_description_file_path) try: - self.git_repo.create_file(task_state_file_path, - f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", - "", branch=branch) + self._safe_create_file(task_state_file_path, + f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", + "", branch=branch) + log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path) except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err) return TaskState.UNDETERMINED - log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path) - self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch) - log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s", - self.description.task_object.remote_file_path, target_dir) - # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still - # open or yet to be created); if it is not valid, perform corrective actions + try: + self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch) + log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s", + self.description.task_object.remote_file_path, target_dir) + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err) + return TaskState.UNDETERMINED + + # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number + # is still open or yet to be created); if it is not valid, perform corrective actions return TaskState.NEW_TASK @log_function_entry_exit() From 467781c48f85c2abd1aa35b954bd5a8983cfe367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 20:33:42 +0200 Subject: [PATCH 113/218] improve logging --- scripts/automated_ingestion/eessi_task.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 56dd9897..811b42e7 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -487,6 +487,7 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None # Create blob for symlink target blob = self.git_repo.create_git_blob(target_path, "utf-8") + log_message(LoggingScope.TASK_OPS, 'INFO', "blob created: %s", blob) # Create tree element tree_element = { @@ -498,18 +499,22 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None # Create new tree and commit new_tree = self.git_repo.create_git_tree([tree_element], base_tree) + log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree) + commit_message = f"Add symlink {source_path} -> {target_path}" new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit]) + log_message(LoggingScope.TASK_OPS, 'INFO', "new commit created: %s", new_commit) # Update reference ref.edit(new_commit.sha) log_message(LoggingScope.TASK_OPS, 'INFO', "Symlink created: %s -> %s", source_path, target_path) + return True except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err) - raise err + return False @log_function_entry_exit() def _safe_create_file(self, path: str, message: str, content: str, branch: str = None): @@ -559,14 +564,14 @@ def _handle_add_undetermined(self): log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path) except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err) + # TODO: rollback previous changes (task description file) return TaskState.UNDETERMINED - try: - self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch) + if self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch): log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s", self.description.task_object.remote_file_path, target_dir) - except Exception as err: - log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err) + else: + # TODO: rollback previous changes (task description file, task state file) return TaskState.UNDETERMINED # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number From 09fb09215091e614b6724b593544ab3aee315695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 20:36:44 +0200 Subject: [PATCH 114/218] create new tree in try/except block --- scripts/automated_ingestion/eessi_task.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 811b42e7..227c9442 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -497,10 +497,15 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None "sha": blob.sha } - # Create new tree and commit - new_tree = self.git_repo.create_git_tree([tree_element], base_tree) - log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree) + # Create new tree + try: + new_tree = self.git_repo.create_git_tree([tree_element], base_tree) + log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree) + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err) + return False + # Create new commit commit_message = f"Add symlink {source_path} -> {target_path}" new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit]) log_message(LoggingScope.TASK_OPS, 'INFO', "new commit created: %s", new_commit) From 6c6d3dd39b970c103adb368b7e98d45aa3165b57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 20:41:37 +0200 Subject: [PATCH 115/218] debug issue creating git tree --- scripts/automated_ingestion/eessi_task.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 227c9442..5985b80f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -503,6 +503,10 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree) except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err) + log_message(LoggingScope.TASK_OPS, 'ERROR', " Status Code: %s", err.status) + log_message(LoggingScope.TASK_OPS, 'ERROR', " Error Message: %s", err.data) + log_message(LoggingScope.TASK_OPS, 'ERROR', " Headers: %s", err.headers) + log_message(LoggingScope.TASK_OPS, 'ERROR', " Raw Response: %s", err.response) return False # Create new commit From 031881673087db6d41f40d79964e404369d15f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 20:54:03 +0200 Subject: [PATCH 116/218] improve error handling and reporting --- scripts/automated_ingestion/eessi_task.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 5985b80f..2619d2de 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -8,6 +8,7 @@ from github import Github, GithubException, UnknownObjectException from github.PullRequest import PullRequest import os +import traceback class SequenceStatus(Enum): @@ -501,13 +502,20 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None try: new_tree = self.git_repo.create_git_tree([tree_element], base_tree) log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree) - except Exception as err: + except GithubException as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err) log_message(LoggingScope.TASK_OPS, 'ERROR', " Status Code: %s", err.status) log_message(LoggingScope.TASK_OPS, 'ERROR', " Error Message: %s", err.data) log_message(LoggingScope.TASK_OPS, 'ERROR', " Headers: %s", err.headers) log_message(LoggingScope.TASK_OPS, 'ERROR', " Raw Response: %s", err.response) return False + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "\n=== General Exception ===") + log_message(LoggingScope.TASK_OPS, 'ERROR', " Type: %s", type(err).__name__) + log_message(LoggingScope.TASK_OPS, 'ERROR', " Message: %s", str(err)) + log_message(LoggingScope.TASK_OPS, 'ERROR', " Traceback:") + log_message(LoggingScope.TASK_OPS, 'ERROR', " %s", traceback.format_exc()) + return False # Create new commit commit_message = f"Add symlink {source_path} -> {target_path}" From b842eba46ebeced4d830bc6cd61fcf694ccde3d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 1 Jun 2025 21:00:35 +0200 Subject: [PATCH 117/218] use InputGitTreeElement instead of simple Dict --- scripts/automated_ingestion/eessi_task.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 2619d2de..db918efc 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -5,7 +5,7 @@ from eessi_task_description import EESSITaskDescription from eessi_task_payload import EESSITaskPayload from utils import log_message, LoggingScope, log_function_entry_exit -from github import Github, GithubException, UnknownObjectException +from github import Github, GithubException, InputGitTreeElement, UnknownObjectException from github.PullRequest import PullRequest import os import traceback @@ -491,12 +491,13 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None log_message(LoggingScope.TASK_OPS, 'INFO', "blob created: %s", blob) # Create tree element - tree_element = { - "path": source_path, - "mode": "120000", - "type": "blob", - "sha": blob.sha - } + tree_element = InputGitTreeElement( + path=source_path, + mode="120000", + type="blob", + sha=blob.sha + ) + log_message(LoggingScope.TASK_OPS, 'INFO', "tree element created: %s", tree_element) # Create new tree try: From 29fd410ea34be56eb2412815c3a1444044894006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 6 Jun 2025 22:31:55 +0200 Subject: [PATCH 118/218] use pointer file instead of symlink --- scripts/automated_ingestion/eessi_task.py | 25 +++++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index db918efc..abc9f476 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -447,11 +447,12 @@ def determine_state(self) -> TaskState: if self._path_exists_in_branch(path_in_default_branch, branch=self.git_repo.default_branch): log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch", path_in_default_branch) + # TODO: determine state + exit(0) else: log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch", path_in_default_branch) return TaskState.UNDETERMINED - exit(0) @log_function_entry_exit() def handle(self): @@ -553,10 +554,10 @@ def _safe_create_file(self, path: str, message: str, content: str, branch: str = def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" print("Handling ADD action in UNDETERMINED state") - # create symlink target directory (REPO/PR/SEQ/TASK_FILE_NAME/) + # create target directory (REPO/PR/SEQ/TASK_FILE_NAME/) # create task file in target directory (TARGET_DIR/TaskDescription) # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK) - # create symlink from task file path to target directory (remote_file_path -> TARGET_DIR) + # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR) branch = self.git_repo.default_branch repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() @@ -564,7 +565,7 @@ def _handle_add_undetermined(self): task_file_name = self.description.get_task_file_name() target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" task_description_file_path = f"{target_dir}/TaskDescription" - task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}" + task_state_file_path = f"{target_dir}/TaskState" try: self._safe_create_file(task_description_file_path, f"new task description for {repo_name} PR {pr_number} seq {sequence_number}", @@ -578,17 +579,23 @@ def _handle_add_undetermined(self): try: self._safe_create_file(task_state_file_path, f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", - "", branch=branch) + f"{TaskState.NEW_TASK.name}", branch=branch) log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path) except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err) # TODO: rollback previous changes (task description file) return TaskState.UNDETERMINED - if self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch): - log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s", - self.description.task_object.remote_file_path, target_dir) - else: + try: + remote_file_path = self.description.task_object.remote_file_path + self._safe_create_file(remote_file_path, + f"pointer from task file {remote_file_path} to target {target_dir}", + f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}", + branch=branch) + log_message(LoggingScope.TASK_OPS, 'INFO', "pointer file created: %s -> %s", + remote_file_path, target_dir) + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating pointer file: %s", err) # TODO: rollback previous changes (task description file, task state file) return TaskState.UNDETERMINED From e6680d67e0ec72dcb7701fca9292c3282ef3e779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 12:14:06 +0200 Subject: [PATCH 119/218] add a couple of files with a single commit --- scripts/automated_ingestion/eessi_task.py | 111 ++++++++++++++++------ 1 file changed, 82 insertions(+), 29 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index abc9f476..afa7ee87 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,14 +1,18 @@ from enum import Enum, auto from typing import Dict, List, Tuple, Optional + +import os +import traceback +import base64 + from eessi_data_object import EESSIDataAndSignatureObject from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription from eessi_task_payload import EESSITaskPayload from utils import log_message, LoggingScope, log_function_entry_exit + from github import Github, GithubException, InputGitTreeElement, UnknownObjectException from github.PullRequest import PullRequest -import os -import traceback class SequenceStatus(Enum): @@ -550,6 +554,61 @@ def _safe_create_file(self, path: str, message: str, content: str, branch: str = else: raise err # Some other error + @log_function_entry_exit() + def _create_multi_file_commit(self, files_data, commit_message, branch=None): + """ + Create a commit with multiple file changes + + files_data: dict with structure: + { + "path/to/file1.txt": { + "content": "file content", + "mode": "100644" # optional, defaults to 100644 + }, + "path/to/file2.py": { + "content": "print('hello')", + "mode": "100644" + } + } + """ + branch = self.git_repo.default_branch if branch is None else branch + ref = self.git_repo.get_git_ref(f"heads/{branch}") + current_commit = self.git_repo.get_git_commit(ref.object.sha) + base_tree = current_commit.tree + + # Create tree elements + tree_elements = [] + for file_path, file_info in files_data.items(): + content = file_info["content"] + if isinstance(content, str): + content = content.encode('utf-8') + + blob = self.git_repo.create_git_blob( + base64.b64encode(content).decode('utf-8'), + "base64" + ) + tree_elements.append(InputGitTreeElement( + path=file_path, + mode=file_info.get("mode", "100644"), + type="blob", + sha=blob.sha + )) + + # Create new tree + new_tree = self.git_repo.create_git_tree(tree_elements, base_tree) + + # Create commit + new_commit = self.git_repo.create_git_commit( + commit_message, + new_tree, + [current_commit] + ) + + # Update branch reference + ref.edit(new_commit.sha) + + return new_commit + @log_function_entry_exit() def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" @@ -566,36 +625,30 @@ def _handle_add_undetermined(self): target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" task_description_file_path = f"{target_dir}/TaskDescription" task_state_file_path = f"{target_dir}/TaskState" - try: - self._safe_create_file(task_description_file_path, - f"new task description for {repo_name} PR {pr_number} seq {sequence_number}", - self.description.get_contents(), branch=branch) - log_message(LoggingScope.TASK_OPS, 'INFO', - "task description file created: %s", task_description_file_path) - except Exception as err: - log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task description file: %s", err) - return TaskState.UNDETERMINED - - try: - self._safe_create_file(task_state_file_path, - f"new task state for {repo_name} PR {pr_number} seq {sequence_number}", - f"{TaskState.NEW_TASK.name}", branch=branch) - log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path) - except Exception as err: - log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err) - # TODO: rollback previous changes (task description file) - return TaskState.UNDETERMINED + remote_file_path = self.description.task_object.remote_file_path + + files_to_commit = { + task_description_file_path: { + "content": self.description.get_contents(), + "mode": "100644" + }, + task_state_file_path: { + "content": f"{TaskState.NEW_TASK.name}", + "mode": "100644" + }, + remote_file_path: { + "content": f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}", + "mode": "100644" + } + } try: - remote_file_path = self.description.task_object.remote_file_path - self._safe_create_file(remote_file_path, - f"pointer from task file {remote_file_path} to target {target_dir}", - f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}", - branch=branch) - log_message(LoggingScope.TASK_OPS, 'INFO', "pointer file created: %s -> %s", - remote_file_path, target_dir) + commit = self._create_multi_file_commit(files_to_commit, + f"new task for {repo_name} PR {pr_number} seq {sequence_number}", + branch=branch) + log_message(LoggingScope.TASK_OPS, 'INFO', "commit created: %s", commit) except Exception as err: - log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating pointer file: %s", err) + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating commit: %s", err) # TODO: rollback previous changes (task description file, task state file) return TaskState.UNDETERMINED From b63699ba62b3a9267ce5055ff81f6fec7140a14c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 12:25:44 +0200 Subject: [PATCH 120/218] add new line to state file --- scripts/automated_ingestion/eessi_task.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index afa7ee87..a1c3614a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -633,7 +633,7 @@ def _handle_add_undetermined(self): "mode": "100644" }, task_state_file_path: { - "content": f"{TaskState.NEW_TASK.name}", + "content": f"{TaskState.NEW_TASK.name}\n", "mode": "100644" }, remote_file_path: { @@ -643,9 +643,11 @@ def _handle_add_undetermined(self): } try: - commit = self._create_multi_file_commit(files_to_commit, - f"new task for {repo_name} PR {pr_number} seq {sequence_number}", - branch=branch) + commit = self._create_multi_file_commit( + files_to_commit, + f"new task for {repo_name} PR {pr_number} seq {sequence_number}", + branch=branch + ) log_message(LoggingScope.TASK_OPS, 'INFO', "commit created: %s", commit) except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating commit: %s", err) From 3a1d975ed51a42a8778350ab11367c5fd1cdb402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 13:29:45 +0200 Subject: [PATCH 121/218] determine task state from TaskState file --- scripts/automated_ingestion/eessi_task.py | 80 ++++++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a1c3614a..9af53cb3 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -440,6 +440,60 @@ def _path_exists_in_branch(self, path: str, branch: str = None) -> bool: except FileNotFoundError: return False + @log_function_entry_exit() + def _read_dict_from_string(self, content: str) -> dict: + """ + Read the dictionary from the string. + """ + config_dict = {} + for line in content.strip().split('\n'): + if '=' in line and not line.strip().startswith('#'): # Skip comments + key, value = line.split('=', 1) # Split only on first '=' + config_dict[key.strip()] = value.strip() + return config_dict + + @log_function_entry_exit() + def _read_target_dir_from_file(self, path: str, branch: str = None) -> str: + """ + Read the target directory from the file in the given branch. + """ + branch = self.git_repo.default_branch if branch is None else branch + content = self.git_repo.get_contents(path, ref=branch) + + # Decode the content from base64 + content_str = content.decoded_content.decode('utf-8') + + # Parse into dictionary + config_dict = self._read_dict_from_string(content_str) + + return config_dict.get('target_dir', None) + + @log_function_entry_exit() + def _branch_exists(self, branch_name: str) -> bool: + """ + Check if a branch exists. + """ + try: + self.git_repo.get_branch(branch_name) + return True + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "error checking if branch %s exists: %s", + branch_name, err) + return False + + @log_function_entry_exit() + def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState: + """ + Read the task state from the file in the given branch. + """ + branch = self.git_repo.default_branch if branch is None else branch + content = self.git_repo.get_contents(path, ref=branch) + + # Decode the content from base64 + content_str = content.decoded_content.decode('utf-8') + + return TaskState.from_string(content_str) + @log_function_entry_exit() def determine_state(self) -> TaskState: """ @@ -448,11 +502,33 @@ def determine_state(self) -> TaskState: # High-level logic: # 1. Check if path representing the task file exists in the default branch path_in_default_branch = self.description.task_object.remote_file_path - if self._path_exists_in_branch(path_in_default_branch, branch=self.git_repo.default_branch): + default_branch = self.git_repo.default_branch + if self._path_exists_in_branch(path_in_default_branch, branch=default_branch): log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch", path_in_default_branch) # TODO: determine state - exit(0) + # - get state from task file in default branch + # - get target_dir from path_in_default_branch + target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch) + # read the TaskState file in target dir + task_state_file_path = f"{target_dir}/TaskState" + task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch) + # - if branch for sequence number exists, get state from task file in corresponding branch + # - branch name is of the form REPO-PR-SEQ + # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ + # - obtain repo, pr, seq from target dir + org, repo, pr, seq, _ = target_dir.split('/') + staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}" + if self._branch_exists(staging_branch_name): + # read the TaskState file in staging branch + task_state_staging_branch = self._read_task_state_from_file(task_state_file_path, staging_branch_name) + log_message(LoggingScope.TASK_OPS, 'INFO', "task state in staging branch %s: %s", + staging_branch_name, task_state_staging_branch) + return task_state_staging_branch + else: + log_message(LoggingScope.TASK_OPS, 'INFO', "task state in default branch: %s", + task_state_default_branch) + return task_state_default_branch else: log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch", path_in_default_branch) From 6f2c92f8ca1b8248fb4a4790cd39106215810960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 14:38:51 +0200 Subject: [PATCH 122/218] fix check for path --- scripts/automated_ingestion/eessi_task.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 9af53cb3..a33dd4ab 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -429,16 +429,15 @@ def _path_exists_in_branch(self, path: str, branch: str = None) -> bool: """ Check if a path exists in a branch. """ + branch = self.git_repo.default_branch if branch is None else branch try: - branch = self.git_repo.default_branch if branch is None else branch - contents = self._list_directory_contents(path, branch) - if isinstance(contents, list): - return True - else: - return False + self.git_repo.get_contents(path, ref=branch) return True - except FileNotFoundError: - return False + except GithubException as err: + if err.status == 404: + return False + else: + raise err @log_function_entry_exit() def _read_dict_from_string(self, content: str) -> dict: From 57b9da61dc407a5e1396ae2ef379d772bfb30601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 14:56:13 +0200 Subject: [PATCH 123/218] add log output when determining state --- scripts/automated_ingestion/eessi_task.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a33dd4ab..d18f4eca 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -490,8 +490,12 @@ def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState # Decode the content from base64 content_str = content.decoded_content.decode('utf-8') + log_message(LoggingScope.TASK_OPS, 'INFO', "content in TaskState file: %s", content_str) - return TaskState.from_string(content_str) + task_state = TaskState.from_string(content_str) + log_message(LoggingScope.TASK_OPS, 'INFO', "task state: %s", task_state) + + return task_state @log_function_entry_exit() def determine_state(self) -> TaskState: From c53729fec1c6a5d27ffee9f30109353fea0fb720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 15:18:12 +0200 Subject: [PATCH 124/218] fix from_string --- scripts/automated_ingestion/eessi_task.py | 27 +++++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index d18f4eca..d2e1a7c7 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -33,17 +33,30 @@ class TaskState(Enum): @classmethod def from_string(cls, name, default=None, case_sensitive=False): + log_message(LoggingScope.TASK_OPS, 'INFO', "from_string: %s", name) if case_sensitive: - return cls.__members__.get(name, default) + to_return = cls.__members__.get(name, default) + log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return) + return to_return try: - return next( - member for member_name, member in cls.__members__.items() - if member_name.lower() == name.lower() - ) - except StopIteration: + to_return = cls[name.upper()] + log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return) + return to_return + except KeyError: return default +# try: +# log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will iterate over: %s", cls.__members__) +# to_return = next( +# member for member_name, member in cls.__members__.items() +# if member_name.lower() == name.lower() +# ) +# log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return) +# return to_return +# except StopIteration: +# return default + def __str__(self): return self.name.lower() @@ -489,7 +502,7 @@ def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState content = self.git_repo.get_contents(path, ref=branch) # Decode the content from base64 - content_str = content.decoded_content.decode('utf-8') + content_str = content.decoded_content.decode('utf-8').strip() log_message(LoggingScope.TASK_OPS, 'INFO', "content in TaskState file: %s", content_str) task_state = TaskState.from_string(content_str) From af6e1d880122e40c2da6c053d57c0571d8ebe905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 15:25:15 +0200 Subject: [PATCH 125/218] return upper case state name --- scripts/automated_ingestion/eessi_task.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index d2e1a7c7..1b17dee2 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -46,19 +46,8 @@ def from_string(cls, name, default=None, case_sensitive=False): except KeyError: return default -# try: -# log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will iterate over: %s", cls.__members__) -# to_return = next( -# member for member_name, member in cls.__members__.items() -# if member_name.lower() == name.lower() -# ) -# log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return) -# return to_return -# except StopIteration: -# return default - def __str__(self): - return self.name.lower() + return self.name.upper() class EESSITask: From c76b68b440cd6835ddec4d15c8abcffa2abf3840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 15:34:35 +0200 Subject: [PATCH 126/218] use lower state name to create handler name --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 1b17dee2..08c59948 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -547,7 +547,7 @@ def handle(self): state_before_handle = self.determine_state() # Construct handler method name - handler_name = f"_handle_{self.action}_{state_before_handle}" + handler_name = f"_handle_{self.action}_{state_before_handle.lower()}" # Check if the handler exists handler = getattr(self, handler_name, None) From ed75818b5a7629b55db5539fe8ecd1c42bc5fe94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 15:36:57 +0200 Subject: [PATCH 127/218] convert task state to str first --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 08c59948..7cf9689f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -547,7 +547,7 @@ def handle(self): state_before_handle = self.determine_state() # Construct handler method name - handler_name = f"_handle_{self.action}_{state_before_handle.lower()}" + handler_name = f"_handle_{self.action}_{str(state_before_handle).lower()}" # Check if the handler exists handler = getattr(self, handler_name, None) From bc61294744c86996ba5305185da81ca3ed5b9e0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 16:28:51 +0200 Subject: [PATCH 128/218] complete handler for state NEW_TASK --- scripts/automated_ingestion/eessi_task.py | 90 ++++++++++++----------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7cf9689f..6d2f799c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -417,14 +417,15 @@ def _list_directory_contents(self, directory_path, branch=None): raise err @log_function_entry_exit() - def _next_state(self) -> TaskState: + def _next_state(self, state: TaskState = None) -> TaskState: """ Determine the next state based on the current state using the valid_transitions dictionary. NOTE, it assumes that function is only called for non-terminal states and that the next state is the first element of the list returned by the valid_transitions dictionary. """ - return self.valid_transitions[self.state][0] + the_state = state if state is not None else self.determine_state() + return self.valid_transitions[the_state][0] @log_function_entry_exit() def _path_exists_in_branch(self, path: str, branch: str = None) -> bool: @@ -504,24 +505,22 @@ def determine_state(self) -> TaskState: """ Determine the state of the task based on the state of the staging repository. """ - # High-level logic: - # 1. Check if path representing the task file exists in the default branch + # check if path representing the task file exists in the default branch path_in_default_branch = self.description.task_object.remote_file_path default_branch = self.git_repo.default_branch if self._path_exists_in_branch(path_in_default_branch, branch=default_branch): log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch", path_in_default_branch) - # TODO: determine state - # - get state from task file in default branch - # - get target_dir from path_in_default_branch + # get state from task file in default branch + # - get target_dir from path_in_default_branch target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch) # read the TaskState file in target dir task_state_file_path = f"{target_dir}/TaskState" task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch) - # - if branch for sequence number exists, get state from task file in corresponding branch - # - branch name is of the form REPO-PR-SEQ - # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ - # - obtain repo, pr, seq from target dir + # if branch for sequence number exists, get state from task file in corresponding branch + # - branch name is of the form REPO-PR-SEQ + # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ + # - obtain repo, pr, seq from target dir org, repo, pr, seq, _ = target_dir.split('/') staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}" if self._branch_exists(staging_branch_name): @@ -690,6 +689,31 @@ def _create_multi_file_commit(self, files_data, commit_message, branch=None): return new_commit + @log_function_entry_exit() + def _update_file(self, file_path, new_content, commit_message, branch=None): + try: + branch = self.git_repo.default_branch if branch is None else branch + + # Get the current file + file = self.git_repo.get_contents(file_path, ref=branch) + + # Update the file + result = self.git_repo.update_file( + path=file_path, + message=commit_message, + content=new_content, + sha=file.sha, + branch=branch + ) + + log_message(LoggingScope.TASK_OPS, 'INFO', + "File updated successfully. Commit SHA: %s", result['commit'].sha) + return result + + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error updating file: %s", err) + return None + @log_function_entry_exit() def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" @@ -759,40 +783,24 @@ def _handle_add_new_task(self): payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client) self.payload = EESSITaskPayload(payload_object) log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload) - # determine next state (NEXT_STATE), put metadata/task file into GH staging repo in main branch under directory - # REPO/PR_NUM/SEQ_NUM/task_file_name.NEXT_STATE + + # determine next state (NEXT_STATE), update TaskState file content next_state = self._next_state() log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path, + self.git_repo.default_branch) + task_state_file_path = f"{target_dir}/TaskState" + default_branch = self.git_repo.default_branch repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() - repo_pr_dir = f"{repo_name}/{pr_number}" - sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number) - if len(sequence_numbers) == 0: - sequence_number = 0 - else: - # we need to figure out the status of the last deployment (with the highest sequence number) - # if a PR exists and it is closed, we add the task to the *next* higher sequence number - # otherwise we add the task to the highest sequence number - sequence_number = self._find_highest_number(sequence_numbers.keys()) - branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" - if branch_name in [branch.name for branch in self.git_repo.get_branches()]: - # branch exists, check if PR exists - find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')] - if find_pr: - pr = find_pr.pop(0) - if pr.state == 'closed': - sequence_number += 1 - # we use the basename of the remote file path for the task description file - task_file_name = self.description.get_task_file_name() - staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{task_file_name}.{next_state}" - log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path) - # contents of task description / metadata file - contents = self.description.get_contents() - self.git_repo.create_file(staging_repo_path, - f"new task for {repo_name} PR {pr_number} seq {sequence_number}: add build for arch", - contents) - self.state = next_state - return True + seq_num = self._get_fixed_sequence_number() + commit_message = f"changing task state for repo {repo_name} PR {pr_number} seq {seq_num} to {next_state}" + self._update_file(task_state_file_path, + f"{next_state.name}\n", + commit_message, + branch=default_branch) + + return next_state @log_function_entry_exit() def _handle_add_payload_staged(self): From 2ac47b69ca13f8a8706b4a052ebf8b24d6c2f759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 16:34:31 +0200 Subject: [PATCH 129/218] add TODO about checking validity of sequence number and corresponding branch --- scripts/automated_ingestion/eessi_task.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 6d2f799c..92ee380e 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -800,6 +800,8 @@ def _handle_add_new_task(self): commit_message, branch=default_branch) + # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number + # is still open or yet to be created); if it is not valid, perform corrective actions return next_state @log_function_entry_exit() From 62bd006cae9e78d12bc0573d183dcf771d3aad80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 21:37:20 +0200 Subject: [PATCH 130/218] first part for handling task after payload got staged --- scripts/automated_ingestion/eessi_task.py | 126 +++++++++++----------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 92ee380e..868263eb 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -13,6 +13,7 @@ from github import Github, GithubException, InputGitTreeElement, UnknownObjectException from github.PullRequest import PullRequest +from github.Branch import Branch class SequenceStatus(Enum): @@ -471,17 +472,21 @@ def _read_target_dir_from_file(self, path: str, branch: str = None) -> str: return config_dict.get('target_dir', None) @log_function_entry_exit() - def _branch_exists(self, branch_name: str) -> bool: + def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]: """ - Check if a branch exists. + Get a branch object from its name. """ + if not branch_name: + return self.git_repo.default_branch + try: - self.git_repo.get_branch(branch_name) - return True + branch = self.git_repo.get_branch(branch_name) + log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s exists: %s", branch_name, branch) + return branch except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "error checking if branch %s exists: %s", branch_name, err) - return False + return None @log_function_entry_exit() def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState: @@ -523,7 +528,7 @@ def determine_state(self) -> TaskState: # - obtain repo, pr, seq from target dir org, repo, pr, seq, _ = target_dir.split('/') staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}" - if self._branch_exists(staging_branch_name): + if self._get_branch_from_name(staging_branch_name): # read the TaskState file in staging branch task_state_staging_branch = self._read_task_state_from_file(task_state_file_path, staging_branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "task state in staging branch %s: %s", @@ -767,18 +772,21 @@ def _handle_add_undetermined(self): def _handle_add_new_task(self): """Handler for ADD action in NEW_TASK state""" print("Handling ADD action in NEW_TASK state") - # Implementation for adding in NEW_TASK state: a task is only NEW_TASK if it was not processed yet + # get name of of payload from metadata payload_name = self.description.metadata['payload']['filename'] log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name) + # get config and remote_client from self.description.task_object config = self.description.task_object.config remote_client = self.description.task_object.remote_client + # determine remote_file_path by replacing basename of remote_file_path in self.description.task_object # with payload_name description_remote_file_path = self.description.task_object.remote_file_path payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name) log_message(LoggingScope.TASK_OPS, 'INFO', "payload_remote_file_path: %s", payload_remote_file_path) + # initialize payload object payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client) self.payload = EESSITaskPayload(payload_object) @@ -804,66 +812,60 @@ def _handle_add_new_task(self): # is still open or yet to be created); if it is not valid, perform corrective actions return next_state + @log_function_entry_exit() + def _determine_branch_name_from_sequence_number(self, sequence_number: int = None) -> str: + """Determine the branch name from the sequence number""" + sequence_number = self._get_fixed_sequence_number() if sequence_number is None else sequence_number + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + return f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + + @log_function_entry_exit() + def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: + """ + Find the single PR for the given branch in any state. + + Args: + repo: GitHub repository + branch_name: Name of the branch + + Returns: + PullRequest object if found, None otherwise + """ + try: + head_ref = f"{self.git_repo.owner.login}:{branch_name}" + prs = list(self.git_repo.get_pulls(state='all', head=head_ref)) + return prs[0] if prs else None + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err) + return None + @log_function_entry_exit() def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" print("Handling ADD action in PAYLOAD_STAGED state") - # Implementation for adding in PAYLOAD_STAGED state - # - create or find PR - # - update PR contents - # determine PR - # - no PR -> create one - # - PR && closed -> create one (may require to move task file to different sequence number) - # - PR && open -> update PR contents, task file status, etc - # TODO: determine sequence number, then use it to find staging pr - # find staging PR - sequence_number = self._get_sequence_number_for_task_file() - staging_pr, staging_branch = self._find_staging_pr(sequence_number) - # create PR if necessary - if staging_pr is None and sequence_number is None: - # no PR found, create one - staging_pr, staging_branch = self._create_staging_pr(sequence_number) - elif staging_pr is None and sequence_number is not None: - # no PR found, create one - staging_pr, staging_branch = self._create_staging_pr(sequence_number) - elif staging_pr.state == 'closed': - # PR closed, create new one - staging_pr, staging_branch = self._create_staging_pr(sequence_number + 1) - if staging_pr is None: - # something went wrong, we cannot continue - log_message(LoggingScope.ERROR, 'ERROR', "no staging PR found for task %s", self.description) - return False - # update PR contents - self._update_pr_contents(staging_pr) - # update task file status - self._update_task_file_status(staging_branch) - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - # current sequence - sequence_number = self._get_current_sequence_number() - sequence_status = self._determine_sequence_status(sequence_number) - if sequence_status == SequenceStatus.FINISHED: - sequence_number += 1 - # re-determine sequence status - sequence_status = self._determine_sequence_status(sequence_number) - if sequence_status == SequenceStatus.DOES_NOT_EXIST: - # something is odd, the task file should already be in the default branch - log_message(LoggingScope.ERROR, 'ERROR', "sequence number %s does not exist", sequence_number) - return False - elif sequence_status == SequenceStatus.FINISHED: - # we need to figure out the status of the last deployment (with the highest sequence number) - branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" - log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s exists", branch_name) - # check if branch exists - # - yes: check if corresponding PR exists - # - yes: check status of PR - # - open: rename file and add it to branch, set state, update PR contents, return - # - closed && !merged: rename file to rejected, set state - # - else: weird state, log message, return - # - no: delete branch - # create new branch, add task file to branch, set state, create PR, update PR contents, return - return True + branch_name = self._determine_branch_name_from_sequence_number() + branch = self._get_branch_from_name(branch_name) + if not branch: + # branch for sequence number does not exist + # TODO: could have been merged already --> check if sequence directory exists + # ASSUME: it has not existed before --> create it + branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", self.git_repo.default_branch) + log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s created: %s", branch_name, branch) + else: + log_message(LoggingScope.TASK_OPS, 'INFO', "found existing branch for %s: %s", branch_name, branch) + + pr = self._find_pr_for_branch(branch_name) + if not pr: + log_message(LoggingScope.TASK_OPS, 'INFO', "no PR found for branch %s", branch_name) + # TODO: create PR + else: + log_message(LoggingScope.TASK_OPS, 'INFO', "found existing PR for branch %s: %s", branch_name, pr) + # TODO: check if PR is open or closed + # TODO: if closed, create issue (PR already closed) + + return TaskState.PAYLOAD_STAGED @log_function_entry_exit() def _handle_add_pull_request(self): From 16f0a304300a3c880aca2283996c8f9fa2a00d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 7 Jun 2025 22:09:59 +0200 Subject: [PATCH 131/218] use sha for creating branch + make variable less ambiguous --- scripts/automated_ingestion/eessi_task.py | 105 +++++++++++----------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 868263eb..0080a48e 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -98,44 +98,44 @@ def _determine_task_action(self) -> EESSITaskAction: return EESSITaskAction.UNKNOWN @log_function_entry_exit() - def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch=None) -> bool: + def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch_name: str = None) -> bool: """ Check if a file exists in a repository branch. Args: file_path_prefix: the prefix of the file path - branch: the branch to check + branch_name: the branch to check Returns: True if a file with the prefix exists in the branch, False otherwise """ - if branch is None: - branch = self.git_repo.default_branch + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + # branch = self._get_branch_from_name(branch_name) try: # get all files in directory part of file_path_prefix directory_part = os.path.dirname(file_path_prefix) - files = self.git_repo.get_contents(directory_part, ref=branch) + files = self.git_repo.get_contents(directory_part, ref=branch_name) log_msg = "Found files %s in directory %s in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch) + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch_name) # check if any of the files has file_path_prefix as prefix for file in files: if file.path.startswith(file_path_prefix): log_msg = "Found file %s in directory %s in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch) + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch_name) return True log_msg = "No file with prefix %s found in directory %s in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch) + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch_name) return False except UnknownObjectException: # file_path does not exist in branch log_msg = "Directory %s or file with prefix %s does not exist in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch) + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name) return False except GithubException as err: if err.status == 404: # file_path does not exist in branch log_msg = "Directory %s or file with prefix %s does not exist in branch %s" - log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch) + log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name) return False else: # if there was some other (e.g. connection) issue, log message and return False @@ -317,9 +317,10 @@ def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]: repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + default_branch_name = self.git_repo.default_branch pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}", - head=branch_name, base=self.git_repo.default_branch) + head=branch_name, base=default_branch_name) return pr, branch_name @log_function_entry_exit() @@ -399,12 +400,13 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s return state @log_function_entry_exit() - def _list_directory_contents(self, directory_path, branch=None): + def _list_directory_contents(self, directory_path, branch_name: str = None): try: # Get contents of the directory - branch = self.git_repo.default_branch if branch is None else branch - log_message(LoggingScope.TASK_OPS, 'INFO', "listing contents of %s in branch %s", directory_path, branch) - contents = self.git_repo.get_contents(directory_path, ref=branch) + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + log_message(LoggingScope.TASK_OPS, 'INFO', + "listing contents of %s in branch %s", directory_path, branch_name) + contents = self.git_repo.get_contents(directory_path, ref=branch_name) # If contents is a list, it means we successfully got directory contents if isinstance(contents, list): @@ -429,13 +431,13 @@ def _next_state(self, state: TaskState = None) -> TaskState: return self.valid_transitions[the_state][0] @log_function_entry_exit() - def _path_exists_in_branch(self, path: str, branch: str = None) -> bool: + def _path_exists_in_branch(self, path: str, branch_name: str = None) -> bool: """ Check if a path exists in a branch. """ - branch = self.git_repo.default_branch if branch is None else branch + branch_name = self.git_repo.default_branch if branch_name is None else branch_name try: - self.git_repo.get_contents(path, ref=branch) + self.git_repo.get_contents(path, ref=branch_name) return True except GithubException as err: if err.status == 404: @@ -456,12 +458,12 @@ def _read_dict_from_string(self, content: str) -> dict: return config_dict @log_function_entry_exit() - def _read_target_dir_from_file(self, path: str, branch: str = None) -> str: + def _read_target_dir_from_file(self, path: str, branch_name: str = None) -> str: """ Read the target directory from the file in the given branch. """ - branch = self.git_repo.default_branch if branch is None else branch - content = self.git_repo.get_contents(path, ref=branch) + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + content = self.git_repo.get_contents(path, ref=branch_name) # Decode the content from base64 content_str = content.decoded_content.decode('utf-8') @@ -476,8 +478,7 @@ def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]: """ Get a branch object from its name. """ - if not branch_name: - return self.git_repo.default_branch + branch_name = self.git_repo.default_branch if branch_name is None else branch_name try: branch = self.git_repo.get_branch(branch_name) @@ -489,12 +490,12 @@ def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]: return None @log_function_entry_exit() - def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState: + def _read_task_state_from_file(self, path: str, branch_name: str = None) -> TaskState: """ Read the task state from the file in the given branch. """ - branch = self.git_repo.default_branch if branch is None else branch - content = self.git_repo.get_contents(path, ref=branch) + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + content = self.git_repo.get_contents(path, ref=branch_name) # Decode the content from base64 content_str = content.decoded_content.decode('utf-8').strip() @@ -512,16 +513,16 @@ def determine_state(self) -> TaskState: """ # check if path representing the task file exists in the default branch path_in_default_branch = self.description.task_object.remote_file_path - default_branch = self.git_repo.default_branch - if self._path_exists_in_branch(path_in_default_branch, branch=default_branch): + default_branch_name = self.git_repo.default_branch + if self._path_exists_in_branch(path_in_default_branch, branch_name=default_branch_name): log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch", path_in_default_branch) # get state from task file in default branch # - get target_dir from path_in_default_branch - target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch) + target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch_name) # read the TaskState file in target dir task_state_file_path = f"{target_dir}/TaskState" - task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch) + task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch_name) # if branch for sequence number exists, get state from task file in corresponding branch # - branch name is of the form REPO-PR-SEQ # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ @@ -568,11 +569,11 @@ def handle(self): # Implement handlers for ADD action @log_function_entry_exit() - def _create_symlink(self, source_path: str, target_path: str, branch: str = None): + def _create_symlink(self, source_path: str, target_path: str, branch_name: str = None): """Create a symlink in the given branch.""" try: - branch = self.git_repo.default_branch if branch is None else branch - ref = self.git_repo.get_git_ref(f"heads/{branch}") + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + ref = self.git_repo.get_git_ref(f"heads/{branch_name}") commit = self.git_repo.get_git_commit(ref.object.sha) base_tree = self.git_repo.get_git_tree(commit.tree.sha) @@ -625,22 +626,22 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None return False @log_function_entry_exit() - def _safe_create_file(self, path: str, message: str, content: str, branch: str = None): + def _safe_create_file(self, path: str, message: str, content: str, branch_name: str = None): """Create a file in the given branch.""" try: - branch = self.git_repo.default_branch if branch is None else branch - existing_file = self.git_repo.get_contents(path, ref=branch) + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + existing_file = self.git_repo.get_contents(path, ref=branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "File %s already exists", path) return existing_file except GithubException as err: if err.status == 404: # File doesn't exist # Safe to create - return self.git_repo.create_file(path, message, content, branch=branch) + return self.git_repo.create_file(path, message, content, branch=branch_name) else: raise err # Some other error @log_function_entry_exit() - def _create_multi_file_commit(self, files_data, commit_message, branch=None): + def _create_multi_file_commit(self, files_data, commit_message, branch_name: str = None): """ Create a commit with multiple file changes @@ -656,8 +657,8 @@ def _create_multi_file_commit(self, files_data, commit_message, branch=None): } } """ - branch = self.git_repo.default_branch if branch is None else branch - ref = self.git_repo.get_git_ref(f"heads/{branch}") + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + ref = self.git_repo.get_git_ref(f"heads/{branch_name}") current_commit = self.git_repo.get_git_commit(ref.object.sha) base_tree = current_commit.tree @@ -695,12 +696,12 @@ def _create_multi_file_commit(self, files_data, commit_message, branch=None): return new_commit @log_function_entry_exit() - def _update_file(self, file_path, new_content, commit_message, branch=None): + def _update_file(self, file_path, new_content, commit_message, branch_name: str = None): try: - branch = self.git_repo.default_branch if branch is None else branch + branch_name = self.git_repo.default_branch if branch_name is None else branch_name # Get the current file - file = self.git_repo.get_contents(file_path, ref=branch) + file = self.git_repo.get_contents(file_path, ref=branch_name) # Update the file result = self.git_repo.update_file( @@ -708,7 +709,7 @@ def _update_file(self, file_path, new_content, commit_message, branch=None): message=commit_message, content=new_content, sha=file.sha, - branch=branch + branch=branch_name ) log_message(LoggingScope.TASK_OPS, 'INFO', @@ -727,7 +728,7 @@ def _handle_add_undetermined(self): # create task file in target directory (TARGET_DIR/TaskDescription) # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK) # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR) - branch = self.git_repo.default_branch + branch_name = self.git_repo.default_branch repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() sequence_number = self._get_fixed_sequence_number() # corresponds to an open or yet to be created PR @@ -756,7 +757,7 @@ def _handle_add_undetermined(self): commit = self._create_multi_file_commit( files_to_commit, f"new task for {repo_name} PR {pr_number} seq {sequence_number}", - branch=branch + branch_name=branch_name ) log_message(LoggingScope.TASK_OPS, 'INFO', "commit created: %s", commit) except Exception as err: @@ -795,10 +796,10 @@ def _handle_add_new_task(self): # determine next state (NEXT_STATE), update TaskState file content next_state = self._next_state() log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + default_branch_name = self.git_repo.default_branch target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path, - self.git_repo.default_branch) + default_branch_name) task_state_file_path = f"{target_dir}/TaskState" - default_branch = self.git_repo.default_branch repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() seq_num = self._get_fixed_sequence_number() @@ -806,7 +807,7 @@ def _handle_add_new_task(self): self._update_file(task_state_file_path, f"{next_state.name}\n", commit_message, - branch=default_branch) + branch_name=default_branch_name) # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number # is still open or yet to be created); if it is not valid, perform corrective actions @@ -847,11 +848,15 @@ def _handle_add_payload_staged(self): branch_name = self._determine_branch_name_from_sequence_number() branch = self._get_branch_from_name(branch_name) + default_branch_name = self.git_repo.default_branch + default_branch = self._get_branch_from_name(default_branch_name) + default_sha = default_branch.commit.sha if not branch: # branch for sequence number does not exist # TODO: could have been merged already --> check if sequence directory exists # ASSUME: it has not existed before --> create it - branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", self.git_repo.default_branch) + log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s does not exist, creating it", branch_name) + branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", default_sha) log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s created: %s", branch_name, branch) else: log_message(LoggingScope.TASK_OPS, 'INFO', "found existing branch for %s: %s", branch_name, branch) From 2b0a19109cce1579a503d3c4c1c5a41c6b4d3546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 17:59:34 +0200 Subject: [PATCH 132/218] simplify determination of state and obtain it from feature branch if it exists --- scripts/automated_ingestion/eessi_task.py | 50 +++++++++++------------ 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0080a48e..7efa928b 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -512,36 +512,34 @@ def determine_state(self) -> TaskState: Determine the state of the task based on the state of the staging repository. """ # check if path representing the task file exists in the default branch - path_in_default_branch = self.description.task_object.remote_file_path - default_branch_name = self.git_repo.default_branch - if self._path_exists_in_branch(path_in_default_branch, branch_name=default_branch_name): + # (name of task pointer file is the same in both the default branch and the "feature" branch) + task_pointer_file = self.description.task_object.remote_file_path + branch_to_use = self.git_repo.default_branch + + if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use): log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch", - path_in_default_branch) - # get state from task file in default branch - # - get target_dir from path_in_default_branch - target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch_name) - # read the TaskState file in target dir - task_state_file_path = f"{target_dir}/TaskState" - task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch_name) - # if branch for sequence number exists, get state from task file in corresponding branch - # - branch name is of the form REPO-PR-SEQ - # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ - # - obtain repo, pr, seq from target dir + task_pointer_file) + + # determine if there is a "feature" branch for the sequence number + # - read target dir from task pointer file in default branch + # - construct feature branch name from target dir + target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use) org, repo, pr, seq, _ = target_dir.split('/') - staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}" - if self._get_branch_from_name(staging_branch_name): - # read the TaskState file in staging branch - task_state_staging_branch = self._read_task_state_from_file(task_state_file_path, staging_branch_name) - log_message(LoggingScope.TASK_OPS, 'INFO', "task state in staging branch %s: %s", - staging_branch_name, task_state_staging_branch) - return task_state_staging_branch - else: - log_message(LoggingScope.TASK_OPS, 'INFO', "task state in default branch: %s", - task_state_default_branch) - return task_state_default_branch + feature_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}" + if self._get_branch_from_name(feature_branch_name): + branch_to_use = feature_branch_name + + # get state from task file in branch to use (default or feature) + # - read the TaskState file in target dir + task_state_file_path = f"{target_dir}/TaskState" + task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use) + + log_message(LoggingScope.TASK_OPS, 'INFO', "task state in %s branch: %s", + branch_to_use, task_state) + return task_state else: log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch", - path_in_default_branch) + task_pointer_file) return TaskState.UNDETERMINED @log_function_entry_exit() From 4d1db2e64bbab5f3ffe04c19c84ae18cdc567288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 18:12:36 +0200 Subject: [PATCH 133/218] small improvements to handler for new_task --- scripts/automated_ingestion/eessi_task.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7efa928b..c33e74f9 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -771,6 +771,9 @@ def _handle_add_undetermined(self): def _handle_add_new_task(self): """Handler for ADD action in NEW_TASK state""" print("Handling ADD action in NEW_TASK state") + # determine next state + next_state = self._next_state(TaskState.NEW_TASK) + log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) # get name of of payload from metadata payload_name = self.description.metadata['payload']['filename'] @@ -791,9 +794,7 @@ def _handle_add_new_task(self): self.payload = EESSITaskPayload(payload_object) log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload) - # determine next state (NEXT_STATE), update TaskState file content - next_state = self._next_state() - log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + # update TaskState file content default_branch_name = self.git_repo.default_branch target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path, default_branch_name) From 7eed7decbd56067df38f8605a6ce14e06cf1bda4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 19:33:22 +0200 Subject: [PATCH 134/218] only use specific branch for determining state, and default to main branch --- scripts/automated_ingestion/eessi_task.py | 29 ++++++++--------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index c33e74f9..c88fe526 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -507,39 +507,30 @@ def _read_task_state_from_file(self, path: str, branch_name: str = None) -> Task return task_state @log_function_entry_exit() - def determine_state(self) -> TaskState: + def determine_state(self, branch: str = None) -> TaskState: """ Determine the state of the task based on the state of the staging repository. """ - # check if path representing the task file exists in the default branch - # (name of task pointer file is the same in both the default branch and the "feature" branch) + # check if path representing the task file exists in the default branch or the "feature" branch task_pointer_file = self.description.task_object.remote_file_path - branch_to_use = self.git_repo.default_branch + branch_to_use = self.git_repo.default_branch if branch is None else branch if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use): - log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch", - task_pointer_file) + log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s", + task_pointer_file, branch_to_use) - # determine if there is a "feature" branch for the sequence number - # - read target dir from task pointer file in default branch - # - construct feature branch name from target dir - target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use) - org, repo, pr, seq, _ = target_dir.split('/') - feature_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}" - if self._get_branch_from_name(feature_branch_name): - branch_to_use = feature_branch_name - - # get state from task file in branch to use (default or feature) + # get state from task file in branch to use # - read the TaskState file in target dir + target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use) task_state_file_path = f"{target_dir}/TaskState" task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use) - log_message(LoggingScope.TASK_OPS, 'INFO', "task state in %s branch: %s", + log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch %s: %s", branch_to_use, task_state) return task_state else: - log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch", - task_pointer_file) + log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in branch %s", + task_pointer_file, branch_to_use) return TaskState.UNDETERMINED @log_function_entry_exit() From 58c7bdc112978dc05ec299705f6d438868292920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 20:31:19 +0200 Subject: [PATCH 135/218] use different method to determine feature branch name --- scripts/automated_ingestion/eessi_task.py | 47 +++++++++++++++-------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index c88fe526..0dcdb680 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -831,32 +831,47 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err) return None + @log_function_entry_exit() + def _determine_feature_branch_name(self) -> str: + """Determine the feature branch name from the target directory name""" + task_pointer_file = self.description.task_object.remote_file_path + target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch) + # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) + org, repo, pr, seq, _ = target_dir.split('/') + return f"{org}-{repo}-PR-{pr}-SEQ-{seq}" + @log_function_entry_exit() def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" print("Handling ADD action in PAYLOAD_STAGED state") - branch_name = self._determine_branch_name_from_sequence_number() - branch = self._get_branch_from_name(branch_name) - default_branch_name = self.git_repo.default_branch - default_branch = self._get_branch_from_name(default_branch_name) - default_sha = default_branch.commit.sha - if not branch: - # branch for sequence number does not exist - # TODO: could have been merged already --> check if sequence directory exists + feature_branch_name = self._determine_feature_branch_name() + feature_branch = self._get_branch_from_name(feature_branch_name) + if not feature_branch: + # feature branch does not exist + # TODO: could have been merged already --> check if PR corresponding to the feature branch exists # ASSUME: it has not existed before --> create it - log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s does not exist, creating it", branch_name) - branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", default_sha) - log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s created: %s", branch_name, branch) + log_message(LoggingScope.TASK_OPS, 'INFO', + "branch %s does not exist, creating it", feature_branch_name) + + default_branch_name = self.git_repo.default_branch + default_branch = self._get_branch_from_name(default_branch_name) + default_sha = default_branch.commit.sha + feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha) + log_message(LoggingScope.TASK_OPS, 'INFO', + "branch %s created: %s", feature_branch_name, feature_branch) else: - log_message(LoggingScope.TASK_OPS, 'INFO', "found existing branch for %s: %s", branch_name, branch) + log_message(LoggingScope.TASK_OPS, 'INFO', + "found existing branch for %s: %s", feature_branch_name, feature_branch) - pr = self._find_pr_for_branch(branch_name) - if not pr: - log_message(LoggingScope.TASK_OPS, 'INFO', "no PR found for branch %s", branch_name) + pull_request = self._find_pr_for_branch(feature_branch_name) + if not pull_request: + log_message(LoggingScope.TASK_OPS, 'INFO', + "no PR found for branch %s", feature_branch_name) # TODO: create PR else: - log_message(LoggingScope.TASK_OPS, 'INFO', "found existing PR for branch %s: %s", branch_name, pr) + log_message(LoggingScope.TASK_OPS, 'INFO', + "found existing PR for branch %s: %s", feature_branch_name, pull_request) # TODO: check if PR is open or closed # TODO: if closed, create issue (PR already closed) From f4bf9177012dc43c4e5eb1406a627f3a3c3a5aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 22:12:45 +0200 Subject: [PATCH 136/218] create first version of PR plus some related improvements --- .../automated_ingestion.py | 2 + scripts/automated_ingestion/eessi_task.py | 93 ++++++++++++++----- 2 files changed, 73 insertions(+), 22 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index e0ca710b..0ebce03e 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -257,6 +257,8 @@ def main(): EESSITaskDescription( EESSIDataAndSignatureObject(config, task_path, s3_bucket) ), + config, + cvmfs_repo, gh_staging_repo ) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0dcdb680..165507ed 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -55,12 +55,14 @@ class EESSITask: description: EESSITaskDescription payload: EESSITaskPayload action: EESSITaskAction - state: TaskState git_repo: Github + config: Dict @log_function_entry_exit() - def __init__(self, description: EESSITaskDescription, git_repo: Github): + def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo: str, git_repo: Github): self.description = description + self.config = config + self.cvmfs_repo = cvmfs_repo self.git_repo = git_repo self.action = self._determine_task_action() @@ -685,7 +687,7 @@ def _create_multi_file_commit(self, files_data, commit_message, branch_name: str return new_commit @log_function_entry_exit() - def _update_file(self, file_path, new_content, commit_message, branch_name: str = None): + def _update_file(self, file_path, new_content, commit_message, branch_name: str = None) -> Optional[Dict]: try: branch_name = self.git_repo.default_branch if branch_name is None else branch_name @@ -758,6 +760,22 @@ def _handle_add_undetermined(self): # is still open or yet to be created); if it is not valid, perform corrective actions return TaskState.NEW_TASK + @log_function_entry_exit() + def _update_task_state_file(self, next_state: TaskState, branch_name: str = None) -> Optional[Dict]: + """Update the TaskState file content in default or given branch""" + branch_name = self.git_repo.default_branch if branch_name is None else branch_name + + task_pointer_file = self.description.task_object.remote_file_path + target_dir = self._read_target_dir_from_file(task_pointer_file, branch_name) + task_state_file_path = f"{target_dir}/TaskState" + _, repo, pr, seq, _ = target_dir.split('/') + commit_message = f"changing task state for repo {repo} PR {pr} seq {seq} to {next_state}" + result = self._update_file(task_state_file_path, + f"{next_state.name}\n", + commit_message, + branch_name=branch_name) + return result + @log_function_entry_exit() def _handle_add_new_task(self): """Handler for ADD action in NEW_TASK state""" @@ -786,18 +804,7 @@ def _handle_add_new_task(self): log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload) # update TaskState file content - default_branch_name = self.git_repo.default_branch - target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path, - default_branch_name) - task_state_file_path = f"{target_dir}/TaskState" - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - seq_num = self._get_fixed_sequence_number() - commit_message = f"changing task state for repo {repo_name} PR {pr_number} seq {seq_num} to {next_state}" - self._update_file(task_state_file_path, - f"{next_state.name}\n", - commit_message, - branch_name=default_branch_name) + self._update_task_state_file(next_state) # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number # is still open or yet to be created); if it is not valid, perform corrective actions @@ -831,6 +838,15 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err) return None + @log_function_entry_exit() + def _determine_sequence_number(self) -> int: + """Determine the sequence number from the target directory name""" + task_pointer_file = self.description.task_object.remote_file_path + target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch) + # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) + _, _, _, seq, _ = target_dir.split('/') + return int(seq) + @log_function_entry_exit() def _determine_feature_branch_name(self) -> str: """Determine the feature branch name from the target directory name""" @@ -844,7 +860,12 @@ def _determine_feature_branch_name(self) -> str: def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" print("Handling ADD action in PAYLOAD_STAGED state") + next_state = self._next_state(TaskState.PAYLOAD_STAGED) + log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + default_branch_name = self.git_repo.default_branch + default_branch = self._get_branch_from_name(default_branch_name) + default_sha = default_branch.commit.sha feature_branch_name = self._determine_feature_branch_name() feature_branch = self._get_branch_from_name(feature_branch_name) if not feature_branch: @@ -854,9 +875,6 @@ def _handle_add_payload_staged(self): log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s does not exist, creating it", feature_branch_name) - default_branch_name = self.git_repo.default_branch - default_branch = self._get_branch_from_name(default_branch_name) - default_sha = default_branch.commit.sha feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha) log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s created: %s", feature_branch_name, feature_branch) @@ -868,21 +886,52 @@ def _handle_add_payload_staged(self): if not pull_request: log_message(LoggingScope.TASK_OPS, 'INFO', "no PR found for branch %s", feature_branch_name) - # TODO: create PR + # update TaskState file content in feature branch + self._update_task_state_file(next_state, branch_name=feature_branch_name) + # create PR + pr_title_format = self.config['github']['grouped_pr_title'] + pr_body_format = self.config['github']['grouped_pr_body'] + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" + seq_num = self._determine_sequence_number() + pr_title = pr_title_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + repo=repo_name, + seq_num=seq_num, + ) + pr_body = pr_body_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + pr_url=pr_url, + repo=repo_name, + seq_num=seq_num, + contents="TO BE DONE", + analysis="TO BE DONE", + action="TO BE DONE", + ) + pr = self.git_repo.create_pull( + title=pr_title, + body=pr_body, + head=feature_branch_name, + base=default_branch_name + ) + log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr) + return TaskState.PULL_REQUEST else: log_message(LoggingScope.TASK_OPS, 'INFO', "found existing PR for branch %s: %s", feature_branch_name, pull_request) # TODO: check if PR is open or closed # TODO: if closed, create issue (PR already closed) - - return TaskState.PAYLOAD_STAGED + return TaskState.PULL_REQUEST @log_function_entry_exit() def _handle_add_pull_request(self): """Handler for ADD action in PULL_REQUEST state""" print("Handling ADD action in PULL_REQUEST state") # Implementation for adding in PULL_REQUEST state - return True + return TaskState.PULL_REQUEST @log_function_entry_exit() def _handle_add_approved(self): From 9fda61b4bec6b50cfcde40fccb096e44babf9d16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 22:34:21 +0200 Subject: [PATCH 137/218] filter some PRs, just during development --- scripts/automated_ingestion/eessi_task.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 165507ed..256f0369 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -832,7 +832,9 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" - prs = list(self.git_repo.get_pulls(state='all', head=head_ref)) + filter_prs = [16] # TODO: remove this once the PR is merged + prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) + if pr.number not in filter_prs] return prs[0] if prs else None except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err) From 4686e0c5f04cce89cf3a2a099de9969e7065a13e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 23:16:58 +0200 Subject: [PATCH 138/218] revise state updates to reflect current and future states --- scripts/automated_ingestion/eessi_task.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 256f0369..16cef73d 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -863,7 +863,8 @@ def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" print("Handling ADD action in PAYLOAD_STAGED state") next_state = self._next_state(TaskState.PAYLOAD_STAGED) - log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + approved_state = TaskState.APPROVED + log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s, approved_state: %s", next_state, approved_state) default_branch_name = self.git_repo.default_branch default_branch = self._get_branch_from_name(default_branch_name) @@ -888,8 +889,15 @@ def _handle_add_payload_staged(self): if not pull_request: log_message(LoggingScope.TASK_OPS, 'INFO', "no PR found for branch %s", feature_branch_name) - # update TaskState file content in feature branch - self._update_task_state_file(next_state, branch_name=feature_branch_name) + # update TaskState file content + # - next state in default branch (interpreted as current state) + # - approved state in feature branch (interpreted as future state, ie, after the PR is merged) + self._update_task_state_file(next_state, branch_name=default_branch_name) + self._update_task_state_file(approved_state, branch_name=feature_branch_name) + log_message(LoggingScope.TASK_OPS, 'INFO', + "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", + next_state, default_branch_name, approved_state, feature_branch_name) + # create PR pr_title_format = self.config['github']['grouped_pr_title'] pr_body_format = self.config['github']['grouped_pr_body'] From d4d08cfbe115d484daf1bd4166d7247211e89766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 23:19:31 +0200 Subject: [PATCH 139/218] filter one more PR --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 16cef73d..98dc989f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" - filter_prs = [16] # TODO: remove this once the PR is merged + filter_prs = [16, 17] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] return prs[0] if prs else None From 2c73ee9a90557b74cd929019ea77dc1937bdaae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 23:44:42 +0200 Subject: [PATCH 140/218] alternative method to update taskstate file --- scripts/automated_ingestion/eessi_task.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 98dc989f..64f8b396 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -893,7 +893,10 @@ def _handle_add_payload_staged(self): # - next state in default branch (interpreted as current state) # - approved state in feature branch (interpreted as future state, ie, after the PR is merged) self._update_task_state_file(next_state, branch_name=default_branch_name) - self._update_task_state_file(approved_state, branch_name=feature_branch_name) + # try to first update the task state file in the feature branch to + # next state (attempt to avoid merge conflicts) + self._update_task_state_file(next_state, branch_name=feature_branch_name) + # self._update_task_state_file(approved_state, branch_name=feature_branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", next_state, default_branch_name, approved_state, feature_branch_name) From d5cd773c2fac8c9c5c958a567509748e8f946249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 23:45:29 +0200 Subject: [PATCH 141/218] also filter PR 18 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 64f8b396..f40ea04b 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" - filter_prs = [16, 17] # TODO: remove this once the PR is merged + filter_prs = [16, 17, 18] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] return prs[0] if prs else None From 902bae2276274ac2d161c4b7fe4b3c6c712bfe7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 8 Jun 2025 23:51:44 +0200 Subject: [PATCH 142/218] alternative method to update taskstate file --- scripts/automated_ingestion/eessi_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index f40ea04b..0c33f47a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" - filter_prs = [16, 17, 18] # TODO: remove this once the PR is merged + filter_prs = [16, 17, 18, 19] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] return prs[0] if prs else None @@ -896,7 +896,7 @@ def _handle_add_payload_staged(self): # try to first update the task state file in the feature branch to # next state (attempt to avoid merge conflicts) self._update_task_state_file(next_state, branch_name=feature_branch_name) - # self._update_task_state_file(approved_state, branch_name=feature_branch_name) + self._update_task_state_file(approved_state, branch_name=feature_branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", next_state, default_branch_name, approved_state, feature_branch_name) From 35616699c077a3fbcba24a4721b8bdfa5aa8cc09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 00:11:19 +0200 Subject: [PATCH 143/218] another attempt to avoid merge conflict --- scripts/automated_ingestion/eessi_task.py | 40 ++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0c33f47a..0d6e2c64 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" - filter_prs = [16, 17, 18, 19] # TODO: remove this once the PR is merged + filter_prs = [16, 17, 18, 19, 20] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] return prs[0] if prs else None @@ -858,6 +858,39 @@ def _determine_feature_branch_name(self) -> str: org, repo, pr, seq, _ = target_dir.split('/') return f"{org}-{repo}-PR-{pr}-SEQ-{seq}" + @log_function_entry_exit() + def _sync_task_state_file(self, source_branch: str, target_branch: str): + """Update task state file from source to target branch""" + task_pointer_file = self.description.task_object.remote_file_path + target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch) + task_state_file_path = f"{target_dir}/TaskState" + + try: + # Get content from source branch + source_content = self.git_repo.get_contents(task_state_file_path, ref=source_branch) + + # Get current file in target branch + target_file = self.git_repo.get_contents(task_state_file_path, ref=target_branch) + + # Update if content is different + if source_content.sha != target_file.sha: + result = self.git_repo.update_file( + path=task_state_file_path, + message=f"Sync {task_state_file_path} from {source_branch} to {target_branch}", + content=source_content.decoded_content, + sha=target_file.sha, + branch=target_branch + ) + log_message(LoggingScope.TASK_OPS, 'INFO', "Updated %s", task_state_file_path) + return result + else: + log_message(LoggingScope.TASK_OPS, 'INFO', "No changes needed for %s", task_state_file_path) + return None + + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Error syncing task state file: %s", err) + return None + @log_function_entry_exit() def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" @@ -893,9 +926,8 @@ def _handle_add_payload_staged(self): # - next state in default branch (interpreted as current state) # - approved state in feature branch (interpreted as future state, ie, after the PR is merged) self._update_task_state_file(next_state, branch_name=default_branch_name) - # try to first update the task state file in the feature branch to - # next state (attempt to avoid merge conflicts) - self._update_task_state_file(next_state, branch_name=feature_branch_name) + # sync task state file from default to feature branch (attempt to avoid merge conflicts) + self._sync_task_state_file(default_branch_name, feature_branch_name) self._update_task_state_file(approved_state, branch_name=feature_branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", From 61fb94459b9f9eddd381acc6e09c2649adabfdec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 00:30:35 +0200 Subject: [PATCH 144/218] yet another attempt at avoiding merge conflict --- scripts/automated_ingestion/eessi_task.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0d6e2c64..6ddfb5b2 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" - filter_prs = [16, 17, 18, 19, 20] # TODO: remove this once the PR is merged + filter_prs = [16, 17, 18, 19, 20, 21] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] return prs[0] if prs else None @@ -926,8 +926,13 @@ def _handle_add_payload_staged(self): # - next state in default branch (interpreted as current state) # - approved state in feature branch (interpreted as future state, ie, after the PR is merged) self._update_task_state_file(next_state, branch_name=default_branch_name) - # sync task state file from default to feature branch (attempt to avoid merge conflicts) - self._sync_task_state_file(default_branch_name, feature_branch_name) + # merge default branch into feature branch (attempt to avoid merge conflicts) + self.git_repo.merge( + head=default_branch_name, + base=feature_branch_name, + commit_message=f"Merge {default_branch_name} into {feature_branch_name}" + ) + # update task state file in feature branch self._update_task_state_file(approved_state, branch_name=feature_branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", From 2cae1504407c7dd5f4e01fcdabd2c6867d4fe8d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 08:57:09 +0200 Subject: [PATCH 145/218] improve readbility of code --- scripts/automated_ingestion/eessi_task.py | 132 ++++++++++++++-------- 1 file changed, 86 insertions(+), 46 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 6ddfb5b2..da361bbf 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" - filter_prs = [16, 17, 18, 19, 20, 21] # TODO: remove this once the PR is merged + filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] return prs[0] if prs else None @@ -891,6 +891,85 @@ def _sync_task_state_file(self, source_branch: str, target_branch: str): log_message(LoggingScope.TASK_OPS, 'ERROR', "Error syncing task state file: %s", err) return None + @log_function_entry_exit() + def _update_task_states(self, next_state: TaskState, default_branch_name: str, + approved_state: TaskState, feature_branch_name: str): + """ + Update task states in default and feature branches + + States have to be updated in a specific order and in particular the default branch has to be + merged into the feature branch before the feature branch can be updated to avoid a merge conflict. + + Args: + next_state: next state to be applied to the default branch + default_branch_name: name of the default branch + approved_state: state to be applied to the feature branch + feature_branch_name: name of the feature branch + """ + # TODO: add failure handling (capture failures and return them somehow) + + # update TaskState file content + # - next_state in default branch (interpreted as current state) + # - approved_state in feature branch (interpreted as future state, ie, after + # the PR corresponding to the feature branch will be merged) + + # first, update the task state file in the default branch + self._update_task_state_file(next_state, branch_name=default_branch_name) + + # second, merge default branch into feature branch (to avoid a merge conflict) + arch = self.description.task_object.arch + commit_message = f"merge {default_branch_name} into {feature_branch_name} for {arch}" + self.git_repo.merge( + head=default_branch_name, + base=feature_branch_name, + commit_message=commit_message + ) + + # last, update task state file in feature branch + self._update_task_state_file(approved_state, branch_name=feature_branch_name) + log_message(LoggingScope.TASK_OPS, 'INFO', + "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", + next_state, default_branch_name, approved_state, feature_branch_name) + + @log_function_entry_exit() + def _create_pull_request(self, feature_branch_name: str, default_branch_name: str): + """ + Create a PR from the feature branch to the default branch + + Args: + feature_branch_name: name of the feature branch + default_branch_name: name of the default branch + """ + pr_title_format = self.config['github']['grouped_pr_title'] + pr_body_format = self.config['github']['grouped_pr_body'] + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" + seq_num = self._determine_sequence_number() + pr_title = pr_title_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + repo=repo_name, + seq_num=seq_num, + ) + pr_body = pr_body_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + pr_url=pr_url, + repo=repo_name, + seq_num=seq_num, + contents="TO BE DONE", + analysis="TO BE DONE", + action="TO BE DONE", + ) + pr = self.git_repo.create_pull( + title=pr_title, + body=pr_body, + head=feature_branch_name, + base=default_branch_name + ) + log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr) + @log_function_entry_exit() def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" @@ -922,52 +1001,13 @@ def _handle_add_payload_staged(self): if not pull_request: log_message(LoggingScope.TASK_OPS, 'INFO', "no PR found for branch %s", feature_branch_name) - # update TaskState file content - # - next state in default branch (interpreted as current state) - # - approved state in feature branch (interpreted as future state, ie, after the PR is merged) - self._update_task_state_file(next_state, branch_name=default_branch_name) - # merge default branch into feature branch (attempt to avoid merge conflicts) - self.git_repo.merge( - head=default_branch_name, - base=feature_branch_name, - commit_message=f"Merge {default_branch_name} into {feature_branch_name}" - ) - # update task state file in feature branch - self._update_task_state_file(approved_state, branch_name=feature_branch_name) - log_message(LoggingScope.TASK_OPS, 'INFO', - "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", - next_state, default_branch_name, approved_state, feature_branch_name) - # create PR - pr_title_format = self.config['github']['grouped_pr_title'] - pr_body_format = self.config['github']['grouped_pr_body'] - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" - seq_num = self._determine_sequence_number() - pr_title = pr_title_format.format( - cvmfs_repo=self.cvmfs_repo, - pr=pr_number, - repo=repo_name, - seq_num=seq_num, - ) - pr_body = pr_body_format.format( - cvmfs_repo=self.cvmfs_repo, - pr=pr_number, - pr_url=pr_url, - repo=repo_name, - seq_num=seq_num, - contents="TO BE DONE", - analysis="TO BE DONE", - action="TO BE DONE", - ) - pr = self.git_repo.create_pull( - title=pr_title, - body=pr_body, - head=feature_branch_name, - base=default_branch_name - ) - log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr) + # TODO: add failure handling (capture result and act on it) + self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name) + + # TODO: add failure handling (capture result and act on it) + self._create_pull_request(feature_branch_name, default_branch_name) + return TaskState.PULL_REQUEST else: log_message(LoggingScope.TASK_OPS, 'INFO', From 9e6d4fe80f393c22d62479120e6ac8256b389be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 09:14:20 +0200 Subject: [PATCH 146/218] fix obtaining arch value --- scripts/automated_ingestion/eessi_task.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index da361bbf..a3ab4b9b 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -917,7 +917,9 @@ def _update_task_states(self, next_state: TaskState, default_branch_name: str, self._update_task_state_file(next_state, branch_name=default_branch_name) # second, merge default branch into feature branch (to avoid a merge conflict) - arch = self.description.task_object.arch + # TODO: store arch info (CPU+ACCEL) in task/metdata file and then access that rather + # than using a part of the file name + arch = self.description.get_metadata_file_components()[3] commit_message = f"merge {default_branch_name} into {feature_branch_name} for {arch}" self.git_repo.merge( head=default_branch_name, From 029d5b6d3ba67e67fb9328a119c6db35ea91b614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 09:29:58 +0200 Subject: [PATCH 147/218] improve commit message when changing task state --- scripts/automated_ingestion/eessi_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a3ab4b9b..0aaee310 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -768,8 +768,8 @@ def _update_task_state_file(self, next_state: TaskState, branch_name: str = None task_pointer_file = self.description.task_object.remote_file_path target_dir = self._read_target_dir_from_file(task_pointer_file, branch_name) task_state_file_path = f"{target_dir}/TaskState" - _, repo, pr, seq, _ = target_dir.split('/') - commit_message = f"changing task state for repo {repo} PR {pr} seq {seq} to {next_state}" + arch = self.description.get_metadata_file_components()[3] + commit_message = f"change task state to {next_state} in {branch_name} for {arch}" result = self._update_file(task_state_file_path, f"{next_state.name}\n", commit_message, From b996f6bee8943461972493c1877b5f46b05a38b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 09:40:56 +0200 Subject: [PATCH 148/218] first step towards adding another task to a deployment PR --- scripts/automated_ingestion/eessi_task.py | 25 +++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0aaee310..497f53e0 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -972,6 +972,14 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st ) log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr) + @log_function_entry_exit() + def _update_pull_request(self, pull_request: PullRequest, feature_branch_name: str): + """Update the pull request""" + # TODO: update sections (contents analysis, action) + # for now, function just logs a message + log_message(LoggingScope.TASK_OPS, 'INFO', + "updating pull request %s for branch %s", pull_request, feature_branch_name) + @log_function_entry_exit() def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" @@ -1015,8 +1023,21 @@ def _handle_add_payload_staged(self): log_message(LoggingScope.TASK_OPS, 'INFO', "found existing PR for branch %s: %s", feature_branch_name, pull_request) # TODO: check if PR is open or closed - # TODO: if closed, create issue (PR already closed) - return TaskState.PULL_REQUEST + if pull_request.state == 'closed': + log_message(LoggingScope.TASK_OPS, 'INFO', + "PR %s is closed, creating issue", pull_request) + # TODO: create issue + return TaskState.PAYLOAD_STAGED + else: + log_message(LoggingScope.TASK_OPS, 'INFO', + "PR %s is open, updating task states", pull_request) + # TODO: add failure handling (capture result and act on it) + self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name) + + # TODO: add failure handling (capture result and act on it) + self._update_pull_request(pull_request, feature_branch_name) + + return TaskState.PULL_REQUEST @log_function_entry_exit() def _handle_add_pull_request(self): From a4c2d02dee7381d8ea4c39a5f49f3fa2fbba15a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 10:14:11 +0200 Subject: [PATCH 149/218] don't stop processing after first task --- scripts/automated_ingestion/automated_ingestion.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 0ebce03e..46875b33 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -254,12 +254,8 @@ def main(): # Create EESSITask for the task file try: task = EESSITask( - EESSITaskDescription( - EESSIDataAndSignatureObject(config, task_path, s3_bucket) - ), - config, - cvmfs_repo, - gh_staging_repo + EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)), + config, cvmfs_repo, gh_staging_repo ) except Exception as err: @@ -281,7 +277,6 @@ def main(): log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s': previous state = '%s', current state = '%s'", task_path, previous_state.name, current_state.name) - exit(0) # run loop body only once # # TODO: update the information shown below (what makes sense to show?) # # Log information about the task From 121130a49b2c51ac940e0ff08d2e8616896a1f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 10:44:50 +0200 Subject: [PATCH 150/218] first step to create contents overview --- scripts/automated_ingestion/eessi_task.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 497f53e0..0952c4df 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -933,6 +933,20 @@ def _update_task_states(self, next_state: TaskState, default_branch_name: str, "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)", next_state, default_branch_name, approved_state, feature_branch_name) + @log_function_entry_exit() + def _create_contents_overview(self) -> str: + """Create a contents overview for the pull request""" + # TODO: implement + feature_branch_name = self._determine_feature_branch_name() + task_pointer_file = self.description.task_object.remote_file_path + target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name) + directories = self._list_directory_contents(target_dir, feature_branch_name) + for directory in directories: + print(directory) + # tarball_contents = self.description.task_object.get_contents_overview() + + return "TO BE DONE" + @log_function_entry_exit() def _create_pull_request(self, feature_branch_name: str, default_branch_name: str): """ @@ -954,13 +968,14 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st repo=repo_name, seq_num=seq_num, ) + # contents_overview = self._create_contents_overview() pr_body = pr_body_format.format( cvmfs_repo=self.cvmfs_repo, pr=pr_number, pr_url=pr_url, repo=repo_name, seq_num=seq_num, - contents="TO BE DONE", + contents=contents_overview, analysis="TO BE DONE", action="TO BE DONE", ) @@ -1044,6 +1059,8 @@ def _handle_add_pull_request(self): """Handler for ADD action in PULL_REQUEST state""" print("Handling ADD action in PULL_REQUEST state") # Implementation for adding in PULL_REQUEST state + contents_overview = self._create_contents_overview() + log_message(LoggingScope.TASK_OPS, 'INFO', "contents_overview: %s", contents_overview) return TaskState.PULL_REQUEST @log_function_entry_exit() From 64c4fe807ef63df3492500141b621d3ac57e464d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 10:47:15 +0200 Subject: [PATCH 151/218] a little more log output --- scripts/automated_ingestion/eessi_task.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0952c4df..51931217 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -941,6 +941,7 @@ def _create_contents_overview(self) -> str: task_pointer_file = self.description.task_object.remote_file_path target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name) directories = self._list_directory_contents(target_dir, feature_branch_name) + print(f"target_dir: {target_dir}") for directory in directories: print(directory) # tarball_contents = self.description.task_object.get_contents_overview() From 1061e1e9b11917e6d46ad9a5182d3fc9a54b47c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 9 Jun 2025 10:49:48 +0200 Subject: [PATCH 152/218] one level up from target_dir --- scripts/automated_ingestion/eessi_task.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 51931217..e1fb7ec9 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -940,10 +940,12 @@ def _create_contents_overview(self) -> str: feature_branch_name = self._determine_feature_branch_name() task_pointer_file = self.description.task_object.remote_file_path target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name) - directories = self._list_directory_contents(target_dir, feature_branch_name) + pr_dir = os.path.dirname(target_dir) + directories = self._list_directory_contents(pr_dir, feature_branch_name) print(f"target_dir: {target_dir}") + print(f"pr_dir: {pr_dir}") for directory in directories: - print(directory) + print(f"directory: {directory}") # tarball_contents = self.description.task_object.get_contents_overview() return "TO BE DONE" From ae0ad4b97d1bc3e093cd0cfdd0c0195cf181526d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 15 Jun 2025 20:29:38 +0200 Subject: [PATCH 153/218] show basic task summary --- scripts/automated_ingestion/eessi_task.py | 47 +++++++++++++++++-- .../automated_ingestion/eessi_task_payload.py | 7 +++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index e1fb7ec9..a068ee44 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -934,7 +934,45 @@ def _update_task_states(self, next_state: TaskState, default_branch_name: str, next_state, default_branch_name, approved_state, feature_branch_name) @log_function_entry_exit() - def _create_contents_overview(self) -> str: + def _create_task_summary(self) -> str: + """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory.""" + + # determine task summary file path in feature branch on GitHub + feature_branch_name = self._determine_feature_branch_name() + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + sequence_number = self._get_fixed_sequence_number() # corresponds to an open PR + task_file_name = self.description.get_task_file_name() + target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" + task_summary_file_path = f"{target_dir}/TaskSummary.html" + + # check if task summary file already exists in repo on GitHub + task_summary_file = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) + if task_summary_file: + log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path) + return task_summary_file + + # create task summary + payload_name = self.description.metadata['payload']['filename'] + payload_summary = self.payload.analyse_contents() + metadata_contents = self.description.get_contents() + task_summary = f"
{payload_name}\n
    \n" + task_summary += "
  • Metadata\n" + task_summary += f"
    {metadata_contents}
    \n
  • \n" + task_summary += f"
  • Payload\n{payload_summary}\n
  • \n" + task_summary += "
\n" + task_summary += "
\n" + + # create HTML file with task summary in REPO-PR-SEQ directory + # TODO: add failure handling (capture result and act on it) + # self._safe_create_file(task_summary_file_path, f"create task summary for {task_file_name}", + # task_summary, branch_name=feature_branch_name) + + # return task summary + return task_summary + + @log_function_entry_exit() + def _create_pr_contents_overview(self) -> str: """Create a contents overview for the pull request""" # TODO: implement feature_branch_name = self._determine_feature_branch_name() @@ -971,7 +1009,8 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st repo=repo_name, seq_num=seq_num, ) - # contents_overview = self._create_contents_overview() + self._create_task_summary() + # contents_overview = self._create_pr_contents_overview() pr_body = pr_body_format.format( cvmfs_repo=self.cvmfs_repo, pr=pr_number, @@ -1062,8 +1101,8 @@ def _handle_add_pull_request(self): """Handler for ADD action in PULL_REQUEST state""" print("Handling ADD action in PULL_REQUEST state") # Implementation for adding in PULL_REQUEST state - contents_overview = self._create_contents_overview() - log_message(LoggingScope.TASK_OPS, 'INFO', "contents_overview: %s", contents_overview) + task_summary = self._create_task_summary() + log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary) return TaskState.PULL_REQUEST @log_function_entry_exit() diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index bba630fe..548ac086 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -35,6 +35,13 @@ def __init__(self, payload_object: EESSIDataAndSignatureObject): # Verify signature self.signature_verified = self.payload_object.verify_signature() + @log_function_entry_exit() + def analyse_contents(self) -> str: + """Analyse the contents of the payload and return a summary in a ready-to-use HTML format.""" + # TODO: implement + return "TO BE DONE" + + @log_function_entry_exit() def __str__(self) -> str: """Return a string representation of the EESSITaskPayload object.""" return f"EESSITaskPayload({self.payload_object.local_file_path}, verified={self.signature_verified})" From 7b4d2675eb5bc1a8d9622cee4d0fbfc34db1f939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 15 Jun 2025 20:39:34 +0200 Subject: [PATCH 154/218] use existing method to check for existance of task summary file --- scripts/automated_ingestion/eessi_task.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a068ee44..0c4e66ec 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -947,10 +947,10 @@ def _create_task_summary(self) -> str: task_summary_file_path = f"{target_dir}/TaskSummary.html" # check if task summary file already exists in repo on GitHub - task_summary_file = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) - if task_summary_file: + if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path) - return task_summary_file + # TODO: read contents of task summary file + return "DUMMY TASK SUMMARY" # create task summary payload_name = self.description.metadata['payload']['filename'] From a754921a2fa436d189b7e3750c1d1d23ca808882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 15 Jun 2025 20:52:35 +0200 Subject: [PATCH 155/218] init payload object in constructor for EESSITask given corresponding state --- scripts/automated_ingestion/eessi_task.py | 27 +++++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0c4e66ec..45394e71 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -80,7 +80,10 @@ def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo: TaskState.DONE: [] # Terminal state } - # self.state = self._find_state() + state = self.determine_state() + if state >= TaskState.PAYLOAD_STAGED: + log_message(LoggingScope.TASK_OPS, 'INFO', "initializing payload object in constructor for EESSITask") + self._init_payload_object() @log_function_entry_exit() def _determine_task_action(self) -> EESSITaskAction: @@ -777,12 +780,11 @@ def _update_task_state_file(self, next_state: TaskState, branch_name: str = None return result @log_function_entry_exit() - def _handle_add_new_task(self): - """Handler for ADD action in NEW_TASK state""" - print("Handling ADD action in NEW_TASK state") - # determine next state - next_state = self._next_state(TaskState.NEW_TASK) - log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + def _init_payload_object(self): + """Initialize the payload object""" + if self.payload is not None: + log_message(LoggingScope.TASK_OPS, 'INFO', "payload object already initialized") + return # get name of of payload from metadata payload_name = self.description.metadata['payload']['filename'] @@ -803,6 +805,17 @@ def _handle_add_new_task(self): self.payload = EESSITaskPayload(payload_object) log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload) + @log_function_entry_exit() + def _handle_add_new_task(self): + """Handler for ADD action in NEW_TASK state""" + print("Handling ADD action in NEW_TASK state") + # determine next state + next_state = self._next_state(TaskState.NEW_TASK) + log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) + + # initialize payload object + self._init_payload_object() + # update TaskState file content self._update_task_state_file(next_state) From 8a37ae17b888baddbdb7daa6fa43c0d8ed54eee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 15 Jun 2025 20:58:49 +0200 Subject: [PATCH 156/218] add comparison of TaskState values --- scripts/automated_ingestion/eessi_task.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 45394e71..07138998 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1,5 +1,6 @@ from enum import Enum, auto from typing import Dict, List, Tuple, Optional +from functools import total_ordering import os import traceback @@ -22,6 +23,7 @@ class SequenceStatus(Enum): FINISHED = auto() +@total_ordering class TaskState(Enum): UNDETERMINED = auto() # The task state was not determined yet NEW_TASK = auto() # The task has been created but not yet processed @@ -47,6 +49,11 @@ def from_string(cls, name, default=None, case_sensitive=False): except KeyError: return default + def __lt__(self, other): + if self.__class__ is other.__class__: + return self.value < other.value + return NotImplemented + def __str__(self): return self.name.upper() From 088fee93a879b383797a4d25ead08d399feb56fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 15 Jun 2025 21:01:43 +0200 Subject: [PATCH 157/218] init payload to None initially --- scripts/automated_ingestion/eessi_task.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 07138998..e5d415cb 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -87,6 +87,7 @@ def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo: TaskState.DONE: [] # Terminal state } + self.payload = None state = self.determine_state() if state >= TaskState.PAYLOAD_STAGED: log_message(LoggingScope.TASK_OPS, 'INFO', "initializing payload object in constructor for EESSITask") From 047d271d03468e1776969734d4aa0e25bc9980c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 04:48:46 +0200 Subject: [PATCH 158/218] create payload analysis --- .../automated_ingestion.cfg.example | 5 ++ scripts/automated_ingestion/eessi_task.py | 12 ++-- .../automated_ingestion/eessi_task_payload.py | 60 ++++++++++++++++++- 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.cfg.example b/scripts/automated_ingestion/automated_ingestion.cfg.example index bdf40fa3..98ab1e79 100644 --- a/scripts/automated_ingestion/automated_ingestion.cfg.example +++ b/scripts/automated_ingestion/automated_ingestion.cfg.example @@ -114,6 +114,11 @@ grouped_pr_body = A group of tarballs has been staged for {pr_url}. {metadata} +# Template for individual tarball PRs +task_summary_payload_template = + {tar_overview} + + [slack] ingestion_notification = yes ingestion_message = Tarball `{tarball}` has been ingested into the CVMFS repository `{cvmfs_repo}`. diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index e5d415cb..51dbf2e8 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -970,8 +970,8 @@ def _create_task_summary(self) -> str: # check if task summary file already exists in repo on GitHub if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path) - # TODO: read contents of task summary file - return "DUMMY TASK SUMMARY" + task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) + return task_summary.decoded_content # create task summary payload_name = self.description.metadata['payload']['filename'] @@ -980,7 +980,11 @@ def _create_task_summary(self) -> str: task_summary = f"
{payload_name}\n
    \n" task_summary += "
  • Metadata\n" task_summary += f"
    {metadata_contents}
    \n
  • \n" - task_summary += f"
  • Payload\n{payload_summary}\n
  • \n" + task_summary += "
  • Overview of payload contents\n" + task_summary += self.config['github']['task_summary_payload_template'].format( + payload_overview=payload_summary, + ) + task_summary += "
  • \n" task_summary += "
\n" task_summary += "
\n" @@ -1038,7 +1042,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st pr_url=pr_url, repo=repo_name, seq_num=seq_num, - contents=contents_overview, + contents="TO BE DONE", analysis="TO BE DONE", action="TO BE DONE", ) diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index 548ac086..9a643157 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -1,4 +1,7 @@ from dataclasses import dataclass +import tarfile +from pathlib import PurePosixPath +import os from eessi_data_object import EESSIDataAndSignatureObject from utils import log_function_entry_exit @@ -38,8 +41,61 @@ def __init__(self, payload_object: EESSIDataAndSignatureObject): @log_function_entry_exit() def analyse_contents(self) -> str: """Analyse the contents of the payload and return a summary in a ready-to-use HTML format.""" - # TODO: implement - return "TO BE DONE" + tar = tarfile.open(self.payload_object.local_file_path, 'r') + members = tar.getmembers() + tar_num_members = len(members) + paths = sorted([m.path for m in members]) + + if tar_num_members < 100: + tar_members_desc = "Full listing of the contents of the tarball:" + members_list = paths + + else: + tar_members_desc = "Summarized overview of the contents of the tarball:" + # determine prefix after filtering out '/init' subdirectory, + # to get actual prefix for specific CPU target (like '2023.06/software/linux/aarch64/neoverse_v1') + init_subdir = os.path.join('*', 'init') + non_init_paths = sorted( + [path for path in paths if not any(parent.match(init_subdir) for parent in PurePosixPath(path).parents)] + ) + if non_init_paths: + prefix = os.path.commonprefix(non_init_paths) + else: + prefix = os.path.commonprefix(paths) + + # TODO: this only works for software tarballs, how to handle compat layer tarballs? + swdirs = [ # all directory names with the pattern: /software// + member.path + for member in members + if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, 'software', '*', '*')) + ] + modfiles = [ # all filenames with the pattern: /modules///*.lua + member.path + for member in members + if member.isfile() and + PurePosixPath(member.path).match(os.path.join(prefix, 'modules', '*', '*', '*.lua')) + ] + other = [ # anything that is not in /software nor /modules + member.path + for member in members + if (not PurePosixPath(prefix).joinpath('software') in PurePosixPath(member.path).parents + and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(member.path).parents) + # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*')) + # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*')) + ] + members_list = sorted(swdirs + modfiles + other) + + # Construct the overview. + tar_members = '\n'.join(members_list) + overview = f"Total number of items in the tarball: {tar_num_members}" + overview += f"\nURL to the tarball: {self.url}" + overview += f"\n{tar_members_desc}\n" + overview += f"```\n{tar_members}\n```" + + # Make sure that the overview does not exceed Github's maximum length (65536 characters). + if len(overview) > 60000: + overview = overview[:60000] + "\n\nWARNING: output exceeded the maximum length and was truncated!\n```" + return overview @log_function_entry_exit() def __str__(self) -> str: From eb3c1b163001955ae2436ab72ef114edee440097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 04:58:06 +0200 Subject: [PATCH 159/218] add function to obtain URL for remote file --- scripts/automated_ingestion/eessi_data_object.py | 5 +++++ scripts/automated_ingestion/eessi_task_payload.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py index 6e8189fe..c7adc05b 100644 --- a/scripts/automated_ingestion/eessi_data_object.py +++ b/scripts/automated_ingestion/eessi_data_object.py @@ -324,6 +324,11 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool: log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", self.remote_file_path, str(err)) raise + @log_function_entry_exit() + def get_url(self) -> str: + """Get the URL of the data file.""" + return f"https://{self.remote_client.bucket}.s3.amazonaws.com/{self.remote_file_path}" + def __str__(self) -> str: """Return a string representation of the EESSI data and signature object.""" return f"EESSIDataAndSignatureObject({self.remote_file_path})" diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index 9a643157..3729c846 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -88,7 +88,7 @@ def analyse_contents(self) -> str: # Construct the overview. tar_members = '\n'.join(members_list) overview = f"Total number of items in the tarball: {tar_num_members}" - overview += f"\nURL to the tarball: {self.url}" + overview += f"\nURL to the tarball: {self.payload_object.get_url()}" overview += f"\n{tar_members_desc}\n" overview += f"```\n{tar_members}\n```" From fe4164659ac5a4a85e9b0a289ad951ba13c5013d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 05:03:47 +0200 Subject: [PATCH 160/218] fix var name in template --- scripts/automated_ingestion/automated_ingestion.cfg.example | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/automated_ingestion.cfg.example b/scripts/automated_ingestion/automated_ingestion.cfg.example index 98ab1e79..18009d88 100644 --- a/scripts/automated_ingestion/automated_ingestion.cfg.example +++ b/scripts/automated_ingestion/automated_ingestion.cfg.example @@ -114,9 +114,9 @@ grouped_pr_body = A group of tarballs has been staged for {pr_url}. {metadata} -# Template for individual tarball PRs +# Template for payload overview task_summary_payload_template = - {tar_overview} + {payload_overview} [slack] From e3e10e31c9185adcae18f2d600f4f02f13e9d379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 05:09:32 +0200 Subject: [PATCH 161/218] =?UTF-8?q?create=20task=20summary=20file=20in=20s?= =?UTF-8?q?taging=20PRR=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/automated_ingestion/eessi_task.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 51dbf2e8..92d79d4d 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -990,8 +990,10 @@ def _create_task_summary(self) -> str: # create HTML file with task summary in REPO-PR-SEQ directory # TODO: add failure handling (capture result and act on it) - # self._safe_create_file(task_summary_file_path, f"create task summary for {task_file_name}", - # task_summary, branch_name=feature_branch_name) + commit_message = f"create summary for {task_file_name} in {feature_branch_name}" + self._safe_create_file(task_summary_file_path, commit_message, task_summary, + branch_name=feature_branch_name) + log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file created: %s", task_summary_file_path) # return task summary return task_summary From aa2461214422759367b7df633d153d94054cf237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 05:25:02 +0200 Subject: [PATCH 162/218] first step to create PR contents overview --- scripts/automated_ingestion/eessi_task.py | 25 +++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 92d79d4d..81f0b253 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1009,11 +1009,22 @@ def _create_pr_contents_overview(self) -> str: directories = self._list_directory_contents(pr_dir, feature_branch_name) print(f"target_dir: {target_dir}") print(f"pr_dir: {pr_dir}") - for directory in directories: - print(f"directory: {directory}") - # tarball_contents = self.description.task_object.get_contents_overview() + contents_overview = "" + if directories: + contents_overview += "
    \n" + for directory in directories: + task_summary_file_path = f"{directory}/TaskSummary.html" + if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): + task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) + contents_overview += f"
  • {task_summary.decoded_content}
  • \n" + else: + contents_overview += f"
  • Task summary file not found: {task_summary_file_path}
  • \n" + contents_overview += "
\n" + else: + contents_overview += "No tasks found in this PR\n" - return "TO BE DONE" + print(f"contents_overview: {contents_overview}") + return contents_overview @log_function_entry_exit() def _create_pull_request(self, feature_branch_name: str, default_branch_name: str): @@ -1037,14 +1048,14 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st seq_num=seq_num, ) self._create_task_summary() - # contents_overview = self._create_pr_contents_overview() + contents_overview = self._create_pr_contents_overview() pr_body = pr_body_format.format( cvmfs_repo=self.cvmfs_repo, pr=pr_number, pr_url=pr_url, repo=repo_name, seq_num=seq_num, - contents="TO BE DONE", + contents=contents_overview, analysis="TO BE DONE", action="TO BE DONE", ) @@ -1130,6 +1141,8 @@ def _handle_add_pull_request(self): # Implementation for adding in PULL_REQUEST state task_summary = self._create_task_summary() log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary) + contents_overview = self._create_pr_contents_overview() + log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview) return TaskState.PULL_REQUEST @log_function_entry_exit() From 6a4f62c0c88c295d1ca9fc4f70e4d802af67fd5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 05:29:34 +0200 Subject: [PATCH 163/218] use name of directory --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 81f0b253..df45dfe7 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1013,7 +1013,7 @@ def _create_pr_contents_overview(self) -> str: if directories: contents_overview += "
    \n" for directory in directories: - task_summary_file_path = f"{directory}/TaskSummary.html" + task_summary_file_path = f"{directory.name}/TaskSummary.html" if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) contents_overview += f"
  • {task_summary.decoded_content}
  • \n" From 1d7b70717feb09e96b141791c3c29c51af13ae2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 05:32:29 +0200 Subject: [PATCH 164/218] add PR dir component --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index df45dfe7..ba994ac2 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1013,7 +1013,7 @@ def _create_pr_contents_overview(self) -> str: if directories: contents_overview += "
      \n" for directory in directories: - task_summary_file_path = f"{directory.name}/TaskSummary.html" + task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html" if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) contents_overview += f"
    • {task_summary.decoded_content}
    • \n" From 5e4f10770370182b68a9f94a50d98a8a4734d79a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 05:39:44 +0200 Subject: [PATCH 165/218] note TODO --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index ba994ac2..8b83498a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1073,7 +1073,7 @@ def _update_pull_request(self, pull_request: PullRequest, feature_branch_name: s # TODO: update sections (contents analysis, action) # for now, function just logs a message log_message(LoggingScope.TASK_OPS, 'INFO', - "updating pull request %s for branch %s", pull_request, feature_branch_name) + "TODO: updating pull request %s for branch %s", pull_request, feature_branch_name) @log_function_entry_exit() def _handle_add_payload_staged(self): From 71f703c6b8f1c5a33bd59ab7ad31aee06064dc42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 05:44:36 +0200 Subject: [PATCH 166/218] bump sequence number to 1 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8b83498a..b3f61b06 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 0 + return 1 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From 02c01d080612974cf4d1e202bb108c7503f80fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 19:54:17 +0200 Subject: [PATCH 167/218] tweak formatting of bundling PR content --- scripts/automated_ingestion/eessi_task.py | 15 ++++++++------- scripts/automated_ingestion/eessi_task_payload.py | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index b3f61b06..6781f616 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -971,21 +971,22 @@ def _create_task_summary(self) -> str: if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path) task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) - return task_summary.decoded_content + # return task_summary.decoded_content + return task_summary # create task summary payload_name = self.description.metadata['payload']['filename'] payload_summary = self.payload.analyse_contents() metadata_contents = self.description.get_contents() - task_summary = f"
      {payload_name}\n
        \n" - task_summary += "
      • Metadata\n" - task_summary += f"
        {metadata_contents}
        \n
      • \n" - task_summary += "
      • Overview of payload contents\n" + task_summary = f"
        {payload_name}\n\n" + task_summary += "
        Metadata\n\n" + task_summary += f"```\n{metadata_contents}\n```\n
        \n" + task_summary += "
        Overview of payload contents\n\n" task_summary += self.config['github']['task_summary_payload_template'].format( payload_overview=payload_summary, ) - task_summary += "
      • \n" - task_summary += "
      \n" + task_summary += "
      \n" + task_summary += "\n" task_summary += "\n" # create HTML file with task summary in REPO-PR-SEQ directory diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index 3729c846..65bb2b9e 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -89,8 +89,8 @@ def analyse_contents(self) -> str: tar_members = '\n'.join(members_list) overview = f"Total number of items in the tarball: {tar_num_members}" overview += f"\nURL to the tarball: {self.payload_object.get_url()}" - overview += f"\n{tar_members_desc}\n" - overview += f"```\n{tar_members}\n```" + overview += f"\n{tar_members_desc}\n\n" + overview += f"```\n{tar_members}\n```\n" # Make sure that the overview does not exceed Github's maximum length (65536 characters). if len(overview) > 60000: From e508148fb34312231ff71c951f48bcd6fce9da44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 20:02:13 +0200 Subject: [PATCH 168/218] bump sequence number to 2 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 6781f616..b552be7f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 1 + return 2 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From 5c5475496850a2c85ab9c55a01c30f6d59e5e1a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 20:12:31 +0200 Subject: [PATCH 169/218] alternative for creating PR body, bumping seq to 3 --- scripts/automated_ingestion/eessi_task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index b552be7f..b4f48602 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 2 + return 3 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: @@ -1057,7 +1057,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st repo=repo_name, seq_num=seq_num, contents=contents_overview, - analysis="TO BE DONE", + analysis=str(contents_overview), action="TO BE DONE", ) pr = self.git_repo.create_pull( From 53670df5947e4f4c9b2c48275b2ec75511152ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 20:24:13 +0200 Subject: [PATCH 170/218] decode file contents from GitHub and bump sequence number --- scripts/automated_ingestion/eessi_task.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index b4f48602..7cb4b988 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 3 + return 4 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: @@ -1008,19 +1008,18 @@ def _create_pr_contents_overview(self) -> str: target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name) pr_dir = os.path.dirname(target_dir) directories = self._list_directory_contents(pr_dir, feature_branch_name) - print(f"target_dir: {target_dir}") - print(f"pr_dir: {pr_dir}") contents_overview = "" if directories: - contents_overview += "
        \n" + contents_overview += "\n" for directory in directories: task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html" if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): - task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) - contents_overview += f"
      • {task_summary.decoded_content}
      • \n" + file_contents = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) + task_summary = base64.b64decode(file_contents).decode('utf-8') + contents_overview += f"{task_summary}\n" else: - contents_overview += f"
      • Task summary file not found: {task_summary_file_path}
      • \n" - contents_overview += "
      \n" + contents_overview += f"Task summary file not found: {task_summary_file_path}\n" + contents_overview += "\n" else: contents_overview += "No tasks found in this PR\n" @@ -1057,7 +1056,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st repo=repo_name, seq_num=seq_num, contents=contents_overview, - analysis=str(contents_overview), + analysis=contents_overview, action="TO BE DONE", ) pr = self.git_repo.create_pull( From f0fc09ff84998d0109a8fbc538c939f49dbc3927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 20:30:25 +0200 Subject: [PATCH 171/218] need to access .content --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7cb4b988..8226ab2c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1015,7 +1015,7 @@ def _create_pr_contents_overview(self) -> str: task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html" if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): file_contents = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) - task_summary = base64.b64decode(file_contents).decode('utf-8') + task_summary = base64.b64decode(file_contents.content).decode('utf-8') contents_overview += f"{task_summary}\n" else: contents_overview += f"Task summary file not found: {task_summary_file_path}\n" From 0660fa1b1b5721d6365b61a9f4b64a37bb05fa55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 21:34:45 +0200 Subject: [PATCH 172/218] add function to return bucket URL --- scripts/automated_ingestion/s3_bucket.py | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py index ff62813f..79fed289 100644 --- a/scripts/automated_ingestion/s3_bucket.py +++ b/scripts/automated_ingestion/s3_bucket.py @@ -154,3 +154,34 @@ def download(self, remote_path: str, local_path: str) -> None: # Store the ETag self._write_etag(local_path, etag) + + @log_function_entry_exit() + def get_bucket_url(self) -> str: + """ + Get the HTTPS URL for a bucket from an initialized boto3 client. + Works with both AWS S3 and MinIO/S3-compatible services. + """ + try: + # Check if this is a custom endpoint (MinIO) or AWS S3 + endpoint_url = self.client.meta.endpoint_url + + if endpoint_url: + # Custom endpoint (MinIO, DigitalOcean Spaces, etc.) + # Most S3-compatible services use path-style URLs + bucket_url = f"{endpoint_url}/{self.bucket}" + + else: + # AWS S3 (no custom endpoint specified) + region = self.client.meta.region_name or 'us-east-1' + + # AWS S3 virtual-hosted-style URLs + if region == 'us-east-1': + bucket_url = f"https://{self.bucket}.s3.amazonaws.com" + else: + bucket_url = f"https://{self.bucket}.s3.{region}.amazonaws.com" + + return bucket_url + + except Exception as err: + log_message(LoggingScope.ERROR, 'ERROR', "Error getting bucket URL: %s", str(err)) + return None From 26525891a814e4e753850c8f0008c1dca1778512 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 21:37:32 +0200 Subject: [PATCH 173/218] use method that returns bucket URL --- scripts/automated_ingestion/eessi_task_payload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index 65bb2b9e..809ce152 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -88,7 +88,7 @@ def analyse_contents(self) -> str: # Construct the overview. tar_members = '\n'.join(members_list) overview = f"Total number of items in the tarball: {tar_num_members}" - overview += f"\nURL to the tarball: {self.payload_object.get_url()}" + overview += f"\nURL to the tarball: {self.payload_object.remote_client.get_bucket_url()}" overview += f"\n{tar_members_desc}\n\n" overview += f"```\n{tar_members}\n```\n" From 7b7bb63715ad6f3498a3e96a097c36bba0c74349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 21:41:09 +0200 Subject: [PATCH 174/218] bump sequence number to 5 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 8226ab2c..5fb13e3a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 4 + return 5 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From 6eb7002c0bbe76f3457be261bc3f5bb5707712a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 22:02:22 +0200 Subject: [PATCH 175/218] add remote_file_path to bucket_url and update pull request --- scripts/automated_ingestion/eessi_task.py | 45 ++++++++++++++----- .../automated_ingestion/eessi_task_payload.py | 4 +- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 5fb13e3a..a76523eb 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1056,7 +1056,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st repo=repo_name, seq_num=seq_num, contents=contents_overview, - analysis=contents_overview, + analysis="TO BE DONE", action="TO BE DONE", ) pr = self.git_repo.create_pull( @@ -1068,12 +1068,35 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr) @log_function_entry_exit() - def _update_pull_request(self, pull_request: PullRequest, feature_branch_name: str): - """Update the pull request""" + def _update_pull_request(self, pull_request: PullRequest): + """ + Update the pull request + + Args: + pull_request: instance of the pull request + """ # TODO: update sections (contents analysis, action) - # for now, function just logs a message - log_message(LoggingScope.TASK_OPS, 'INFO', - "TODO: updating pull request %s for branch %s", pull_request, feature_branch_name) + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" + seq_num = self._determine_sequence_number() + + self._create_task_summary() + contents_overview = self._create_pr_contents_overview() + pr_body_format = self.config['github']['grouped_pr_body'] + pr_body = pr_body_format.format( + cvmfs_repo=self.cvmfs_repo, + pr=pr_number, + pr_url=pr_url, + repo=repo_name, + seq_num=seq_num, + contents=contents_overview, + analysis="TO BE DONE", + action="TO BE DONE", + ) + pull_request.edit(body=pr_body) + + log_message(LoggingScope.TASK_OPS, 'INFO', "PR updated: %s", pull_request) @log_function_entry_exit() def _handle_add_payload_staged(self): @@ -1130,7 +1153,7 @@ def _handle_add_payload_staged(self): self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name) # TODO: add failure handling (capture result and act on it) - self._update_pull_request(pull_request, feature_branch_name) + self._update_pull_request(pull_request) return TaskState.PULL_REQUEST @@ -1139,10 +1162,10 @@ def _handle_add_pull_request(self): """Handler for ADD action in PULL_REQUEST state""" print("Handling ADD action in PULL_REQUEST state") # Implementation for adding in PULL_REQUEST state - task_summary = self._create_task_summary() - log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary) - contents_overview = self._create_pr_contents_overview() - log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview) + # task_summary = self._create_task_summary() + # log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary) + # contents_overview = self._create_pr_contents_overview() + # log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview) return TaskState.PULL_REQUEST @log_function_entry_exit() diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index 809ce152..c8f82df2 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -88,7 +88,9 @@ def analyse_contents(self) -> str: # Construct the overview. tar_members = '\n'.join(members_list) overview = f"Total number of items in the tarball: {tar_num_members}" - overview += f"\nURL to the tarball: {self.payload_object.remote_client.get_bucket_url()}" + bucket_url = self.payload_object.remote_client.get_bucket_url() + remote_file_path = self.payload_object.remote_file_path + overview += f"\nURL to the tarball: {bucket_url}/{remote_file_path}" overview += f"\n{tar_members_desc}\n\n" overview += f"```\n{tar_members}\n```\n" From bd090600db78c2feeb10e7dc7938b9394cd64992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Mon, 16 Jun 2025 22:29:13 +0200 Subject: [PATCH 176/218] bump sequence number to 6 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a76523eb..88b54bdc 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 5 + return 6 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From c2b8513061e61852875504ea05a69990a7c9cd3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Tue, 17 Jun 2025 22:14:46 +0200 Subject: [PATCH 177/218] implement first version of handler for PULL_REQUEST state --- scripts/automated_ingestion/eessi_task.py | 88 +++++++++++++++++++++-- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 88b54bdc..65387ab1 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -730,7 +730,6 @@ def _handle_add_undetermined(self): # create task file in target directory (TARGET_DIR/TaskDescription) # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK) # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR) - branch_name = self.git_repo.default_branch repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() sequence_number = self._get_fixed_sequence_number() # corresponds to an open or yet to be created PR @@ -755,6 +754,7 @@ def _handle_add_undetermined(self): } } + branch_name = self.git_repo.default_branch try: commit = self._create_multi_file_commit( files_to_commit, @@ -1150,6 +1150,7 @@ def _handle_add_payload_staged(self): log_message(LoggingScope.TASK_OPS, 'INFO', "PR %s is open, updating task states", pull_request) # TODO: add failure handling (capture result and act on it) + # THINK about what a failure would mean and what to do about it. self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name) # TODO: add failure handling (capture result and act on it) @@ -1162,6 +1163,72 @@ def _handle_add_pull_request(self): """Handler for ADD action in PULL_REQUEST state""" print("Handling ADD action in PULL_REQUEST state") # Implementation for adding in PULL_REQUEST state + # we got here because the state of the task is PULL_REQUEST in the default branch + # determine branch and PR and state of PR + # PR is open --> just return TaskState.PULL_REQUEST + # PR is closed & merged --> deployment is approved + # PR is closed & not merged --> deployment is rejected + sequence_number = self._determine_sequence_number() + feature_branch_name = self._determine_feature_branch_name(sequence_number) + # TODO: check if feature branch exists, for now ASSUME it does + pull_request = self._find_pr_for_branch(feature_branch_name) + if pull_request: + log_message(LoggingScope.TASK_OPS, 'INFO', + "found PR for branch %s: %s", feature_branch_name, pull_request) + if pull_request.state == 'closed': + if pull_request.merged: + log_message(LoggingScope.TASK_OPS, 'INFO', + "PR %s is closed and merged, returning APPROVED state", pull_request) + # TODO: How could we ended up here? state in default branch is PULL_REQUEST but + # PR is merged, hence it should have been in the APPROVED state + # ==> for now, just return TaskState.PULL_REQUEST + # + # there is the possibility that the PR was updated just before the + # PR was merged + # WHY is it a problem? because a task may have been accepted that wouldn't + # have been accepted or worse shouldn't been accepted + # WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw + # HOWEVER, the contents of the PR directory may be inconsistent with + # respect to the TaskState file and missing TaskSummary.html file + # WE could create an issue and only return TaskState.APPROVED if the + # issue is closed + # WE could also defer all handling of this to the handler for the + # APPROVED state + # NOPE, we have to do some handling here, at least for the tasks where their + # state file did + # --> check if we could have ended up here? If so, create an issue. + # Do we need a state ISSUE_OPENED to avoid processing the task again? + return TaskState.PULL_REQUEST + else: + log_message(LoggingScope.TASK_OPS, 'INFO', + "PR %s is closed and not merged, returning REJECTED state", pull_request) + # TODO: there is the possibility that the PR was updated just before the + # PR was closed + # WHY is it a problem? because a task may have been rejected that wouldn't + # have been rejected or worse shouldn't been rejected + # WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw + # HOWEVER, the contents of the PR directory may be inconsistent with + # respect to the TaskState file and missing TaskSummary.html file + # WE could create an issue and only return TaskState.REJECTED if the + # issue is closed + # WE could also defer all handling of this to the handler for the + # REJECTED state + # FOR NOW, we assume that the task was rejected on purpose + # we need to change the state of the task in the default branch to REJECTED + self._update_task_state_file(TaskState.REJECTED) + return TaskState.REJECTED + else: + log_message(LoggingScope.TASK_OPS, 'INFO', + "PR %s is open, returning PULL_REQUEST state", pull_request) + return TaskState.PULL_REQUEST + else: + log_message(LoggingScope.TASK_OPS, 'INFO', + "no PR found for branch %s", feature_branch_name) + # the method was called because the state of the task is PULL_REQUEST in the default branch + # however, it's weird that the PR was not found for the feature branch + # TODO: may create or update an issue for the task or deployment + return TaskState.PULL_REQUEST + # task_summary = self._create_task_summary() # log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary) # contents_overview = self._create_pr_contents_overview() @@ -1171,16 +1238,27 @@ def _handle_add_pull_request(self): @log_function_entry_exit() def _handle_add_approved(self): """Handler for ADD action in APPROVED state""" - print("Handling ADD action in APPROVED state") + print("Handling ADD action in APPROVED state: %s", self.description.get_task_file_name()) # Implementation for adding in APPROVED state - return True + # TODO: essentially, run the ingest function + # TODO: change state in default branch to INGESTED + return TaskState.INGESTED @log_function_entry_exit() def _handle_add_ingested(self): """Handler for ADD action in INGESTED state""" - print("Handling ADD action in INGESTED state") + print("Handling ADD action in INGESTED state: %s", self.description.get_task_file_name()) # Implementation for adding in INGESTED state - return True + # TODO: change state in default branch to DONE + return TaskState.DONE + + @log_function_entry_exit() + def _handle_add_rejected(self): + """Handler for ADD action in REJECTED state""" + print("Handling ADD action in REJECTED state: %s", self.description.get_task_file_name()) + # Implementation for adding in REJECTED state + # TODO: change state in default branch to DONE + return TaskState.DONE @log_function_entry_exit() def transition_to(self, new_state: TaskState): From 04e3e7069b5f3f9717ce0edabf7e7a307b7ef2b5 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 18 Jun 2025 14:10:50 +0200 Subject: [PATCH 178/218] use different function to determine feature branch --- scripts/automated_ingestion/eessi_task.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 65387ab1..e124a9ed 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1168,8 +1168,7 @@ def _handle_add_pull_request(self): # PR is open --> just return TaskState.PULL_REQUEST # PR is closed & merged --> deployment is approved # PR is closed & not merged --> deployment is rejected - sequence_number = self._determine_sequence_number() - feature_branch_name = self._determine_feature_branch_name(sequence_number) + feature_branch_name = self._determine_feature_branch_name() # TODO: check if feature branch exists, for now ASSUME it does pull_request = self._find_pr_for_branch(feature_branch_name) if pull_request: @@ -1229,10 +1228,6 @@ def _handle_add_pull_request(self): # TODO: may create or update an issue for the task or deployment return TaskState.PULL_REQUEST - # task_summary = self._create_task_summary() - # log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary) - # contents_overview = self._create_pr_contents_overview() - # log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview) return TaskState.PULL_REQUEST @log_function_entry_exit() From 3a45f93bfb92da32b8183e3ab1e5bca303e7a6e7 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 18 Jun 2025 14:24:26 +0200 Subject: [PATCH 179/218] fixing print statements in handlers --- scripts/automated_ingestion/eessi_task.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index e124a9ed..1807a5df 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -725,7 +725,7 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str @log_function_entry_exit() def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" - print("Handling ADD action in UNDETERMINED state") + print("Handling ADD action in UNDETERMINED state: %s" % self.description.get_task_file_name()) # create target directory (REPO/PR/SEQ/TASK_FILE_NAME/) # create task file in target directory (TARGET_DIR/TaskDescription) # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK) @@ -816,7 +816,7 @@ def _init_payload_object(self): @log_function_entry_exit() def _handle_add_new_task(self): """Handler for ADD action in NEW_TASK state""" - print("Handling ADD action in NEW_TASK state") + print("Handling ADD action in NEW_TASK state: %s" % self.description.get_task_file_name()) # determine next state next_state = self._next_state(TaskState.NEW_TASK) log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state) @@ -1101,7 +1101,7 @@ def _update_pull_request(self, pull_request: PullRequest): @log_function_entry_exit() def _handle_add_payload_staged(self): """Handler for ADD action in PAYLOAD_STAGED state""" - print("Handling ADD action in PAYLOAD_STAGED state") + print("Handling ADD action in PAYLOAD_STAGED state: %s" % self.description.get_task_file_name()) next_state = self._next_state(TaskState.PAYLOAD_STAGED) approved_state = TaskState.APPROVED log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s, approved_state: %s", next_state, approved_state) @@ -1161,7 +1161,7 @@ def _handle_add_payload_staged(self): @log_function_entry_exit() def _handle_add_pull_request(self): """Handler for ADD action in PULL_REQUEST state""" - print("Handling ADD action in PULL_REQUEST state") + print("Handling ADD action in PULL_REQUEST state: %s" % self.description.get_task_file_name()) # Implementation for adding in PULL_REQUEST state # we got here because the state of the task is PULL_REQUEST in the default branch # determine branch and PR and state of PR @@ -1233,7 +1233,7 @@ def _handle_add_pull_request(self): @log_function_entry_exit() def _handle_add_approved(self): """Handler for ADD action in APPROVED state""" - print("Handling ADD action in APPROVED state: %s", self.description.get_task_file_name()) + print("Handling ADD action in APPROVED state: %s" % self.description.get_task_file_name()) # Implementation for adding in APPROVED state # TODO: essentially, run the ingest function # TODO: change state in default branch to INGESTED @@ -1242,7 +1242,7 @@ def _handle_add_approved(self): @log_function_entry_exit() def _handle_add_ingested(self): """Handler for ADD action in INGESTED state""" - print("Handling ADD action in INGESTED state: %s", self.description.get_task_file_name()) + print("Handling ADD action in INGESTED state: %s" % self.description.get_task_file_name()) # Implementation for adding in INGESTED state # TODO: change state in default branch to DONE return TaskState.DONE @@ -1250,7 +1250,7 @@ def _handle_add_ingested(self): @log_function_entry_exit() def _handle_add_rejected(self): """Handler for ADD action in REJECTED state""" - print("Handling ADD action in REJECTED state: %s", self.description.get_task_file_name()) + print("Handling ADD action in REJECTED state: %s" % self.description.get_task_file_name()) # Implementation for adding in REJECTED state # TODO: change state in default branch to DONE return TaskState.DONE From 569de25bf780fe1b3db65284d9667810f7e1a216 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 18 Jun 2025 14:34:12 +0200 Subject: [PATCH 180/218] add more logging to main loop --- scripts/automated_ingestion/automated_ingestion.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py index 46875b33..974f8497 100755 --- a/scripts/automated_ingestion/automated_ingestion.py +++ b/scripts/automated_ingestion/automated_ingestion.py @@ -273,9 +273,12 @@ def main(): current_state != TaskState.DONE and previous_state != current_state): previous_state = current_state + log_message(LoggingScope.GROUP_OPS, 'INFO', + "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'", + task_path, previous_state.name, current_state.name) current_state = task.handle() log_message(LoggingScope.GROUP_OPS, 'INFO', - "Task '%s': previous state = '%s', current state = '%s'", + "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'", task_path, previous_state.name, current_state.name) # # TODO: update the information shown below (what makes sense to show?) From 8ad5ceeb7cb0a234357edc6ae71363a6144f07d2 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 18 Jun 2025 15:08:22 +0200 Subject: [PATCH 181/218] implement handler for approved state --- scripts/automated_ingestion/eessi_task.py | 55 +++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 1807a5df..77dcd2eb 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1230,12 +1230,67 @@ def _handle_add_pull_request(self): return TaskState.PULL_REQUEST + @log_function_entry_exit() + def _perform_task_action(self): + """Perform the task action""" + # TODO: support other actions than ADD + if self.action == EESSITaskAction.ADD: + self._perform_task_add() + else: + raise ValueError(f"Task action '{self.action}' not supported (yet)") + + @log_function_entry_exit() + def _perform_task_add(self): + """Perform the ADD task action""" + # TODO: verify checksum here or before? + script = self.config['paths']['ingestion_script'] + sudo = ['sudo'] if self.config['cvmfs'].getboolean('ingest_as_root', True) else [] + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Running the ingestion script for %s...\n with script: %s\n with sudo: %s', + self.description.get_task_file_name(), + script, 'no' if sudo == [] else 'yes') + # ingest_cmd = subprocess.run( + # sudo + [script, self.cvmfs_repo, self.local_path], + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + # if ingest_cmd.returncode == 0: + if False: + next_state = self._next_state(self.state) + self._move_metadata_file(self.state, next_state) + if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False): + # send_slack_message( + # self.config['secrets']['slack_webhook'], + # self.config['slack']['ingestion_message'].format( + # tarball=os.path.basename(self.payload.local_path), + # cvmfs_repo=self.cvmfs_repo) + # ) + pass + else: + issue_title = f'Failed to add {os.path.basename(self.payload.local_path)}' + # issue_body = self.config['github']['failed_ingestion_issue_body'].format( + # command=' '.join(ingest_cmd.args), + # tarball=os.path.basename(self.payload.local_path), + # return_code=ingest_cmd.returncode, + # stdout=ingest_cmd.stdout.decode('UTF-8'), + # stderr=ingest_cmd.stderr.decode('UTF-8'), + # ) + if self.issue_exists(issue_title, state='open'): + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Failed to add %s, but an open issue already exists, skipping...', + os.path.basename(self.payload.local_path)) + else: + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Failed to add %s, but an open issue does not exist, creating one...', + os.path.basename(self.payload.local_path)) + # TODO: self.git_repo.create_issue(title=issue_title, body=issue_body) + @log_function_entry_exit() def _handle_add_approved(self): """Handler for ADD action in APPROVED state""" print("Handling ADD action in APPROVED state: %s" % self.description.get_task_file_name()) # Implementation for adding in APPROVED state # TODO: essentially, run the ingest function + self._perform_task_action() # TODO: change state in default branch to INGESTED return TaskState.INGESTED From 623b8c306db1efaec13cd102dc0d0a7c041985ec Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 18 Jun 2025 15:13:43 +0200 Subject: [PATCH 182/218] fix access to payload file name --- scripts/automated_ingestion/eessi_task.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 77dcd2eb..777456ba 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1250,7 +1250,7 @@ def _perform_task_add(self): self.description.get_task_file_name(), script, 'no' if sudo == [] else 'yes') # ingest_cmd = subprocess.run( - # sudo + [script, self.cvmfs_repo, self.local_path], + # sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path], # stdout=subprocess.PIPE, # stderr=subprocess.PIPE) # if ingest_cmd.returncode == 0: @@ -1261,15 +1261,15 @@ def _perform_task_add(self): # send_slack_message( # self.config['secrets']['slack_webhook'], # self.config['slack']['ingestion_message'].format( - # tarball=os.path.basename(self.payload.local_path), + # tarball=os.path.basename(self.payload.payload_object.local_file_path), # cvmfs_repo=self.cvmfs_repo) # ) pass else: - issue_title = f'Failed to add {os.path.basename(self.payload.local_path)}' + issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}' # issue_body = self.config['github']['failed_ingestion_issue_body'].format( # command=' '.join(ingest_cmd.args), - # tarball=os.path.basename(self.payload.local_path), + # tarball=os.path.basename(self.payload.payload_object.local_file_path), # return_code=ingest_cmd.returncode, # stdout=ingest_cmd.stdout.decode('UTF-8'), # stderr=ingest_cmd.stderr.decode('UTF-8'), @@ -1277,11 +1277,11 @@ def _perform_task_add(self): if self.issue_exists(issue_title, state='open'): log_message(LoggingScope.STATE_OPS, 'INFO', 'Failed to add %s, but an open issue already exists, skipping...', - os.path.basename(self.payload.local_path)) + os.path.basename(self.payload.payload_object.local_file_path)) else: log_message(LoggingScope.STATE_OPS, 'INFO', 'Failed to add %s, but an open issue does not exist, creating one...', - os.path.basename(self.payload.local_path)) + os.path.basename(self.payload.payload_object.local_file_path)) # TODO: self.git_repo.create_issue(title=issue_title, body=issue_body) @log_function_entry_exit() From 1cd5ebd56a77ce2a346bb18034ac5424f6e6c315 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 18 Jun 2025 21:29:08 +0200 Subject: [PATCH 183/218] add method _issue_exists --- scripts/automated_ingestion/eessi_task.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 777456ba..5b57517c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1239,6 +1239,18 @@ def _perform_task_action(self): else: raise ValueError(f"Task action '{self.action}' not supported (yet)") + @log_function_entry_exit() + def _issue_exists(self, title: str, state: str = 'open') -> bool: + """ + Check if an issue with the given title and state already exists. + """ + issues = self.git_repo.get_issues(state=state) + for issue in issues: + if issue.title == title and issue.state == state: + return True + else: + return False + @log_function_entry_exit() def _perform_task_add(self): """Perform the ADD task action""" @@ -1274,7 +1286,7 @@ def _perform_task_add(self): # stdout=ingest_cmd.stdout.decode('UTF-8'), # stderr=ingest_cmd.stderr.decode('UTF-8'), # ) - if self.issue_exists(issue_title, state='open'): + if self._issue_exists(issue_title, state='open'): log_message(LoggingScope.STATE_OPS, 'INFO', 'Failed to add %s, but an open issue already exists, skipping...', os.path.basename(self.payload.payload_object.local_file_path)) From c30abfcbe419a68b2c764e4825095b81524ea145 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 19 Jun 2025 00:27:33 +0200 Subject: [PATCH 184/218] updates to ingestion incl state handling, error handling --- scripts/automated_ingestion/eessi_task.py | 83 +++++++++++++---------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 5b57517c..40d4d151 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -2,15 +2,16 @@ from typing import Dict, List, Tuple, Optional from functools import total_ordering +import base64 import os +import subprocess import traceback -import base64 from eessi_data_object import EESSIDataAndSignatureObject from eessi_task_action import EESSITaskAction from eessi_task_description import EESSITaskDescription from eessi_task_payload import EESSITaskPayload -from utils import log_message, LoggingScope, log_function_entry_exit +from utils import send_slack_message, log_message, LoggingScope, log_function_entry_exit from github import Github, GithubException, InputGitTreeElement, UnknownObjectException from github.PullRequest import PullRequest @@ -82,9 +83,9 @@ def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo: TaskState.PAYLOAD_STAGED: [TaskState.PULL_REQUEST], TaskState.PULL_REQUEST: [TaskState.APPROVED, TaskState.REJECTED], TaskState.APPROVED: [TaskState.INGESTED], - TaskState.REJECTED: [TaskState.DONE], - TaskState.INGESTED: [TaskState.DONE], - TaskState.DONE: [] # Terminal state + TaskState.REJECTED: [], # terminal state + TaskState.INGESTED: [], # terminal state + TaskState.DONE: [] # virtual terminal state, not used to write on GitHub } self.payload = None @@ -1231,11 +1232,11 @@ def _handle_add_pull_request(self): return TaskState.PULL_REQUEST @log_function_entry_exit() - def _perform_task_action(self): + def _perform_task_action(self) -> bool: """Perform the task action""" # TODO: support other actions than ADD if self.action == EESSITaskAction.ADD: - self._perform_task_add() + return self._perform_task_add() else: raise ValueError(f"Task action '{self.action}' not supported (yet)") @@ -1252,7 +1253,7 @@ def _issue_exists(self, title: str, state: str = 'open') -> bool: return False @log_function_entry_exit() - def _perform_task_add(self): + def _perform_task_add(self) -> bool: """Perform the ADD task action""" # TODO: verify checksum here or before? script = self.config['paths']['ingestion_script'] @@ -1261,31 +1262,31 @@ def _perform_task_add(self): 'Running the ingestion script for %s...\n with script: %s\n with sudo: %s', self.description.get_task_file_name(), script, 'no' if sudo == [] else 'yes') - # ingest_cmd = subprocess.run( - # sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - # if ingest_cmd.returncode == 0: - if False: + ingest_cmd = subprocess.run( + sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + # TODO: if ingest_cmd.returncode == 0: + if True: next_state = self._next_state(self.state) - self._move_metadata_file(self.state, next_state) + self._update_task_state_file(next_state) if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False): - # send_slack_message( - # self.config['secrets']['slack_webhook'], - # self.config['slack']['ingestion_message'].format( - # tarball=os.path.basename(self.payload.payload_object.local_file_path), - # cvmfs_repo=self.cvmfs_repo) - # ) - pass + send_slack_message( + self.config['secrets']['slack_webhook'], + self.config['slack']['ingestion_message'].format( + tarball=os.path.basename(self.payload.payload_object.local_file_path), + cvmfs_repo=self.cvmfs_repo) + ) + return True else: issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}' - # issue_body = self.config['github']['failed_ingestion_issue_body'].format( - # command=' '.join(ingest_cmd.args), - # tarball=os.path.basename(self.payload.payload_object.local_file_path), - # return_code=ingest_cmd.returncode, - # stdout=ingest_cmd.stdout.decode('UTF-8'), - # stderr=ingest_cmd.stderr.decode('UTF-8'), - # ) + issue_body = self.config['github']['failed_ingestion_issue_body'].format( + command=' '.join(ingest_cmd.args), + tarball=os.path.basename(self.payload.payload_object.local_file_path), + return_code=ingest_cmd.returncode, + stdout=ingest_cmd.stdout.decode('UTF-8'), + stderr=ingest_cmd.stderr.decode('UTF-8'), + ) if self._issue_exists(issue_title, state='open'): log_message(LoggingScope.STATE_OPS, 'INFO', 'Failed to add %s, but an open issue already exists, skipping...', @@ -1294,24 +1295,33 @@ def _perform_task_add(self): log_message(LoggingScope.STATE_OPS, 'INFO', 'Failed to add %s, but an open issue does not exist, creating one...', os.path.basename(self.payload.payload_object.local_file_path)) - # TODO: self.git_repo.create_issue(title=issue_title, body=issue_body) + self.git_repo.create_issue(title=issue_title, body=issue_body) + return False @log_function_entry_exit() def _handle_add_approved(self): """Handler for ADD action in APPROVED state""" print("Handling ADD action in APPROVED state: %s" % self.description.get_task_file_name()) # Implementation for adding in APPROVED state - # TODO: essentially, run the ingest function - self._perform_task_action() - # TODO: change state in default branch to INGESTED - return TaskState.INGESTED + # If successful, _perform_task_action() will change the state + # to INGESTED on GitHub + try: + if self._perform_task_action(): + return TaskState.INGESTED + else: + return TaskState.APPROVED + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', + "Error performing task action: %s", err) + return TaskState.APPROVED @log_function_entry_exit() def _handle_add_ingested(self): """Handler for ADD action in INGESTED state""" print("Handling ADD action in INGESTED state: %s" % self.description.get_task_file_name()) # Implementation for adding in INGESTED state - # TODO: change state in default branch to DONE + # DONT change state on GitHub, because the result + # (INGESTED/REJECTED) would be overwritten return TaskState.DONE @log_function_entry_exit() @@ -1319,7 +1329,8 @@ def _handle_add_rejected(self): """Handler for ADD action in REJECTED state""" print("Handling ADD action in REJECTED state: %s" % self.description.get_task_file_name()) # Implementation for adding in REJECTED state - # TODO: change state in default branch to DONE + # DONT change state on GitHub, because the result + # (INGESTED/REJECTED) would be overwritten return TaskState.DONE @log_function_entry_exit() From d9d2fc836b4da63afbd8e1afdb5ee59e4fb67d06 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 19 Jun 2025 00:53:39 +0200 Subject: [PATCH 185/218] fix using state and remove unused function transition_to --- scripts/automated_ingestion/eessi_task.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 40d4d151..99e581c4 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1268,7 +1268,7 @@ def _perform_task_add(self) -> bool: stderr=subprocess.PIPE) # TODO: if ingest_cmd.returncode == 0: if True: - next_state = self._next_state(self.state) + next_state = self._next_state(TaskState.APPROVED) self._update_task_state_file(next_state) if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False): send_slack_message( @@ -1333,16 +1333,6 @@ def _handle_add_rejected(self): # (INGESTED/REJECTED) would be overwritten return TaskState.DONE - @log_function_entry_exit() - def transition_to(self, new_state: TaskState): - """ - Transition the task to a new state if valid. - """ - if new_state in self.valid_transitions[self.state]: - self.state = new_state - return True - return False - @log_function_entry_exit() def __str__(self): return f"EESSITask(description={self.description}, action={self.action}, state={self.determine_state()})" From 96050b601689664d7e321f66ea3b0d05cc303879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 11:39:27 +0200 Subject: [PATCH 186/218] enable code to run ingestion script --- scripts/automated_ingestion/eessi_task.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 99e581c4..23de8db7 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1266,8 +1266,7 @@ def _perform_task_add(self) -> bool: sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # TODO: if ingest_cmd.returncode == 0: - if True: + if ingest_cmd.returncode == 0: next_state = self._next_state(TaskState.APPROVED) self._update_task_state_file(next_state) if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False): From 435b96fa4cac4bffe679878697db0f89a30c8307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 12:07:16 +0200 Subject: [PATCH 187/218] bump sequence number to 7 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 23de8db7..4607bbbe 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 6 + return 7 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From 37149ee5575480cde763803a5c772f9f70c4f573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 12:18:06 +0200 Subject: [PATCH 188/218] log result and output of ingest script --- scripts/automated_ingestion/eessi_task.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 4607bbbe..2ea19e58 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1266,6 +1266,12 @@ def _perform_task_add(self) -> bool: sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Ingestion script returned code %s', ingest_cmd.returncode) + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Ingestion script stdout: %s', ingest_cmd.stdout.decode('UTF-8')) + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Ingestion script stderr: %s', ingest_cmd.stderr.decode('UTF-8')) if ingest_cmd.returncode == 0: next_state = self._next_state(TaskState.APPROVED) self._update_task_state_file(next_state) From c7ed07db3297cc606b9838098c2a6791535a5de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 12:19:09 +0200 Subject: [PATCH 189/218] bump sequence number to 8 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 2ea19e58..e13ebe3d 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 7 + return 8 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From d4528fef653485cbef13546c5eea18fe581542f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 12:24:47 +0200 Subject: [PATCH 190/218] bump sequence number to 9 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index e13ebe3d..ce43ba5d 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 8 + return 9 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From 73a77ee1f2a13bef6dd968f171f65875f9c547a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 12:36:41 +0200 Subject: [PATCH 191/218] add logging for issue creation --- scripts/automated_ingestion/eessi_task.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index ce43ba5d..d97987ef 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1284,6 +1284,10 @@ def _perform_task_add(self) -> bool: ) return True else: + log_message(LoggingScope.STATE_OPS, 'ERROR', + 'Failed to add %s, return code %s', + os.path.basename(self.payload.payload_object.local_file_path), + ingest_cmd.returncode) issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}' issue_body = self.config['github']['failed_ingestion_issue_body'].format( command=' '.join(ingest_cmd.args), @@ -1292,6 +1296,9 @@ def _perform_task_add(self) -> bool: stdout=ingest_cmd.stdout.decode('UTF-8'), stderr=ingest_cmd.stderr.decode('UTF-8'), ) + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Creating issue for failed ingestion: title: %s, body: %s', + issue_title, issue_body) if self._issue_exists(issue_title, state='open'): log_message(LoggingScope.STATE_OPS, 'INFO', 'Failed to add %s, but an open issue already exists, skipping...', From adcb918d02a743bcd955a5a7298bad7c8a19f103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 12:37:06 +0200 Subject: [PATCH 192/218] bump sequence number to 10 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index d97987ef..cc771044 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 9 + return 10 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From 98c04095568d1a49e78a2dd2f7978c29a0479710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 17:17:27 +0200 Subject: [PATCH 193/218] improve logging when processing ingestion failure --- scripts/automated_ingestion/eessi_task.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index cc771044..5ced5ad1 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1284,21 +1284,29 @@ def _perform_task_add(self) -> bool: ) return True else: + tarball = os.path.basename(self.payload.payload_object.local_file_path) log_message(LoggingScope.STATE_OPS, 'ERROR', 'Failed to add %s, return code %s', - os.path.basename(self.payload.payload_object.local_file_path), + tarball, ingest_cmd.returncode) - issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}' + + issue_title = f'Failed to add {tarball}' + log_message(LoggingScope.STATE_OPS, 'INFO', + 'Creating issue for failed ingestion: title: %s', + issue_title) + + command = ' '.join(ingest_cmd.args) issue_body = self.config['github']['failed_ingestion_issue_body'].format( - command=' '.join(ingest_cmd.args), - tarball=os.path.basename(self.payload.payload_object.local_file_path), + command=command, + tarball=tarball, return_code=ingest_cmd.returncode, stdout=ingest_cmd.stdout.decode('UTF-8'), - stderr=ingest_cmd.stderr.decode('UTF-8'), + stderr=ingest_cmd.stderr.decode('UTF-8') ) log_message(LoggingScope.STATE_OPS, 'INFO', - 'Creating issue for failed ingestion: title: %s, body: %s', - issue_title, issue_body) + 'Creating issue for failed ingestion: body: %s', + issue_body) + if self._issue_exists(issue_title, state='open'): log_message(LoggingScope.STATE_OPS, 'INFO', 'Failed to add %s, but an open issue already exists, skipping...', From da9cc94d3679ea2e9c59f24dec195a56d454a42f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 18:06:00 +0200 Subject: [PATCH 194/218] add traceback when catching exception --- scripts/automated_ingestion/eessi_task.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 5ced5ad1..9eeb3446 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1292,11 +1292,12 @@ def _perform_task_add(self) -> bool: issue_title = f'Failed to add {tarball}' log_message(LoggingScope.STATE_OPS, 'INFO', - 'Creating issue for failed ingestion: title: %s', + "Creating issue for failed ingestion: title: '%s'", issue_title) command = ' '.join(ingest_cmd.args) - issue_body = self.config['github']['failed_ingestion_issue_body'].format( + failed_ingestion_issue_body = self.config['github']['failed_ingestion_issue_body'] + issue_body = failed_ingestion_issue_body.format( command=command, tarball=tarball, return_code=ingest_cmd.returncode, @@ -1304,7 +1305,7 @@ def _perform_task_add(self) -> bool: stderr=ingest_cmd.stderr.decode('UTF-8') ) log_message(LoggingScope.STATE_OPS, 'INFO', - 'Creating issue for failed ingestion: body: %s', + "Creating issue for failed ingestion: body: '%s'", issue_body) if self._issue_exists(issue_title, state='open'): @@ -1332,7 +1333,7 @@ def _handle_add_approved(self): return TaskState.APPROVED except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', - "Error performing task action: %s", err) + "Error performing task action: '%s'\nTraceback:\n%s", err, traceback.format_exc()) return TaskState.APPROVED @log_function_entry_exit() From 54062a37c6cde40d59b20b18d310e75b83b6572e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 18:15:21 +0200 Subject: [PATCH 195/218] convert Path to str --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 9eeb3446..11b7f055 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1263,7 +1263,7 @@ def _perform_task_add(self) -> bool: self.description.get_task_file_name(), script, 'no' if sudo == [] else 'yes') ingest_cmd = subprocess.run( - sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path], + sudo + [script, self.cvmfs_repo, str(self.payload.payload_object.local_file_path)], stdout=subprocess.PIPE, stderr=subprocess.PIPE) log_message(LoggingScope.STATE_OPS, 'INFO', From bb6021edb53fd1cf42c957ade72b3cc456bda7d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sat, 21 Jun 2025 20:14:26 +0200 Subject: [PATCH 196/218] bump sequence number to 11 --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 11b7f055..7c21f1fe 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int: """ Get a fixed sequence number. """ - return 10 + return 11 @log_function_entry_exit() def _determine_sequence_status(self, sequence_number: int = None) -> int: From de00cee1032ae903c6fe16d20f6ea1b319126765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 12:32:32 +0200 Subject: [PATCH 197/218] 1st step to make sequence numbers non-hardcoded --- scripts/automated_ingestion/eessi_task.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7c21f1fe..84c77d4b 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -835,6 +835,7 @@ def _handle_add_new_task(self): @log_function_entry_exit() def _determine_branch_name_from_sequence_number(self, sequence_number: int = None) -> str: """Determine the branch name from the sequence number""" + # TODO: make sequence_number mandatory and thereby remove need for _get_fixed_sequence_number sequence_number = self._get_fixed_sequence_number() if sequence_number is None else sequence_number repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() @@ -863,7 +864,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: return None @log_function_entry_exit() - def _determine_sequence_number(self) -> int: + def _determine_sequence_number_from_pull_request_directory(self) -> int: """Determine the sequence number from the target directory name""" task_pointer_file = self.description.task_object.remote_file_path target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch) @@ -963,6 +964,8 @@ def _create_task_summary(self) -> str: feature_branch_name = self._determine_feature_branch_name() repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() + # TODO: determine sequence number from task pointer file and thereby remove need + # for _get_fixed_sequence_number sequence_number = self._get_fixed_sequence_number() # corresponds to an open PR task_file_name = self.description.get_task_file_name() target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" @@ -1041,7 +1044,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" - seq_num = self._determine_sequence_number() + seq_num = self._determine_sequence_number_from_pull_request_directory() pr_title = pr_title_format.format( cvmfs_repo=self.cvmfs_repo, pr=pr_number, @@ -1080,7 +1083,7 @@ def _update_pull_request(self, pull_request: PullRequest): repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() pr_url = f"https://github.com/{repo_name}/pull/{pr_number}" - seq_num = self._determine_sequence_number() + seq_num = self._determine_sequence_number_from_pull_request_directory() self._create_task_summary() contents_overview = self._create_pr_contents_overview() From b8240dd25311ef18885623858682ea44de78cc35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 12:44:17 +0200 Subject: [PATCH 198/218] rename target_dir to pull_request_dir --- scripts/automated_ingestion/eessi_task.py | 60 +++++++++++------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 84c77d4b..a9cfb263 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -472,9 +472,9 @@ def _read_dict_from_string(self, content: str) -> dict: return config_dict @log_function_entry_exit() - def _read_target_dir_from_file(self, path: str, branch_name: str = None) -> str: + def _read_pull_request_dir_from_file(self, path: str, branch_name: str = None) -> str: """ - Read the target directory from the file in the given branch. + Read the pull request directory from the file in the given branch. """ branch_name = self.git_repo.default_branch if branch_name is None else branch_name content = self.git_repo.get_contents(path, ref=branch_name) @@ -485,7 +485,7 @@ def _read_target_dir_from_file(self, path: str, branch_name: str = None) -> str: # Parse into dictionary config_dict = self._read_dict_from_string(content_str) - return config_dict.get('target_dir', None) + return config_dict.get('pull_request_dir', None) @log_function_entry_exit() def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]: @@ -534,9 +534,9 @@ def determine_state(self, branch: str = None) -> TaskState: task_pointer_file, branch_to_use) # get state from task file in branch to use - # - read the TaskState file in target dir - target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use) - task_state_file_path = f"{target_dir}/TaskState" + # - read the TaskState file in pull request directory + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_to_use) + task_state_file_path = f"{pull_request_dir}/TaskState" task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use) log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch %s: %s", @@ -727,17 +727,17 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" print("Handling ADD action in UNDETERMINED state: %s" % self.description.get_task_file_name()) - # create target directory (REPO/PR/SEQ/TASK_FILE_NAME/) - # create task file in target directory (TARGET_DIR/TaskDescription) - # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK) - # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR) + # create pull request directory (REPO/PR/SEQ/TASK_FILE_NAME/) + # create task file in pull request directory (PULL_REQUEST_DIR/TaskDescription) + # create task status file in pull request directory (PULL_REQUEST_DIR/TaskState.NEW_TASK) + # create pointer file from task file path to pull request directory (remote_file_path -> PULL_REQUEST_DIR) repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() sequence_number = self._get_fixed_sequence_number() # corresponds to an open or yet to be created PR task_file_name = self.description.get_task_file_name() - target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" - task_description_file_path = f"{target_dir}/TaskDescription" - task_state_file_path = f"{target_dir}/TaskState" + pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" + task_description_file_path = f"{pull_request_dir}/TaskDescription" + task_state_file_path = f"{pull_request_dir}/TaskState" remote_file_path = self.description.task_object.remote_file_path files_to_commit = { @@ -750,7 +750,7 @@ def _handle_add_undetermined(self): "mode": "100644" }, remote_file_path: { - "content": f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}", + "content": f"remote_file_path = {remote_file_path}\npull_request_dir = {pull_request_dir}", "mode": "100644" } } @@ -778,8 +778,8 @@ def _update_task_state_file(self, next_state: TaskState, branch_name: str = None branch_name = self.git_repo.default_branch if branch_name is None else branch_name task_pointer_file = self.description.task_object.remote_file_path - target_dir = self._read_target_dir_from_file(task_pointer_file, branch_name) - task_state_file_path = f"{target_dir}/TaskState" + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_name) + task_state_file_path = f"{pull_request_dir}/TaskState" arch = self.description.get_metadata_file_components()[3] commit_message = f"change task state to {next_state} in {branch_name} for {arch}" result = self._update_file(task_state_file_path, @@ -865,28 +865,28 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: @log_function_entry_exit() def _determine_sequence_number_from_pull_request_directory(self) -> int: - """Determine the sequence number from the target directory name""" + """Determine the sequence number from the pull request directory name""" task_pointer_file = self.description.task_object.remote_file_path - target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch) - # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) - _, _, _, seq, _ = target_dir.split('/') + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch) + # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) + _, _, _, seq, _ = pull_request_dir.split('/') return int(seq) @log_function_entry_exit() def _determine_feature_branch_name(self) -> str: - """Determine the feature branch name from the target directory name""" + """Determine the feature branch name from the pull request directory name""" task_pointer_file = self.description.task_object.remote_file_path - target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch) - # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) - org, repo, pr, seq, _ = target_dir.split('/') + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch) + # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo) + org, repo, pr, seq, _ = pull_request_dir.split('/') return f"{org}-{repo}-PR-{pr}-SEQ-{seq}" @log_function_entry_exit() def _sync_task_state_file(self, source_branch: str, target_branch: str): """Update task state file from source to target branch""" task_pointer_file = self.description.task_object.remote_file_path - target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch) - task_state_file_path = f"{target_dir}/TaskState" + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch) + task_state_file_path = f"{pull_request_dir}/TaskState" try: # Get content from source branch @@ -968,8 +968,8 @@ def _create_task_summary(self) -> str: # for _get_fixed_sequence_number sequence_number = self._get_fixed_sequence_number() # corresponds to an open PR task_file_name = self.description.get_task_file_name() - target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" - task_summary_file_path = f"{target_dir}/TaskSummary.html" + pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" + task_summary_file_path = f"{pull_request_dir}/TaskSummary.html" # check if task summary file already exists in repo on GitHub if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): @@ -1009,8 +1009,8 @@ def _create_pr_contents_overview(self) -> str: # TODO: implement feature_branch_name = self._determine_feature_branch_name() task_pointer_file = self.description.task_object.remote_file_path - target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name) - pr_dir = os.path.dirname(target_dir) + pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, feature_branch_name) + pr_dir = os.path.dirname(pull_request_dir) directories = self._list_directory_contents(pr_dir, feature_branch_name) contents_overview = "" if directories: From 345bbb7a3b2903c8b631ee2367396d9c59e8ad92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 12:45:18 +0200 Subject: [PATCH 199/218] remove function to create symlink --- scripts/automated_ingestion/eessi_task.py | 57 ----------------------- 1 file changed, 57 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a9cfb263..f6a4d399 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -571,63 +571,6 @@ def handle(self): return state_before_handle # Implement handlers for ADD action - @log_function_entry_exit() - def _create_symlink(self, source_path: str, target_path: str, branch_name: str = None): - """Create a symlink in the given branch.""" - try: - branch_name = self.git_repo.default_branch if branch_name is None else branch_name - ref = self.git_repo.get_git_ref(f"heads/{branch_name}") - commit = self.git_repo.get_git_commit(ref.object.sha) - base_tree = self.git_repo.get_git_tree(commit.tree.sha) - - # Create blob for symlink target - blob = self.git_repo.create_git_blob(target_path, "utf-8") - log_message(LoggingScope.TASK_OPS, 'INFO', "blob created: %s", blob) - - # Create tree element - tree_element = InputGitTreeElement( - path=source_path, - mode="120000", - type="blob", - sha=blob.sha - ) - log_message(LoggingScope.TASK_OPS, 'INFO', "tree element created: %s", tree_element) - - # Create new tree - try: - new_tree = self.git_repo.create_git_tree([tree_element], base_tree) - log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree) - except GithubException as err: - log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err) - log_message(LoggingScope.TASK_OPS, 'ERROR', " Status Code: %s", err.status) - log_message(LoggingScope.TASK_OPS, 'ERROR', " Error Message: %s", err.data) - log_message(LoggingScope.TASK_OPS, 'ERROR', " Headers: %s", err.headers) - log_message(LoggingScope.TASK_OPS, 'ERROR', " Raw Response: %s", err.response) - return False - except Exception as err: - log_message(LoggingScope.TASK_OPS, 'ERROR', "\n=== General Exception ===") - log_message(LoggingScope.TASK_OPS, 'ERROR', " Type: %s", type(err).__name__) - log_message(LoggingScope.TASK_OPS, 'ERROR', " Message: %s", str(err)) - log_message(LoggingScope.TASK_OPS, 'ERROR', " Traceback:") - log_message(LoggingScope.TASK_OPS, 'ERROR', " %s", traceback.format_exc()) - return False - - # Create new commit - commit_message = f"Add symlink {source_path} -> {target_path}" - new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit]) - log_message(LoggingScope.TASK_OPS, 'INFO', "new commit created: %s", new_commit) - - # Update reference - ref.edit(new_commit.sha) - - log_message(LoggingScope.TASK_OPS, 'INFO', "Symlink created: %s -> %s", - source_path, target_path) - return True - - except Exception as err: - log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err) - return False - @log_function_entry_exit() def _safe_create_file(self, path: str, message: str, content: str, branch_name: str = None): """Create a file in the given branch.""" From e8004155b168df215d3e5582619f465735c1838d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 12:47:50 +0200 Subject: [PATCH 200/218] remove function to obtain branch name from sequence number --- scripts/automated_ingestion/eessi_task.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index f6a4d399..0bda49ff 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -775,15 +775,6 @@ def _handle_add_new_task(self): # is still open or yet to be created); if it is not valid, perform corrective actions return next_state - @log_function_entry_exit() - def _determine_branch_name_from_sequence_number(self, sequence_number: int = None) -> str: - """Determine the branch name from the sequence number""" - # TODO: make sequence_number mandatory and thereby remove need for _get_fixed_sequence_number - sequence_number = self._get_fixed_sequence_number() if sequence_number is None else sequence_number - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - return f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" - @log_function_entry_exit() def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ From d717527a2ac7e076b69cf28c919ef99ea27531d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 12:54:46 +0200 Subject: [PATCH 201/218] remove one use of _get_fixed_sequence_number --- scripts/automated_ingestion/eessi_task.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0bda49ff..0c9057eb 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -895,17 +895,15 @@ def _create_task_summary(self) -> str: """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory.""" # determine task summary file path in feature branch on GitHub - feature_branch_name = self._determine_feature_branch_name() repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() - # TODO: determine sequence number from task pointer file and thereby remove need - # for _get_fixed_sequence_number - sequence_number = self._get_fixed_sequence_number() # corresponds to an open PR + sequence_number = self._determine_sequence_number_from_pull_request_directory() task_file_name = self.description.get_task_file_name() pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" task_summary_file_path = f"{pull_request_dir}/TaskSummary.html" # check if task summary file already exists in repo on GitHub + feature_branch_name = self._determine_feature_branch_name() if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path) task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) From 868b23be65aa2ec976146c8df683bc87df6aa483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 13:08:32 +0200 Subject: [PATCH 202/218] improve ways to obtain pull request directory --- scripts/automated_ingestion/eessi_task.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 0c9057eb..be09eb5a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -472,12 +472,18 @@ def _read_dict_from_string(self, content: str) -> dict: return config_dict @log_function_entry_exit() - def _read_pull_request_dir_from_file(self, path: str, branch_name: str = None) -> str: + def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch_name: str = None) -> str: """ Read the pull request directory from the file in the given branch. """ - branch_name = self.git_repo.default_branch if branch_name is None else branch_name - content = self.git_repo.get_contents(path, ref=branch_name) + # set default values for task pointer file and branch name + if task_pointer_file is None: + task_pointer_file = self.description.task_object.remote_file_path + if branch_name is None: + branch_name = self.git_repo.default_branch + + # read the pull request directory from the file in the given branch + content = self.git_repo.get_contents(task_pointer_file, ref=branch_name) # Decode the content from base64 content_str = content.decoded_content.decode('utf-8') @@ -487,6 +493,11 @@ def _read_pull_request_dir_from_file(self, path: str, branch_name: str = None) - return config_dict.get('pull_request_dir', None) + @log_function_entry_exit() + def _determine_pull_request_dir(self, task_pointer_file: str = None, branch_name: str = None) -> str: + """Determine the pull request directory via the task pointer file""" + return self._read_pull_request_dir_from_file(task_pointer_file=task_pointer_file, branch_name=branch_name) + @log_function_entry_exit() def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]: """ @@ -535,7 +546,7 @@ def determine_state(self, branch: str = None) -> TaskState: # get state from task file in branch to use # - read the TaskState file in pull request directory - pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_to_use) + pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use) task_state_file_path = f"{pull_request_dir}/TaskState" task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use) @@ -678,7 +689,7 @@ def _handle_add_undetermined(self): pr_number = self.description.get_pr_number() sequence_number = self._get_fixed_sequence_number() # corresponds to an open or yet to be created PR task_file_name = self.description.get_task_file_name() - pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" + pull_request_dir = self._determine_pull_request_dir() task_description_file_path = f"{pull_request_dir}/TaskDescription" task_state_file_path = f"{pull_request_dir}/TaskState" remote_file_path = self.description.task_object.remote_file_path From dd0e8bbf4ca0f098e1e96ea4eb66682d4bebb127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 13:13:37 +0200 Subject: [PATCH 203/218] clarify how to determine pull_request_dir --- scripts/automated_ingestion/eessi_task.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index be09eb5a..a05e57ea 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -681,6 +681,8 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" print("Handling ADD action in UNDETERMINED state: %s" % self.description.get_task_file_name()) + # task is in state UNDETERMINED if there is no pull request directory for the task yet + # # create pull request directory (REPO/PR/SEQ/TASK_FILE_NAME/) # create task file in pull request directory (PULL_REQUEST_DIR/TaskDescription) # create task status file in pull request directory (PULL_REQUEST_DIR/TaskState.NEW_TASK) @@ -689,7 +691,9 @@ def _handle_add_undetermined(self): pr_number = self.description.get_pr_number() sequence_number = self._get_fixed_sequence_number() # corresponds to an open or yet to be created PR task_file_name = self.description.get_task_file_name() - pull_request_dir = self._determine_pull_request_dir() + # we cannot use self._determine_pull_request_dir() here because it requires a task pointer file + # and we don't have one yet + pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" task_description_file_path = f"{pull_request_dir}/TaskDescription" task_state_file_path = f"{pull_request_dir}/TaskState" remote_file_path = self.description.task_object.remote_file_path From 20a7ae87ce41507df1f19f165bec0025a52cb3d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 13:16:34 +0200 Subject: [PATCH 204/218] remove need for determining sequence number in _create_task_summary --- scripts/automated_ingestion/eessi_task.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index a05e57ea..d7cf6ecc 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -910,15 +910,11 @@ def _create_task_summary(self) -> str: """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory.""" # determine task summary file path in feature branch on GitHub - repo_name = self.description.get_repo_name() - pr_number = self.description.get_pr_number() - sequence_number = self._determine_sequence_number_from_pull_request_directory() - task_file_name = self.description.get_task_file_name() - pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}" + feature_branch_name = self._determine_feature_branch_name() + pull_request_dir = self._determine_pull_request_dir(branch_name=feature_branch_name) task_summary_file_path = f"{pull_request_dir}/TaskSummary.html" # check if task summary file already exists in repo on GitHub - feature_branch_name = self._determine_feature_branch_name() if self._path_exists_in_branch(task_summary_file_path, feature_branch_name): log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path) task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name) @@ -942,6 +938,7 @@ def _create_task_summary(self) -> str: # create HTML file with task summary in REPO-PR-SEQ directory # TODO: add failure handling (capture result and act on it) + task_file_name = self.description.get_task_file_name() commit_message = f"create summary for {task_file_name} in {feature_branch_name}" self._safe_create_file(task_summary_file_path, commit_message, task_summary, branch_name=feature_branch_name) From 814cd50cef8dd4e4442a693180406b317d841356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 15:42:40 +0200 Subject: [PATCH 205/218] add function to determine sequence number and use the function --- scripts/automated_ingestion/eessi_task.py | 64 ++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index d7cf6ecc..7368cecb 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -677,6 +677,60 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str log_message(LoggingScope.TASK_OPS, 'ERROR', "Error updating file: %s", err) return None + @log_function_entry_exit() + def _sorted_list_of_sequence_numbers(self) -> List[int]: + """Create a sorted list of sequence numbers from the pull requests directory""" + # a pull request's directory is of the form REPO/PR/SEQ + # hence, we can get all sequence numbers from the pull requests directory REPO/PR + sequence_numbers = [] + repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}" + + # iterate over all directories under repo_pr_dir + try: + directories = self._list_directory_contents(repo_pr_dir) + for dir in directories: + # check if the directory is a number + if dir.name.isdigit(): + sequence_numbers.append(int(dir.name)) + else: + # directory is not a number, so we skip it + continue + except FileNotFoundError: + # repo_pr_dir does not exist, so we return an empty dictionary + log_message(LoggingScope.TASK_OPS, 'ERROR', "Pull requests directory '%s' does not exist", repo_pr_dir) + except GithubException as err: + if err.status != 404: # 404 is catched by FileNotFoundError + # some other error than the directory not existing + log_message(LoggingScope.TASK_OPS, 'ERROR', + "Some other error than the directory not existing: %s", err) + except Exception as err: + log_message(LoggingScope.TASK_OPS, 'ERROR', "Unexpected error: %s", err) + + return sorted(sequence_numbers) + + @log_function_entry_exit() + def _determine_sequence_number(self) -> int: + """Determine the sequence number for the task""" + + sequence_numbers = self._sorted_list_of_sequence_numbers() + if len(sequence_numbers) == 0: + return 0 + + # get the highest sequence number + highest_sequence_number = sequence_numbers[-1] + + pull_request = self._find_pr_for_sequence_number(highest_sequence_number) + if pull_request is None: + # the directory for the sequence number exists but no PR yet + return highest_sequence_number + else: + if pull_request.is_merged(): + # the PR is merged, so we use the next sequence number + return highest_sequence_number + 1 + else: + # the PR is not merged, so we can use the current sequence number + return highest_sequence_number + @log_function_entry_exit() def _handle_add_undetermined(self): """Handler for ADD action in UNDETERMINED state""" @@ -689,7 +743,7 @@ def _handle_add_undetermined(self): # create pointer file from task file path to pull request directory (remote_file_path -> PULL_REQUEST_DIR) repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() - sequence_number = self._get_fixed_sequence_number() # corresponds to an open or yet to be created PR + sequence_number = self._determine_sequence_number() # corresponds to an open or yet to be created PR task_file_name = self.description.get_task_file_name() # we cannot use self._determine_pull_request_dir() here because it requires a task pointer file # and we don't have one yet @@ -812,6 +866,14 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err) return None + @log_function_entry_exit() + def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullRequest]: + """Find the PR for the given sequence number""" + repo_name = self.description.get_repo_name() + pr_number = self.description.get_pr_number() + feature_branch_name = f"{repo_name}-PR-{pr_number}-SEQ-{sequence_number}" + return self._find_pr_for_branch(feature_branch_name) + @log_function_entry_exit() def _determine_sequence_number_from_pull_request_directory(self) -> int: """Determine the sequence number from the pull request directory name""" From cff761ebabc8a5fe6984a3fa044f5374f1bf9cfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 16:07:18 +0200 Subject: [PATCH 206/218] improve logging when determining task state --- scripts/automated_ingestion/eessi_task.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7368cecb..35b2ee02 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -541,20 +541,22 @@ def determine_state(self, branch: str = None) -> TaskState: branch_to_use = self.git_repo.default_branch if branch is None else branch if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use): - log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s", + log_message(LoggingScope.TASK_OPS, 'INFO', "path '%s' exists in branch '%s'", task_pointer_file, branch_to_use) # get state from task file in branch to use # - read the TaskState file in pull request directory pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use) + log_message(LoggingScope.TASK_OPS, 'INFO', "pull request directory: '%s'", pull_request_dir) task_state_file_path = f"{pull_request_dir}/TaskState" + log_message(LoggingScope.TASK_OPS, 'INFO', "task state file path: '%s'", task_state_file_path) task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use) - log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch %s: %s", + log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch '%s': %s", branch_to_use, task_state) return task_state else: - log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in branch %s", + log_message(LoggingScope.TASK_OPS, 'INFO', "path '%s' does not exist in branch '%s'", task_pointer_file, branch_to_use) return TaskState.UNDETERMINED From d85e017c654116244deb2eeb093a7b4ed0a233aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 16:12:48 +0200 Subject: [PATCH 207/218] add logging and consider 'target_dir' attr name --- scripts/automated_ingestion/eessi_task.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 35b2ee02..673c51d6 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -481,6 +481,8 @@ def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch task_pointer_file = self.description.task_object.remote_file_path if branch_name is None: branch_name = self.git_repo.default_branch + log_message(LoggingScope.TASK_OPS, 'INFO', "reading pull request directory from file '%s' in branch '%s'", + task_pointer_file, branch_name) # read the pull request directory from the file in the given branch content = self.git_repo.get_contents(task_pointer_file, ref=branch_name) @@ -491,7 +493,8 @@ def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch # Parse into dictionary config_dict = self._read_dict_from_string(content_str) - return config_dict.get('pull_request_dir', None) + target_dir = config_dict.get('target_dir', None) + return config_dict.get('pull_request_dir', target_dir) @log_function_entry_exit() def _determine_pull_request_dir(self, task_pointer_file: str = None, branch_name: str = None) -> str: From cf081158a85e65fc59ff3a23fad6feed2a4fe883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 16:34:13 +0200 Subject: [PATCH 208/218] add logging for determining sequence number and pull request --- scripts/automated_ingestion/eessi_task.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 673c51d6..7a4b789d 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -718,14 +718,22 @@ def _determine_sequence_number(self) -> int: """Determine the sequence number for the task""" sequence_numbers = self._sorted_list_of_sequence_numbers() + log_message(LoggingScope.TASK_OPS, 'INFO', "number of sequence numbers: %d", len(sequence_numbers)) if len(sequence_numbers) == 0: return 0 + log_message(LoggingScope.TASK_OPS, 'INFO', "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers))) + # get the highest sequence number highest_sequence_number = sequence_numbers[-1] + log_message(LoggingScope.TASK_OPS, 'INFO', "highest sequence number: %d", highest_sequence_number) pull_request = self._find_pr_for_sequence_number(highest_sequence_number) + log_message(LoggingScope.TASK_OPS, 'INFO', "pull request: %s", pull_request) + if pull_request is None: + log_message(LoggingScope.TASK_OPS, 'INFO', "Did not find pull request for sequence number %d", + highest_sequence_number) # the directory for the sequence number exists but no PR yet return highest_sequence_number else: @@ -863,9 +871,13 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: """ try: head_ref = f"{self.git_repo.owner.login}:{branch_name}" + log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref) filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] + log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs)) + if len(prs): + log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d", prs[0].number) return prs[0] if prs else None except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err) From f94f69d37f34d91626860cb1ef72a55cabc322ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 16:47:46 +0200 Subject: [PATCH 209/218] print head refs for all PRs --- scripts/automated_ingestion/eessi_task.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 7a4b789d..b3f695d6 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -873,6 +873,11 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: head_ref = f"{self.git_repo.owner.login}:{branch_name}" log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref) filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged + + all_prs = list(self.git_repo.get_pulls(state='all')) + for pr in all_prs: + log_message(LoggingScope.TASK_OPS, 'INFO', "PR #{pr.number}: {pr.head.ref}") + prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs)) From 06faa4b4e8ae90c2e8b151835881cfb31f8a9313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 16:50:20 +0200 Subject: [PATCH 210/218] fix PR head ref logging --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index b3f695d6..5a18c30c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -876,7 +876,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: all_prs = list(self.git_repo.get_pulls(state='all')) for pr in all_prs: - log_message(LoggingScope.TASK_OPS, 'INFO', "PR #{pr.number}: {pr.head.ref}") + log_message(LoggingScope.TASK_OPS, 'INFO', "PR #%d: %s", pr.number, pr.head.ref) prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) if pr.number not in filter_prs] From 9979adffc1223c327184ee572f50067f5757a005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 16:54:24 +0200 Subject: [PATCH 211/218] do not use login when searching for PRs --- scripts/automated_ingestion/eessi_task.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 5a18c30c..20512a4f 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -870,7 +870,9 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: PullRequest object if found, None otherwise """ try: - head_ref = f"{self.git_repo.owner.login}:{branch_name}" + # head_ref = f"{self.git_repo.owner.login}:{branch_name}" + # apparently, the head_ref does not contain the login + head_ref = f"{branch_name}" log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref) filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged From af021aef9ffc4a3877394665caf5671d593a9529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 17:27:36 +0200 Subject: [PATCH 212/218] improve function to determine PR --- scripts/automated_ingestion/eessi_task.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 20512a4f..dfdadf6c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -737,6 +737,9 @@ def _determine_sequence_number(self) -> int: # the directory for the sequence number exists but no PR yet return highest_sequence_number else: + log_message(LoggingScope.TASK_OPS, 'INFO', "pull request found: %s", pull_request) + log_message(LoggingScope.TASK_OPS, 'INFO', "pull request state/merged: %s/%s", + pull_request.state, str(pull_request.is_merged())) if pull_request.is_merged(): # the PR is merged, so we use the next sequence number return highest_sequence_number + 1 @@ -872,15 +875,22 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: try: # head_ref = f"{self.git_repo.owner.login}:{branch_name}" # apparently, the head_ref does not contain the login - head_ref = f"{branch_name}" - log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref) + last_dash = branch_name.rfind('-') + if last_dash != -1: + head_ref_wout_seq_num = branch_name[:last_dash + 1] # +1 to include the separator + else: + head_ref_wout_seq_num = branch_name + + log_message(LoggingScope.TASK_OPS, 'INFO', + "searching for PRs starting with head_ref: '%s'", head_ref_wout_seq_num) filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged - all_prs = list(self.git_repo.get_pulls(state='all')) + all_prs = [pr for pr in list(self.git_repo.get_pulls(state='all')) + if pr.head.ref.startswith(head_ref_wout_seq_num)] for pr in all_prs: log_message(LoggingScope.TASK_OPS, 'INFO', "PR #%d: %s", pr.number, pr.head.ref) - prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref)) + prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=branch_name)) if pr.number not in filter_prs] log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs)) if len(prs): @@ -896,7 +906,10 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() feature_branch_name = f"{repo_name}-PR-{pr_number}-SEQ-{sequence_number}" - return self._find_pr_for_branch(feature_branch_name) + pull_request = self._find_pr_for_branch(feature_branch_name) + log_message(LoggingScope.TASK_OPS, 'INFO', "pull request for branch '%s': %s", + feature_branch_name, pull_request) + return pull_request @log_function_entry_exit() def _determine_sequence_number_from_pull_request_directory(self) -> int: From 251b60eae5f36da0c0d6d35e904c3643a378b033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 17:35:58 +0200 Subject: [PATCH 213/218] fix branch name --- scripts/automated_ingestion/eessi_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index dfdadf6c..86ccae01 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -905,7 +905,7 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq """Find the PR for the given sequence number""" repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() - feature_branch_name = f"{repo_name}-PR-{pr_number}-SEQ-{sequence_number}" + feature_branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" pull_request = self._find_pr_for_branch(feature_branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "pull request for branch '%s': %s", feature_branch_name, pull_request) From 2275007fbacd09ffbc62fe81d878011122610d9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 18:04:52 +0200 Subject: [PATCH 214/218] =?UTF-8?q?restructure=20logging=20when=20determin?= =?UTF-8?q?ing=20PRR=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/automated_ingestion/eessi_task.py | 34 +++++++++++++---------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 86ccae01..50c6ea2a 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -875,21 +875,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: try: # head_ref = f"{self.git_repo.owner.login}:{branch_name}" # apparently, the head_ref does not contain the login - last_dash = branch_name.rfind('-') - if last_dash != -1: - head_ref_wout_seq_num = branch_name[:last_dash + 1] # +1 to include the separator - else: - head_ref_wout_seq_num = branch_name - - log_message(LoggingScope.TASK_OPS, 'INFO', - "searching for PRs starting with head_ref: '%s'", head_ref_wout_seq_num) filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged - - all_prs = [pr for pr in list(self.git_repo.get_pulls(state='all')) - if pr.head.ref.startswith(head_ref_wout_seq_num)] - for pr in all_prs: - log_message(LoggingScope.TASK_OPS, 'INFO', "PR #%d: %s", pr.number, pr.head.ref) - prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=branch_name)) if pr.number not in filter_prs] log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs)) @@ -906,6 +892,26 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq repo_name = self.description.get_repo_name() pr_number = self.description.get_pr_number() feature_branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}" + + # list all PRs with head_ref starting with the feature branch name without the sequence number + last_dash = feature_branch_name.rfind('-') + if last_dash != -1: + head_ref_wout_seq_num = feature_branch_name[:last_dash + 1] # +1 to include the separator + else: + head_ref_wout_seq_num = feature_branch_name + + log_message(LoggingScope.TASK_OPS, 'INFO', + "searching for PRs whose head_ref starts with: '%s'", head_ref_wout_seq_num) + + all_prs = [pr for pr in list(self.git_repo.get_pulls(state='all')) + if pr.head.ref.startswith(head_ref_wout_seq_num)] + log_message(LoggingScope.TASK_OPS, 'INFO', " number of PRs found: %d", len(all_prs)) + for pr in all_prs: + log_message(LoggingScope.TASK_OPS, 'INFO', " PR #%d: %s", pr.number, pr.head.ref) + + # now, find the PR for the feature branch name (if any) + log_message(LoggingScope.TASK_OPS, 'INFO', + "searching PR for feature branch name: '%s'", feature_branch_name) pull_request = self._find_pr_for_branch(feature_branch_name) log_message(LoggingScope.TASK_OPS, 'INFO', "pull request for branch '%s': %s", feature_branch_name, pull_request) From cffc319416d072d2f1d892ebd90143c6915ce45f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 18:47:57 +0200 Subject: [PATCH 215/218] change way to determine PRs for branch name --- scripts/automated_ingestion/eessi_task.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 50c6ea2a..784cbb73 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -876,11 +876,11 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: # head_ref = f"{self.git_repo.owner.login}:{branch_name}" # apparently, the head_ref does not contain the login filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged - prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=branch_name)) - if pr.number not in filter_prs] + prs = [pr for pr in list(self.git_repo.get_pulls(state='all')) + if pr.number not in filter_prs and pr.head.ref == branch_name] log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs)) if len(prs): - log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d", prs[0].number) + log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d, %s", prs[0].number, prs[0].head.ref) return prs[0] if prs else None except Exception as err: log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err) From 29d17fc0b8e06ea33f2f7cfe700a2536557c62b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 18:57:38 +0200 Subject: [PATCH 216/218] little code cleanup --- scripts/automated_ingestion/eessi_task.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 784cbb73..be0ce67c 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -873,11 +873,8 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]: PullRequest object if found, None otherwise """ try: - # head_ref = f"{self.git_repo.owner.login}:{branch_name}" - # apparently, the head_ref does not contain the login - filter_prs = [16, 17, 18, 19, 20, 21, 22] # TODO: remove this once the PR is merged prs = [pr for pr in list(self.git_repo.get_pulls(state='all')) - if pr.number not in filter_prs and pr.head.ref == branch_name] + if pr.head.ref == branch_name] log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs)) if len(prs): log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d, %s", prs[0].number, prs[0].head.ref) From be9443c659615e93db46fff1850cd7adb35121b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 20:43:37 +0200 Subject: [PATCH 217/218] reformat PR body and make format configurable --- scripts/automated_ingestion/eessi_task.py | 22 ++++++++----------- .../automated_ingestion/eessi_task_payload.py | 20 ++++++++--------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index be0ce67c..5e54597e 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1027,16 +1027,12 @@ def _create_task_summary(self) -> str: payload_name = self.description.metadata['payload']['filename'] payload_summary = self.payload.analyse_contents() metadata_contents = self.description.get_contents() - task_summary = f"
      {payload_name}\n\n" - task_summary += "
      Metadata\n\n" - task_summary += f"```\n{metadata_contents}\n```\n
      \n" - task_summary += "
      Overview of payload contents\n\n" - task_summary += self.config['github']['task_summary_payload_template'].format( - payload_overview=payload_summary, + + task_summary = self.config['github']['task_summary_payload_template'].format( + payload_name=payload_name, + metadata_contents=metadata_contents, + payload_overview=payload_summary ) - task_summary += "
      \n" - task_summary += "\n" - task_summary += "
      \n" # create HTML file with task summary in REPO-PR-SEQ directory # TODO: add failure handling (capture result and act on it) @@ -1106,8 +1102,8 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st repo=repo_name, seq_num=seq_num, contents=contents_overview, - analysis="TO BE DONE", - action="TO BE DONE", + analysis="
      TO BE DONE
      ", + action="
      TO BE DONE
      ", ) pr = self.git_repo.create_pull( title=pr_title, @@ -1141,8 +1137,8 @@ def _update_pull_request(self, pull_request: PullRequest): repo=repo_name, seq_num=seq_num, contents=contents_overview, - analysis="TO BE DONE", - action="TO BE DONE", + analysis="
      TO BE DONE
      ", + action="
      TO BE DONE
      ", ) pull_request.edit(body=pr_body) diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index c8f82df2..39ab3724 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -85,16 +85,16 @@ def analyse_contents(self) -> str: ] members_list = sorted(swdirs + modfiles + other) - # Construct the overview. - tar_members = '\n'.join(members_list) - overview = f"Total number of items in the tarball: {tar_num_members}" - bucket_url = self.payload_object.remote_client.get_bucket_url() - remote_file_path = self.payload_object.remote_file_path - overview += f"\nURL to the tarball: {bucket_url}/{remote_file_path}" - overview += f"\n{tar_members_desc}\n\n" - overview += f"```\n{tar_members}\n```\n" - - # Make sure that the overview does not exceed Github's maximum length (65536 characters). + # Construct the overview + overview = self.config['github']['task_summary_payload_overview_template'].format( + tar_num_members=tar_num_members, + bucket_url=self.payload_object.remote_client.get_bucket_url(), + remote_file_path=self.payload_object.remote_file_path, + tar_members_desc=tar_members_desc, + tar_members='\n'.join(members_list) + ) + + # Make sure that the overview does not exceed Github's maximum length (65536 characters) if len(overview) > 60000: overview = overview[:60000] + "\n\nWARNING: output exceeded the maximum length and was truncated!\n```" return overview From 3713639e48c339c548158513bc6a1e7ec2cf09da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Sun, 22 Jun 2025 20:52:47 +0200 Subject: [PATCH 218/218] make config dictionary available for analyse_contents --- scripts/automated_ingestion/eessi_task.py | 2 +- scripts/automated_ingestion/eessi_task_payload.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py index 5e54597e..bd863946 100644 --- a/scripts/automated_ingestion/eessi_task.py +++ b/scripts/automated_ingestion/eessi_task.py @@ -1025,7 +1025,7 @@ def _create_task_summary(self) -> str: # create task summary payload_name = self.description.metadata['payload']['filename'] - payload_summary = self.payload.analyse_contents() + payload_summary = self.payload.analyse_contents(self.config) metadata_contents = self.description.get_contents() task_summary = self.config['github']['task_summary_payload_template'].format( diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py index 39ab3724..cb39cc81 100644 --- a/scripts/automated_ingestion/eessi_task_payload.py +++ b/scripts/automated_ingestion/eessi_task_payload.py @@ -2,6 +2,7 @@ import tarfile from pathlib import PurePosixPath import os +from typing import Dict from eessi_data_object import EESSIDataAndSignatureObject from utils import log_function_entry_exit @@ -39,7 +40,7 @@ def __init__(self, payload_object: EESSIDataAndSignatureObject): self.signature_verified = self.payload_object.verify_signature() @log_function_entry_exit() - def analyse_contents(self) -> str: + def analyse_contents(self, config: Dict) -> str: """Analyse the contents of the payload and return a summary in a ready-to-use HTML format.""" tar = tarfile.open(self.payload_object.local_file_path, 'r') members = tar.getmembers() @@ -86,7 +87,7 @@ def analyse_contents(self) -> str: members_list = sorted(swdirs + modfiles + other) # Construct the overview - overview = self.config['github']['task_summary_payload_overview_template'].format( + overview = config['github']['task_summary_payload_overview_template'].format( tar_num_members=tar_num_members, bucket_url=self.payload_object.remote_client.get_bucket_url(), remote_file_path=self.payload_object.remote_file_path,