From c6fb70789da46ba78e1863707a98d9c7ee9a5773 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Tue, 22 Apr 2025 09:56:22 +0200
Subject: [PATCH 001/218] initial version of grouping tarballs in a single
 staging PR

---
 .../automated_ingestion.cfg.example           | 43 +++++++++++++++
 .../automated_ingestion.py                    | 55 ++++++++++++++++---
 scripts/automated_ingestion/eessitarball.py   | 39 +++++++++++++
 3 files changed, 130 insertions(+), 7 deletions(-)
diff --git a/scripts/automated_ingestion/automated_ingestion.cfg.example b/scripts/automated_ingestion/automated_ingestion.cfg.example
index 68df3e4e..bdf40fa3 100644
--- a/scripts/automated_ingestion/automated_ingestion.cfg.example
+++ b/scripts/automated_ingestion/automated_ingestion.cfg.example
@@ -63,7 +63,33 @@ pr_body = A new tarball has been staged for {pr_url}.
     ```
     
     </details>
+    
+    <details>
+    <summary>Overview of tarball contents</summary>
+    
+    {tar_overview}
+    
+    </details>
+
+# Method for creating staging PRs:
+# - 'individual': create one PR per tarball (old method)
+# - 'grouped': group tarballs by link2pr and create one PR per group (new method)
+staging_pr_method = individual
 
+# Template for individual tarball PRs
+individual_pr_body = A new tarball has been staged for {pr_url}.
+    Please review the contents of this tarball carefully.
+    Merging this PR will lead to automatic ingestion of the tarball to the repository {cvmfs_repo}.
+    
+    <details>
+    <summary>Metadata of tarball</summary>
+    
+    ```
+    {metadata}
+    ```
+    
+    </details>
+    
     <details>
     <summary>Overview of tarball contents</summary>
     
@@ -71,6 +97,23 @@ pr_body = A new tarball has been staged for {pr_url}.
     
     </details>
 
+# Template for grouped tarball PRs
+grouped_pr_body = A group of tarballs has been staged for {pr_url}.
+    Please review the contents of these tarballs carefully.
+    Merging this PR will lead to automatic ingestion of the approved tarballs to the repository {cvmfs_repo}.
+    Unchecked tarballs will be marked as rejected.
+    
+    {tarballs}
+    
+    <details>
+    <summary>Overview of tarball contents</summary>
+    
+    {tar_overview}
+    
+    </details>
+    
+    {metadata}
+
 [slack]
 ingestion_notification = yes
 ingestion_message = Tarball `{tarball}` has been ingested into the CVMFS repository `{cvmfs_repo}`.
diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 92dac552..41d928c7 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -51,6 +51,37 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt
     ]
     return tarballs
 
+def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'):
+    """Return a dictionary of tarball groups, keyed by (repo, pr_number)."""
+    tarballs = find_tarballs(s3, bucket, extension, metadata_extension)
+    groups = {}
+
+    for tarball in tarballs:
+        # Download metadata to get link2pr info
+        metadata_file = tarball + metadata_extension
+        local_metadata = os.path.join(config['paths']['download_dir'], os.path.basename(metadata_file))
+
+        try:
+            s3.download_file(bucket, metadata_file, local_metadata)
+            with open(local_metadata, 'r') as meta:
+                metadata = json.load(meta)
+                repo = metadata['link2pr']['repo']
+                pr = metadata['link2pr']['pr']
+                group_key = (repo, pr)
+
+                if group_key not in groups:
+                    groups[group_key] = []
+                groups[group_key].append(tarball)
+        except Exception as err:
+            logging.error(f"Failed to process metadata for {tarball}: {err}")
+            continue
+        finally:
+            # Clean up downloaded metadata file
+            if os.path.exists(local_metadata):
+                os.remove(local_metadata)
+
+    return groups
+
 
 def parse_config(path):
     """Parse the configuration file."""
@@ -102,14 +133,24 @@ def main():
 
     buckets = json.loads(config['aws']['staging_buckets'])
     for bucket, cvmfs_repo in buckets.items():
-        tarballs = find_tarballs(s3, bucket)
-        if args.list_only:
-            for num, tarball in enumerate(tarballs):
-                print(f'[{bucket}] {num}: {tarball}')
+        if config['github'].get('staging_pr_method', 'individual') == 'grouped':
+            # use new grouped PR method
+            tarball_groups = find_tarball_groups(s3, bucket, config)
+            for (repo, pr_id), tarballs in tarball_groups.items():
+                if tarballs:
+                    # Create a group handler for these tarballs
+                    group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                    group_handler.process_group(tarballs)
         else:
-            for tarball in tarballs:
-                tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                tar.run_handler()
+            # use old individual PR method
+            tarballs = find_tarballs(s3, bucket)
+            if args.list_only:
+                for num, tarball in enumerate(tarballs):
+                    print(f'[{bucket}] {num}: {tarball}')
+            else:
+                for tarball in tarballs:
+                    tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                    tar.run_handler()
 
 
 if __name__ == '__main__':
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 40ac6fa1..127ea2db 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -434,3 +434,42 @@ def issue_exists(self, title, state='open'):
                 return True
         else:
             return False
+
+    def get_link2pr_info(self):
+        """Get the link2pr information from the metadata file."""
+        with open(self.local_metadata_path, 'r') as meta:
+            metadata = json.load(meta)
+        return metadata['link2pr']['repo'], metadata['link2pr']['pr']
+
+class EessiTarballGroup:
+    """Class to handle a group of tarballs that share the same link2pr information."""
+
+    def __init__(self, first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo):
+        """Initialize with the first tarball in the group."""
+        self.first_tar = EessiTarball(first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo)
+        self.config = config
+        self.git_repo = git_staging_repo
+        self.s3 = s3
+        self.bucket = bucket
+        self.cvmfs_repo = cvmfs_repo
+
+    def process_group(self, tarballs):
+        """Process a group of tarballs together."""
+        # Verify all tarballs have the same link2pr info
+        if not self.verify_group_consistency(tarballs):
+            logging.error("Tarballs in group have inconsistent link2pr information")
+            return
+
+        # Process the group
+        self.first_tar.make_approval_request(tarballs)
+
+    def verify_group_consistency(self, tarballs):
+        """Verify all tarballs in the group have the same link2pr information."""
+        first_repo, first_pr = self.first_tar.get_link2pr_info()
+
+        for tarball in tarballs[1:]:  # Skip first tarball as we already have its info
+            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+            repo, pr = temp_tar.get_link2pr_info()
+            if repo != first_repo or pr != first_pr:
+                return False
+        return True

From aa32e71c7b0eaa6dac089846ac3b034309e5ff9b Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Tue, 22 Apr 2025 10:05:39 +0200
Subject: [PATCH 002/218] add code linting check

---
 .flake8                            | 14 +++++++++++
 .github/workflows/check-flake8.yml | 37 ++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 .flake8
 create mode 100644 .github/workflows/check-flake8.yml

diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..b6b309e3
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,14 @@
+# This file is part of the EESSI filesystem layer,
+# see https://github.com/EESSI/filesystem-layer
+#
+# author: Thomas Roeblitz (@trz42)
+#
+# license: GPLv2
+#
+
+[flake8]
+max-line-length = 120
+
+# ignore "Black would make changes" produced by flake8-black
+# see also https://github.com/houndci/hound/issues/1769
+extend-ignore = BLK100
diff --git a/.github/workflows/check-flake8.yml b/.github/workflows/check-flake8.yml
new file mode 100644
index 00000000..2a3a425b
--- /dev/null
+++ b/.github/workflows/check-flake8.yml
@@ -0,0 +1,37 @@
+# This file is part of the EESSI filesystem layer,
+# see https://github.com/EESSI/filesystem-layer
+#
+# author: Thomas Roeblitz (@trz42)
+#
+# license: GPLv2
+#
+
+name: Run tests
+on: [push, pull_request]
+# Declare default permissions as read only.
+permissions: read-all
+jobs:
+  test:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python: [3.7, 3.8, 3.9, '3.10', '3.11', '3.12']
+      fail-fast: false
+    steps:
+      - name: checkout
+        uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+
+      - name: set up Python
+        uses: actions/setup-python@13ae5bb136fac2878aff31522b9efb785519f984 # v4.3.0
+        with:
+          python-version: ${{matrix.python}}
+
+      - name: Install required Python packages + pytest + flake8
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r requirements.txt
+          python -m pip install --upgrade flake8
+
+      - name: Run flake8 to verify PEP8-compliance of Python code
+        run: |
+          flake8

From 88ba094206bd359b89f07f2e3221de2c1f1f09f1 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Tue, 22 Apr 2025 10:39:53 +0200
Subject: [PATCH 003/218] revise make_approval_request and fix flake8 issues

---
 scripts/automated_ingestion/eessitarball.py | 179 ++++++++++++--------
 1 file changed, 110 insertions(+), 69 deletions(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 127ea2db..4bfccbf9 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -2,7 +2,6 @@
 
 from pathlib import PurePosixPath
 
-import boto3
 import github
 import json
 import logging
@@ -64,25 +63,23 @@ def download(self, force=False):
                 # and may be optional or required.
                 try:
                     self.s3.download_file(self.bucket, sig_object, local_sig_file)
-                except:
+                except Exception as err:
+                    log_msg = 'Failed to download signature file %s for %s from %s to %s.'
                     if self.config['signatures'].getboolean('signatures_required', True):
-                        logging.error(
-                            f'Failed to download signature file {sig_object} for {object} from {self.bucket} to {local_sig_file}.'
-                        )
+                        log_msg += '\nException: %s'
+                        logging.error(log_msg, sig_object, object, self.bucket, local_sig_file, err)
                         skip = True
                         break
                     else:
-                        logging.warning(
-                            f'Failed to download signature file {sig_object} for {object} from {self.bucket} to {local_sig_file}. ' +
-                             'Ignoring this, because signatures are not required with the current configuration.'
-                        )
+                        log_msg += ' Ignoring this, because signatures are not required with the current configuration.'
+                        log_msg += '\nException: %s'
+                        logging.warning(log_msg, sig_object, object, self.bucket, local_sig_file, err)
                 # Now we download the file itself.
                 try:
                     self.s3.download_file(self.bucket, object, local_file)
-                except:
-                    logging.error(
-                        f'Failed to download {object} from {self.bucket} to {local_file}.'
-                    )
+                except Exception as err:
+                    log_msg = 'Failed to download %s from %s to %s.\nException: %s'
+                    logging.error(log_msg, object, self.bucket, local_file, err)
                     skip = True
                     break
         # If any required download failed, make sure to skip this tarball completely.
@@ -100,13 +97,14 @@ def find_state(self):
             except github.UnknownObjectException:
                 # no metadata file found in this state's directory, so keep searching...
                 continue
-            except github.GithubException as e:
-                if e.status == 404:
+            except github.GithubException as err:
+                if err.status == 404:
                     # no metadata file found in this state's directory, so keep searching...
                     continue
                 else:
                     # if there was some other (e.g. connection) issue, abort the search for this tarball
-                    logging.warning(f'Unable to determine the state of {self.object}, the GitHub API returned status {e.status}!')
+                    log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
+                    logging.warning(log_msg, self.object, err.status)
                     return "unknown"
         else:
             # if no state was found, we assume this is a new tarball that was ingested to the bucket
@@ -128,7 +126,9 @@ def get_contents_overview(self):
             # determine prefix after filtering out '<EESSI version>/init' subdirectory,
             # to get actual prefix for specific CPU target (like '2023.06/software/linux/aarch64/neoverse_v1')
             init_subdir = os.path.join('*', 'init')
-            non_init_paths = sorted([p for p in paths if not any(x.match(init_subdir) for x in PurePosixPath(p).parents)])
+            non_init_paths = sorted(
+                [p for p in paths if not any(x.match(init_subdir) for x in PurePosixPath(p).parents)]
+            )
             if non_init_paths:
                 prefix = os.path.commonprefix(non_init_paths)
             else:
@@ -148,8 +148,8 @@ def get_contents_overview(self):
             other = [  # anything that is not in <prefix>/software nor <prefix>/modules
                 m.path
                 for m in members
-                if not PurePosixPath(prefix).joinpath('software') in PurePosixPath(m.path).parents
-                   and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(m.path).parents
+                if (not PurePosixPath(prefix).joinpath('software') in PurePosixPath(m.path).parents
+                    and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(m.path).parents)
                 # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*'))
                 # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*'))
             ]
@@ -204,16 +204,20 @@ def verify_signatures(self):
         verify_script = self.config['signatures']['signature_verification_script']
         allowed_signers_file = self.config['signatures']['allowed_signers_file']
         if not os.path.exists(verify_script):
-            logging.error(f'Unable to verify signatures, the specified signature verification script does not exist!')
+            logging.error('Unable to verify signatures, the specified signature verification script does not exist!')
             return False
 
         if not os.path.exists(allowed_signers_file):
-            logging.error(f'Unable to verify signatures, the specified allowed signers file does not exist!')
+            logging.error('Unable to verify signatures, the specified allowed signers file does not exist!')
             return False
 
-        for (file, sig_file) in [(self.local_path, self.local_sig_path), (self.local_metadata_path, self.local_metadata_sig_path)]:
+        for (file, sig_file) in [
+            (self.local_path, self.local_sig_path),
+            (self.local_metadata_path, self.local_metadata_sig_path)
+        ]:
             verify_cmd = subprocess.run(
-                [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file, '--file', file, '--signature-file', sig_file],
+                [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file,
+                 '--file', file, '--signature-file', sig_file],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE)
             if verify_cmd.returncode == 0:
@@ -237,7 +241,7 @@ def verify_checksum(self):
 
     def ingest(self):
         """Process a tarball that is ready to be ingested by running the ingestion script."""
-        #TODO: check if there is an open issue for this tarball, and if there is, skip it.
+        # TODO: check if there is an open issue for this tarball, and if there is, skip it.
         logging.info(f'Tarball {self.object} is ready to be ingested.')
         self.download()
         logging.info('Verifying its signature...')
@@ -273,7 +277,9 @@ def ingest(self):
             if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False):
                 send_slack_message(
                     self.config['secrets']['slack_webhook'],
-                    self.config['slack']['ingestion_message'].format(tarball=os.path.basename(self.object), cvmfs_repo=self.cvmfs_repo)
+                    self.config['slack']['ingestion_message'].format(
+                        tarball=os.path.basename(self.object),
+                        cvmfs_repo=self.cvmfs_repo)
                 )
         else:
             issue_title = f'Failed to ingest {self.object}'
@@ -314,7 +320,7 @@ def mark_new_tarball_as_staged(self):
 
         logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.')
         file_path_staged = next_state + '/' + self.metadata_file
-        new_file = self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch='main')
+        self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch='main')
 
         self.state = next_state
         self.run_handler()
@@ -328,35 +334,39 @@ def print_unknown(self):
         """Process a tarball which has an unknown state."""
         logging.info("The state of this tarball could not be determined, so we're skipping it.")
 
-    def make_approval_request(self):
+    def make_approval_request(self, tarballs_in_group=None):
         """Process a staged tarball by opening a pull request for ingestion approval."""
         next_state = self.next_state(self.state)
-        file_path_staged = self.state + '/' + self.metadata_file
-        file_path_to_ingest = next_state + '/' + self.metadata_file
-
+        # file_path_staged = self.state + '/' + self.metadata_file
         filename = os.path.basename(self.object)
-        tarball_metadata = self.git_repo.get_contents(file_path_staged)
-        git_branch = filename + '_' + next_state
-        self.download()
 
+        # Get link2pr info from metadata
+        with open(self.local_metadata_path, 'r') as meta:
+            metadata = meta.read()
+        meta_dict = json.loads(metadata)
+        repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr']
+        pr_url = f"https://github.com/{repo}/pull/{pr_id}"
+
+        # Create branch name based on whether we're handling a group
+        if tarballs_in_group is None:
+            # Individual tarball
+            git_branch = filename + '_' + next_state
+        else:
+            # Group of tarballs
+            sequence = self.find_next_sequence_number(repo, pr_id)
+            git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}'
+
+        # Check for existing branch and PR
         main_branch = self.git_repo.get_branch('main')
         if git_branch in [branch.name for branch in self.git_repo.get_branches()]:
-            # Existing branch found for this tarball, so we've run this step before.
-            # Try to find out if there's already a PR as well...
-            logging.info("Branch already exists for " + self.object)
-            # Filtering with only head=<branch name> returns all prs if there's no match, so double-check
-            find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all') if pr.head.ref == git_branch]
-            logging.debug('Found PRs: ' + str(find_pr))
+            find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all')
+                       if pr.head.ref == git_branch]
             if find_pr:
-                # So, we have a branch and a PR for this tarball (if there are more, pick the first one)...
                 pr = find_pr.pop(0)
-                logging.info(f'PR {pr.number} found for {self.object}')
                 if pr.state == 'open':
-                    # The PR is still open, so it hasn't been reviewed yet: ignore this tarball.
                     logging.info('PR is still open, skipping this tarball...')
                     return
                 elif pr.state == 'closed' and not pr.merged:
-                    # The PR was closed but not merged, i.e. it was rejected for ingestion.
                     logging.info('PR was rejected')
                     self.reject()
                     return
@@ -364,48 +374,78 @@ def make_approval_request(self):
                     logging.warn(f'Warning, tarball {self.object} is in a weird state:')
                     logging.warn(f'Branch: {git_branch}\nPR: {pr}\nPR state: {pr.state}\nPR merged: {pr.merged}')
             else:
-                # There is a branch, but no PR for this tarball.
-                # This is weird, so let's remove the branch and reprocess the tarball.
                 logging.info(f'Tarball {self.object} has a branch, but no PR.')
-                logging.info(f'Removing existing branch...')
+                logging.info('Removing existing branch...')
                 ref = self.git_repo.get_git_ref(f'heads/{git_branch}')
                 ref.delete()
-        logging.info(f'Making pull request to get ingestion approval for {self.object}.')
-        # Create a new branch
+
+        # Create new branch
         self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha)
-        # Move the file to the directory of the next stage in this branch
-        self.move_metadata_file(self.state, next_state, branch=git_branch)
-        # Get metadata file contents
-        metadata = ''
-        with open(self.local_metadata_path, 'r') as meta:
-            metadata = meta.read()
-        meta_dict = json.loads(metadata)
-        repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr']
-        pr_url = f"https://github.com/{repo}/pull/{pr_id}"
-        # Try to get the tarball contents and open a PR to get approval for the ingestion
+
+        # Move metadata file(s) to staged directory
+        if tarballs_in_group is None:
+            self.move_metadata_file(self.state, next_state, branch=git_branch)
+        else:
+            for tarball in tarballs_in_group:
+                temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+                temp_tar.move_metadata_file('new', 'staged', branch=git_branch)
+
+        # Create PR with appropriate template
         try:
-            tarball_contents = self.get_contents_overview()
-            pr_body = self.config['github']['pr_body'].format(
-                cvmfs_repo=self.cvmfs_repo,
-                pr_url=pr_url,
-                tar_overview=self.get_contents_overview(),
-                metadata=metadata,
-            )
-            pr_title = '[%s] Ingest %s' % (self.cvmfs_repo, filename)
+            if tarballs_in_group is None:
+                # Individual tarball
+                tarball_contents = self.get_contents_overview()
+                pr_body = self.config['github']['individual_pr_body'].format(
+                    cvmfs_repo=self.cvmfs_repo,
+                    pr_url=pr_url,
+                    tar_overview=tarball_contents,
+                    metadata=metadata,
+                )
+                pr_title = f'[{self.cvmfs_repo}] Ingest {filename}'
+            else:
+                # Group of tarballs
+                tar_overviews = []
+                for tarball in tarballs_in_group:
+                    try:
+                        temp_tar = EessiTarball(
+                            tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+                        temp_tar.download()
+                        overview = temp_tar.get_contents_overview()
+                        tar_details_tpl = "<details>\n<summary>Contents of %s</summary>\n\n%s\n</details>\n"
+                        tar_overviews.append(tar_details_tpl % (tarball, overview))
+                    except Exception as err:
+                        logging.error(f"Failed to get contents overview for {tarball}: {err}")
+                        tar_details_tpl = "<details>\n<summary>Contents of %s</summary>\n\n"
+                        tar_details_tpl += "Failed to get contents overview: %s\n</details>\n"
+                        tar_overviews.append(tar_details_tpl % (tarball, err))
+
+                pr_body = self.config['github']['grouped_pr_body'].format(
+                    cvmfs_repo=self.cvmfs_repo,
+                    pr_url=pr_url,
+                    tarballs=self.format_tarball_list(tarballs_in_group),
+                    metadata=self.format_metadata_list(tarballs_in_group),
+                    tar_overview="\n".join(tar_overviews)
+                )
+                pr_title = f'[{self.cvmfs_repo}] Staging PR #{sequence} for {repo}#{pr_id}'
+
+            # Add signature verification status if applicable
             if self.sig_verified:
-                pr_body += "\n\n:heavy_check_mark: :closed_lock_with_key: The signature of this tarball has been successfully verified."
+                pr_body += "\n\n:heavy_check_mark: :closed_lock_with_key: "
+                pr_body += "The signature of this tarball has been successfully verified."
                 pr_title += ' :closed_lock_with_key:'
+
             self.git_repo.create_pull(title=pr_title, body=pr_body, head=git_branch, base='main')
+
         except Exception as err:
             issue_title = f'Failed to get contents of {self.object}'
             issue_body = self.config['github']['failed_tarball_overview_issue_body'].format(
                 tarball=self.object,
                 error=err
             )
-            if len([i for i in self.git_repo.get_issues(state='open') if i.title == issue_title]) == 0:
+            if not self.issue_exists(issue_title, state='open'):
                 self.git_repo.create_issue(title=issue_title, body=issue_body)
             else:
-                logging.info(f'Failed to create tarball overview, but an issue already exists.')
+                logging.info('Failed to create tarball overview, but an issue already exists.')
 
     def move_metadata_file(self, old_state, new_state, branch='main'):
         """Move the metadata file of a tarball from an old state's directory to a new state's directory."""
@@ -441,6 +481,7 @@ def get_link2pr_info(self):
             metadata = json.load(meta)
         return metadata['link2pr']['repo'], metadata['link2pr']['pr']
 
+
 class EessiTarballGroup:
     """Class to handle a group of tarballs that share the same link2pr information."""
 

From dd9b71a05e52276d0f54dd780f0ebf283c60dd6b Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Tue, 22 Apr 2025 10:45:32 +0200
Subject: [PATCH 004/218] requirements.txt is not needed

---
 .github/workflows/check-flake8.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/check-flake8.yml b/.github/workflows/check-flake8.yml
index 2a3a425b..f0ebe250 100644
--- a/.github/workflows/check-flake8.yml
+++ b/.github/workflows/check-flake8.yml
@@ -29,7 +29,6 @@ jobs:
       - name: Install required Python packages + pytest + flake8
         run: |
           python -m pip install --upgrade pip
-          python -m pip install -r requirements.txt
           python -m pip install --upgrade flake8
 
       - name: Run flake8 to verify PEP8-compliance of Python code

From 2cfde96141e919b24770702b765079e92c894e71 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Tue, 22 Apr 2025 10:53:23 +0200
Subject: [PATCH 005/218] fix flake8 issues in automated_ingestion.py

---
 .../automated_ingestion.py                    | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 41d928c7..5983abe3 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -1,12 +1,11 @@
 #!/usr/bin/env python3
 
-from eessitarball import EessiTarball
-from pid.decorator import pidfile
+from eessitarball import EessiTarball, EessiTarballGroup
+from pid.decorator import pidfile  # noqa: F401
 from pid import PidFileError
 
 import argparse
 import boto3
-import botocore
 import configparser
 import github
 import json
@@ -38,7 +37,10 @@ def error(msg, code=1):
 
 
 def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt'):
-    """Return a list of all tarballs in an S3 bucket that have a metadata file with the given extension (and same filename)."""
+    """
+    Return a list of all tarballs in an S3 bucket that have a metadata file with
+    the given extension (and same filename).
+    """
     # TODO: list_objects_v2 only returns up to 1000 objects
     s3_objects = s3.list_objects_v2(Bucket=bucket).get('Contents', [])
     files = [obj['Key'] for obj in s3_objects]
@@ -46,11 +48,11 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt
     tarballs = [
         file
         for file in files
-        if file.endswith(extension)
-           and file + metadata_extension in files
+        if file.endswith(extension) and file + metadata_extension in files
     ]
     return tarballs
 
+
 def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'):
     """Return a dictionary of tarball groups, keyed by (repo, pr_number)."""
     tarballs = find_tarballs(s3, bucket, extension, metadata_extension)
@@ -88,15 +90,15 @@ def parse_config(path):
     config = configparser.ConfigParser()
     try:
         config.read(path)
-    except:
-        error(f'Unable to read configuration file {path}!')
+    except Exception as err:
+        error(f'Unable to read configuration file {path}!\nException: {err}')
 
     # Check if all required configuration parameters/sections can be found.
     for section in REQUIRED_CONFIG.keys():
-        if not section in config:
+        if section not in config:
             error(f'Missing section "{section}" in configuration file {path}.')
         for item in REQUIRED_CONFIG[section]:
-            if not item in config[section]:
+            if item not in config[section]:
                 error(f'Missing configuration item "{item}" in section "{section}" of configuration file {path}.')
     return config
 

From 0cb2622b8adac2283258181c016e6ead10098ef6 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Tue, 22 Apr 2025 11:02:35 +0200
Subject: [PATCH 006/218] fix flake8 issues in check-stratum-servers.py

---
 scripts/check-stratum-servers.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/scripts/check-stratum-servers.py b/scripts/check-stratum-servers.py
index de4270d9..4e35b09e 100755
--- a/scripts/check-stratum-servers.py
+++ b/scripts/check-stratum-servers.py
@@ -9,7 +9,8 @@
 import yaml
 
 # Default location for EESSI's Ansible group vars file containing the CVMFS settings.
-DEFAULT_ANSIBLE_GROUP_VARS_LOCATION = 'https://raw.githubusercontent.com/EESSI/filesystem-layer/main/inventory/group_vars/all.yml'
+DEFAULT_ANSIBLE_GROUP_VARS_LOCATION = \
+    'https://raw.githubusercontent.com/EESSI/filesystem-layer/main/inventory/group_vars/all.yml'
 # Default fully qualified CVMFS repository name
 DEFAULT_CVMFS_FQRN = 'software.eessi.io'
 # Maximum amount of time (in minutes) that a Stratum 1 is allowed to not having performed a snapshot.
@@ -32,8 +33,8 @@ def find_stratum_urls(vars_file, fqrn):
     """Find all Stratum 0/1 URLs in a given Ansible YAML vars file that contains the EESSI CVMFS configuration."""
     try:
         group_vars = urllib.request.urlopen(vars_file)
-    except:
-        error(f'Cannot read the file that contains the Stratum 1 URLs from {vars_file}!')
+    except Exception as err:
+        error(f'Cannot read the file that contains the Stratum 1 URLs from {vars_file}!\nException: {err}')
     try:
         group_vars_yaml = yaml.safe_load(group_vars)
         s1_urls = group_vars_yaml['eessi_cvmfs_server_urls'][0]['urls']
@@ -44,8 +45,8 @@ def find_stratum_urls(vars_file, fqrn):
                 break
         else:
             error(f'Could not find Stratum 0 URL in {vars_file}!')
-    except:
-        error(f'Cannot parse the yaml file from {vars_file}!')
+    except Exception as err:
+        error(f'Cannot parse the yaml file from {vars_file}!\nException: {err}')
     return s0_url, s1_urls
 
 
@@ -64,7 +65,7 @@ def check_revisions(stratum_urls, fqrn):
                 revisions[stratum] = int(rev_matches[0])
             else:
                 errors.append(f'Could not find revision number for stratum {stratum}!')
-        except urllib.error.HTTPError as e:
+        except urllib.error.HTTPError:
             errors.append(f'Could not connect to {stratum}!')
 
     # Check if all revisions are the same.
@@ -95,10 +96,11 @@ def check_snapshots(s1_urls, fqrn, max_snapshot_delay=DEFAULT_MAX_SNAPSHOT_DELAY
             # Stratum 1 servers are supposed to make a snapshot every few minutes,
             # so let's check if it is not too far behind.
             if now - last_snapshot_time > datetime.timedelta(minutes=max_snapshot_delay):
+                time_diff = (now - last_snapshot_time).seconds / 60
                 errors.append(
-                    f'Stratum 1 {s1} has made its last snapshot {(now - last_snapshot_time).seconds / 60:.0f} minutes ago!')
-        except urllib.error.HTTPError as e:
-            errors.append(f'Could not connect to {s1_json}!')
+                    f'Stratum 1 {s1} has made its last snapshot {time_diff:.0f} minutes ago!')
+        except urllib.error.HTTPError:
+            errors.append(f'Could not connect to {s1_snapshot_file}!')
 
     if last_snapshots:
         # Get the Stratum 1 with the most recent snapshot...

From 7214d27526ae7a6d21ea7af27c2a26680426d284 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 25 Apr 2025 21:37:39 +0200
Subject: [PATCH 007/218] incremental updates through testing

---
 .../automated_ingestion.py                    | 18 +++--
 scripts/automated_ingestion/eessitarball.py   | 67 +++++++++++++++++++
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 5983abe3..ca059431 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -131,6 +131,8 @@ def main():
         's3',
         aws_access_key_id=config['secrets']['aws_access_key_id'],
         aws_secret_access_key=config['secrets']['aws_secret_access_key'],
+        endpoint_url=config['aws']['endpoint_url'],
+        verify=config['aws']['verify_cert_path'],
     )
 
     buckets = json.loads(config['aws']['staging_buckets'])
@@ -138,11 +140,17 @@ def main():
         if config['github'].get('staging_pr_method', 'individual') == 'grouped':
             # use new grouped PR method
             tarball_groups = find_tarball_groups(s3, bucket, config)
-            for (repo, pr_id), tarballs in tarball_groups.items():
-                if tarballs:
-                    # Create a group handler for these tarballs
-                    group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                    group_handler.process_group(tarballs)
+            if args.list_only:
+                print(f"#tarball_groups: {len(tarball_groups)}")
+                for (repo, pr_id), tarballs in tarball_groups.items():
+                    print(f"  {repo}#{pr_id}: #tarballs {len(tarballs)}")
+            else:
+                for (repo, pr_id), tarballs in tarball_groups.items():
+                    if tarballs:
+                        # Create a group handler for these tarballs
+                        group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                        print(f"group_handler created\n{group_handler.to_string()}")
+                        group_handler.process_group(tarballs)
         else:
             # use old individual PR method
             tarballs = find_tarballs(s3, bucket)
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 4bfccbf9..88bfc1df 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -181,6 +181,17 @@ def run_handler(self):
         handler = self.states[self.state]['handler']
         handler()
 
+    def to_string(self):
+        """Serialize tarball info so it can be printed."""
+        str = f"tarball: {self.object}"
+        str += f"\n   metadt: {self.metadata_file}"
+        str += f"\n   config: {self.config}"
+        str += f"\n   s3....: {self.s3}"
+        str += f"\n   bucket: {self.bucket}"
+        str += f"\n   cvmfs.: {self.cvmfs_repo}"
+        str += f"\n   GHrepo: {self.git_repo}"
+        return str
+
     def verify_signatures(self):
         """Verify the signatures of the downloaded tarball and metadata file using the corresponding signature files."""
 
@@ -334,6 +345,35 @@ def print_unknown(self):
         """Process a tarball which has an unknown state."""
         logging.info("The state of this tarball could not be determined, so we're skipping it.")
 
+    def find_next_sequence_number(self, repo, pr_id):
+        """Find the next available sequence number for staging PRs of a source PR."""
+        # Search for existing branches for this source PR
+        base_branch = f'staging-{repo.replace("/", "-")}-{pr_id}'
+        existing_branches = [
+            ref.ref for ref in self.git_repo.get_git_refs()
+            if ref.ref.startswith(f'refs/heads/{base_branch}')
+        ]
+
+        if not existing_branches:
+            return 1
+
+        # Extract sequence numbers from existing branches
+        sequence_numbers = []
+        for branch in existing_branches:
+            try:
+                # Extract the sequence number from branch name
+                # Format: staging-repo-pr_id-sequence
+                sequence = int(branch.split('-')[-1])
+                sequence_numbers.append(sequence)
+            except (ValueError, IndexError):
+                continue
+
+        if not sequence_numbers:
+            return 1
+
+        # Return next available sequence number
+        return max(sequence_numbers) + 1
+
     def make_approval_request(self, tarballs_in_group=None):
         """Process a staged tarball by opening a pull request for ingestion approval."""
         next_state = self.next_state(self.state)
@@ -494,8 +534,24 @@ def __init__(self, first_tarball, config, git_staging_repo, s3, bucket, cvmfs_re
         self.bucket = bucket
         self.cvmfs_repo = cvmfs_repo
 
+    def download_tarballs_and_more(self, tarballs):
+        """Download all files associated with this group of tarballs."""
+        for tarball in tarballs:
+            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+            print(f"downloading files for '{temp_tar.object}'")
+            temp_tar.download(force=True)
+            if not temp_tar.local_path or not temp_tar.local_metadata_path:
+                logging.warn(f"Skipping this tarball: {temp_tar.object}")
+                return False
+        return True
+
     def process_group(self, tarballs):
         """Process a group of tarballs together."""
+        # download tarballs, metadata files and their signatures
+        if not self.download_tarballs_and_more(tarballs):
+            logging.error("Downloading tarballs, metadata files and/or their signatures failed")
+            return
+
         # Verify all tarballs have the same link2pr info
         if not self.verify_group_consistency(tarballs):
             logging.error("Tarballs in group have inconsistent link2pr information")
@@ -504,12 +560,23 @@ def process_group(self, tarballs):
         # Process the group
         self.first_tar.make_approval_request(tarballs)
 
+    def to_string(self):
+        """Serialize tarball group info so it can be printed."""
+        str = f"first tarball: {self.first_tar.to_string()}"
+        str += f"\n   config: {self.config}"
+        str += f"\n   GHrepo: {self.git_repo}"
+        str += f"\n   s3....: {self.s3}"
+        str += f"\n   bucket: {self.bucket}"
+        str += f"\n   cvmfs.: {self.cvmfs_repo}"
+        return str
+
     def verify_group_consistency(self, tarballs):
         """Verify all tarballs in the group have the same link2pr information."""
         first_repo, first_pr = self.first_tar.get_link2pr_info()
 
         for tarball in tarballs[1:]:  # Skip first tarball as we already have its info
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+            print(f"temp tar: {temp_tar.to_string()}")
             repo, pr = temp_tar.get_link2pr_info()
             if repo != first_repo or pr != first_pr:
                 return False

From 946dd65e3222b2164f991cf108836c4305ac7874 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 25 Apr 2025 23:52:44 +0200
Subject: [PATCH 008/218] add missing functions

---
 .../automated_ingestion.py                    | 12 ++++
 scripts/automated_ingestion/eessitarball.py   | 62 +++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index ca059431..285f2c0c 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -100,6 +100,18 @@ def parse_config(path):
         for item in REQUIRED_CONFIG[section]:
             if item not in config[section]:
                 error(f'Missing configuration item "{item}" in section "{section}" of configuration file {path}.')
+
+    # Validate staging_pr_method
+    staging_method = config['github'].get('staging_pr_method', 'individual')
+    if staging_method not in ['individual', 'grouped']:
+        error(f'Invalid staging_pr_method: "{staging_method}" in configuration file {path}. Must be either "individual" or "grouped".')
+
+    # Validate PR body templates
+    if staging_method == 'individual' and 'individual_pr_body' not in config['github']:
+        error(f'Missing "individual_pr_body" in configuration file {path}.')
+    if staging_method == 'grouped' and 'grouped_pr_body' not in config['github']:
+        error(f'Missing "grouped_pr_body" in configuration file {path}.')
+
     return config
 
 
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 88bfc1df..50f6c16f 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -487,6 +487,22 @@ def make_approval_request(self, tarballs_in_group=None):
             else:
                 logging.info('Failed to create tarball overview, but an issue already exists.')
 
+    def format_tarball_list(self, tarballs):
+        """Format a list of tarballs with checkboxes for approval."""
+        formatted = "### Tarballs to be ingested\n\n"
+        for tarball in tarballs:
+            formatted += f"- [ ] {tarball}\n"
+        return formatted
+
+    def format_metadata_list(self, tarballs):
+        """Format metadata for all tarballs in collapsible sections."""
+        formatted = "### Metadata\n\n"
+        for tarball in tarballs:
+            with open(self.get_metadata_path(tarball), 'r') as meta:
+                metadata = meta.read()
+                formatted += f"<details>\n<summary>Metadata for {tarball}</summary>\n\n```\n{metadata}\n```\n</details>\n\n"
+        return formatted
+
     def move_metadata_file(self, old_state, new_state, branch='main'):
         """Move the metadata file of a tarball from an old state's directory to a new state's directory."""
         file_path_old = old_state + '/' + self.metadata_file
@@ -499,6 +515,52 @@ def move_metadata_file(self, old_state, new_state, branch='main'):
         self.git_repo.create_file(file_path_new, 'move to ' + new_state, tarball_metadata.decoded_content,
                                   branch=branch)
 
+    def process_pr_merge(self, pr_number):
+        """Process a merged PR by handling the checkboxes and moving tarballs to appropriate states."""
+        pr = self.git_repo.get_pull(pr_number)
+
+        # Get the branch name
+        branch_name = pr.head.ref
+
+        # Get the list of tarballs from the PR body
+        tarballs = self.extract_tarballs_from_pr_body(pr.body)
+
+        # Get the checked status for each tarball
+        checked_tarballs = self.extract_checked_tarballs(pr.body)
+
+        # Process each tarball
+        for tarball in tarballs:
+            if tarball in checked_tarballs:
+                # Move to approved state
+                self.move_metadata_file('staged', 'approved', branch=branch_name)
+            else:
+                # Move to rejected state
+                self.move_metadata_file('staged', 'rejected', branch=branch_name)
+
+        # Delete the branch after processing
+        ref = self.git_repo.get_git_ref(f'heads/{branch_name}')
+        ref.delete()
+
+    def extract_checked_tarballs(self, pr_body):
+        """Extract list of checked tarballs from PR body."""
+        checked_tarballs = []
+        for line in pr_body.split('\n'):
+            if line.strip().startswith('- [x] '):
+                tarball = line.strip()[6:] # Remove '- [x] ' prefix
+                checked_tarballs.append(tarball)
+        return checked_tarballs
+
+    def extract_tarballs_from_pr_body(self, pr_body):
+        """Extract list of all tarballs from PR body."""
+        tarballs = []
+        for line in pr_body.split('\n'):
+            if line.strip().startswith('- ['):
+                tarball = line.strip()[6:] # Remove '- [ ] ' or '- [x] ' prefix
+                tarballs.append(tarball)
+        return tarballs
+
+    def reject(self):
+        """Reject a tarball for ingestion."""
     def reject(self):
         """Reject a tarball for ingestion."""
         # Let's move the the tarball to the directory for rejected tarballs.

From 51894ec71ec0dac855eedd14cb4deabe6b0a2aa2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 26 Apr 2025 00:31:21 +0200
Subject: [PATCH 009/218] mark tarballs in group as new initially

---
 scripts/automated_ingestion/automated_ingestion.py | 8 ++++----
 scripts/automated_ingestion/eessitarball.py        | 9 ++++++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 285f2c0c..7668dfb0 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -159,10 +159,10 @@ def main():
             else:
                 for (repo, pr_id), tarballs in tarball_groups.items():
                     if tarballs:
-                        # Create a group handler for these tarballs
-                        group_handler = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                        print(f"group_handler created\n{group_handler.to_string()}")
-                        group_handler.process_group(tarballs)
+                        # Create a group for these tarballs
+                        group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                        print(f"group created\n{group.to_string()}")
+                        group.process_group(tarballs)
         else:
             # use old individual PR method
             tarballs = find_tarballs(s3, bucket)
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 50f6c16f..9e523376 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -559,8 +559,6 @@ def extract_tarballs_from_pr_body(self, pr_body):
                 tarballs.append(tarball)
         return tarballs
 
-    def reject(self):
-        """Reject a tarball for ingestion."""
     def reject(self):
         """Reject a tarball for ingestion."""
         # Let's move the the tarball to the directory for rejected tarballs.
@@ -619,7 +617,12 @@ def process_group(self, tarballs):
             logging.error("Tarballs in group have inconsistent link2pr information")
             return
 
-        # Process the group
+        # First mark all tarballs as staged by creating their metadata files in the GitHub repository
+        for tarball in tarballs:
+            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+            temp_tar.mark_new_tarball_as_staged()
+
+        # Then process the group for approval
         self.first_tar.make_approval_request(tarballs)
 
     def to_string(self):

From ea51a5e3eb3380fbc4672fc46a7769df01f97295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 26 Apr 2025 00:48:00 +0200
Subject: [PATCH 010/218] always use group branch only

---
 scripts/automated_ingestion/eessitarball.py | 22 ++++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 9e523376..056666e9 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -310,7 +310,7 @@ def print_ingested(self):
         """Process a tarball that has already been ingested."""
         logging.info(f'{self.object} has already been ingested, skipping...')
 
-    def mark_new_tarball_as_staged(self):
+    def mark_new_tarball_as_staged(self, branch=None):
         """Process a new tarball that was added to the staging bucket."""
         next_state = self.next_state(self.state)
         logging.info(f'Found new tarball {self.object}, downloading it...')
@@ -331,10 +331,14 @@ def mark_new_tarball_as_staged(self):
 
         logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.')
         file_path_staged = next_state + '/' + self.metadata_file
-        self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch='main')
+
+        # If no branch is provided, use the main branch
+        target_branch = branch if branch else 'main'
+        self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch=target_branch)
 
         self.state = next_state
-        self.run_handler()
+        if not branch:  # Only run handler if we're not part of a group
+            self.run_handler()
 
     def print_rejected(self):
         """Process a (rejected) tarball for which the corresponding PR has been closed witout merging."""
@@ -377,7 +381,6 @@ def find_next_sequence_number(self, repo, pr_id):
     def make_approval_request(self, tarballs_in_group=None):
         """Process a staged tarball by opening a pull request for ingestion approval."""
         next_state = self.next_state(self.state)
-        # file_path_staged = self.state + '/' + self.metadata_file
         filename = os.path.basename(self.object)
 
         # Get link2pr info from metadata
@@ -387,14 +390,9 @@ def make_approval_request(self, tarballs_in_group=None):
         repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr']
         pr_url = f"https://github.com/{repo}/pull/{pr_id}"
 
-        # Create branch name based on whether we're handling a group
-        if tarballs_in_group is None:
-            # Individual tarball
-            git_branch = filename + '_' + next_state
-        else:
-            # Group of tarballs
-            sequence = self.find_next_sequence_number(repo, pr_id)
-            git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}'
+        # Always use the consistent branch naming scheme
+        sequence = self.find_next_sequence_number(repo, pr_id)
+        git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}'
 
         # Check for existing branch and PR
         main_branch = self.git_repo.get_branch('main')

From 7db3dbee78a08c4539abd0255d55debaff56f574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 26 Apr 2025 01:09:52 +0200
Subject: [PATCH 011/218] add a bit debug output

---
 scripts/automated_ingestion/eessitarball.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 056666e9..833a66f7 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -92,6 +92,7 @@ def find_state(self):
         for state in list(self.states.keys()):
             # iterate through the state dirs and try to find the tarball's metadata file
             try:
+                print(f"Checking {state} for {self.metadata_file}")
                 self.git_repo.get_contents(state + '/' + self.metadata_file)
                 return state
             except github.UnknownObjectException:
@@ -330,6 +331,7 @@ def mark_new_tarball_as_staged(self, branch=None):
             contents = meta.read()
 
         logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.')
+        print(f'Adding tarball\'s metadata ({self.metadata_file}) to the "{next_state}" folder of the git repository.')
         file_path_staged = next_state + '/' + self.metadata_file
 
         # If no branch is provided, use the main branch

From 9ef35355df9b18b9aecfde7a21c8c7bcb4833eb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 26 Apr 2025 21:30:30 +0200
Subject: [PATCH 012/218] improve logging

---
 .../automated_ingestion.py                    | 83 ++++++++++++++++--
 scripts/automated_ingestion/eessitarball.py   | 84 +++++++++++--------
 2 files changed, 124 insertions(+), 43 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 7668dfb0..67d90a45 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -13,6 +13,7 @@
 import os
 import pid
 import sys
+from pathlib import Path
 
 REQUIRED_CONFIG = {
     'secrets': ['aws_secret_access_key', 'aws_access_key_id', 'github_pat'],
@@ -118,12 +119,81 @@ def parse_config(path):
 def parse_args():
     """Parse the command-line arguments."""
     parser = argparse.ArgumentParser()
+
+    # Logging options
+    logging_group = parser.add_argument_group('Logging options')
+    logging_group.add_argument('--log-file',
+                             help='Path to log file (overrides config file setting)')
+    logging_group.add_argument('--console-level',
+                             choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+                             help='Logging level for console output (overrides config file setting)')
+    logging_group.add_argument('--file-level',
+                             choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+                             help='Logging level for file output (overrides config file setting)')
+    logging_group.add_argument('--quiet',
+                             action='store_true',
+                             help='Suppress console output (overrides all other console settings)')
+
+    # Existing arguments
     parser.add_argument('-c', '--config', type=str, help='path to configuration file',
-                        default='automated_ingestion.cfg', dest='config')
+                       default='automated_ingestion.cfg', dest='config')
     parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug')
     parser.add_argument('-l', '--list', help='only list available tarballs', action='store_true', dest='list_only')
-    args = parser.parse_args()
-    return args
+
+    return parser.parse_args()
+
+
+def setup_logging(config, args):
+    """
+    Configure logging based on configuration file and command line arguments.
+    Command line arguments take precedence over config file settings.
+
+    Args:
+        config: Configuration dictionary
+        args: Parsed command line arguments
+    """
+    # Get settings from config file
+    log_file = config['logging'].get('filename')
+    log_format = config['logging'].get('format', '%(levelname)s:%(message)s')
+    config_console_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.INFO)
+    config_file_level = LOG_LEVELS.get(config['logging'].get('file_level', 'DEBUG').upper(), logging.DEBUG)
+
+    # Override with command line arguments if provided
+    log_file = args.log_file if args.log_file else log_file
+    console_level = getattr(logging, args.console_level) if args.console_level else config_console_level
+    file_level = getattr(logging, args.file_level) if args.file_level else config_file_level
+
+    # Debug mode overrides console level
+    if args.debug:
+        console_level = logging.DEBUG
+
+    # Create logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)  # Set root logger to lowest level
+
+    # Create formatters
+    console_formatter = logging.Formatter(log_format)
+    file_formatter = logging.Formatter('%(asctime)s - ' + log_format)
+
+    # Console handler (only if not quiet)
+    if not args.quiet:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(console_level)
+        console_handler.setFormatter(console_formatter)
+        logger.addHandler(console_handler)
+
+    # File handler (if log file is specified)
+    if log_file:
+        # Ensure log directory exists
+        log_path = Path(log_file)
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(file_level)
+        file_handler.setFormatter(file_formatter)
+        logger.addHandler(file_handler)
+
+    return logger
 
 
 @pid.decorator.pidfile('automated_ingestion.pid')
@@ -131,11 +201,8 @@ def main():
     """Main function."""
     args = parse_args()
     config = parse_config(args.config)
-    log_file = config['logging'].get('filename', None)
-    log_format = config['logging'].get('format', '%(levelname)s:%(message)s')
-    log_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.WARN)
-    log_level = logging.DEBUG if args.debug else log_level
-    logging.basicConfig(filename=log_file, format=log_format, level=log_level)
+    setup_logging(config, args)
+
     # TODO: check configuration: secrets, paths, permissions on dirs, etc
     gh_pat = config['secrets']['github_pat']
     gh_staging_repo = github.Github(gh_pat).get_repo(config['github']['staging_repo'])
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 833a66f7..f7749a94 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -89,11 +89,11 @@ def download(self, force=False):
 
     def find_state(self):
         """Find the state of this tarball by searching through the state directories in the git repository."""
+        logging.debug(f"Find state for {self.object}")
         for state in list(self.states.keys()):
-            # iterate through the state dirs and try to find the tarball's metadata file
             try:
-                print(f"Checking {state} for {self.metadata_file}")
                 self.git_repo.get_contents(state + '/' + self.metadata_file)
+                logging.info(f"Found metadata file {self.metadata_file} in state: {state}")
                 return state
             except github.UnknownObjectException:
                 # no metadata file found in this state's directory, so keep searching...
@@ -107,9 +107,8 @@ def find_state(self):
                     log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
                     logging.warning(log_msg, self.object, err.status)
                     return "unknown"
-        else:
-            # if no state was found, we assume this is a new tarball that was ingested to the bucket
-            return "new"
+        logging.info(f"Tarball {self.metadata_file} is new")
+        return "new"
 
     def get_contents_overview(self):
         """Return an overview of what is included in the tarball."""
@@ -319,23 +318,22 @@ def mark_new_tarball_as_staged(self, branch=None):
         # Use force as it may be a new attempt for an existing tarball that failed before.
         self.download(force=True)
         if not self.local_path or not self.local_metadata_path:
-            logging.warn('Skipping this tarball...')
+            logging.warning(f"Skipping tarball {self.object} - download failed")
             return
 
         # Verify the signatures of the tarball and metadata file.
         if not self.verify_signatures():
-            logging.warn('Signature verification of the tarball or its metadata failed, skipping this tarball...')
+            logging.warning(f"Skipping tarball {self.object} - signature verification failed")
+            return
+
+        # If no branch is provided, use the main branch
+        target_branch = branch if branch else 'main'
+        logging.info(f"Adding metadata to '{next_state}' folder in {target_branch} branch")
 
+        file_path_staged = next_state + '/' + self.metadata_file
         contents = ''
         with open(self.local_metadata_path, 'r') as meta:
             contents = meta.read()
-
-        logging.info(f'Adding tarball\'s metadata to the "{next_state}" folder of the git repository.')
-        print(f'Adding tarball\'s metadata ({self.metadata_file}) to the "{next_state}" folder of the git repository.')
-        file_path_staged = next_state + '/' + self.metadata_file
-
-        # If no branch is provided, use the main branch
-        target_branch = branch if branch else 'main'
         self.git_repo.create_file(file_path_staged, 'new tarball', contents, branch=target_branch)
 
         self.state = next_state
@@ -383,22 +381,21 @@ def find_next_sequence_number(self, repo, pr_id):
     def make_approval_request(self, tarballs_in_group=None):
         """Process a staged tarball by opening a pull request for ingestion approval."""
         next_state = self.next_state(self.state)
-        filename = os.path.basename(self.object)
 
-        # Get link2pr info from metadata
+        # obtain link2pr information (repo and pr_id) from metadata file
         with open(self.local_metadata_path, 'r') as meta:
             metadata = meta.read()
         meta_dict = json.loads(metadata)
         repo, pr_id = meta_dict['link2pr']['repo'], meta_dict['link2pr']['pr']
-        pr_url = f"https://github.com/{repo}/pull/{pr_id}"
 
-        # Always use the consistent branch naming scheme
+        # find next sequence number for staging PRs of this source PR
         sequence = self.find_next_sequence_number(repo, pr_id)
         git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}'
 
-        # Check for existing branch and PR
+        # Check if git_branch exists and what the status of the corressponding PR is
         main_branch = self.git_repo.get_branch('main')
         if git_branch in [branch.name for branch in self.git_repo.get_branches()]:
+            logging.info(f"Branch {git_branch} already exists, checking the status of the corresponding PR...")
             find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all')
                        if pr.head.ref == git_branch]
             if find_pr:
@@ -413,6 +410,8 @@ def make_approval_request(self, tarballs_in_group=None):
                 else:
                     logging.warn(f'Warning, tarball {self.object} is in a weird state:')
                     logging.warn(f'Branch: {git_branch}\nPR: {pr}\nPR state: {pr.state}\nPR merged: {pr.merged}')
+                    # TODO:  should we delete the branch or open an issue? 
+                    return
             else:
                 logging.info(f'Tarball {self.object} has a branch, but no PR.')
                 logging.info('Removing existing branch...')
@@ -424,16 +423,19 @@ def make_approval_request(self, tarballs_in_group=None):
 
         # Move metadata file(s) to staged directory
         if tarballs_in_group is None:
+            logging.info(f"Moving metadata for individual tarball to staged")
             self.move_metadata_file(self.state, next_state, branch=git_branch)
         else:
+            logging.info(f"Moving metadata for {len(tarballs_in_group)} tarballs to staged")
             for tarball in tarballs_in_group:
                 temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
                 temp_tar.move_metadata_file('new', 'staged', branch=git_branch)
 
         # Create PR with appropriate template
         try:
+            pr_url=f"https://github.com/{repo}/pull/{pr_id}",
             if tarballs_in_group is None:
-                # Individual tarball
+                logging.info(f"Creating PR for individual tarball: {self.object}")
                 tarball_contents = self.get_contents_overview()
                 pr_body = self.config['github']['individual_pr_body'].format(
                     cvmfs_repo=self.cvmfs_repo,
@@ -441,7 +443,7 @@ def make_approval_request(self, tarballs_in_group=None):
                     tar_overview=tarball_contents,
                     metadata=metadata,
                 )
-                pr_title = f'[{self.cvmfs_repo}] Ingest {filename}'
+                pr_title = f'[{self.cvmfs_repo}] Ingest {os.path.basename(self.object)}'
             else:
                 # Group of tarballs
                 tar_overviews = []
@@ -475,17 +477,18 @@ def make_approval_request(self, tarballs_in_group=None):
                 pr_title += ' :closed_lock_with_key:'
 
             self.git_repo.create_pull(title=pr_title, body=pr_body, head=git_branch, base='main')
+            logging.info(f"Created PR: {pr_title}")
 
         except Exception as err:
-            issue_title = f'Failed to get contents of {self.object}'
-            issue_body = self.config['github']['failed_tarball_overview_issue_body'].format(
-                tarball=self.object,
-                error=err
-            )
-            if not self.issue_exists(issue_title, state='open'):
-                self.git_repo.create_issue(title=issue_title, body=issue_body)
-            else:
-                logging.info('Failed to create tarball overview, but an issue already exists.')
+            logging.error(f"Failed to create PR: {err}")
+            if not self.issue_exists(f'Failed to get contents of {self.object}', state='open'):
+                self.git_repo.create_issue(
+                    title=f'Failed to get contents of {self.object}',
+                    body=self.config['github']['failed_tarball_overview_issue_body'].format(
+                        tarball=self.object,
+                        error=err
+                    )
+                )
 
     def format_tarball_list(self, tarballs):
         """Format a list of tarballs with checkboxes for approval."""
@@ -607,22 +610,33 @@ def download_tarballs_and_more(self, tarballs):
 
     def process_group(self, tarballs):
         """Process a group of tarballs together."""
-        # download tarballs, metadata files and their signatures
+        logging.info(f"Processing group of {len(tarballs)} tarballs")
+
         if not self.download_tarballs_and_more(tarballs):
             logging.error("Downloading tarballs, metadata files and/or their signatures failed")
             return
 
         # Verify all tarballs have the same link2pr info
         if not self.verify_group_consistency(tarballs):
-            logging.error("Tarballs in group have inconsistent link2pr information")
+            logging.error("Tarballs have inconsistent link2pr information")
             return
 
-        # First mark all tarballs as staged by creating their metadata files in the GitHub repository
+        # Get branch name from first tarball
+        with open(self.first_tar.local_metadata_path, 'r') as meta:
+            metadata = json.load(meta)
+        repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr']
+        sequence = self.first_tar.find_next_sequence_number(repo, pr_id)
+        git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}'
+
+        logging.info(f"Creating group branch: {git_branch}")
+
+        # Mark all tarballs as staged in the group branch
         for tarball in tarballs:
+            logging.info(f"Processing tarball in group: {tarball}")
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-            temp_tar.mark_new_tarball_as_staged()
+            temp_tar.mark_new_tarball_as_staged(branch=git_branch)
 
-        # Then process the group for approval
+        # Process the group for approval
         self.first_tar.make_approval_request(tarballs)
 
     def to_string(self):

From d860d1e97de5a22aa9302c7b7a0716a32dbaebd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 26 Apr 2025 22:04:12 +0200
Subject: [PATCH 013/218] more improvements to logging

---
 .../automated_ingestion.py                    |  8 ++---
 scripts/automated_ingestion/eessitarball.py   | 34 ++++++++++---------
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 67d90a45..622f0386 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -220,22 +220,22 @@ def main():
             # use new grouped PR method
             tarball_groups = find_tarball_groups(s3, bucket, config)
             if args.list_only:
-                print(f"#tarball_groups: {len(tarball_groups)}")
+                logging.info(f"#tarball_groups: {len(tarball_groups)}")
                 for (repo, pr_id), tarballs in tarball_groups.items():
-                    print(f"  {repo}#{pr_id}: #tarballs {len(tarballs)}")
+                    logging.info(f"  {repo}#{pr_id}: #tarballs {len(tarballs)}")
             else:
                 for (repo, pr_id), tarballs in tarball_groups.items():
                     if tarballs:
                         # Create a group for these tarballs
                         group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                        print(f"group created\n{group.to_string()}")
+                        logging.info(f"group created\n{group.to_string()}")
                         group.process_group(tarballs)
         else:
             # use old individual PR method
             tarballs = find_tarballs(s3, bucket)
             if args.list_only:
                 for num, tarball in enumerate(tarballs):
-                    print(f'[{bucket}] {num}: {tarball}')
+                    logging.info(f'[{bucket}] {num}: {tarball}')
             else:
                 for tarball in tarballs:
                     tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo)
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index f7749a94..1284be5b 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -181,15 +181,16 @@ def run_handler(self):
         handler = self.states[self.state]['handler']
         handler()
 
-    def to_string(self):
+    def to_string(self, oneline=False):
         """Serialize tarball info so it can be printed."""
         str = f"tarball: {self.object}"
-        str += f"\n   metadt: {self.metadata_file}"
-        str += f"\n   config: {self.config}"
-        str += f"\n   s3....: {self.s3}"
-        str += f"\n   bucket: {self.bucket}"
-        str += f"\n   cvmfs.: {self.cvmfs_repo}"
-        str += f"\n   GHrepo: {self.git_repo}"
+        sep = "\n" if not oneline else ","
+        str += f"{sep} metadt: {self.metadata_file}"
+        str += f"{sep} config: {self.config}"
+        str += f"{sep} s3....: {self.s3}"
+        str += f"{sep} bucket: {self.bucket}"
+        str += f"{sep} cvmfs.: {self.cvmfs_repo}"
+        str += f"{sep} GHrepo: {self.git_repo}"
         return str
 
     def verify_signatures(self):
@@ -601,7 +602,7 @@ def download_tarballs_and_more(self, tarballs):
         """Download all files associated with this group of tarballs."""
         for tarball in tarballs:
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-            print(f"downloading files for '{temp_tar.object}'")
+            logging.info(f"downloading files for '{temp_tar.object}'")
             temp_tar.download(force=True)
             if not temp_tar.local_path or not temp_tar.local_metadata_path:
                 logging.warn(f"Skipping this tarball: {temp_tar.object}")
@@ -639,14 +640,15 @@ def process_group(self, tarballs):
         # Process the group for approval
         self.first_tar.make_approval_request(tarballs)
 
-    def to_string(self):
+    def to_string(self, oneline=False):
         """Serialize tarball group info so it can be printed."""
-        str = f"first tarball: {self.first_tar.to_string()}"
-        str += f"\n   config: {self.config}"
-        str += f"\n   GHrepo: {self.git_repo}"
-        str += f"\n   s3....: {self.s3}"
-        str += f"\n   bucket: {self.bucket}"
-        str += f"\n   cvmfs.: {self.cvmfs_repo}"
+        str = f"first tarball: {self.first_tar.to_string(oneline)}"
+        sep = "\n" if not oneline else ","
+        str += f"{sep} config: {self.config}"
+        str += f"{sep} GHrepo: {self.git_repo}"
+        str += f"{sep} s3....: {self.s3}"
+        str += f"{sep} bucket: {self.bucket}"
+        str += f"{sep} cvmfs.: {self.cvmfs_repo}"
         return str
 
     def verify_group_consistency(self, tarballs):
@@ -655,7 +657,7 @@ def verify_group_consistency(self, tarballs):
 
         for tarball in tarballs[1:]:  # Skip first tarball as we already have its info
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-            print(f"temp tar: {temp_tar.to_string()}")
+            logging.debug(f"temp tar: {temp_tar.to_string()}")
             repo, pr = temp_tar.get_link2pr_info()
             if repo != first_repo or pr != first_pr:
                 return False

From 18cf44d064c65b1b7134e61a7ee45efb038a9ea1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 26 Apr 2025 22:19:13 +0200
Subject: [PATCH 014/218] tweak logging

---
 scripts/automated_ingestion/automated_ingestion.py | 4 ++--
 scripts/automated_ingestion/eessitarball.py        | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 622f0386..9eb717cb 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -154,7 +154,7 @@ def setup_logging(config, args):
     """
     # Get settings from config file
     log_file = config['logging'].get('filename')
-    log_format = config['logging'].get('format', '%(levelname)s:%(message)s')
+    log_format = config['logging'].get('format', '%(levelname)s: %(message)s')
     config_console_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.INFO)
     config_file_level = LOG_LEVELS.get(config['logging'].get('file_level', 'DEBUG').upper(), logging.DEBUG)
 
@@ -228,7 +228,7 @@ def main():
                     if tarballs:
                         # Create a group for these tarballs
                         group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                        logging.info(f"group created\n{group.to_string()}")
+                        logging.info(f"group created\n{group.to_string(oneline=True)}")
                         group.process_group(tarballs)
         else:
             # use old individual PR method
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 1284be5b..b45a5e69 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -186,8 +186,6 @@ def to_string(self, oneline=False):
         str = f"tarball: {self.object}"
         sep = "\n" if not oneline else ","
         str += f"{sep} metadt: {self.metadata_file}"
-        str += f"{sep} config: {self.config}"
-        str += f"{sep} s3....: {self.s3}"
         str += f"{sep} bucket: {self.bucket}"
         str += f"{sep} cvmfs.: {self.cvmfs_repo}"
         str += f"{sep} GHrepo: {self.git_repo}"

From 4a6fecc0caee0b59893bd8bc289e0301e30444c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 26 Apr 2025 22:44:37 +0200
Subject: [PATCH 015/218] more logging output for downloads and signature
 verification

---
 scripts/automated_ingestion/eessitarball.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index b45a5e69..af22f4fa 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -56,12 +56,14 @@ def download(self, force=False):
             (self.object, self.local_path, self.object_sig, self.local_sig_path),
             (self.metadata_file, self.local_metadata_path, self.metadata_sig_file, self.local_metadata_sig_path),
         ]
+        logging.info(f"Downloading {files}")
         skip = False
         for (object, local_file, sig_object, local_sig_file) in files:
             if force or not os.path.exists(local_file):
                 # First we try to download signature file, which may or may not be available
                 # and may be optional or required.
                 try:
+                    logging.info(f"Downloading signature file {sig_object} to {local_sig_file}")
                     self.s3.download_file(self.bucket, sig_object, local_sig_file)
                 except Exception as err:
                     log_msg = 'Failed to download signature file %s for %s from %s to %s.'
@@ -76,6 +78,7 @@ def download(self, force=False):
                         logging.warning(log_msg, sig_object, object, self.bucket, local_sig_file, err)
                 # Now we download the file itself.
                 try:
+                    logging.info(f"Downloading file {object} to {local_file}")
                     self.s3.download_file(self.bucket, object, local_file)
                 except Exception as err:
                     log_msg = 'Failed to download %s from %s to %s.\nException: %s'
@@ -200,13 +203,16 @@ def verify_signatures(self):
             if not os.path.exists(sig_file):
                 logging.warning(sig_missing_msg % sig_file)
                 sig_missing = True
+                logging.info(f"Signature file {sig_file} is missing.")
 
         if sig_missing:
             # If signature files are missing, we return a failure,
             # unless the configuration specifies that signatures are not required.
             if self.config['signatures'].getboolean('signatures_required', True):
+                logging.error(f"Signature file {sig_file} is missing.")
                 return False
             else:
+                logging.info(f"Signature file {sig_file} is missing, but signatures are not required.")
                 return True
 
         # If signatures are provided, we should always verify them, regardless of the signatures_required.
@@ -234,6 +240,8 @@ def verify_signatures(self):
                 logging.debug(f'Signature for {file} successfully verified.')
             else:
                 logging.error(f'Failed to verify signature for {file}.')
+                logging.error(f"  stdout: {verify_cmd.stdout.decode('UTF-8')}")
+                logging.error(f"  stderr: {verify_cmd.stderr.decode('UTF-8')}")
                 return False
 
         self.sig_verified = True

From e1b1ee67c1d3ef4637b094df96d72b7fc13071a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 07:46:03 +0200
Subject: [PATCH 016/218] add capability to run verify script in a container

---
 scripts/automated_ingestion/eessitarball.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index af22f4fa..795d10e5 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -217,6 +217,7 @@ def verify_signatures(self):
 
         # If signatures are provided, we should always verify them, regardless of the signatures_required.
         # In order to do so, we need the verification script and an allowed signers file.
+        verify_runenv = self.config['signatures']['signature_verification_runenv'].split()
         verify_script = self.config['signatures']['signature_verification_script']
         allowed_signers_file = self.config['signatures']['allowed_signers_file']
         if not os.path.exists(verify_script):
@@ -231,9 +232,12 @@ def verify_signatures(self):
             (self.local_path, self.local_sig_path),
             (self.local_metadata_path, self.local_metadata_sig_path)
         ]:
+            command = verify_runenv + [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file,
+                 '--file', file, '--signature-file', sig_file]
+            logging.info(f"Running command: {' '.join(command)}")
+
             verify_cmd = subprocess.run(
-                [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file,
-                 '--file', file, '--signature-file', sig_file],
+                command, 
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE)
             if verify_cmd.returncode == 0:

From 56a9d682232bc8f7886f69618f89cd64dc389026 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 08:14:00 +0200
Subject: [PATCH 017/218] improve branch naming and only add files to staged
 dir in main branch

---
 scripts/automated_ingestion/eessitarball.py | 23 ++++++++++++---------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 795d10e5..61d24a7c 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -363,7 +363,7 @@ def print_unknown(self):
     def find_next_sequence_number(self, repo, pr_id):
         """Find the next available sequence number for staging PRs of a source PR."""
         # Search for existing branches for this source PR
-        base_branch = f'staging-{repo.replace("/", "-")}-{pr_id}'
+        base_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-'
         existing_branches = [
             ref.ref for ref in self.git_repo.get_git_refs()
             if ref.ref.startswith(f'refs/heads/{base_branch}')
@@ -377,7 +377,7 @@ def find_next_sequence_number(self, repo, pr_id):
         for branch in existing_branches:
             try:
                 # Extract the sequence number from branch name
-                # Format: staging-repo-pr_id-sequence
+                # Format: staging-<repo>-pr-<pr_id>-seq-<sequence>
                 sequence = int(branch.split('-')[-1])
                 sequence_numbers.append(sequence)
             except (ValueError, IndexError):
@@ -401,7 +401,7 @@ def make_approval_request(self, tarballs_in_group=None):
 
         # find next sequence number for staging PRs of this source PR
         sequence = self.find_next_sequence_number(repo, pr_id)
-        git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}'
+        git_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-{sequence}'
 
         # Check if git_branch exists and what the status of the corressponding PR is
         main_branch = self.git_repo.get_branch('main')
@@ -633,19 +633,22 @@ def process_group(self, tarballs):
             return
 
         # Get branch name from first tarball
-        with open(self.first_tar.local_metadata_path, 'r') as meta:
-            metadata = json.load(meta)
-        repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr']
-        sequence = self.first_tar.find_next_sequence_number(repo, pr_id)
-        git_branch = f'staging-{repo.replace("/", "-")}-{pr_id}-{sequence}'
+        # with open(self.first_tar.local_metadata_path, 'r') as meta:
+        #     metadata = json.load(meta)
+        # repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr']
+        # sequence = self.first_tar.find_next_sequence_number(repo, pr_id)
+        # git_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-{sequence}'
 
-        logging.info(f"Creating group branch: {git_branch}")
+        # logging.info(f"Creating group branch: {git_branch}")
 
         # Mark all tarballs as staged in the group branch
         for tarball in tarballs:
             logging.info(f"Processing tarball in group: {tarball}")
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-            temp_tar.mark_new_tarball_as_staged(branch=git_branch)
+            # temp_tar.mark_new_tarball_as_staged(branch=git_branch)
+            temp_tar.mark_new_tarball_as_staged('main')
+
+        exit()
 
         # Process the group for approval
         self.first_tar.make_approval_request(tarballs)

From 935025f3b586f923e35a7275ed2ab0fc8f166923 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 08:22:03 +0200
Subject: [PATCH 018/218] don't stop after staging files

---
 scripts/automated_ingestion/eessitarball.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 61d24a7c..9b585067 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -648,8 +648,6 @@ def process_group(self, tarballs):
             # temp_tar.mark_new_tarball_as_staged(branch=git_branch)
             temp_tar.mark_new_tarball_as_staged('main')
 
-        exit()
-
         # Process the group for approval
         self.first_tar.make_approval_request(tarballs)
 

From 682ddbddee1fbb7b2c9d2ba6fc37634a3b7b551a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 08:31:20 +0200
Subject: [PATCH 019/218] add more log info when moving metadata files

---
 scripts/automated_ingestion/eessitarball.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 9b585067..1b722400 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -433,6 +433,7 @@ def make_approval_request(self, tarballs_in_group=None):
         self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha)
 
         # Move metadata file(s) to staged directory
+        logging.info(f"Moving metadata for {self.object} from {self.state} to {next_state} in branch {git_branch}")
         if tarballs_in_group is None:
             logging.info(f"Moving metadata for individual tarball to staged")
             self.move_metadata_file(self.state, next_state, branch=git_branch)
@@ -521,7 +522,7 @@ def move_metadata_file(self, old_state, new_state, branch='main'):
         """Move the metadata file of a tarball from an old state's directory to a new state's directory."""
         file_path_old = old_state + '/' + self.metadata_file
         file_path_new = new_state + '/' + self.metadata_file
-        logging.debug(f'Moving metadata file {self.metadata_file} from {file_path_old} to {file_path_new}.')
+        logging.info(f'Moving metadata file {self.metadata_file} from {file_path_old} to {file_path_new} in branch {branch}')
         tarball_metadata = self.git_repo.get_contents(file_path_old)
         # Remove the metadata file from the old state's directory...
         self.git_repo.delete_file(file_path_old, 'remove from ' + old_state, sha=tarball_metadata.sha, branch=branch)

From 412cc464656ac7e241e0985109f8fd8cd9f8c2ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 08:56:10 +0200
Subject: [PATCH 020/218] need to handle first tarball differently

---
 scripts/automated_ingestion/eessitarball.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 1b722400..ea41f0b7 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -432,7 +432,7 @@ def make_approval_request(self, tarballs_in_group=None):
         # Create new branch
         self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha)
 
-        # Move metadata file(s) to staged directory
+        # Move metadata file(s) to approved directory
         logging.info(f"Moving metadata for {self.object} from {self.state} to {next_state} in branch {git_branch}")
         if tarballs_in_group is None:
             logging.info(f"Moving metadata for individual tarball to staged")
@@ -633,23 +633,15 @@ def process_group(self, tarballs):
             logging.error("Tarballs have inconsistent link2pr information")
             return
 
-        # Get branch name from first tarball
-        # with open(self.first_tar.local_metadata_path, 'r') as meta:
-        #     metadata = json.load(meta)
-        # repo, pr_id = metadata['link2pr']['repo'], metadata['link2pr']['pr']
-        # sequence = self.first_tar.find_next_sequence_number(repo, pr_id)
-        # git_branch = f'staging-{repo.replace("/", "-")}-pr-{pr_id}-seq-{sequence}'
-
-        # logging.info(f"Creating group branch: {git_branch}")
-
-        # Mark all tarballs as staged in the group branch
-        for tarball in tarballs:
+        # Mark all tarballs as staged in the group branch, however need to handle first tarball differently
+        logging.info(f"Processing first tarball in group: {self.first_tar.object}")
+        self.first_tar.mark_new_tarball_as_staged('main') # this sets the state of the first tarball to 'staged'
+        for tarball in tarballs[1:]:
             logging.info(f"Processing tarball in group: {tarball}")
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-            # temp_tar.mark_new_tarball_as_staged(branch=git_branch)
             temp_tar.mark_new_tarball_as_staged('main')
 
-        # Process the group for approval
+        # Process the group for approval, only works correctly if first tarball is already in state 'staged'
         self.first_tar.make_approval_request(tarballs)
 
     def to_string(self, oneline=False):

From d0f0d260241d137d8b2e6d8e96bc3a5b0cab2d30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 09:09:05 +0200
Subject: [PATCH 021/218] move from staged to approval plus a little more
 logging

---
 scripts/automated_ingestion/eessitarball.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index ea41f0b7..7ed4ff0d 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -392,7 +392,7 @@ def find_next_sequence_number(self, repo, pr_id):
     def make_approval_request(self, tarballs_in_group=None):
         """Process a staged tarball by opening a pull request for ingestion approval."""
         next_state = self.next_state(self.state)
-
+        logging.info(f"Making approval request for tarball {self.object} in state {self.state} to {next_state}")
         # obtain link2pr information (repo and pr_id) from metadata file
         with open(self.local_metadata_path, 'r') as meta:
             metadata = meta.read()
@@ -441,7 +441,7 @@ def make_approval_request(self, tarballs_in_group=None):
             logging.info(f"Moving metadata for {len(tarballs_in_group)} tarballs to staged")
             for tarball in tarballs_in_group:
                 temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-                temp_tar.move_metadata_file('new', 'staged', branch=git_branch)
+                temp_tar.move_metadata_file(self.state, next_state, branch=git_branch)
 
         # Create PR with appropriate template
         try:

From c2c456d219b2325e7ad4173ce2d7489d193fca71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 09:25:36 +0200
Subject: [PATCH 022/218] fix missing function

---
 scripts/automated_ingestion/eessitarball.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 7ed4ff0d..e50840a9 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -513,7 +513,7 @@ def format_metadata_list(self, tarballs):
         """Format metadata for all tarballs in collapsible sections."""
         formatted = "### Metadata\n\n"
         for tarball in tarballs:
-            with open(self.get_metadata_path(tarball), 'r') as meta:
+            with open(self.metadata_file, 'r') as meta:
                 metadata = meta.read()
                 formatted += f"<details>\n<summary>Metadata for {tarball}</summary>\n\n```\n{metadata}\n```\n</details>\n\n"
         return formatted

From 5cad5eda71dfd8f76805acc531848200ecbc8f33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 27 Apr 2025 09:38:42 +0200
Subject: [PATCH 023/218] improve get_metadata_path

---
 scripts/automated_ingestion/eessitarball.py | 25 ++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index e50840a9..8ad92fe5 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -513,11 +513,34 @@ def format_metadata_list(self, tarballs):
         """Format metadata for all tarballs in collapsible sections."""
         formatted = "### Metadata\n\n"
         for tarball in tarballs:
-            with open(self.metadata_file, 'r') as meta:
+            with open(self.get_metadata_path(tarball), 'r') as meta:
                 metadata = meta.read()
                 formatted += f"<details>\n<summary>Metadata for {tarball}</summary>\n\n```\n{metadata}\n```\n</details>\n\n"
         return formatted
 
+    def get_metadata_path(self, tarball=None):
+        """
+        Return the local path of the metadata file.
+
+        Args:
+            tarball (str, optional): Name of the tarball to get metadata path for.
+                                   If None, use the current tarball's metadata file.
+        """
+        if tarball is None:
+            # For single tarball, use the instance's metadata file
+            if not self.local_metadata_path:
+                self.local_metadata_path = os.path.join(
+                    self.config['paths']['download_dir'],
+                    os.path.basename(self.metadata_file)
+                )
+            return self.local_metadata_path
+        else:
+            # For group of tarballs, construct path from tarball name
+            return os.path.join(
+                self.config['paths']['download_dir'],
+                os.path.basename(tarball) + self.config['paths']['metadata_file_extension']
+            )
+
     def move_metadata_file(self, old_state, new_state, branch='main'):
         """Move the metadata file of a tarball from an old state's directory to a new state's directory."""
         file_path_old = old_state + '/' + self.metadata_file

From d045aa54e90b2ea638c2768ad6ca580f2e4580d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Thu, 1 May 2025 18:42:17 +0200
Subject: [PATCH 024/218] enhance logging capabilities

---
 .../automated_ingestion.py                    |  16 +++
 scripts/automated_ingestion/eessitarball.py   |  10 +-
 scripts/automated_ingestion/utils.py          | 120 ++++++++++++++++++
 3 files changed, 145 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 9eb717cb..78310ea6 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -3,6 +3,7 @@
 from eessitarball import EessiTarball, EessiTarballGroup
 from pid.decorator import pidfile  # noqa: F401
 from pid import PidFileError
+from utils import log_function_entry_exit
 
 import argparse
 import boto3
@@ -54,6 +55,7 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt
     return tarballs
 
 
+@log_function_entry_exit()
 def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'):
     """Return a dictionary of tarball groups, keyed by (repo, pr_number)."""
     tarballs = find_tarballs(s3, bucket, extension, metadata_extension)
@@ -86,6 +88,7 @@ def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extens
     return groups
 
 
+@log_function_entry_exit()
 def parse_config(path):
     """Parse the configuration file."""
     config = configparser.ConfigParser()
@@ -116,6 +119,7 @@ def parse_config(path):
     return config
 
 
+@log_function_entry_exit()
 def parse_args():
     """Parse the command-line arguments."""
     parser = argparse.ArgumentParser()
@@ -133,6 +137,11 @@ def parse_args():
     logging_group.add_argument('--quiet',
                              action='store_true',
                              help='Suppress console output (overrides all other console settings)')
+    logging_group.add_argument('--log-scopes',
+                             help='Comma-separated list of logging scopes using +/- syntax. '
+                                  'Examples: "+FUNC_ENTRY_EXIT" (enable only function entry/exit), '
+                                  '"+ALL,-FUNC_ENTRY_EXIT" (enable all except function entry/exit), '
+                                  '"+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" (enable function entry/exit but disable example)')
 
     # Existing arguments
     parser.add_argument('-c', '--config', type=str, help='path to configuration file',
@@ -143,6 +152,7 @@ def parse_args():
     return parser.parse_args()
 
 
+@log_function_entry_exit()
 def setup_logging(config, args):
     """
     Configure logging based on configuration file and command line arguments.
@@ -167,6 +177,11 @@ def setup_logging(config, args):
     if args.debug:
         console_level = logging.DEBUG
 
+    # Set up logging scopes
+    if args.log_scopes:
+        from utils import set_logging_scopes
+        set_logging_scopes(args.log_scopes)
+
     # Create logger
     logger = logging.getLogger()
     logger.setLevel(logging.DEBUG)  # Set root logger to lowest level
@@ -197,6 +212,7 @@ def setup_logging(config, args):
 
 
 @pid.decorator.pidfile('automated_ingestion.pid')
+@log_function_entry_exit()
 def main():
     """Main function."""
     args = parse_args()
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 8ad92fe5..4885f665 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -1,4 +1,4 @@
-from utils import send_slack_message, sha256sum
+from utils import send_slack_message, sha256sum, log_function_entry_exit
 
 from pathlib import PurePosixPath
 
@@ -18,6 +18,7 @@ class EessiTarball:
     for which it interfaces with the S3 bucket, GitHub, and CVMFS.
     """
 
+    @log_function_entry_exit()
     def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo):
         """Initialize the tarball object."""
         self.config = config
@@ -48,6 +49,7 @@ def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo
         # Find the initial state of this tarball.
         self.state = self.find_state()
 
+    @log_function_entry_exit()
     def download(self, force=False):
         """
         Download this tarball and its corresponding metadata file, if this hasn't been already done.
@@ -90,6 +92,7 @@ def download(self, force=False):
             self.local_path = None
             self.local_metadata_path = None
 
+    @log_function_entry_exit()
     def find_state(self):
         """Find the state of this tarball by searching through the state directories in the git repository."""
         logging.debug(f"Find state for {self.object}")
@@ -194,6 +197,7 @@ def to_string(self, oneline=False):
         str += f"{sep} GHrepo: {self.git_repo}"
         return str
 
+    @log_function_entry_exit()
     def verify_signatures(self):
         """Verify the signatures of the downloaded tarball and metadata file using the corresponding signature files."""
 
@@ -251,6 +255,7 @@ def verify_signatures(self):
         self.sig_verified = True
         return True
 
+    @log_function_entry_exit()
     def verify_checksum(self):
         """Verify the checksum of the downloaded tarball with the one in its metadata file."""
         local_sha256 = sha256sum(self.local_path)
@@ -261,6 +266,7 @@ def verify_checksum(self):
         logging.debug(f'Checksum stored in metadata file: {meta_sha256}')
         return local_sha256 == meta_sha256
 
+    @log_function_entry_exit()
     def ingest(self):
         """Process a tarball that is ready to be ingested by running the ingestion script."""
         # TODO: check if there is an open issue for this tarball, and if there is, skip it.
@@ -321,6 +327,7 @@ def print_ingested(self):
         """Process a tarball that has already been ingested."""
         logging.info(f'{self.object} has already been ingested, skipping...')
 
+    @log_function_entry_exit()
     def mark_new_tarball_as_staged(self, branch=None):
         """Process a new tarball that was added to the staging bucket."""
         next_state = self.next_state(self.state)
@@ -389,6 +396,7 @@ def find_next_sequence_number(self, repo, pr_id):
         # Return next available sequence number
         return max(sequence_numbers) + 1
 
+    @log_function_entry_exit()
     def make_approval_request(self, tarballs_in_group=None):
         """Process a staged tarball by opening a pull request for ingestion approval."""
         next_state = self.next_state(self.state)
diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 66843dd9..bed75469 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -1,7 +1,87 @@
 import hashlib
 import json
 import requests
+import logging
+import functools
+import time
+from enum import IntFlag, auto
 
+class LoggingScope(IntFlag):
+    """Enumeration of different logging scopes."""
+    NONE = 0
+    FUNC_ENTRY_EXIT = auto()  # Function entry/exit logging
+    # Add more scopes here as needed
+    # EXAMPLE_SCOPE = auto()
+    # ANOTHER_SCOPE = auto()
+    ALL = FUNC_ENTRY_EXIT  # Update this when adding new scopes
+
+# Global setting for logging scopes
+ENABLED_LOGGING_SCOPES = LoggingScope.NONE
+
+def set_logging_scopes(scopes):
+    """
+    Set the enabled logging scopes.
+
+    Args:
+        scopes: Can be:
+            - A LoggingScope value
+            - A string with comma-separated values using +/- syntax:
+              - "+SCOPE" to enable a scope
+              - "-SCOPE" to disable a scope
+              - "ALL" or "+ALL" to enable all scopes
+              - "-ALL" to disable all scopes
+              Examples:
+                "+FUNC_ENTRY_EXIT"  # Enable only function entry/exit
+                "+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE"  # Enable function entry/exit but disable example
+                "+ALL,-FUNC_ENTRY_EXIT"  # Enable all scopes except function entry/exit
+    """
+    global ENABLED_LOGGING_SCOPES
+
+    if isinstance(scopes, LoggingScope):
+        ENABLED_LOGGING_SCOPES = scopes
+        return
+
+    if isinstance(scopes, str):
+        # Start with no scopes enabled
+        ENABLED_LOGGING_SCOPES = LoggingScope.NONE
+
+        # Split into individual scope specifications
+        scope_specs = [s.strip() for s in scopes.split(",")]
+
+        for spec in scope_specs:
+            if not spec:
+                continue
+
+            # Check for ALL special case
+            if spec.upper() in ["ALL", "+ALL"]:
+                ENABLED_LOGGING_SCOPES = LoggingScope.ALL
+                continue
+            elif spec.upper() == "-ALL":
+                ENABLED_LOGGING_SCOPES = LoggingScope.NONE
+                continue
+
+            # Parse scope name and operation
+            operation = spec[0]
+            scope_name = spec[1:].strip().upper()
+
+            try:
+                scope_enum = LoggingScope[scope_name]
+                if operation == '+':
+                    ENABLED_LOGGING_SCOPES |= scope_enum
+                elif operation == '-':
+                    ENABLED_LOGGING_SCOPES &= ~scope_enum
+                else:
+                    logging.warning(f"Invalid operation '{operation}' in scope specification: {spec}")
+            except KeyError:
+                logging.warning(f"Unknown logging scope: {scope_name}")
+
+    elif isinstance(scopes, list):
+        # Convert list to comma-separated string and process
+        set_logging_scopes(",".join(scopes))
+
+def is_logging_scope_enabled(scope):
+    """Check if a specific logging scope is enabled."""
+    return bool(ENABLED_LOGGING_SCOPES & scope)
 
 def send_slack_message(webhook, msg):
     """Send a Slack message."""
@@ -25,3 +105,43 @@ def sha256sum(path):
         for byte_block in iter(lambda: f.read(8192), b''):
             sha256_hash.update(byte_block)
         return sha256_hash.hexdigest()
+
+
+def log_function_entry_exit(logger=None):
+    """
+    Decorator that logs function entry and exit with timing information.
+    Only logs if function entry/exit logging is enabled.
+
+    Args:
+        logger: Optional logger instance. If not provided, uses the root logger.
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT):
+                return func(*args, **kwargs)
+
+            # Use provided logger or get root logger
+            log = logger or logging.getLogger()
+
+            # Log function entry
+            log.debug(f"Entering {func.__name__}")
+            start_time = time.time()
+
+            try:
+                # Execute the function
+                result = func(*args, **kwargs)
+
+                # Log successful exit
+                duration = time.time() - start_time
+                log.debug(f"Exiting {func.__name__} (took {duration:.3f}s)")
+                return result
+
+            except Exception as e:
+                # Log error exit
+                duration = time.time() - start_time
+                log.error(f"Error in {func.__name__} after {duration:.3f}s: {str(e)}")
+                raise
+
+        return wrapper
+    return decorator

From f9fd34ec025b5ef9bbb868ecf574b19cc0e300a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Thu, 1 May 2025 19:31:32 +0200
Subject: [PATCH 025/218] add more logging scopes, convert logging calls and
 improve code readability

---
 .../automated_ingestion.py                    |   9 +-
 scripts/automated_ingestion/eessitarball.py   | 179 +++++++++++-------
 scripts/automated_ingestion/utils.py          |  52 ++++-
 3 files changed, 164 insertions(+), 76 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 78310ea6..71b8b61d 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -3,7 +3,7 @@
 from eessitarball import EessiTarball, EessiTarballGroup
 from pid.decorator import pidfile  # noqa: F401
 from pid import PidFileError
-from utils import log_function_entry_exit
+from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes
 
 import argparse
 import boto3
@@ -108,7 +108,10 @@ def parse_config(path):
     # Validate staging_pr_method
     staging_method = config['github'].get('staging_pr_method', 'individual')
     if staging_method not in ['individual', 'grouped']:
-        error(f'Invalid staging_pr_method: "{staging_method}" in configuration file {path}. Must be either "individual" or "grouped".')
+        error(
+            f'Invalid staging_pr_method: "{staging_method}" in configuration file {path}. '
+            'Must be either "individual" or "grouped".'
+        )
 
     # Validate PR body templates
     if staging_method == 'individual' and 'individual_pr_body' not in config['github']:
@@ -179,8 +182,8 @@ def setup_logging(config, args):
 
     # Set up logging scopes
     if args.log_scopes:
-        from utils import set_logging_scopes
         set_logging_scopes(args.log_scopes)
+        log_message(LoggingScope.DEBUG, 'DEBUG', "Enabled logging scopes: %s", args.log_scopes)
 
     # Create logger
     logger = logging.getLogger()
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index 4885f665..ab888964 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -1,4 +1,4 @@
-from utils import send_slack_message, sha256sum, log_function_entry_exit
+from utils import send_slack_message, sha256sum, log_function_entry_exit, log_message, LoggingScope
 
 from pathlib import PurePosixPath
 
@@ -58,33 +58,41 @@ def download(self, force=False):
             (self.object, self.local_path, self.object_sig, self.local_sig_path),
             (self.metadata_file, self.local_metadata_path, self.metadata_sig_file, self.local_metadata_sig_path),
         ]
-        logging.info(f"Downloading {files}")
+        log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading %s", files)
         skip = False
         for (object, local_file, sig_object, local_sig_file) in files:
             if force or not os.path.exists(local_file):
                 # First we try to download signature file, which may or may not be available
                 # and may be optional or required.
                 try:
-                    logging.info(f"Downloading signature file {sig_object} to {local_sig_file}")
+                    log_msg = "Downloading signature file %s to %s"
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, sig_object, local_sig_file)
                     self.s3.download_file(self.bucket, sig_object, local_sig_file)
                 except Exception as err:
                     log_msg = 'Failed to download signature file %s for %s from %s to %s.'
                     if self.config['signatures'].getboolean('signatures_required', True):
                         log_msg += '\nException: %s'
-                        logging.error(log_msg, sig_object, object, self.bucket, local_sig_file, err)
+                        log_message(
+                            LoggingScope.ERROR, 'ERROR', log_msg,
+                            sig_object, object, self.bucket, local_sig_file, err
+                        )
                         skip = True
                         break
                     else:
-                        log_msg += ' Ignoring this, because signatures are not required with the current configuration.'
+                        log_msg += ' Ignoring this, because signatures are not required'
+                        log_msg += ' with the current configuration.'
                         log_msg += '\nException: %s'
-                        logging.warning(log_msg, sig_object, object, self.bucket, local_sig_file, err)
+                        log_message(
+                            LoggingScope.DOWNLOAD, 'WARNING', log_msg,
+                            sig_object, object, self.bucket, local_sig_file, err
+                        )
                 # Now we download the file itself.
                 try:
-                    logging.info(f"Downloading file {object} to {local_file}")
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file %s to %s", object, local_file)
                     self.s3.download_file(self.bucket, object, local_file)
                 except Exception as err:
                     log_msg = 'Failed to download %s from %s to %s.\nException: %s'
-                    logging.error(log_msg, object, self.bucket, local_file, err)
+                    log_message(LoggingScope.ERROR, 'ERROR', log_msg, object, self.bucket, local_file, err)
                     skip = True
                     break
         # If any required download failed, make sure to skip this tarball completely.
@@ -95,11 +103,12 @@ def download(self, force=False):
     @log_function_entry_exit()
     def find_state(self):
         """Find the state of this tarball by searching through the state directories in the git repository."""
-        logging.debug(f"Find state for {self.object}")
+        log_message(LoggingScope.DEBUG, 'DEBUG', "Find state for %s", self.object)
         for state in list(self.states.keys()):
             try:
                 self.git_repo.get_contents(state + '/' + self.metadata_file)
-                logging.info(f"Found metadata file {self.metadata_file} in state: {state}")
+                log_msg = "Found metadata file %s in state: %s"
+                log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.metadata_file, state)
                 return state
             except github.UnknownObjectException:
                 # no metadata file found in this state's directory, so keep searching...
@@ -111,9 +120,9 @@ def find_state(self):
                 else:
                     # if there was some other (e.g. connection) issue, abort the search for this tarball
                     log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
-                    logging.warning(log_msg, self.object, err.status)
+                    log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status)
                     return "unknown"
-        logging.info(f"Tarball {self.metadata_file} is new")
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', "Tarball %s is new", self.metadata_file)
         return "new"
 
     def get_contents_overview(self):
@@ -199,24 +208,27 @@ def to_string(self, oneline=False):
 
     @log_function_entry_exit()
     def verify_signatures(self):
-        """Verify the signatures of the downloaded tarball and metadata file using the corresponding signature files."""
-
+        """
+        Verify the signatures of the downloaded tarball and metadata file
+        using the corresponding signature files.
+        """
         sig_missing_msg = 'Signature file %s is missing.'
         sig_missing = False
         for sig_file in [self.local_sig_path, self.local_metadata_sig_path]:
             if not os.path.exists(sig_file):
-                logging.warning(sig_missing_msg % sig_file)
+                log_message(LoggingScope.VERIFICATION, 'WARNING', sig_missing_msg, sig_file)
                 sig_missing = True
-                logging.info(f"Signature file {sig_file} is missing.")
+                log_message(LoggingScope.VERIFICATION, 'INFO', "Signature file %s is missing.", sig_file)
 
         if sig_missing:
             # If signature files are missing, we return a failure,
             # unless the configuration specifies that signatures are not required.
             if self.config['signatures'].getboolean('signatures_required', True):
-                logging.error(f"Signature file {sig_file} is missing.")
+                log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing.", sig_file)
                 return False
             else:
-                logging.info(f"Signature file {sig_file} is missing, but signatures are not required.")
+                log_msg = "Signature file %s is missing, but signatures are not required."
+                log_message(LoggingScope.VERIFICATION, 'INFO', log_msg, sig_file)
                 return True
 
         # If signatures are provided, we should always verify them, regardless of the signatures_required.
@@ -225,11 +237,13 @@ def verify_signatures(self):
         verify_script = self.config['signatures']['signature_verification_script']
         allowed_signers_file = self.config['signatures']['allowed_signers_file']
         if not os.path.exists(verify_script):
-            logging.error('Unable to verify signatures, the specified signature verification script does not exist!')
+            log_msg = 'Unable to verify signatures, the specified signature verification script does not exist!'
+            log_message(LoggingScope.ERROR, 'ERROR', log_msg)
             return False
 
         if not os.path.exists(allowed_signers_file):
-            logging.error('Unable to verify signatures, the specified allowed signers file does not exist!')
+            log_msg = 'Unable to verify signatures, the specified allowed signers file does not exist!'
+            log_message(LoggingScope.ERROR, 'ERROR', log_msg)
             return False
 
         for (file, sig_file) in [
@@ -238,18 +252,18 @@ def verify_signatures(self):
         ]:
             command = verify_runenv + [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file,
                  '--file', file, '--signature-file', sig_file]
-            logging.info(f"Running command: {' '.join(command)}")
+            log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(command))
 
             verify_cmd = subprocess.run(
                 command, 
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE)
             if verify_cmd.returncode == 0:
-                logging.debug(f'Signature for {file} successfully verified.')
+                log_message(LoggingScope.VERIFICATION, 'DEBUG', 'Signature for %s successfully verified.', file)
             else:
-                logging.error(f'Failed to verify signature for {file}.')
-                logging.error(f"  stdout: {verify_cmd.stdout.decode('UTF-8')}")
-                logging.error(f"  stderr: {verify_cmd.stderr.decode('UTF-8')}")
+                log_message(LoggingScope.ERROR, 'ERROR', 'Failed to verify signature for %s.', file)
+                log_message(LoggingScope.ERROR, 'ERROR', "  stdout: %s", verify_cmd.stdout.decode('UTF-8'))
+                log_message(LoggingScope.ERROR, 'ERROR', "  stderr: %s", verify_cmd.stderr.decode('UTF-8'))
                 return False
 
         self.sig_verified = True
@@ -262,39 +276,41 @@ def verify_checksum(self):
         meta_sha256 = None
         with open(self.local_metadata_path, 'r') as meta:
             meta_sha256 = json.load(meta)['payload']['sha256sum']
-        logging.debug(f'Checksum of downloaded tarball: {local_sha256}')
-        logging.debug(f'Checksum stored in metadata file: {meta_sha256}')
+        log_message(LoggingScope.VERIFICATION, 'DEBUG', 'Checksum of downloaded tarball: %s', local_sha256)
+        log_message(LoggingScope.VERIFICATION, 'DEBUG', 'Checksum stored in metadata file: %s', meta_sha256)
         return local_sha256 == meta_sha256
 
     @log_function_entry_exit()
     def ingest(self):
         """Process a tarball that is ready to be ingested by running the ingestion script."""
         # TODO: check if there is an open issue for this tarball, and if there is, skip it.
-        logging.info(f'Tarball {self.object} is ready to be ingested.')
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Tarball %s is ready to be ingested.', self.object)
         self.download()
-        logging.info('Verifying its signature...')
+        log_message(LoggingScope.VERIFICATION, 'INFO', 'Verifying its signature...')
         if not self.verify_signatures():
             issue_msg = f'Failed to verify signatures for `{self.object}`'
-            logging.error(issue_msg)
+            log_message(LoggingScope.ERROR, 'ERROR', issue_msg)
             if not self.issue_exists(issue_msg, state='open'):
                 self.git_repo.create_issue(title=issue_msg, body=issue_msg)
             return
         else:
-            logging.debug(f'Signatures of {self.object} and its metadata file successfully verified.')
+            log_msg = 'Signatures of %s and its metadata file successfully verified.'
+            log_message(LoggingScope.VERIFICATION, 'DEBUG', log_msg, self.object)
 
-        logging.info('Verifying its checksum...')
+        log_message(LoggingScope.VERIFICATION, 'INFO', 'Verifying its checksum...')
         if not self.verify_checksum():
             issue_msg = f'Failed to verify checksum for `{self.object}`'
-            logging.error(issue_msg)
+            log_message(LoggingScope.ERROR, 'ERROR', issue_msg)
             if not self.issue_exists(issue_msg, state='open'):
                 self.git_repo.create_issue(title=issue_msg, body=issue_msg)
             return
         else:
-            logging.debug(f'Checksum of {self.object} matches the one in its metadata file.')
+            log_msg = 'Checksum of %s matches the one in its metadata file.'
+            log_message(LoggingScope.VERIFICATION, 'DEBUG', log_msg, self.object)
 
         script = self.config['paths']['ingestion_script']
         sudo = ['sudo'] if self.config['cvmfs'].getboolean('ingest_as_root', True) else []
-        logging.info(f'Running the ingestion script for {self.object}...')
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Running the ingestion script for %s...', self.object)
         ingest_cmd = subprocess.run(
             sudo + [script, self.cvmfs_repo, self.local_path],
             stdout=subprocess.PIPE,
@@ -319,34 +335,39 @@ def ingest(self):
                 stderr=ingest_cmd.stderr.decode('UTF-8'),
             )
             if self.issue_exists(issue_title, state='open'):
-                logging.info(f'Failed to ingest {self.object}, but an open issue already exists, skipping...')
+                log_msg = 'Failed to ingest %s, but an open issue already exists, skipping...'
+                log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object)
             else:
                 self.git_repo.create_issue(title=issue_title, body=issue_body)
 
     def print_ingested(self):
         """Process a tarball that has already been ingested."""
-        logging.info(f'{self.object} has already been ingested, skipping...')
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', '%s has already been ingested, skipping...', self.object)
 
     @log_function_entry_exit()
     def mark_new_tarball_as_staged(self, branch=None):
         """Process a new tarball that was added to the staging bucket."""
         next_state = self.next_state(self.state)
-        logging.info(f'Found new tarball {self.object}, downloading it...')
+        log_msg = 'Found new tarball %s, downloading it...'
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object)
         # Download the tarball and its metadata file.
         # Use force as it may be a new attempt for an existing tarball that failed before.
         self.download(force=True)
         if not self.local_path or not self.local_metadata_path:
-            logging.warning(f"Skipping tarball {self.object} - download failed")
+            log_msg = "Skipping tarball %s - download failed"
+            log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object)
             return
 
         # Verify the signatures of the tarball and metadata file.
         if not self.verify_signatures():
-            logging.warning(f"Skipping tarball {self.object} - signature verification failed")
+            log_msg = "Skipping tarball %s - signature verification failed"
+            log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object)
             return
 
         # If no branch is provided, use the main branch
         target_branch = branch if branch else 'main'
-        logging.info(f"Adding metadata to '{next_state}' folder in {target_branch} branch")
+        log_msg = "Adding metadata to '%s' folder in %s branch"
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, next_state, target_branch)
 
         file_path_staged = next_state + '/' + self.metadata_file
         contents = ''
@@ -360,12 +381,14 @@ def mark_new_tarball_as_staged(self, branch=None):
 
     def print_rejected(self):
         """Process a (rejected) tarball for which the corresponding PR has been closed witout merging."""
-        logging.info("This tarball was rejected, so we're skipping it.")
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', "This tarball was rejected, so we're skipping it.")
         # Do we want to delete rejected tarballs at some point?
 
     def print_unknown(self):
         """Process a tarball which has an unknown state."""
-        logging.info("The state of this tarball could not be determined, so we're skipping it.")
+        log_msg = "The state of this tarball could not be determined,"
+        log_msg += " so we're skipping it."
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg)
 
     def find_next_sequence_number(self, repo, pr_id):
         """Find the next available sequence number for staging PRs of a source PR."""
@@ -400,7 +423,8 @@ def find_next_sequence_number(self, repo, pr_id):
     def make_approval_request(self, tarballs_in_group=None):
         """Process a staged tarball by opening a pull request for ingestion approval."""
         next_state = self.next_state(self.state)
-        logging.info(f"Making approval request for tarball {self.object} in state {self.state} to {next_state}")
+        log_msg = "Making approval request for tarball %s in state %s to %s"
+        log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object, self.state, next_state)
         # obtain link2pr information (repo and pr_id) from metadata file
         with open(self.local_metadata_path, 'r') as meta:
             metadata = meta.read()
@@ -414,26 +438,31 @@ def make_approval_request(self, tarballs_in_group=None):
         # Check if git_branch exists and what the status of the corressponding PR is
         main_branch = self.git_repo.get_branch('main')
         if git_branch in [branch.name for branch in self.git_repo.get_branches()]:
-            logging.info(f"Branch {git_branch} already exists, checking the status of the corresponding PR...")
+            log_msg = "Branch %s already exists, checking the status of the corresponding PR..."
+            log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, git_branch)
             find_pr = [pr for pr in self.git_repo.get_pulls(head=git_branch, state='all')
                        if pr.head.ref == git_branch]
             if find_pr:
                 pr = find_pr.pop(0)
                 if pr.state == 'open':
-                    logging.info('PR is still open, skipping this tarball...')
+                    log_message(LoggingScope.GITHUB_OPS, 'INFO', 'PR is still open, skipping this tarball...')
                     return
                 elif pr.state == 'closed' and not pr.merged:
-                    logging.info('PR was rejected')
+                    log_message(LoggingScope.GITHUB_OPS, 'INFO', 'PR was rejected')
                     self.reject()
                     return
                 else:
-                    logging.warn(f'Warning, tarball {self.object} is in a weird state:')
-                    logging.warn(f'Branch: {git_branch}\nPR: {pr}\nPR state: {pr.state}\nPR merged: {pr.merged}')
+                    log_msg = 'Warning, tarball %s is in a weird state:'
+                    log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, self.object)
+                    log_msg = 'Branch: %s\nPR: %s\nPR state: %s\nPR merged: %s'
+                    log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, 
+                              git_branch, pr, pr.state, pr.merged)
                     # TODO:  should we delete the branch or open an issue? 
                     return
             else:
-                logging.info(f'Tarball {self.object} has a branch, but no PR.')
-                logging.info('Removing existing branch...')
+                log_msg = 'Tarball %s has a branch, but no PR.'
+                log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object)
+                log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Removing existing branch...')
                 ref = self.git_repo.get_git_ref(f'heads/{git_branch}')
                 ref.delete()
 
@@ -441,12 +470,15 @@ def make_approval_request(self, tarballs_in_group=None):
         self.git_repo.create_git_ref(ref='refs/heads/' + git_branch, sha=main_branch.commit.sha)
 
         # Move metadata file(s) to approved directory
-        logging.info(f"Moving metadata for {self.object} from {self.state} to {next_state} in branch {git_branch}")
+        log_msg = "Moving metadata for %s from %s to %s in branch %s"
+        log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, 
+                   self.object, self.state, next_state, git_branch)
         if tarballs_in_group is None:
-            logging.info(f"Moving metadata for individual tarball to staged")
+            log_message(LoggingScope.GITHUB_OPS, 'INFO', "Moving metadata for individual tarball to staged")
             self.move_metadata_file(self.state, next_state, branch=git_branch)
         else:
-            logging.info(f"Moving metadata for {len(tarballs_in_group)} tarballs to staged")
+            log_msg = "Moving metadata for %d tarballs to staged"
+            log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, len(tarballs_in_group))
             for tarball in tarballs_in_group:
                 temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
                 temp_tar.move_metadata_file(self.state, next_state, branch=git_branch)
@@ -455,7 +487,8 @@ def make_approval_request(self, tarballs_in_group=None):
         try:
             pr_url=f"https://github.com/{repo}/pull/{pr_id}",
             if tarballs_in_group is None:
-                logging.info(f"Creating PR for individual tarball: {self.object}")
+                log_msg = "Creating PR for individual tarball: %s"
+                log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object)
                 tarball_contents = self.get_contents_overview()
                 pr_body = self.config['github']['individual_pr_body'].format(
                     cvmfs_repo=self.cvmfs_repo,
@@ -476,7 +509,8 @@ def make_approval_request(self, tarballs_in_group=None):
                         tar_details_tpl = "<details>\n<summary>Contents of %s</summary>\n\n%s\n</details>\n"
                         tar_overviews.append(tar_details_tpl % (tarball, overview))
                     except Exception as err:
-                        logging.error(f"Failed to get contents overview for {tarball}: {err}")
+                        log_msg = "Failed to get contents overview for %s: %s"
+                        log_message(LoggingScope.ERROR, 'ERROR', log_msg, tarball, err)
                         tar_details_tpl = "<details>\n<summary>Contents of %s</summary>\n\n"
                         tar_details_tpl += "Failed to get contents overview: %s\n</details>\n"
                         tar_overviews.append(tar_details_tpl % (tarball, err))
@@ -497,10 +531,10 @@ def make_approval_request(self, tarballs_in_group=None):
                 pr_title += ' :closed_lock_with_key:'
 
             self.git_repo.create_pull(title=pr_title, body=pr_body, head=git_branch, base='main')
-            logging.info(f"Created PR: {pr_title}")
+            log_message(LoggingScope.GITHUB_OPS, 'INFO', "Created PR: %s", pr_title)
 
         except Exception as err:
-            logging.error(f"Failed to create PR: {err}")
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to create PR: %s", err)
             if not self.issue_exists(f'Failed to get contents of {self.object}', state='open'):
                 self.git_repo.create_issue(
                     title=f'Failed to get contents of {self.object}',
@@ -523,7 +557,10 @@ def format_metadata_list(self, tarballs):
         for tarball in tarballs:
             with open(self.get_metadata_path(tarball), 'r') as meta:
                 metadata = meta.read()
-                formatted += f"<details>\n<summary>Metadata for {tarball}</summary>\n\n```\n{metadata}\n```\n</details>\n\n"
+                formatted += (
+                    f"<details>\n<summary>Metadata for {tarball}</summary>\n\n"
+                    f"```\n{metadata}\n```\n</details>\n\n"
+                )
         return formatted
 
     def get_metadata_path(self, tarball=None):
@@ -553,7 +590,8 @@ def move_metadata_file(self, old_state, new_state, branch='main'):
         """Move the metadata file of a tarball from an old state's directory to a new state's directory."""
         file_path_old = old_state + '/' + self.metadata_file
         file_path_new = new_state + '/' + self.metadata_file
-        logging.info(f'Moving metadata file {self.metadata_file} from {file_path_old} to {file_path_new} in branch {branch}')
+        log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Moving metadata file %s from %s to %s in branch %s', 
+                   self.metadata_file, file_path_old, file_path_new, branch)
         tarball_metadata = self.git_repo.get_contents(file_path_old)
         # Remove the metadata file from the old state's directory...
         self.git_repo.delete_file(file_path_old, 'remove from ' + old_state, sha=tarball_metadata.sha, branch=branch)
@@ -608,7 +646,7 @@ def extract_tarballs_from_pr_body(self, pr_body):
     def reject(self):
         """Reject a tarball for ingestion."""
         # Let's move the the tarball to the directory for rejected tarballs.
-        logging.info(f'Marking tarball {self.object} as rejected...')
+        log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Marking tarball %s as rejected...', self.object)
         next_state = 'rejected'
         self.move_metadata_file(self.state, next_state)
 
@@ -644,31 +682,34 @@ def download_tarballs_and_more(self, tarballs):
         """Download all files associated with this group of tarballs."""
         for tarball in tarballs:
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-            logging.info(f"downloading files for '{temp_tar.object}'")
+            log_message(LoggingScope.GROUP_OPS, 'INFO', "downloading files for '%s'", temp_tar.object)
             temp_tar.download(force=True)
             if not temp_tar.local_path or not temp_tar.local_metadata_path:
-                logging.warn(f"Skipping this tarball: {temp_tar.object}")
+                log_message(LoggingScope.GROUP_OPS, 'WARNING', "Skipping this tarball: %s", temp_tar.object)
                 return False
         return True
 
     def process_group(self, tarballs):
         """Process a group of tarballs together."""
-        logging.info(f"Processing group of {len(tarballs)} tarballs")
+        log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing group of %d tarballs", len(tarballs))
 
         if not self.download_tarballs_and_more(tarballs):
-            logging.error("Downloading tarballs, metadata files and/or their signatures failed")
+            log_msg = "Downloading tarballs, metadata files and/or their signatures failed"
+            log_message(LoggingScope.ERROR, 'ERROR', log_msg)
             return
 
         # Verify all tarballs have the same link2pr info
         if not self.verify_group_consistency(tarballs):
-            logging.error("Tarballs have inconsistent link2pr information")
+            log_message(LoggingScope.ERROR, 'ERROR', "Tarballs have inconsistent link2pr information")
             return
 
         # Mark all tarballs as staged in the group branch, however need to handle first tarball differently
-        logging.info(f"Processing first tarball in group: {self.first_tar.object}")
+        log_msg = "Processing first tarball in group: %s"
+        log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, self.first_tar.object)
         self.first_tar.mark_new_tarball_as_staged('main') # this sets the state of the first tarball to 'staged'
         for tarball in tarballs[1:]:
-            logging.info(f"Processing tarball in group: {tarball}")
+            log_msg = "Processing tarball in group: %s"
+            log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, tarball)
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
             temp_tar.mark_new_tarball_as_staged('main')
 
@@ -692,7 +733,7 @@ def verify_group_consistency(self, tarballs):
 
         for tarball in tarballs[1:]:  # Skip first tarball as we already have its info
             temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
-            logging.debug(f"temp tar: {temp_tar.to_string()}")
+            log_message(LoggingScope.DEBUG, 'DEBUG', "temp tar: %s", temp_tar.to_string())
             repo, pr = temp_tar.get_link2pr_info()
             if repo != first_repo or pr != first_pr:
                 return False
diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index bed75469..2c3aeb3c 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -10,10 +10,15 @@ class LoggingScope(IntFlag):
     """Enumeration of different logging scopes."""
     NONE = 0
     FUNC_ENTRY_EXIT = auto()  # Function entry/exit logging
-    # Add more scopes here as needed
-    # EXAMPLE_SCOPE = auto()
-    # ANOTHER_SCOPE = auto()
-    ALL = FUNC_ENTRY_EXIT  # Update this when adding new scopes
+    DOWNLOAD = auto()         # Logging related to file downloads
+    VERIFICATION = auto()     # Logging related to signature and checksum verification
+    STATE_CHANGE = auto()     # Logging related to tarball state changes
+    GITHUB_OPS = auto()       # Logging related to GitHub operations (PRs, issues, etc.)
+    GROUP_OPS = auto()        # Logging related to tarball group operations
+    ERROR = auto()           # Error logging (separate from other scopes for easier filtering)
+    DEBUG = auto()           # Debug-level logging (separate from other scopes for easier filtering)
+    ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE | 
+           GITHUB_OPS | GROUP_OPS | ERROR | DEBUG)
 
 # Global setting for logging scopes
 ENABLED_LOGGING_SCOPES = LoggingScope.NONE
@@ -145,3 +150,42 @@ def wrapper(*args, **kwargs):
 
         return wrapper
     return decorator
+
+def log_with_scope(scope, logger=None):
+    """
+    Decorator that checks if a specific logging scope is enabled before logging.
+
+    Args:
+        scope: LoggingScope value indicating which scope this logging belongs to
+        logger: Optional logger instance. If not provided, uses the root logger.
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if not is_logging_scope_enabled(scope):
+                return func(*args, **kwargs)
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator
+
+def log_message(scope, level, msg, *args, logger=None, **kwargs):
+    """
+    Log a message if the specified scope is enabled.
+
+    Args:
+        scope: LoggingScope value indicating which scope this logging belongs to
+        level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        msg: Message to log
+        logger: Optional logger instance. If not provided, uses the root logger.
+        *args, **kwargs: Additional arguments to pass to the logging function
+    """
+    if not is_logging_scope_enabled(scope):
+        return
+
+    log = logger or logging.getLogger()
+    log_func = getattr(log, level.lower())
+    log_func(msg, *args, **kwargs)
+
+# Example usage:
+# log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file: %s", filename)
+# log_message(LoggingScope.ERROR, 'ERROR', "Failed to download: %s", error_msg)

From aab9f6a2539fc8c14b6884c7f443ac19de81bad8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Thu, 1 May 2025 19:42:26 +0200
Subject: [PATCH 026/218] show function entry/exit at info level

---
 scripts/automated_ingestion/utils.py | 32 +++++++++++-----------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 2c3aeb3c..214673c0 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -115,10 +115,10 @@ def sha256sum(path):
 def log_function_entry_exit(logger=None):
     """
     Decorator that logs function entry and exit with timing information.
-    Only logs if function entry/exit logging is enabled.
+    Only logs if the FUNC_ENTRY_EXIT scope is enabled.
 
     Args:
-        logger: Optional logger instance. If not provided, uses the root logger.
+        logger: Optional logger instance. If not provided, uses the module's logger.
     """
     def decorator(func):
         @functools.wraps(func)
@@ -126,28 +126,22 @@ def wrapper(*args, **kwargs):
             if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT):
                 return func(*args, **kwargs)
 
-            # Use provided logger or get root logger
-            log = logger or logging.getLogger()
+            if logger is None:
+                log = logging.getLogger(func.__module__)
+            else:
+                log = logger
 
-            # Log function entry
-            log.debug(f"Entering {func.__name__}")
             start_time = time.time()
-
+            log.info(f"Entering {func.__name__}")
             try:
-                # Execute the function
                 result = func(*args, **kwargs)
-
-                # Log successful exit
-                duration = time.time() - start_time
-                log.debug(f"Exiting {func.__name__} (took {duration:.3f}s)")
+                end_time = time.time()
+                log.info(f"Exiting {func.__name__} (took {end_time - start_time:.2f}s)")
                 return result
-
-            except Exception as e:
-                # Log error exit
-                duration = time.time() - start_time
-                log.error(f"Error in {func.__name__} after {duration:.3f}s: {str(e)}")
-                raise
-
+            except Exception as err:
+                end_time = time.time()
+                log.info(f"Exiting {func.__name__} with exception (took {end_time - start_time:.2f}s)")
+                raise err
         return wrapper
     return decorator
 

From 678f0d755d780e4b24cd92122acc2eb4e579d337 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Thu, 1 May 2025 19:55:09 +0200
Subject: [PATCH 027/218] tweak func leave msg and add context info

---
 scripts/automated_ingestion/utils.py | 33 +++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 214673c0..aa841563 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -4,6 +4,7 @@
 import logging
 import functools
 import time
+import os
 from enum import IntFlag, auto
 
 class LoggingScope(IntFlag):
@@ -131,16 +132,42 @@ def wrapper(*args, **kwargs):
             else:
                 log = logger
 
+            # Get context information if available
+            context = ""
+            if len(args) > 0 and hasattr(args[0], 'object'):
+                # For EessiTarball methods, show the tarball name and state
+                tarball = args[0]
+                filename = os.path.basename(tarball.object)
+
+                # Format filename to show important parts
+                if len(filename) > 30:
+                    parts = filename.split('-')
+                    if len(parts) >= 6:  # Ensure we have all required parts
+                        # Get version, component, last part of architecture, and epoch
+                        version = parts[1]
+                        component = parts[2]
+                        arch_last = parts[-3].split('-')[-1]  # Last part of architecture
+                        epoch = parts[-2]
+                        filename = f"{version}-{component}-{arch_last}-{epoch}.tar.gz"
+                    else:
+                        # Fallback to simple truncation if format doesn't match
+                        filename = f"{filename[:15]}...{filename[-12:]}"
+
+                context = f" [{filename}"
+                if hasattr(tarball, 'state'):
+                    context += f" in {tarball.state}"
+                context += "]"
+
             start_time = time.time()
-            log.info(f"Entering {func.__name__}")
+            log.info(f"Entering {func.__name__}{context}")
             try:
                 result = func(*args, **kwargs)
                 end_time = time.time()
-                log.info(f"Exiting {func.__name__} (took {end_time - start_time:.2f}s)")
+                log.info(f"Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)")
                 return result
             except Exception as err:
                 end_time = time.time()
-                log.info(f"Exiting {func.__name__} with exception (took {end_time - start_time:.2f}s)")
+                log.info(f"Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err
         return wrapper
     return decorator

From 7ecb9be140e87310d98f70bde13c1a0e3e254342 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Thu, 1 May 2025 20:05:22 +0200
Subject: [PATCH 028/218] fix shown file components and illustrate call stack
 depth

---
 scripts/automated_ingestion/utils.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index aa841563..0786e22f 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -5,6 +5,7 @@
 import functools
 import time
 import os
+import inspect
 from enum import IntFlag, auto
 
 class LoggingScope(IntFlag):
@@ -24,6 +25,9 @@ class LoggingScope(IntFlag):
 # Global setting for logging scopes
 ENABLED_LOGGING_SCOPES = LoggingScope.NONE
 
+# Global variable to track call stack depth
+_call_stack_depth = 0
+
 def set_logging_scopes(scopes):
     """
     Set the enabled logging scopes.
@@ -124,6 +128,8 @@ def log_function_entry_exit(logger=None):
     def decorator(func):
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
+            global _call_stack_depth
+
             if not is_logging_scope_enabled(LoggingScope.FUNC_ENTRY_EXIT):
                 return func(*args, **kwargs)
 
@@ -146,9 +152,9 @@ def wrapper(*args, **kwargs):
                         # Get version, component, last part of architecture, and epoch
                         version = parts[1]
                         component = parts[2]
-                        arch_last = parts[-3].split('-')[-1]  # Last part of architecture
-                        epoch = parts[-2]
-                        filename = f"{version}-{component}-{arch_last}-{epoch}.tar.gz"
+                        arch_last = parts[-2].split('-')[-1]  # Last part of architecture
+                        epoch = parts[-1]  # includes file extension
+                        filename = f"{version}-{component}-{arch_last}-{epoch}"
                     else:
                         # Fallback to simple truncation if format doesn't match
                         filename = f"{filename[:15]}...{filename[-12:]}"
@@ -158,16 +164,22 @@ def wrapper(*args, **kwargs):
                     context += f" in {tarball.state}"
                 context += "]"
 
+            # Create indentation based on call stack depth
+            indent = "  " * _call_stack_depth
+
             start_time = time.time()
-            log.info(f"Entering {func.__name__}{context}")
+            log.info(f"{indent}Entering {func.__name__}{context}")
+            _call_stack_depth += 1
             try:
                 result = func(*args, **kwargs)
+                _call_stack_depth -= 1
                 end_time = time.time()
-                log.info(f"Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)")
+                log.info(f"{indent}Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)")
                 return result
             except Exception as err:
+                _call_stack_depth -= 1
                 end_time = time.time()
-                log.info(f"Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)")
+                log.info(f"{indent}Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err
         return wrapper
     return decorator

From c65d1f7ae8c9d1c7345b38860284d5824ee651cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Thu, 1 May 2025 20:09:38 +0200
Subject: [PATCH 029/218] convert logging calls in automated_ingestion.py

---
 scripts/automated_ingestion/automated_ingestion.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 71b8b61d..11113fd7 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -34,7 +34,7 @@
 
 def error(msg, code=1):
     """Print an error and exit."""
-    logging.error(msg)
+    log_message(LoggingScope.ERROR, 'ERROR', msg)
     sys.exit(code)
 
 
@@ -78,7 +78,7 @@ def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extens
                     groups[group_key] = []
                 groups[group_key].append(tarball)
         except Exception as err:
-            logging.error(f"Failed to process metadata for {tarball}: {err}")
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to process metadata for %s: %s", tarball, err)
             continue
         finally:
             # Clean up downloaded metadata file
@@ -239,22 +239,22 @@ def main():
             # use new grouped PR method
             tarball_groups = find_tarball_groups(s3, bucket, config)
             if args.list_only:
-                logging.info(f"#tarball_groups: {len(tarball_groups)}")
+                log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups))
                 for (repo, pr_id), tarballs in tarball_groups.items():
-                    logging.info(f"  {repo}#{pr_id}: #tarballs {len(tarballs)}")
+                    log_message(LoggingScope.GROUP_OPS, 'INFO', "  %s#%s: #tarballs %d", repo, pr_id, len(tarballs))
             else:
                 for (repo, pr_id), tarballs in tarball_groups.items():
                     if tarballs:
                         # Create a group for these tarballs
                         group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                        logging.info(f"group created\n{group.to_string(oneline=True)}")
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True))
                         group.process_group(tarballs)
         else:
             # use old individual PR method
             tarballs = find_tarballs(s3, bucket)
             if args.list_only:
                 for num, tarball in enumerate(tarballs):
-                    logging.info(f'[{bucket}] {num}: {tarball}')
+                    log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball)
             else:
                 for tarball in tarballs:
                     tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo)

From 0206968c0114ee8239a69db6c07a229207efe6ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 08:39:45 +0200
Subject: [PATCH 030/218] introducing task-based deployments

---
 .../automated_ingestion.py                    | 102 ++++++++++++++----
 1 file changed, 81 insertions(+), 21 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 11113fd7..f7d0db89 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -15,6 +15,7 @@
 import pid
 import sys
 from pathlib import Path
+from typing import List, Dict
 
 REQUIRED_CONFIG = {
     'secrets': ['aws_secret_access_key', 'aws_access_key_id', 'github_pat'],
@@ -150,7 +151,8 @@ def parse_args():
     parser.add_argument('-c', '--config', type=str, help='path to configuration file',
                        default='automated_ingestion.cfg', dest='config')
     parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug')
-    parser.add_argument('-l', '--list', help='only list available tarballs', action='store_true', dest='list_only')
+    parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true', dest='list_only')
+    parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based', action='store_true')
 
     return parser.parse_args()
 
@@ -235,30 +237,88 @@ def main():
 
     buckets = json.loads(config['aws']['staging_buckets'])
     for bucket, cvmfs_repo in buckets.items():
-        if config['github'].get('staging_pr_method', 'individual') == 'grouped':
-            # use new grouped PR method
-            tarball_groups = find_tarball_groups(s3, bucket, config)
+        if args.task_based:
+            # Task-based listing
+            tasks = find_deployment_tasks(s3, bucket)
             if args.list_only:
-                log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups))
-                for (repo, pr_id), tarballs in tarball_groups.items():
-                    log_message(LoggingScope.GROUP_OPS, 'INFO', "  %s#%s: #tarballs %d", repo, pr_id, len(tarballs))
+                log_message(LoggingScope.GROUP_OPS, 'INFO', "#tasks: %d", len(tasks))
+                for num, task in enumerate(tasks):
+                    log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, task)
             else:
-                for (repo, pr_id), tarballs in tarball_groups.items():
-                    if tarballs:
-                        # Create a group for these tarballs
-                        group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True))
-                        group.process_group(tarballs)
+                # TODO: Implement task processing
+                pass
         else:
-            # use old individual PR method
-            tarballs = find_tarballs(s3, bucket)
-            if args.list_only:
-                for num, tarball in enumerate(tarballs):
-                    log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball)
+            # Original tarball-based processing
+            if config['github'].get('staging_pr_method', 'individual') == 'grouped':
+                # use new grouped PR method
+                tarball_groups = find_tarball_groups(s3, bucket, config)
+                if args.list_only:
+                    log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups))
+                    for (repo, pr_id), tarballs in tarball_groups.items():
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "  %s#%s: #tarballs %d", repo, pr_id, len(tarballs))
+                else:
+                    for (repo, pr_id), tarballs in tarball_groups.items():
+                        if tarballs:
+                            # Create a group for these tarballs
+                            group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                            log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True))
+                            group.process_group(tarballs)
             else:
-                for tarball in tarballs:
-                    tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo)
-                    tar.run_handler()
+                # use old individual PR method
+                tarballs = find_tarballs(s3, bucket)
+                if args.list_only:
+                    for num, tarball in enumerate(tarballs):
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball)
+                else:
+                    for tarball in tarballs:
+                        tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                        tar.run_handler()
+
+
+@log_function_entry_exit()
+def find_deployment_tasks(s3, bucket: str, extension='.task') -> List[str]:
+    """
+    Return a list of all task files in an S3 bucket with the given extension,
+    but only if a corresponding payload file exists (same name without extension).
+
+    Args:
+        s3: boto3 S3 client
+        bucket: Name of the S3 bucket to scan
+        extension: File extension to look for (default: '.task')
+
+    Returns:
+        List of task filenames found in the bucket that have a corresponding payload
+    """
+    files = []
+    continuation_token = None
+
+    while True:
+        # List objects with pagination
+        if continuation_token:
+            response = s3.list_objects_v2(
+                Bucket=bucket,
+                ContinuationToken=continuation_token
+            )
+        else:
+            response = s3.list_objects_v2(Bucket=bucket)
+
+        # Add files from this page
+        files.extend([obj['Key'] for obj in response.get('Contents', [])])
+
+        # Check if there are more pages
+        if response.get('IsTruncated'):
+            continuation_token = response.get('NextContinuationToken')
+        else:
+            break
+
+    # Create a set of all files for faster lookup
+    file_set = set(files)
+
+    # Return only task files that have a corresponding payload
+    return [
+        file for file in files
+        if file.endswith(extension) and file[:-len(extension)] in file_set
+    ]
 
 
 if __name__ == '__main__':

From 108bec3ef09794c23b747930899972ddc74dee6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 09:14:46 +0200
Subject: [PATCH 031/218] support providing extensions, e.g., . .meta.txt

---
 .../automated_ingestion.py                    | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index f7d0db89..0c6884fb 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -152,7 +152,9 @@ def parse_args():
                        default='automated_ingestion.cfg', dest='config')
     parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug')
     parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true', dest='list_only')
-    parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based', action='store_true')
+    parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based. '
+                       'Optionally specify comma-separated list of extensions (default: .task)',
+                       nargs='?', const='.task', default=False)
 
     return parser.parse_args()
 
@@ -239,7 +241,8 @@ def main():
     for bucket, cvmfs_repo in buckets.items():
         if args.task_based:
             # Task-based listing
-            tasks = find_deployment_tasks(s3, bucket)
+            extensions = args.task_based.split(',')
+            tasks = find_deployment_tasks(s3, bucket, extensions)
             if args.list_only:
                 log_message(LoggingScope.GROUP_OPS, 'INFO', "#tasks: %d", len(tasks))
                 for num, task in enumerate(tasks):
@@ -276,19 +279,22 @@ def main():
 
 
 @log_function_entry_exit()
-def find_deployment_tasks(s3, bucket: str, extension='.task') -> List[str]:
+def find_deployment_tasks(s3, bucket: str, extensions: List[str] = None) -> List[str]:
     """
-    Return a list of all task files in an S3 bucket with the given extension,
+    Return a list of all task files in an S3 bucket with the given extensions,
     but only if a corresponding payload file exists (same name without extension).
 
     Args:
         s3: boto3 S3 client
         bucket: Name of the S3 bucket to scan
-        extension: File extension to look for (default: '.task')
+        extensions: List of file extensions to look for (default: ['.task'])
 
     Returns:
         List of task filenames found in the bucket that have a corresponding payload
     """
+    if extensions is None:
+        extensions = ['.task']
+
     files = []
     continuation_token = None
 
@@ -315,10 +321,14 @@ def find_deployment_tasks(s3, bucket: str, extension='.task') -> List[str]:
     file_set = set(files)
 
     # Return only task files that have a corresponding payload
-    return [
-        file for file in files
-        if file.endswith(extension) and file[:-len(extension)] in file_set
-    ]
+    result = []
+    for file in files:
+        for ext in extensions:
+            if file.endswith(ext) and file[:-len(ext)] in file_set:
+                result.append(file)
+                break  # Found a matching extension, no need to check others
+
+    return result
 
 
 if __name__ == '__main__':

From 4fce92ac22a3ef1b0f60be407ab43dac9743e5e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 16:35:38 +0200
Subject: [PATCH 032/218] model remote/local files, download them

---
 .../automated_ingestion.py                    |  27 ++-
 .../automated_ingestion/eessi_data_object.py  | 190 ++++++++++++++++++
 scripts/automated_ingestion/s3_client.py      | 132 ++++++++++++
 3 files changed, 347 insertions(+), 2 deletions(-)
 create mode 100644 scripts/automated_ingestion/eessi_data_object.py
 create mode 100644 scripts/automated_ingestion/s3_client.py

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 0c6884fb..ff628957 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from eessitarball import EessiTarball, EessiTarballGroup
+from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode
 from pid.decorator import pidfile  # noqa: F401
 from pid import PidFileError
 from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes
@@ -248,8 +249,30 @@ def main():
                 for num, task in enumerate(tasks):
                     log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, task)
             else:
-                # TODO: Implement task processing
-                pass
+                # Process each task file
+                for task_path in tasks:
+                    try:
+                        # Create EESSIDataAndSignatureObject for the task file
+                        task_obj = EESSIDataAndSignatureObject(config, task_path, s3)
+
+                        # Download the task file and its signature
+                        task_obj.download(mode=DownloadMode.CHECK_REMOTE)
+
+                        # Log the ETags of the downloaded task file
+                        file_etag, sig_etag = task_obj.get_etags()
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', 
+                                  "Task signature %s has ETag: %s", 
+                                  task_obj.remote_sig_path, sig_etag)
+
+                        # TODO: Process the task file contents
+                        # This would involve reading the task file, parsing its contents,
+                        # and performing the required actions based on the task type
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing task file: %s", task_path)
+
+                    except Exception as err:
+                        log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err))
+                        continue
         else:
             # Original tarball-based processing
             if config['github'].get('staging_pr_method', 'individual') == 'grouped':
diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
new file mode 100644
index 00000000..45d36308
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -0,0 +1,190 @@
+import os
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Optional, Protocol, runtime_checkable
+
+import boto3
+import configparser
+
+from .utils import log_function_entry_exit, log_message, LoggingScope
+
+class DownloadMode(Enum):
+    """Enum defining different modes for downloading files."""
+    FORCE = 'force'  # Always download and overwrite
+    CHECK_REMOTE = 'check-remote'  # Download if remote files have changed
+    CHECK_LOCAL = 'check-local'  # Download if files don't exist locally (default)
+
+
+@runtime_checkable
+class RemoteStorageClient(Protocol):
+    """Protocol defining the interface for remote storage clients."""
+
+    def get_metadata(self, remote_path: str) -> dict:
+        """Get metadata about a remote object.
+
+        Args:
+            remote_path: Path to the object in remote storage
+
+        Returns:
+            Dictionary containing object metadata, including 'ETag' key
+        """
+        ...
+
+    def download(self, remote_path: str, local_path: str) -> None:
+        """Download a remote file to a local location.
+
+        Args:
+            remote_path: Path to the object in remote storage
+            local_path: Local path where to save the file
+        """
+        ...
+
+
+@dataclass
+class EESSIDataAndSignatureObject:
+    """Class representing an EESSI data file and its signature in remote storage and locally."""
+
+    # Configuration
+    config: configparser.ConfigParser
+
+    # Remote paths
+    remote_file_path: str  # Path to data file in remote storage
+    remote_sig_path: str  # Path to signature file in remote storage
+
+    # Local paths
+    local_file_path: Path  # Path to local data file
+    local_sig_path: Path  # Path to local signature file
+
+    # Remote storage client
+    remote_client: RemoteStorageClient
+
+    @log_function_entry_exit()
+    def __init__(self, config: configparser.ConfigParser, remote_file_path: str, remote_client: RemoteStorageClient):
+        """
+        Initialize an EESSI data and signature object handler.
+
+        Args:
+            config: Configuration object containing remote storage and local directory information
+            remote_file_path: Path to data file in remote storage
+            remote_client: Remote storage client implementing the RemoteStorageClient protocol
+        """
+        self.config = config
+        self.remote_file_path = remote_file_path
+        sig_ext = config['signatures']['signature_file_extension']
+        self.remote_sig_path = remote_file_path + sig_ext
+
+        # Set up local paths
+        local_dir = Path(config['paths']['download_dir'])
+        # Use the full remote path structure, removing any leading slashes
+        remote_path = remote_file_path.lstrip('/')
+        self.local_file_path = local_dir.joinpath(remote_path)
+        self.local_sig_path = local_dir.joinpath(remote_path + sig_ext)
+        self.remote_client = remote_client
+
+        log_message(LoggingScope.DEBUG, 'DEBUG', "Initialized EESSIDataAndSignatureObject for %s", remote_file_path)
+        log_message(LoggingScope.DEBUG, 'DEBUG', "Local file path: %s", self.local_file_path)
+        log_message(LoggingScope.DEBUG, 'DEBUG', "Local signature path: %s", self.local_sig_path)
+
+    def _get_etag_file_path(self, local_path: Path) -> Path:
+        """Get the path to the .etag file for a given local file."""
+        return local_path.with_suffix('.etag')
+
+    def _get_local_etag(self, local_path: Path) -> Optional[str]:
+        """Get the ETag of a local file from its .etag file."""
+        etag_path = self._get_etag_file_path(local_path)
+        if etag_path.exists():
+            try:
+                with open(etag_path, 'r') as f:
+                    return f.read().strip()
+            except Exception as err:
+                log_message(LoggingScope.DEBUG, 'WARNING', "Failed to read ETag file %s: %s", etag_path, str(err))
+                return None
+        return None
+
+    def get_etags(self) -> tuple[Optional[str], Optional[str]]:
+        """
+        Get the ETags of both the data file and its signature.
+
+        Returns:
+            Tuple containing (data_file_etag, signature_file_etag)
+        """
+        return (
+            self._get_local_etag(self.local_file_path),
+            self._get_local_etag(self.local_sig_path)
+        )
+
+    @log_function_entry_exit()
+    def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool:
+        """
+        Download data file and signature based on the specified mode.
+
+        Args:
+            mode: Download mode to use
+
+        Returns:
+            True if files were downloaded, False otherwise
+        """
+        if mode == DownloadMode.FORCE:
+            should_download = True
+            log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path)
+        elif mode == DownloadMode.CHECK_REMOTE:
+            remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
+            remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag']
+            local_file_etag = self._get_local_etag(self.local_file_path)
+            local_sig_etag = self._get_local_etag(self.local_sig_path)
+
+            should_download = (
+                remote_file_etag != local_file_etag or
+                remote_sig_etag != local_sig_etag
+            )
+            if should_download:
+                log_msg = "Remote files have changed, downloading %s"
+                log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path)
+            else:
+                log_msg = "Remote files unchanged, skipping download of %s"
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path)
+        else:  # CHECK_LOCAL
+            should_download = (
+                not self.local_file_path.exists() or
+                not self.local_sig_path.exists()
+            )
+            if should_download:
+                log_msg = "Local files missing, downloading %s"
+                log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path)
+            else:
+                log_msg = "Local files exist, skipping download of %s"
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path)
+
+        if not should_download:
+            return False
+
+        # Ensure local directory exists
+        self.local_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Download files
+        try:
+            self.remote_client.download(self.remote_file_path, str(self.local_file_path))
+            self.remote_client.download(self.remote_sig_path, str(self.local_sig_path))
+
+            # Log the ETags of downloaded files
+            file_etag = self._get_local_etag(self.local_file_path)
+            sig_etag = self._get_local_etag(self.local_sig_path)
+            log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_file_path, file_etag)
+            log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_sig_path, sig_etag)
+
+            log_msg = "Successfully downloaded %s and its signature"
+            log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path)
+            return True
+        except Exception as err:
+            # Clean up partially downloaded files
+            if self.local_file_path.exists():
+                self.local_file_path.unlink()
+            if self.local_sig_path.exists():
+                self.local_sig_path.unlink()
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", self.remote_file_path, str(err))
+            raise
+
+    def __str__(self) -> str:
+        """Return a string representation of the EESSI data and signature object."""
+        return f"EESSIDataAndSignatureObject({self.remote_file_path})"
diff --git a/scripts/automated_ingestion/s3_client.py b/scripts/automated_ingestion/s3_client.py
new file mode 100644
index 00000000..e61a5ed7
--- /dev/null
+++ b/scripts/automated_ingestion/s3_client.py
@@ -0,0 +1,132 @@
+import boto3
+from typing import Dict, Optional
+import os
+from pathlib import Path
+
+from .utils import log_function_entry_exit, log_message, LoggingScope
+from .eessi_data_object import RemoteStorageClient
+
+class EESSIS3Client(RemoteStorageClient):
+    """EESSI-specific S3 client implementation of the RemoteStorageClient protocol."""
+
+    @log_function_entry_exit()
+    def __init__(self, config, bucket_name: str):
+        """
+        Initialize the EESSI S3 client.
+
+        Args:
+            config: Configuration object containing:
+                   - aws.access_key_id: AWS access key ID (optional, can use AWS_ACCESS_KEY_ID env var)
+                   - aws.secret_access_key: AWS secret access key (optional, can use AWS_SECRET_ACCESS_KEY env var)
+                   - aws.endpoint_url: Custom endpoint URL for S3-compatible backends (optional)
+                   - aws.verify: SSL verification setting (optional)
+                         - True: Verify SSL certificates (default)
+                         - False: Skip SSL certificate verification
+                         - str: Path to CA bundle file
+            bucket_name: Name of the S3 bucket to use
+        """
+        self.bucket = bucket_name
+
+        # Get AWS credentials from environment or config
+        aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'access_key_id')
+        aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'secret_access_key')
+
+        # Configure boto3 client
+        client_config = {}
+
+        # Add endpoint URL if specified in config
+        if config.has_option('aws', 'endpoint_url'):
+            client_config['endpoint_url'] = config['aws']['endpoint_url']
+            log_message(LoggingScope.DEBUG, 'DEBUG', "Using custom endpoint URL: %s", client_config['endpoint_url'])
+
+        # Add SSL verification if specified in config
+        if config.has_option('aws', 'verify'):
+            verify = config['aws']['verify']
+            if verify.lower() == 'false':
+                client_config['verify'] = False
+                log_message(LoggingScope.DEBUG, 'WARNING', "SSL verification disabled")
+            elif verify.lower() == 'true':
+                client_config['verify'] = True
+            else:
+                client_config['verify'] = verify  # Assume it's a path to CA bundle
+                log_message(LoggingScope.DEBUG, 'DEBUG', "Using custom CA bundle: %s", verify)
+
+        self.client = boto3.client(
+            's3',
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            **client_config
+        )
+        log_message(LoggingScope.DEBUG, 'INFO', "Initialized S3 client for bucket: %s", self.bucket)
+
+    @log_function_entry_exit()
+    def get_metadata(self, remote_path: str) -> Dict:
+        """
+        Get metadata about an S3 object.
+
+        Args:
+            remote_path: Path to the object in S3
+
+        Returns:
+            Dictionary containing object metadata, including 'ETag' key
+        """
+        try:
+            log_message(LoggingScope.DEBUG, 'DEBUG', "Getting metadata for S3 object: %s", remote_path)
+            response = self.client.head_object(Bucket=self.bucket, Key=remote_path)
+            log_message(LoggingScope.DEBUG, 'DEBUG', "Retrieved metadata for %s: %s", remote_path, response)
+            return response
+        except ClientError as e:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to get metadata for %s: %s", remote_path, str(e))
+            raise
+
+    def _get_etag_file_path(self, local_path: str) -> Path:
+        """Get the path to the .etag file for a given local file."""
+        return Path(local_path).with_suffix('.etag')
+
+    def _read_etag(self, local_path: str) -> Optional[str]:
+        """Read the ETag from the .etag file if it exists."""
+        etag_path = self._get_etag_file_path(local_path)
+        if etag_path.exists():
+            try:
+                with open(etag_path, 'r') as f:
+                    return f.read().strip()
+            except Exception as e:
+                log_message(LoggingScope.DEBUG, 'WARNING', "Failed to read ETag file %s: %s", etag_path, str(e))
+                return None
+        return None
+
+    def _write_etag(self, local_path: str, etag: str) -> None:
+        """Write the ETag to the .etag file."""
+        etag_path = self._get_etag_file_path(local_path)
+        try:
+            with open(etag_path, 'w') as f:
+                f.write(etag)
+            log_message(LoggingScope.DEBUG, 'DEBUG', "Wrote ETag to %s", etag_path)
+        except Exception as e:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to write ETag file %s: %s", etag_path, str(e))
+            # If we can't write the etag file, it's not critical
+            # The file will just be downloaded again next time
+
+    @log_function_entry_exit()
+    def download(self, remote_path: str, local_path: str) -> None:
+        """
+        Download an S3 object to a local location and store its ETag.
+
+        Args:
+            remote_path: Path to the object in S3
+            local_path: Local path where to save the file
+        """
+        try:
+            log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading %s to %s", remote_path, local_path)
+            self.client.download_file(Bucket=self.bucket, Key=remote_path, Filename=local_path)
+            log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s to %s", remote_path, local_path)
+        except ClientError as e:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", remote_path, str(e))
+            raise
+
+        # Get metadata first to obtain the ETag
+        metadata = self.get_metadata(remote_path)
+        etag = metadata['ETag']
+
+        # Store the ETag
+        self._write_etag(local_path, etag)

From bb6351afe6f26f2521dee3b2fa009830df0e97b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 16:57:29 +0200
Subject: [PATCH 033/218] fix imports

---
 scripts/automated_ingestion/eessi_data_object.py | 2 +-
 scripts/automated_ingestion/s3_client.py         | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index 45d36308..40c87ffa 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -7,7 +7,7 @@
 import boto3
 import configparser
 
-from .utils import log_function_entry_exit, log_message, LoggingScope
+from utils import log_function_entry_exit, log_message, LoggingScope
 
 class DownloadMode(Enum):
     """Enum defining different modes for downloading files."""
diff --git a/scripts/automated_ingestion/s3_client.py b/scripts/automated_ingestion/s3_client.py
index e61a5ed7..c1ea2a71 100644
--- a/scripts/automated_ingestion/s3_client.py
+++ b/scripts/automated_ingestion/s3_client.py
@@ -1,10 +1,11 @@
-import boto3
-from typing import Dict, Optional
 import os
 from pathlib import Path
+from typing import Dict, Optional
+
+import boto3
 
-from .utils import log_function_entry_exit, log_message, LoggingScope
-from .eessi_data_object import RemoteStorageClient
+from utils import log_function_entry_exit, log_message, LoggingScope
+from eessi_data_object import RemoteStorageClient
 
 class EESSIS3Client(RemoteStorageClient):
     """EESSI-specific S3 client implementation of the RemoteStorageClient protocol."""

From 8e64cc986e2353d28e3f267c4012a29ba140322b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 17:04:55 +0200
Subject: [PATCH 034/218] add more details to function entry/leave logging

---
 scripts/automated_ingestion/utils.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 0786e22f..a61fed7f 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -167,19 +167,28 @@ def wrapper(*args, **kwargs):
             # Create indentation based on call stack depth
             indent = "  " * _call_stack_depth
 
+            # Get file name and line number
+            frame = inspect.currentframe()
+            while frame.f_back:  # Walk up the call stack to find the caller
+                frame = frame.f_back
+            file_name = os.path.basename(frame.f_code.co_filename)
+            line_no = frame.f_lineno
+
             start_time = time.time()
-            log.info(f"{indent}Entering {func.__name__}{context}")
+            log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}")
             _call_stack_depth += 1
             try:
                 result = func(*args, **kwargs)
                 _call_stack_depth -= 1
                 end_time = time.time()
-                log.info(f"{indent}Leaving {func.__name__}{context} (took {end_time - start_time:.2f}s)")
+                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}"
+                        f"{context} (took {end_time - start_time:.2f}s)")
                 return result
             except Exception as err:
                 _call_stack_depth -= 1
                 end_time = time.time()
-                log.info(f"{indent}Leaving {func.__name__}{context} with exception (took {end_time - start_time:.2f}s)")
+                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}"
+                        f"{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err
         return wrapper
     return decorator

From c7a0254259e2f8b7f3215beb5783f29f792de0a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 17:08:22 +0200
Subject: [PATCH 035/218] fix details to function entry/leave logging

---
 scripts/automated_ingestion/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index a61fed7f..cf5681b6 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -167,12 +167,12 @@ def wrapper(*args, **kwargs):
             # Create indentation based on call stack depth
             indent = "  " * _call_stack_depth
 
-            # Get file name and line number
+            # Get file name and line number of the decorated function
             frame = inspect.currentframe()
-            while frame.f_back:  # Walk up the call stack to find the caller
-                frame = frame.f_back
-            file_name = os.path.basename(frame.f_code.co_filename)
-            line_no = frame.f_lineno
+            # Get the frame of the decorated function (one level up from the wrapper)
+            func_frame = frame.f_back
+            file_name = os.path.basename(func_frame.f_code.co_filename)
+            line_no = func_frame.f_lineno
 
             start_time = time.time()
             log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}")

From a0c8d4d788b2ecfb5e24c124740f45ef3ed7f365 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 17:11:06 +0200
Subject: [PATCH 036/218] show actual function for entry/leave logging

---
 scripts/automated_ingestion/utils.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index cf5681b6..38bcc68d 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -167,12 +167,9 @@ def wrapper(*args, **kwargs):
             # Create indentation based on call stack depth
             indent = "  " * _call_stack_depth
 
-            # Get file name and line number of the decorated function
-            frame = inspect.currentframe()
-            # Get the frame of the decorated function (one level up from the wrapper)
-            func_frame = frame.f_back
-            file_name = os.path.basename(func_frame.f_code.co_filename)
-            line_no = func_frame.f_lineno
+            # Get file name and line number where the function is defined
+            file_name = os.path.basename(inspect.getsourcefile(func))
+            line_no = inspect.getsourcelines(func)[1]
 
             start_time = time.time()
             log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}")

From e958b49dafae7d472ffeba5ab3f3d7d4405f10eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 17:38:40 +0200
Subject: [PATCH 037/218] print actual lines of entry or leave

---
 scripts/automated_ingestion/utils.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 38bcc68d..913f883e 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -169,22 +169,31 @@ def wrapper(*args, **kwargs):
 
             # Get file name and line number where the function is defined
             file_name = os.path.basename(inspect.getsourcefile(func))
-            line_no = inspect.getsourcelines(func)[1]
+            source_lines, start_line = inspect.getsourcelines(func)
+            # Find the line with the actual function definition
+            def_line = next(i for i, line in enumerate(source_lines) if line.strip().startswith('def '))
+            def_line_no = start_line + def_line
 
             start_time = time.time()
-            log.info(f"{indent}Entering {func.__name__} at {file_name}:{line_no}{context}")
+            log.info(f"{indent}Entering {func.__name__} at {file_name}:{def_line_no}{context}")
             _call_stack_depth += 1
             try:
                 result = func(*args, **kwargs)
                 _call_stack_depth -= 1
                 end_time = time.time()
-                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}"
+                # Get the actual line where the function returned
+                frame = inspect.currentframe()
+                return_line_no = frame.f_back.f_lineno
+                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{return_line_no}"
                         f"{context} (took {end_time - start_time:.2f}s)")
                 return result
             except Exception as err:
                 _call_stack_depth -= 1
                 end_time = time.time()
-                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{line_no}"
+                # Get the actual line where the exception occurred
+                frame = inspect.currentframe()
+                exception_line_no = frame.f_back.f_lineno
+                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exception_line_no}"
                         f"{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err
         return wrapper

From 59c722b14d0b9de239cb42539c4a7940b783ba19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 17:48:21 +0200
Subject: [PATCH 038/218] determine lineno when leaving a function

---
 scripts/automated_ingestion/utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 913f883e..9a6007c3 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -183,7 +183,10 @@ def wrapper(*args, **kwargs):
                 end_time = time.time()
                 # Get the actual line where the function returned
                 frame = inspect.currentframe()
-                return_line_no = frame.f_back.f_lineno
+                # Walk up the stack to find the frame of the decorated function
+                while frame.f_back and frame.f_back.f_code.co_name != func.__name__:
+                    frame = frame.f_back
+                return_line_no = frame.f_lineno
                 log.info(f"{indent}Leaving {func.__name__} at {file_name}:{return_line_no}"
                         f"{context} (took {end_time - start_time:.2f}s)")
                 return result
@@ -192,7 +195,10 @@ def wrapper(*args, **kwargs):
                 end_time = time.time()
                 # Get the actual line where the exception occurred
                 frame = inspect.currentframe()
-                exception_line_no = frame.f_back.f_lineno
+                # Walk up the stack to find the frame of the decorated function
+                while frame.f_back and frame.f_back.f_code.co_name != func.__name__:
+                    frame = frame.f_back
+                exception_line_no = frame.f_lineno
                 log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exception_line_no}"
                         f"{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err

From 39f49340fecde2fe4283ec2b6c54a65f19758cc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 18:06:40 +0200
Subject: [PATCH 039/218] may only show approx line when leaving a function

---
 scripts/automated_ingestion/utils.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 9a6007c3..277c6777 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -173,6 +173,9 @@ def wrapper(*args, **kwargs):
             # Find the line with the actual function definition
             def_line = next(i for i, line in enumerate(source_lines) if line.strip().startswith('def '))
             def_line_no = start_line + def_line
+            # Find the last non-empty line of the function
+            last_line = next(i for i, line in enumerate(reversed(source_lines)) if line.strip())
+            last_line_no = start_line + len(source_lines) - 1 - last_line
 
             start_time = time.time()
             log.info(f"{indent}Entering {func.__name__} at {file_name}:{def_line_no}{context}")
@@ -181,25 +184,19 @@ def wrapper(*args, **kwargs):
                 result = func(*args, **kwargs)
                 _call_stack_depth -= 1
                 end_time = time.time()
-                # Get the actual line where the function returned
-                frame = inspect.currentframe()
-                # Walk up the stack to find the frame of the decorated function
-                while frame.f_back and frame.f_back.f_code.co_name != func.__name__:
-                    frame = frame.f_back
-                return_line_no = frame.f_lineno
-                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{return_line_no}"
+                # For normal returns, show the last line of the function
+                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{last_line_no}"
                         f"{context} (took {end_time - start_time:.2f}s)")
                 return result
             except Exception as err:
                 _call_stack_depth -= 1
                 end_time = time.time()
-                # Get the actual line where the exception occurred
-                frame = inspect.currentframe()
-                # Walk up the stack to find the frame of the decorated function
-                while frame.f_back and frame.f_back.f_code.co_name != func.__name__:
-                    frame = frame.f_back
-                exception_line_no = frame.f_lineno
-                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exception_line_no}"
+                # For exceptions, try to get the line number from the exception
+                try:
+                    exc_line_no = err.__traceback__.tb_lineno
+                except AttributeError:
+                    exc_line_no = last_line_no
+                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exc_line_no}"
                         f"{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err
         return wrapper

From 12ea5ca62eea81b86ccee31adfad2bb90fd06f8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 21:39:35 +0200
Subject: [PATCH 040/218] always use own S3 bucket/client, code refactoring and
 improvements

---
 .../automated_ingestion.py                    | 43 ++++++++-----------
 .../automated_ingestion/eessi_data_object.py  | 35 +--------------
 scripts/automated_ingestion/eessitarball.py   | 42 +++++++++---------
 scripts/automated_ingestion/remote_storage.py | 34 +++++++++++++++
 .../{s3_client.py => s3_bucket.py}            | 30 +++++++++++--
 5 files changed, 101 insertions(+), 83 deletions(-)
 create mode 100644 scripts/automated_ingestion/remote_storage.py
 rename scripts/automated_ingestion/{s3_client.py => s3_bucket.py} (86%)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index ff628957..24799a54 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -2,6 +2,7 @@
 
 from eessitarball import EessiTarball, EessiTarballGroup
 from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode
+from s3_bucket import EESSIS3Bucket
 from pid.decorator import pidfile  # noqa: F401
 from pid import PidFileError
 from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes
@@ -40,13 +41,13 @@ def error(msg, code=1):
     sys.exit(code)
 
 
-def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt'):
+def find_tarballs(s3_bucket, extension='.tar.gz', metadata_extension='.meta.txt'):
     """
     Return a list of all tarballs in an S3 bucket that have a metadata file with
     the given extension (and same filename).
     """
     # TODO: list_objects_v2 only returns up to 1000 objects
-    s3_objects = s3.list_objects_v2(Bucket=bucket).get('Contents', [])
+    s3_objects = s3_bucket.list_objects_v2().get('Contents', [])
     files = [obj['Key'] for obj in s3_objects]
 
     tarballs = [
@@ -58,9 +59,9 @@ def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt
 
 
 @log_function_entry_exit()
-def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'):
+def find_tarball_groups(s3_bucket, config, extension='.tar.gz', metadata_extension='.meta.txt'):
     """Return a dictionary of tarball groups, keyed by (repo, pr_number)."""
-    tarballs = find_tarballs(s3, bucket, extension, metadata_extension)
+    tarballs = find_tarballs(s3_bucket, extension, metadata_extension)
     groups = {}
 
     for tarball in tarballs:
@@ -69,7 +70,7 @@ def find_tarball_groups(s3, bucket, config, extension='.tar.gz', metadata_extens
         local_metadata = os.path.join(config['paths']['download_dir'], os.path.basename(metadata_file))
 
         try:
-            s3.download_file(bucket, metadata_file, local_metadata)
+            s3_bucket.download_file(metadata_file, local_metadata)
             with open(local_metadata, 'r') as meta:
                 metadata = json.load(meta)
                 repo = metadata['link2pr']['repo']
@@ -230,20 +231,16 @@ def main():
     # TODO: check configuration: secrets, paths, permissions on dirs, etc
     gh_pat = config['secrets']['github_pat']
     gh_staging_repo = github.Github(gh_pat).get_repo(config['github']['staging_repo'])
-    s3 = boto3.client(
-        's3',
-        aws_access_key_id=config['secrets']['aws_access_key_id'],
-        aws_secret_access_key=config['secrets']['aws_secret_access_key'],
-        endpoint_url=config['aws']['endpoint_url'],
-        verify=config['aws']['verify_cert_path'],
-    )
 
     buckets = json.loads(config['aws']['staging_buckets'])
     for bucket, cvmfs_repo in buckets.items():
+        # Create our custom S3 bucket for this bucket
+        s3_bucket = EESSIS3Bucket(config, bucket)
+
         if args.task_based:
             # Task-based listing
             extensions = args.task_based.split(',')
-            tasks = find_deployment_tasks(s3, bucket, extensions)
+            tasks = find_deployment_tasks(s3_bucket, extensions)
             if args.list_only:
                 log_message(LoggingScope.GROUP_OPS, 'INFO', "#tasks: %d", len(tasks))
                 for num, task in enumerate(tasks):
@@ -253,7 +250,7 @@ def main():
                 for task_path in tasks:
                     try:
                         # Create EESSIDataAndSignatureObject for the task file
-                        task_obj = EESSIDataAndSignatureObject(config, task_path, s3)
+                        task_obj = EESSIDataAndSignatureObject(config, task_path, s3_bucket)
 
                         # Download the task file and its signature
                         task_obj.download(mode=DownloadMode.CHECK_REMOTE)
@@ -277,7 +274,7 @@ def main():
             # Original tarball-based processing
             if config['github'].get('staging_pr_method', 'individual') == 'grouped':
                 # use new grouped PR method
-                tarball_groups = find_tarball_groups(s3, bucket, config)
+                tarball_groups = find_tarball_groups(s3_bucket, config)
                 if args.list_only:
                     log_message(LoggingScope.GROUP_OPS, 'INFO', "#tarball_groups: %d", len(tarball_groups))
                     for (repo, pr_id), tarballs in tarball_groups.items():
@@ -286,30 +283,29 @@ def main():
                     for (repo, pr_id), tarballs in tarball_groups.items():
                         if tarballs:
                             # Create a group for these tarballs
-                            group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                            group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3_bucket, cvmfs_repo)
                             log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True))
                             group.process_group(tarballs)
             else:
                 # use old individual PR method
-                tarballs = find_tarballs(s3, bucket)
+                tarballs = find_tarballs(s3_bucket)
                 if args.list_only:
                     for num, tarball in enumerate(tarballs):
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "[%s] %d: %s", bucket, num, tarball)
                 else:
                     for tarball in tarballs:
-                        tar = EessiTarball(tarball, config, gh_staging_repo, s3, bucket, cvmfs_repo)
+                        tar = EessiTarball(tarball, config, gh_staging_repo, s3_bucket, cvmfs_repo)
                         tar.run_handler()
 
 
 @log_function_entry_exit()
-def find_deployment_tasks(s3, bucket: str, extensions: List[str] = None) -> List[str]:
+def find_deployment_tasks(s3_bucket, extensions: List[str] = None) -> List[str]:
     """
     Return a list of all task files in an S3 bucket with the given extensions,
     but only if a corresponding payload file exists (same name without extension).
 
     Args:
-        s3: boto3 S3 client
-        bucket: Name of the S3 bucket to scan
+        s3_bucket: EESSIS3Bucket instance
         extensions: List of file extensions to look for (default: ['.task'])
 
     Returns:
@@ -324,12 +320,11 @@ def find_deployment_tasks(s3, bucket: str, extensions: List[str] = None) -> List
     while True:
         # List objects with pagination
         if continuation_token:
-            response = s3.list_objects_v2(
-                Bucket=bucket,
+            response = s3_bucket.list_objects_v2(
                 ContinuationToken=continuation_token
             )
         else:
-            response = s3.list_objects_v2(Bucket=bucket)
+            response = s3_bucket.list_objects_v2()
 
         # Add files from this page
         files.extend([obj['Key'] for obj in response.get('Contents', [])])
diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index 40c87ffa..e12e40c5 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -1,44 +1,13 @@
 import os
 from dataclasses import dataclass
-from enum import Enum
 from pathlib import Path
-from typing import Optional, Protocol, runtime_checkable
+from typing import Optional
 
 import boto3
 import configparser
 
 from utils import log_function_entry_exit, log_message, LoggingScope
-
-class DownloadMode(Enum):
-    """Enum defining different modes for downloading files."""
-    FORCE = 'force'  # Always download and overwrite
-    CHECK_REMOTE = 'check-remote'  # Download if remote files have changed
-    CHECK_LOCAL = 'check-local'  # Download if files don't exist locally (default)
-
-
-@runtime_checkable
-class RemoteStorageClient(Protocol):
-    """Protocol defining the interface for remote storage clients."""
-
-    def get_metadata(self, remote_path: str) -> dict:
-        """Get metadata about a remote object.
-
-        Args:
-            remote_path: Path to the object in remote storage
-
-        Returns:
-            Dictionary containing object metadata, including 'ETag' key
-        """
-        ...
-
-    def download(self, remote_path: str, local_path: str) -> None:
-        """Download a remote file to a local location.
-
-        Args:
-            remote_path: Path to the object in remote storage
-            local_path: Local path where to save the file
-        """
-        ...
+from remote_storage import RemoteStorageClient, DownloadMode
 
 
 @dataclass
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index ab888964..eca6b67b 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -1,4 +1,5 @@
 from utils import send_slack_message, sha256sum, log_function_entry_exit, log_message, LoggingScope
+from s3_bucket import EESSIS3Bucket
 
 from pathlib import PurePosixPath
 
@@ -19,7 +20,7 @@ class EessiTarball:
     """
 
     @log_function_entry_exit()
-    def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo):
+    def __init__(self, object_name, config, git_staging_repo, s3_bucket, cvmfs_repo):
         """Initialize the tarball object."""
         self.config = config
         self.git_repo = git_staging_repo
@@ -27,15 +28,14 @@ def __init__(self, object_name, config, git_staging_repo, s3, bucket, cvmfs_repo
         self.metadata_sig_file = self.metadata_file + config['signatures']['signature_file_extension']
         self.object = object_name
         self.object_sig = object_name + config['signatures']['signature_file_extension']
-        self.s3 = s3
-        self.bucket = bucket
+        self.s3_bucket = s3_bucket
         self.cvmfs_repo = cvmfs_repo
         self.local_path = os.path.join(config['paths']['download_dir'], os.path.basename(object_name))
         self.local_sig_path = self.local_path + config['signatures']['signature_file_extension']
         self.local_metadata_path = self.local_path + config['paths']['metadata_file_extension']
         self.local_metadata_sig_path = self.local_metadata_path + config['signatures']['signature_file_extension']
         self.sig_verified = False
-        self.url = f'https://{bucket}.s3.amazonaws.com/{object_name}'
+        self.url = f'https://{s3_bucket.bucket}.s3.amazonaws.com/{object_name}'
 
         self.states = {
             'new': {'handler': self.mark_new_tarball_as_staged, 'next_state': 'staged'},
@@ -67,14 +67,14 @@ def download(self, force=False):
                 try:
                     log_msg = "Downloading signature file %s to %s"
                     log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, sig_object, local_sig_file)
-                    self.s3.download_file(self.bucket, sig_object, local_sig_file)
+                    self.s3_bucket.download_file(self.s3_bucket.bucket, sig_object, local_sig_file)
                 except Exception as err:
                     log_msg = 'Failed to download signature file %s for %s from %s to %s.'
                     if self.config['signatures'].getboolean('signatures_required', True):
                         log_msg += '\nException: %s'
                         log_message(
                             LoggingScope.ERROR, 'ERROR', log_msg,
-                            sig_object, object, self.bucket, local_sig_file, err
+                            sig_object, object, self.s3_bucket.bucket, local_sig_file, err
                         )
                         skip = True
                         break
@@ -84,15 +84,15 @@ def download(self, force=False):
                         log_msg += '\nException: %s'
                         log_message(
                             LoggingScope.DOWNLOAD, 'WARNING', log_msg,
-                            sig_object, object, self.bucket, local_sig_file, err
+                            sig_object, object, self.s3_bucket.bucket, local_sig_file, err
                         )
                 # Now we download the file itself.
                 try:
                     log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file %s to %s", object, local_file)
-                    self.s3.download_file(self.bucket, object, local_file)
+                    self.s3_bucket.download_file(self.s3_bucket.bucket, object, local_file)
                 except Exception as err:
                     log_msg = 'Failed to download %s from %s to %s.\nException: %s'
-                    log_message(LoggingScope.ERROR, 'ERROR', log_msg, object, self.bucket, local_file, err)
+                    log_message(LoggingScope.ERROR, 'ERROR', log_msg, object, self.s3_bucket.bucket, local_file, err)
                     skip = True
                     break
         # If any required download failed, make sure to skip this tarball completely.
@@ -201,7 +201,7 @@ def to_string(self, oneline=False):
         str = f"tarball: {self.object}"
         sep = "\n" if not oneline else ","
         str += f"{sep} metadt: {self.metadata_file}"
-        str += f"{sep} bucket: {self.bucket}"
+        str += f"{sep} bucket: {self.s3_bucket.bucket}"
         str += f"{sep} cvmfs.: {self.cvmfs_repo}"
         str += f"{sep} GHrepo: {self.git_repo}"
         return str
@@ -480,7 +480,7 @@ def make_approval_request(self, tarballs_in_group=None):
             log_msg = "Moving metadata for %d tarballs to staged"
             log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, len(tarballs_in_group))
             for tarball in tarballs_in_group:
-                temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+                temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo)
                 temp_tar.move_metadata_file(self.state, next_state, branch=git_branch)
 
         # Create PR with appropriate template
@@ -502,8 +502,7 @@ def make_approval_request(self, tarballs_in_group=None):
                 tar_overviews = []
                 for tarball in tarballs_in_group:
                     try:
-                        temp_tar = EessiTarball(
-                            tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+                        temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo)
                         temp_tar.download()
                         overview = temp_tar.get_contents_overview()
                         tar_details_tpl = "<details>\n<summary>Contents of %s</summary>\n\n%s\n</details>\n"
@@ -669,19 +668,18 @@ def get_link2pr_info(self):
 class EessiTarballGroup:
     """Class to handle a group of tarballs that share the same link2pr information."""
 
-    def __init__(self, first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo):
+    def __init__(self, first_tarball, config, git_staging_repo, s3_bucket, cvmfs_repo):
         """Initialize with the first tarball in the group."""
-        self.first_tar = EessiTarball(first_tarball, config, git_staging_repo, s3, bucket, cvmfs_repo)
+        self.first_tar = EessiTarball(first_tarball, config, git_staging_repo, s3_bucket, cvmfs_repo)
         self.config = config
         self.git_repo = git_staging_repo
-        self.s3 = s3
-        self.bucket = bucket
+        self.s3_bucket = s3_bucket
         self.cvmfs_repo = cvmfs_repo
 
     def download_tarballs_and_more(self, tarballs):
         """Download all files associated with this group of tarballs."""
         for tarball in tarballs:
-            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo)
             log_message(LoggingScope.GROUP_OPS, 'INFO', "downloading files for '%s'", temp_tar.object)
             temp_tar.download(force=True)
             if not temp_tar.local_path or not temp_tar.local_metadata_path:
@@ -710,7 +708,7 @@ def process_group(self, tarballs):
         for tarball in tarballs[1:]:
             log_msg = "Processing tarball in group: %s"
             log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, tarball)
-            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo)
             temp_tar.mark_new_tarball_as_staged('main')
 
         # Process the group for approval, only works correctly if first tarball is already in state 'staged'
@@ -722,8 +720,8 @@ def to_string(self, oneline=False):
         sep = "\n" if not oneline else ","
         str += f"{sep} config: {self.config}"
         str += f"{sep} GHrepo: {self.git_repo}"
-        str += f"{sep} s3....: {self.s3}"
-        str += f"{sep} bucket: {self.bucket}"
+        str += f"{sep} s3....: {self.s3_bucket}"
+        str += f"{sep} bucket: {self.s3_bucket.bucket}"
         str += f"{sep} cvmfs.: {self.cvmfs_repo}"
         return str
 
@@ -732,7 +730,7 @@ def verify_group_consistency(self, tarballs):
         first_repo, first_pr = self.first_tar.get_link2pr_info()
 
         for tarball in tarballs[1:]:  # Skip first tarball as we already have its info
-            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3, self.bucket, self.cvmfs_repo)
+            temp_tar = EessiTarball(tarball, self.config, self.git_repo, self.s3_bucket, self.cvmfs_repo)
             log_message(LoggingScope.DEBUG, 'DEBUG', "temp tar: %s", temp_tar.to_string())
             repo, pr = temp_tar.get_link2pr_info()
             if repo != first_repo or pr != first_pr:
diff --git a/scripts/automated_ingestion/remote_storage.py b/scripts/automated_ingestion/remote_storage.py
new file mode 100644
index 00000000..ac005af8
--- /dev/null
+++ b/scripts/automated_ingestion/remote_storage.py
@@ -0,0 +1,34 @@
+from enum import Enum
+from typing import Protocol, runtime_checkable
+
+
+class DownloadMode(Enum):
+    """Enum defining different modes for downloading files."""
+    FORCE = 'force'  # Always download and overwrite
+    CHECK_REMOTE = 'check-remote'  # Download if remote files have changed
+    CHECK_LOCAL = 'check-local'  # Download if files don't exist locally (default)
+
+
+@runtime_checkable
+class RemoteStorageClient(Protocol):
+    """Protocol defining the interface for remote storage clients."""
+
+    def get_metadata(self, remote_path: str) -> dict:
+        """Get metadata about a remote object.
+
+        Args:
+            remote_path: Path to the object in remote storage
+
+        Returns:
+            Dictionary containing object metadata, including 'ETag' key
+        """
+        ...
+
+    def download(self, remote_path: str, local_path: str) -> None:
+        """Download a remote file to a local location.
+
+        Args:
+            remote_path: Path to the object in remote storage
+            local_path: Local path where to save the file
+        """
+        ... 
\ No newline at end of file
diff --git a/scripts/automated_ingestion/s3_client.py b/scripts/automated_ingestion/s3_bucket.py
similarity index 86%
rename from scripts/automated_ingestion/s3_client.py
rename to scripts/automated_ingestion/s3_bucket.py
index c1ea2a71..52e9b0d2 100644
--- a/scripts/automated_ingestion/s3_client.py
+++ b/scripts/automated_ingestion/s3_bucket.py
@@ -5,15 +5,15 @@
 import boto3
 
 from utils import log_function_entry_exit, log_message, LoggingScope
-from eessi_data_object import RemoteStorageClient
+from remote_storage import RemoteStorageClient
 
-class EESSIS3Client(RemoteStorageClient):
-    """EESSI-specific S3 client implementation of the RemoteStorageClient protocol."""
+class EESSIS3Bucket(RemoteStorageClient):
+    """EESSI-specific S3 bucket implementation of the RemoteStorageClient protocol."""
 
     @log_function_entry_exit()
     def __init__(self, config, bucket_name: str):
         """
-        Initialize the EESSI S3 client.
+        Initialize the EESSI S3 bucket.
 
         Args:
             config: Configuration object containing:
@@ -60,6 +60,28 @@ def __init__(self, config, bucket_name: str):
         )
         log_message(LoggingScope.DEBUG, 'INFO', "Initialized S3 client for bucket: %s", self.bucket)
 
+    def list_objects_v2(self, **kwargs):
+        """
+        List objects in the bucket using the underlying boto3 client.
+
+        Args:
+            **kwargs: Additional arguments to pass to boto3.client.list_objects_v2
+
+        Returns:
+            Response from boto3.client.list_objects_v2
+        """
+        return self.client.list_objects_v2(Bucket=self.bucket, **kwargs)
+
+    def download_file(self, key: str, filename: str) -> None:
+        """
+        Download a file from S3 to a local file.
+
+        Args:
+            key: The S3 key of the file to download
+            filename: The local path where the file should be saved
+        """
+        self.client.download_file(self.bucket, key, filename)
+
     @log_function_entry_exit()
     def get_metadata(self, remote_path: str) -> Dict:
         """

From fe6b29c879186e0b95e69ece275b24e38a037d88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 21:45:28 +0200
Subject: [PATCH 041/218] fix aws config key names

---
 scripts/automated_ingestion/s3_bucket.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py
index 52e9b0d2..0e91a925 100644
--- a/scripts/automated_ingestion/s3_bucket.py
+++ b/scripts/automated_ingestion/s3_bucket.py
@@ -29,8 +29,8 @@ def __init__(self, config, bucket_name: str):
         self.bucket = bucket_name
 
         # Get AWS credentials from environment or config
-        aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'access_key_id')
-        aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'secret_access_key')
+        aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'aws_access_key_id')
+        aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'aws_secret_access_key')
 
         # Configure boto3 client
         client_config = {}

From 01f73676becc0d0a3eeaee6947431e6658a2ce86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 21:47:48 +0200
Subject: [PATCH 042/218] fix section name for secrets

---
 scripts/automated_ingestion/s3_bucket.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py
index 0e91a925..79b8a055 100644
--- a/scripts/automated_ingestion/s3_bucket.py
+++ b/scripts/automated_ingestion/s3_bucket.py
@@ -29,8 +29,8 @@ def __init__(self, config, bucket_name: str):
         self.bucket = bucket_name
 
         # Get AWS credentials from environment or config
-        aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('aws', 'aws_access_key_id')
-        aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('aws', 'aws_secret_access_key')
+        aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID') or config.get('secrets', 'aws_access_key_id')
+        aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY') or config.get('secrets', 'aws_secret_access_key')
 
         # Configure boto3 client
         client_config = {}

From 9e84165986326da62065c351825a034597dabc02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 22:16:57 +0200
Subject: [PATCH 043/218] optimize download and improve choosing log scope and
 level

---
 .../automated_ingestion/eessi_data_object.py  | 60 ++++++++++++++-----
 scripts/automated_ingestion/utils.py          | 11 +++-
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index e12e40c5..902417aa 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -98,32 +98,62 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool:
             should_download = True
             log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path)
         elif mode == DownloadMode.CHECK_REMOTE:
-            remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
-            remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag']
+            # First check if we have local ETags
             local_file_etag = self._get_local_etag(self.local_file_path)
             local_sig_etag = self._get_local_etag(self.local_sig_path)
 
-            should_download = (
-                remote_file_etag != local_file_etag or
-                remote_sig_etag != local_sig_etag
-            )
-            if should_download:
-                log_msg = "Remote files have changed, downloading %s"
-                log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path)
+            if local_file_etag:
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag)
+            else:
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found")
+            if local_sig_etag:
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag)
+            else:
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found")
+
+            # If we don't have local ETags, we need to download
+            if not local_file_etag or not local_sig_etag:
+                should_download = True
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", 
+                          self.remote_file_path)
             else:
-                log_msg = "Remote files unchanged, skipping download of %s"
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path)
+                # Get remote ETags and compare
+                remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
+                remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag']
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag)
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag)
+
+                should_download = (
+                    remote_file_etag != local_file_etag or
+                    remote_sig_etag != local_sig_etag
+                )
+                if should_download:
+                    if remote_file_etag != local_file_etag:
+                        log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", 
+                                  local_file_etag, remote_file_etag)
+                    if remote_sig_etag != local_sig_etag:
+                        log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", 
+                                  local_sig_etag, remote_sig_etag)
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", 
+                              self.remote_file_path)
+                else:
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", 
+                              self.remote_file_path)
         else:  # CHECK_LOCAL
             should_download = (
                 not self.local_file_path.exists() or
                 not self.local_sig_path.exists()
             )
             if should_download:
-                log_msg = "Local files missing, downloading %s"
-                log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path)
+                if not self.local_file_path.exists():
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Local file missing: %s", self.local_file_path)
+                if not self.local_sig_path.exists():
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Local signature missing: %s", self.local_sig_path)
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, downloading %s", 
+                          self.remote_file_path)
             else:
-                log_msg = "Local files exist, skipping download of %s"
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', log_msg, self.remote_file_path)
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files exist, skipping download of %s", 
+                          self.remote_file_path)
 
         if not should_download:
             return False
diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 277c6777..e774eafe 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -221,7 +221,9 @@ def wrapper(*args, **kwargs):
 
 def log_message(scope, level, msg, *args, logger=None, **kwargs):
     """
-    Log a message if the specified scope is enabled.
+    Log a message if either:
+    1. The specified scope is enabled, OR
+    2. The current log level is equal to or higher than the specified level
 
     Args:
         scope: LoggingScope value indicating which scope this logging belongs to
@@ -230,10 +232,13 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
         logger: Optional logger instance. If not provided, uses the root logger.
         *args, **kwargs: Additional arguments to pass to the logging function
     """
-    if not is_logging_scope_enabled(scope):
+    log = logger or logging.getLogger()
+    log_level = getattr(logging, level.upper())
+
+    # Check if either condition is met
+    if not (is_logging_scope_enabled(scope) or log_level >= log.getEffectiveLevel()):
         return
 
-    log = logger or logging.getLogger()
     log_func = getattr(log, level.lower())
     log_func(msg, *args, **kwargs)
 

From 550de8881eac901c1fb4e6b2c7586a4cecc5265e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 22:36:55 +0200
Subject: [PATCH 044/218] fix logging logic and remove obsolete decorator

---
 scripts/automated_ingestion/utils.py | 39 ++++++++++++++--------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index e774eafe..e4070458 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -7,6 +7,7 @@
 import os
 import inspect
 from enum import IntFlag, auto
+import sys
 
 class LoggingScope(IntFlag):
     """Enumeration of different logging scopes."""
@@ -202,23 +203,6 @@ def wrapper(*args, **kwargs):
         return wrapper
     return decorator
 
-def log_with_scope(scope, logger=None):
-    """
-    Decorator that checks if a specific logging scope is enabled before logging.
-
-    Args:
-        scope: LoggingScope value indicating which scope this logging belongs to
-        logger: Optional logger instance. If not provided, uses the root logger.
-    """
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            if not is_logging_scope_enabled(scope):
-                return func(*args, **kwargs)
-            return func(*args, **kwargs)
-        return wrapper
-    return decorator
-
 def log_message(scope, level, msg, *args, logger=None, **kwargs):
     """
     Log a message if either:
@@ -239,8 +223,25 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
     if not (is_logging_scope_enabled(scope) or log_level >= log.getEffectiveLevel()):
         return
 
-    log_func = getattr(log, level.lower())
-    log_func(msg, *args, **kwargs)
+    # Create indentation based on call stack depth
+    indent = "  " * _call_stack_depth
+    indented_msg = f"{indent}{msg}"
+
+    # If scope is enabled, bypass the logger's level check
+    if is_logging_scope_enabled(scope):
+        # Create a temporary handler that accepts all levels
+        temp_handler = logging.StreamHandler(sys.stdout)
+        temp_handler.setLevel(logging.DEBUG)
+        log.addHandler(temp_handler)
+        try:
+            log_func = getattr(log, level.lower())
+            log_func(indented_msg, *args, **kwargs)
+        finally:
+            log.removeHandler(temp_handler)
+    else:
+        # Use normal logging with level check
+        log_func = getattr(log, level.lower())
+        log_func(indented_msg, *args, **kwargs)
 
 # Example usage:
 # log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading file: %s", filename)

From dd5fcd4757c4a8a3d1321ff8f8fcadcd0a0cdc70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 22:40:46 +0200
Subject: [PATCH 045/218] make sure to use full log format incl level

---
 scripts/automated_ingestion/utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index e4070458..4f784f95 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -232,6 +232,11 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
         # Create a temporary handler that accepts all levels
         temp_handler = logging.StreamHandler(sys.stdout)
         temp_handler.setLevel(logging.DEBUG)
+        # Use the same format as the root logger's handlers
+        if log.handlers:
+            temp_handler.setFormatter(log.handlers[0].formatter)
+        else:
+            temp_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
         log.addHandler(temp_handler)
         try:
             log_func = getattr(log, level.lower())

From bc184d84a81cf06f46294532c20c1c4e2cbe128e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 22:48:13 +0200
Subject: [PATCH 046/218] use fixed length levelname

---
 scripts/automated_ingestion/utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 4f784f95..d69d1530 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -232,11 +232,15 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
         # Create a temporary handler that accepts all levels
         temp_handler = logging.StreamHandler(sys.stdout)
         temp_handler.setLevel(logging.DEBUG)
-        # Use the same format as the root logger's handlers
+        # Use the same format as the root logger's handlers but with fixed-width level names
         if log.handlers:
-            temp_handler.setFormatter(log.handlers[0].formatter)
+            # Get the original format string
+            orig_format = log.handlers[0].formatter._fmt
+            # Replace %(levelname)s with %(levelname)-8s to make it fixed width
+            new_format = orig_format.replace('%(levelname)s', '%(levelname)-8s')
+            temp_handler.setFormatter(logging.Formatter(new_format))
         else:
-            temp_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
+            temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s'))
         log.addHandler(temp_handler)
         try:
             log_func = getattr(log, level.lower())

From 6ed3bdc3c8499c23814d87a7e9fdf1015f3dab90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 22:50:24 +0200
Subject: [PATCH 047/218] use fixed length levelname everywhere

---
 scripts/automated_ingestion/automated_ingestion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 24799a54..25bc733a 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -196,8 +196,8 @@ def setup_logging(config, args):
     logger.setLevel(logging.DEBUG)  # Set root logger to lowest level
 
     # Create formatters
-    console_formatter = logging.Formatter(log_format)
-    file_formatter = logging.Formatter('%(asctime)s - ' + log_format)
+    console_formatter = logging.Formatter('%(levelname)-8s: %(message)s')
+    file_formatter = logging.Formatter('%(asctime)s - %(levelname)-8s: %(message)s')
 
     # Console handler (only if not quiet)
     if not args.quiet:

From 28558b5a8ca99065db9f8e4282bf04a8ab8da76d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 2 May 2025 23:50:34 +0200
Subject: [PATCH 048/218] improve handling of download errors and cleanup of
 partially downloaded files

---
 .../automated_ingestion/eessi_data_object.py  | 144 ++++++++++++------
 1 file changed, 99 insertions(+), 45 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index 902417aa..8b60099c 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -99,46 +99,51 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool:
             log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path)
         elif mode == DownloadMode.CHECK_REMOTE:
             # First check if we have local ETags
-            local_file_etag = self._get_local_etag(self.local_file_path)
-            local_sig_etag = self._get_local_etag(self.local_sig_path)
+            try:
+                local_file_etag = self._get_local_etag(self.local_file_path)
+                local_sig_etag = self._get_local_etag(self.local_sig_path)
 
-            if local_file_etag:
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag)
-            else:
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found")
-            if local_sig_etag:
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag)
-            else:
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found")
+                if local_file_etag:
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag)
+                else:
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found")
+                if local_sig_etag:
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag)
+                else:
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found")
 
-            # If we don't have local ETags, we need to download
-            if not local_file_etag or not local_sig_etag:
-                should_download = True
-                log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", 
-                          self.remote_file_path)
-            else:
-                # Get remote ETags and compare
-                remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
-                remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag']
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag)
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag)
-
-                should_download = (
-                    remote_file_etag != local_file_etag or
-                    remote_sig_etag != local_sig_etag
-                )
-                if should_download:
-                    if remote_file_etag != local_file_etag:
-                        log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", 
-                                  local_file_etag, remote_file_etag)
-                    if remote_sig_etag != local_sig_etag:
-                        log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", 
-                                  local_sig_etag, remote_sig_etag)
-                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", 
+                # If we don't have local ETags, we need to download
+                if not local_file_etag or not local_sig_etag:
+                    should_download = True
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", 
                               self.remote_file_path)
                 else:
-                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", 
-                              self.remote_file_path)
+                    # Get remote ETags and compare
+                    remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
+                    remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag']
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag)
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag)
+
+                    should_download = (
+                        remote_file_etag != local_file_etag or
+                        remote_sig_etag != local_sig_etag
+                    )
+                    if should_download:
+                        if remote_file_etag != local_file_etag:
+                            log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", 
+                                      local_file_etag, remote_file_etag)
+                        if remote_sig_etag != local_sig_etag:
+                            log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", 
+                                      local_sig_etag, remote_sig_etag)
+                        log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", 
+                                  self.remote_file_path)
+                    else:
+                        log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", 
+                                  self.remote_file_path)
+            except Exception as etag_err:
+                # If we get any error with ETags, we'll just download the files
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err))
+                should_download = True
         else:  # CHECK_LOCAL
             should_download = (
                 not self.local_file_path.exists() or
@@ -163,24 +168,73 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool:
 
         # Download files
         try:
+            # Download the main file first
             self.remote_client.download(self.remote_file_path, str(self.local_file_path))
-            self.remote_client.download(self.remote_sig_path, str(self.local_sig_path))
 
-            # Log the ETags of downloaded files
-            file_etag = self._get_local_etag(self.local_file_path)
-            sig_etag = self._get_local_etag(self.local_sig_path)
-            log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_file_path, file_etag)
-            log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", self.remote_sig_path, sig_etag)
+            # Get and log the ETag of the downloaded file
+            try:
+                file_etag = self._get_local_etag(self.local_file_path)
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", 
+                           self.remote_file_path, file_etag)
+            except Exception as etag_err:
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", 
+                           self.remote_file_path, str(etag_err))
+
+            # Try to download the signature file
+            try:
+                self.remote_client.download(self.remote_sig_path, str(self.local_sig_path))
+                try:
+                    sig_etag = self._get_local_etag(self.local_sig_path)
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", 
+                               self.remote_sig_path, sig_etag)
+                except Exception as etag_err:
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", 
+                               self.remote_sig_path, str(etag_err))
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s and its signature", 
+                           self.remote_file_path)
+            except Exception as sig_err:
+                # Check if signatures are required
+                if self.config['signatures'].getboolean('signatures_required', True):
+                    # If signatures are required, clean up everything since we can't proceed
+                    if self.local_file_path.exists():
+                        self.local_file_path.unlink()
+                    # Clean up etag files regardless of whether their data files exist
+                    file_etag_path = self._get_etag_file_path(self.local_file_path)
+                    if file_etag_path.exists():
+                        file_etag_path.unlink()
+                    sig_etag_path = self._get_etag_file_path(self.local_sig_path)
+                    if sig_etag_path.exists():
+                        sig_etag_path.unlink()
+                    log_message(LoggingScope.ERROR, 'ERROR', "Failed to download required signature for %s: %s", 
+                               self.remote_file_path, str(sig_err))
+                    raise
+                else:
+                    # If signatures are optional, just clean up any partial signature files
+                    if self.local_sig_path.exists():
+                        self.local_sig_path.unlink()
+                    sig_etag_path = self._get_etag_file_path(self.local_sig_path)
+                    if sig_etag_path.exists():
+                        sig_etag_path.unlink()
+                    log_message(LoggingScope.DOWNLOAD, 'WARNING', "Failed to download optional signature for %s: %s", 
+                               self.remote_file_path, str(sig_err))
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s (signature optional)", 
+                               self.remote_file_path)
 
-            log_msg = "Successfully downloaded %s and its signature"
-            log_message(LoggingScope.DOWNLOAD, 'INFO', log_msg, self.remote_file_path)
             return True
         except Exception as err:
-            # Clean up partially downloaded files
+            # This catch block is only for errors in the main file download
+            # Clean up partially downloaded files and their etags
             if self.local_file_path.exists():
                 self.local_file_path.unlink()
             if self.local_sig_path.exists():
                 self.local_sig_path.unlink()
+            # Clean up etag files regardless of whether their data files exist
+            file_etag_path = self._get_etag_file_path(self.local_file_path)
+            if file_etag_path.exists():
+                file_etag_path.unlink()
+            sig_etag_path = self._get_etag_file_path(self.local_sig_path)
+            if sig_etag_path.exists():
+                sig_etag_path.unlink()
             log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", self.remote_file_path, str(err))
             raise
 

From 7cf615ef3307b54915f3cd18ae3b82387e703b39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 3 May 2025 00:47:20 +0200
Subject: [PATCH 049/218] add task description and verification of signatures

---
 .../automated_ingestion.py                    | 18 +++--
 .../automated_ingestion/eessi_data_object.py  | 69 ++++++++++++++++++
 .../eessi_task_description.py                 | 71 +++++++++++++++++++
 3 files changed, 151 insertions(+), 7 deletions(-)
 create mode 100644 scripts/automated_ingestion/eessi_task_description.py

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 25bc733a..85afdffd 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from eessitarball import EessiTarball, EessiTarballGroup
-from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode
+from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode, EESSITaskDescription
 from s3_bucket import EESSIS3Bucket
 from pid.decorator import pidfile  # noqa: F401
 from pid import PidFileError
@@ -249,18 +249,22 @@ def main():
                 # Process each task file
                 for task_path in tasks:
                     try:
-                        # Create EESSIDataAndSignatureObject for the task file
-                        task_obj = EESSIDataAndSignatureObject(config, task_path, s3_bucket)
+                        # Create EESSITaskDescription for the task file
+                        task_description = EESSITaskDescription(
+                            EESSIDataAndSignatureObject(config, task_path, s3_bucket)
+                        )
 
-                        # Download the task file and its signature
-                        task_obj.download(mode=DownloadMode.CHECK_REMOTE)
+                        # Log information about the task
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_description.task_object.local_file_path)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_description.task_object.local_sig_path)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task_description.signature_verified)
 
                         # Log the ETags of the downloaded task file
-                        file_etag, sig_etag = task_obj.get_etags()
+                        file_etag, sig_etag = task_description.task_object.get_etags()
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag)
                         log_message(LoggingScope.GROUP_OPS, 'INFO', 
                                   "Task signature %s has ETag: %s", 
-                                  task_obj.remote_sig_path, sig_etag)
+                                  task_description.task_object.remote_sig_path, sig_etag)
 
                         # TODO: Process the task file contents
                         # This would involve reading the task file, parsing its contents,
diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index 8b60099c..97867402 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -1,4 +1,5 @@
 import os
+import subprocess
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
@@ -83,6 +84,74 @@ def get_etags(self) -> tuple[Optional[str], Optional[str]]:
             self._get_local_etag(self.local_sig_path)
         )
 
+    @log_function_entry_exit()
+    def verify_signature(self) -> bool:
+        """
+        Verify the signature of the data file using the corresponding signature file.
+        
+        Returns:
+            bool: True if the signature is valid or if signatures are not required, False otherwise
+        """
+        # Check if signature file exists
+        if not self.local_sig_path.exists():
+            log_message(LoggingScope.VERIFICATION, 'WARNING', "Signature file %s is missing", 
+                       self.local_sig_path)
+            
+            # If signatures are required, return failure
+            if self.config['signatures'].getboolean('signatures_required', True):
+                log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing and signatures are required", 
+                           self.local_sig_path)
+                return False
+            else:
+                log_message(LoggingScope.VERIFICATION, 'INFO', 
+                           "Signature file %s is missing, but signatures are not required", 
+                           self.local_sig_path)
+                return True
+
+        # If signatures are provided, we should always verify them, regardless of the signatures_required setting
+        verify_runenv = self.config['signatures']['signature_verification_runenv'].split()
+        verify_script = self.config['signatures']['signature_verification_script']
+        allowed_signers_file = self.config['signatures']['allowed_signers_file']
+
+        # Check if verification tools exist
+        if not Path(verify_script).exists():
+            log_message(LoggingScope.ERROR, 'ERROR', 
+                       "Unable to verify signature: verification script %s does not exist", verify_script)
+            return False
+
+        if not Path(allowed_signers_file).exists():
+            log_message(LoggingScope.ERROR, 'ERROR', 
+                       "Unable to verify signature: allowed signers file %s does not exist", allowed_signers_file)
+            return False
+
+        # Run the verification command with named parameters
+        cmd = verify_runenv + [
+            verify_script,
+            '--verify',
+            '--allowed-signers-file', allowed_signers_file,
+            '--file', str(self.local_file_path),
+            '--signature-file', str(self.local_sig_path)
+        ]
+        log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(cmd))
+        
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode == 0:
+                log_message(LoggingScope.VERIFICATION, 'INFO', 
+                           "Successfully verified signature for %s", self.local_file_path)
+                return True
+            else:
+                log_message(LoggingScope.ERROR, 'ERROR', 
+                           "Signature verification failed for %s", self.local_file_path)
+                log_message(LoggingScope.ERROR, 'ERROR', "  stdout: %s", result.stdout)
+                log_message(LoggingScope.ERROR, 'ERROR', "  stderr: %s", result.stderr)
+                return False
+        except Exception as e:
+            log_message(LoggingScope.ERROR, 'ERROR', 
+                       "Error during signature verification for %s: %s", 
+                       self.local_file_path, str(e))
+            return False
+
     @log_function_entry_exit()
     def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool:
         """
diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
new file mode 100644
index 00000000..a958fa4d
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -0,0 +1,71 @@
+import json
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from eessi_data_object import EESSIDataAndSignatureObject
+from utils import log_function_entry_exit, log_message, LoggingScope
+
+
+@dataclass
+class EESSITaskDescription:
+    """Class representing an EESSI task to be performed, including its metadata and associated data files."""
+
+    # The EESSI data and signature object associated with this task
+    task_object: EESSIDataAndSignatureObject
+
+    # Whether the signature was successfully verified
+    signature_verified: bool = False
+
+    # Metadata from the task description file
+    metadata: Dict[str, Any] = None
+
+    @log_function_entry_exit()
+    def __init__(self, task_object: EESSIDataAndSignatureObject):
+        """
+        Initialize an EESSITaskDescription object.
+
+        Args:
+            task_object: The EESSI data and signature object associated with this task
+        """
+        self.task_object = task_object
+        self.metadata = {}
+        
+        # Verify signature and set initial state
+        self.signature_verified = self.task_object.verify_signature()
+        
+        # Try to read metadata (will only succeed if signature is verified)
+        try:
+            self._read_metadata()
+        except RuntimeError:
+            # Expected if signature is not verified yet
+            pass
+
+    @log_function_entry_exit()
+    def _read_metadata(self) -> None:
+        """
+        Internal method to read and parse the metadata from the task description file.
+        Only reads metadata if the signature has been verified.
+        """
+        if not self.signature_verified:
+            log_message(LoggingScope.ERROR, 'ERROR', "Cannot read metadata: signature not verified for %s", 
+                       self.task_object.local_file_path)
+            raise RuntimeError("Cannot read metadata: signature not verified")
+
+        try:
+            with open(self.task_object.local_file_path, 'r') as f:
+                self.metadata = json.load(f)
+            log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s", self.task_object.local_file_path)
+        except json.JSONDecodeError as e:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to parse JSON in task description file %s: %s", 
+                       self.task_object.local_file_path, str(e))
+            raise
+        except Exception as e:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to read task description file %s: %s", 
+                       self.task_object.local_file_path, str(e))
+            raise
+
+    def __str__(self) -> str:
+        """Return a string representation of the EESSITaskDescription object."""
+        return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" 
\ No newline at end of file

From aa548dcd106da8a2585f423c4c9af0b3afebca96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 3 May 2025 00:49:47 +0200
Subject: [PATCH 050/218] fix import of EESSITaskDescription

---
 scripts/automated_ingestion/automated_ingestion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 85afdffd..4a4d63ac 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 
 from eessitarball import EessiTarball, EessiTarballGroup
-from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode, EESSITaskDescription
+from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode
+from eessi_task_description import EESSITaskDescription
 from s3_bucket import EESSIS3Bucket
 from pid.decorator import pidfile  # noqa: F401
 from pid import PidFileError

From 3e8f8bdd656b7a34a8c22047a52403efd6b191bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 3 May 2025 00:57:55 +0200
Subject: [PATCH 051/218] add more log output for verification

---
 scripts/automated_ingestion/automated_ingestion.py | 2 +-
 scripts/automated_ingestion/eessi_data_object.py   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 4a4d63ac..f68a7373 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -270,7 +270,7 @@ def main():
                         # TODO: Process the task file contents
                         # This would involve reading the task file, parsing its contents,
                         # and performing the required actions based on the task type
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing task file: %s", task_path)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path)
 
                     except Exception as err:
                         log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err))
diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index 97867402..aca00adf 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -139,6 +139,8 @@ def verify_signature(self) -> bool:
             if result.returncode == 0:
                 log_message(LoggingScope.VERIFICATION, 'INFO', 
                            "Successfully verified signature for %s", self.local_file_path)
+                log_message(LoggingScope.VERIFICATION, 'DEBUG', "  stdout: %s", result.stdout)
+                log_message(LoggingScope.VERIFICATION, 'DEBUG', "  stderr: %s", result.stderr)
                 return True
             else:
                 log_message(LoggingScope.ERROR, 'ERROR', 

From e51200d40f9df34f6159bb09318669a6a3512948 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 3 May 2025 01:05:58 +0200
Subject: [PATCH 052/218] add scopes to log messages and avoid message
 duplicates

---
 scripts/automated_ingestion/utils.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index d69d1530..18581d5c 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -179,14 +179,14 @@ def wrapper(*args, **kwargs):
             last_line_no = start_line + len(source_lines) - 1 - last_line
 
             start_time = time.time()
-            log.info(f"{indent}Entering {func.__name__} at {file_name}:{def_line_no}{context}")
+            log.info(f"{indent}[FUNC_ENTRY_EXIT] Entering {func.__name__} at {file_name}:{def_line_no}{context}")
             _call_stack_depth += 1
             try:
                 result = func(*args, **kwargs)
                 _call_stack_depth -= 1
                 end_time = time.time()
                 # For normal returns, show the last line of the function
-                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{last_line_no}"
+                log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{last_line_no}"
                         f"{context} (took {end_time - start_time:.2f}s)")
                 return result
             except Exception as err:
@@ -197,7 +197,7 @@ def wrapper(*args, **kwargs):
                     exc_line_no = err.__traceback__.tb_lineno
                 except AttributeError:
                     exc_line_no = last_line_no
-                log.info(f"{indent}Leaving {func.__name__} at {file_name}:{exc_line_no}"
+                log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{exc_line_no}"
                         f"{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err
         return wrapper
@@ -225,7 +225,9 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
 
     # Create indentation based on call stack depth
     indent = "  " * _call_stack_depth
-    indented_msg = f"{indent}{msg}"
+    # Add scope to the message
+    scoped_msg = f"[{scope.name}] {msg}"
+    indented_msg = f"{indent}{scoped_msg}"
 
     # If scope is enabled, bypass the logger's level check
     if is_logging_scope_enabled(scope):
@@ -247,7 +249,7 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
             log_func(indented_msg, *args, **kwargs)
         finally:
             log.removeHandler(temp_handler)
-    else:
+    elif log_level >= log.getEffectiveLevel():
         # Use normal logging with level check
         log_func = getattr(log, level.lower())
         log_func(indented_msg, *args, **kwargs)

From bc9453747780a270b14cd8ffd2c359c8a9297ec2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 3 May 2025 01:10:01 +0200
Subject: [PATCH 053/218] next attempt to avoid message duplicates

---
 scripts/automated_ingestion/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 18581d5c..0cc4ba59 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -229,7 +229,7 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
     scoped_msg = f"[{scope.name}] {msg}"
     indented_msg = f"{indent}{scoped_msg}"
 
-    # If scope is enabled, bypass the logger's level check
+    # If scope is enabled, use the temporary handler
     if is_logging_scope_enabled(scope):
         # Create a temporary handler that accepts all levels
         temp_handler = logging.StreamHandler(sys.stdout)
@@ -249,7 +249,8 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
             log_func(indented_msg, *args, **kwargs)
         finally:
             log.removeHandler(temp_handler)
-    elif log_level >= log.getEffectiveLevel():
+    # Only use normal logging if scope is not enabled AND level is high enough
+    elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel():
         # Use normal logging with level check
         log_func = getattr(log, level.lower())
         log_func(indented_msg, *args, **kwargs)

From f9aa559da268cb92a5412c0e00f32a039a61463c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 3 May 2025 01:15:52 +0200
Subject: [PATCH 054/218] temporarily disable standard loggers

---
 .../automated_ingestion.py                    |  5 +++++
 scripts/automated_ingestion/utils.py          | 21 +++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index f68a7373..94cf9439 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -196,6 +196,9 @@ def setup_logging(config, args):
     logger = logging.getLogger()
     logger.setLevel(logging.DEBUG)  # Set root logger to lowest level
 
+    # Store original handlers
+    logger._original_handlers = []
+
     # Create formatters
     console_formatter = logging.Formatter('%(levelname)-8s: %(message)s')
     file_formatter = logging.Formatter('%(asctime)s - %(levelname)-8s: %(message)s')
@@ -206,6 +209,7 @@ def setup_logging(config, args):
         console_handler.setLevel(console_level)
         console_handler.setFormatter(console_formatter)
         logger.addHandler(console_handler)
+        logger._original_handlers.append(console_handler)
 
     # File handler (if log file is specified)
     if log_file:
@@ -217,6 +221,7 @@ def setup_logging(config, args):
         file_handler.setLevel(file_level)
         file_handler.setFormatter(file_formatter)
         logger.addHandler(file_handler)
+        logger._original_handlers.append(file_handler)
 
     return logger
 
diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 0cc4ba59..da0bf220 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -231,24 +231,27 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
 
     # If scope is enabled, use the temporary handler
     if is_logging_scope_enabled(scope):
+        # Remove all existing handlers
+        for handler in log.handlers[:]:
+            log.removeHandler(handler)
+
         # Create a temporary handler that accepts all levels
         temp_handler = logging.StreamHandler(sys.stdout)
         temp_handler.setLevel(logging.DEBUG)
-        # Use the same format as the root logger's handlers but with fixed-width level names
-        if log.handlers:
-            # Get the original format string
-            orig_format = log.handlers[0].formatter._fmt
-            # Replace %(levelname)s with %(levelname)-8s to make it fixed width
-            new_format = orig_format.replace('%(levelname)s', '%(levelname)-8s')
-            temp_handler.setFormatter(logging.Formatter(new_format))
-        else:
-            temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s'))
+        temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s'))
         log.addHandler(temp_handler)
+
         try:
             log_func = getattr(log, level.lower())
             log_func(indented_msg, *args, **kwargs)
         finally:
             log.removeHandler(temp_handler)
+            # Restore original handlers
+            for handler in log.handlers[:]:
+                log.removeHandler(handler)
+            if hasattr(log, '_original_handlers'):
+                for handler in log._original_handlers:
+                    log.addHandler(handler)
     # Only use normal logging if scope is not enabled AND level is high enough
     elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel():
         # Use normal logging with level check

From 0f0dfca3e1d3726cbaa9bd8d6ac33038c186c246 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 4 May 2025 07:52:53 +0200
Subject: [PATCH 055/218] change handling of temporary log handler for scopes

---
 .../automated_ingestion.py                    |  5 -----
 scripts/automated_ingestion/utils.py          | 21 ++++++++++++-------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 94cf9439..f68a7373 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -196,9 +196,6 @@ def setup_logging(config, args):
     logger = logging.getLogger()
     logger.setLevel(logging.DEBUG)  # Set root logger to lowest level
 
-    # Store original handlers
-    logger._original_handlers = []
-
     # Create formatters
     console_formatter = logging.Formatter('%(levelname)-8s: %(message)s')
     file_formatter = logging.Formatter('%(asctime)s - %(levelname)-8s: %(message)s')
@@ -209,7 +206,6 @@ def setup_logging(config, args):
         console_handler.setLevel(console_level)
         console_handler.setFormatter(console_formatter)
         logger.addHandler(console_handler)
-        logger._original_handlers.append(console_handler)
 
     # File handler (if log file is specified)
     if log_file:
@@ -221,7 +217,6 @@ def setup_logging(config, args):
         file_handler.setLevel(file_level)
         file_handler.setFormatter(file_formatter)
         logger.addHandler(file_handler)
-        logger._original_handlers.append(file_handler)
 
     return logger
 
diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index da0bf220..70fbd9de 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -231,27 +231,32 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
 
     # If scope is enabled, use the temporary handler
     if is_logging_scope_enabled(scope):
-        # Remove all existing handlers
-        for handler in log.handlers[:]:
-            log.removeHandler(handler)
+        # Save original handlers
+        original_handlers = list(log.handlers)
 
         # Create a temporary handler that accepts all levels
         temp_handler = logging.StreamHandler(sys.stdout)
         temp_handler.setLevel(logging.DEBUG)
         temp_handler.setFormatter(logging.Formatter('%(levelname)-8s: %(message)s'))
-        log.addHandler(temp_handler)
 
         try:
+            # Remove existing handlers temporarily
+            for handler in original_handlers:
+                log.removeHandler(handler)
+
+            # Add temporary handler
+            log.addHandler(temp_handler)
+
+            # Log the message
             log_func = getattr(log, level.lower())
             log_func(indented_msg, *args, **kwargs)
         finally:
             log.removeHandler(temp_handler)
             # Restore original handlers
-            for handler in log.handlers[:]:
-                log.removeHandler(handler)
-            if hasattr(log, '_original_handlers'):
-                for handler in log._original_handlers:
+            for handler in original_handlers:
+                if handler not in log.handlers:
                     log.addHandler(handler)
+
     # Only use normal logging if scope is not enabled AND level is high enough
     elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel():
         # Use normal logging with level check

From d87b078a2cab6d373e91f02ded9b12b61e2e21bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 4 May 2025 10:38:09 +0200
Subject: [PATCH 056/218] change default download mode and optimise fetching of
 etags

---
 .../automated_ingestion/eessi_data_object.py  | 100 ++++++++++--------
 1 file changed, 57 insertions(+), 43 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index aca00adf..482ca6f3 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -155,7 +155,7 @@ def verify_signature(self) -> bool:
             return False
 
     @log_function_entry_exit()
-    def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool:
+    def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
         """
         Download data file and signature based on the specified mode.
 
@@ -165,56 +165,70 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_LOCAL) -> bool:
         Returns:
             True if files were downloaded, False otherwise
         """
+        # If mode is FORCE, we always download regardless of local or remote state
         if mode == DownloadMode.FORCE:
             should_download = True
             log_message(LoggingScope.DOWNLOAD, 'INFO', "Forcing download of %s", self.remote_file_path)
+        # For CHECK_REMOTE mode, check if we can optimize
         elif mode == DownloadMode.CHECK_REMOTE:
-            # First check if we have local ETags
-            try:
-                local_file_etag = self._get_local_etag(self.local_file_path)
-                local_sig_etag = self._get_local_etag(self.local_sig_path)
+            # Optimization: Check if local files exist first
+            local_files_exist = (
+                self.local_file_path.exists() and
+                self.local_sig_path.exists()
+            )
 
-                if local_file_etag:
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag)
-                else:
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found")
-                if local_sig_etag:
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag)
-                else:
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found")
+            # If files don't exist locally, we can skip ETag checks
+            if not local_files_exist:
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, skipping ETag checks and downloading %s",
+                           self.remote_file_path)
+                should_download = True
+            else:
+                # First check if we have local ETags
+                try:
+                    local_file_etag = self._get_local_etag(self.local_file_path)
+                    local_sig_etag = self._get_local_etag(self.local_sig_path)
 
-                # If we don't have local ETags, we need to download
-                if not local_file_etag or not local_sig_etag:
-                    should_download = True
-                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s", 
-                              self.remote_file_path)
-                else:
-                    # Get remote ETags and compare
-                    remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
-                    remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag']
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag)
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag)
-
-                    should_download = (
-                        remote_file_etag != local_file_etag or
-                        remote_sig_etag != local_sig_etag
-                    )
-                    if should_download:
-                        if remote_file_etag != local_file_etag:
-                            log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s", 
-                                      local_file_etag, remote_file_etag)
-                        if remote_sig_etag != local_sig_etag:
-                            log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s", 
-                                      local_sig_etag, remote_sig_etag)
-                        log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s", 
-                                  self.remote_file_path)
+                    if local_file_etag:
+                        log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local file ETag: %s", local_file_etag)
+                    else:
+                        log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local file ETag found")
+                    if local_sig_etag:
+                        log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Local signature ETag: %s", local_sig_etag)
                     else:
-                        log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s", 
+                        log_message(LoggingScope.DOWNLOAD, 'DEBUG', "No local signature ETag found")
+
+                    # If we don't have local ETags, we need to download
+                    if not local_file_etag or not local_sig_etag:
+                        should_download = True
+                        log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s",
                                   self.remote_file_path)
-            except Exception as etag_err:
-                # If we get any error with ETags, we'll just download the files
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err))
-                should_download = True
+                    else:
+                        # Get remote ETags and compare
+                        remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
+                        remote_sig_etag = self.remote_client.get_metadata(self.remote_sig_path)['ETag']
+                        log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote file ETag: %s", remote_file_etag)
+                        log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Remote signature ETag: %s", remote_sig_etag)
+
+                        should_download = (
+                            remote_file_etag != local_file_etag or
+                            remote_sig_etag != local_sig_etag
+                        )
+                        if should_download:
+                            if remote_file_etag != local_file_etag:
+                                log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s",
+                                          local_file_etag, remote_file_etag)
+                            if remote_sig_etag != local_sig_etag:
+                                log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s",
+                                          local_sig_etag, remote_sig_etag)
+                            log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s",
+                                      self.remote_file_path)
+                        else:
+                            log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s",
+                                      self.remote_file_path)
+                except Exception as etag_err:
+                    # If we get any error with ETags, we'll just download the files
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err))
+                    should_download = True
         else:  # CHECK_LOCAL
             should_download = (
                 not self.local_file_path.exists() or

From 55aa5a1016c1953d8dabd2449822924845c469f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 4 May 2025 10:49:22 +0200
Subject: [PATCH 057/218] download task object when necessary

---
 scripts/automated_ingestion/eessi_task_description.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index a958fa4d..c8f96ec9 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -31,7 +31,9 @@ def __init__(self, task_object: EESSIDataAndSignatureObject):
         """
         self.task_object = task_object
         self.metadata = {}
-        
+
+        self.task_object.download(mode=DownloadMode.CHECK_REMOTE)
+
         # Verify signature and set initial state
         self.signature_verified = self.task_object.verify_signature()
         

From 6838435d84f1cd824324f5e311b706f20de5917b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 4 May 2025 10:51:47 +0200
Subject: [PATCH 058/218] import DownloadMode

---
 scripts/automated_ingestion/eessi_task_description.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index c8f96ec9..fcc5a68b 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -6,6 +6,7 @@
 
 from eessi_data_object import EESSIDataAndSignatureObject
 from utils import log_function_entry_exit, log_message, LoggingScope
+from remote_storage import DownloadMode
 
 
 @dataclass

From 7afd28328d6b8b211265cfd57f3a4f0820a37377 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 4 May 2025 15:56:26 +0200
Subject: [PATCH 059/218] initial EESSITask and support for handling states for
 different actions

---
 .../automated_ingestion.py                    |  29 +++--
 scripts/automated_ingestion/eessi_task.py     | 117 ++++++++++++++++++
 .../automated_ingestion/eessi_task_action.py  |  11 ++
 .../eessi_task_description.py                 |   7 ++
 4 files changed, 154 insertions(+), 10 deletions(-)
 create mode 100644 scripts/automated_ingestion/eessi_task.py
 create mode 100644 scripts/automated_ingestion/eessi_task_action.py

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index f68a7373..01606b01 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -2,6 +2,7 @@
 
 from eessitarball import EessiTarball, EessiTarballGroup
 from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode
+from eessi_task import EESSITask
 from eessi_task_description import EESSITaskDescription
 from s3_bucket import EESSIS3Bucket
 from pid.decorator import pidfile  # noqa: F401
@@ -250,27 +251,35 @@ def main():
                 # Process each task file
                 for task_path in tasks:
                     try:
-                        # Create EESSITaskDescription for the task file
-                        task_description = EESSITaskDescription(
-                            EESSIDataAndSignatureObject(config, task_path, s3_bucket)
-                        )
-
+                        # Create EESSITask for the task file
+                        try:
+                            task = EESSITask(
+                                EESSITaskDescription(
+                                    EESSIDataAndSignatureObject(config, task_path, s3_bucket)
+                                )
+                            )
+                        except Exception as err:
+                            log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", task_path, str(err))
+                            continue
+
+                        # TODO: update the information shown below (what makes sense to show?)
                         # Log information about the task
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_description.task_object.local_file_path)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_description.task_object.local_sig_path)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task_description.signature_verified)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task.task_description.task_object.local_file_path)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task.task_description.task_object.local_sig_path)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task.task_description.signature_verified)
 
                         # Log the ETags of the downloaded task file
-                        file_etag, sig_etag = task_description.task_object.get_etags()
+                        file_etag, sig_etag = task.task_description.task_object.get_etags()
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag)
                         log_message(LoggingScope.GROUP_OPS, 'INFO', 
                                   "Task signature %s has ETag: %s", 
-                                  task_description.task_object.remote_sig_path, sig_etag)
+                                  task.task_description.task_object.remote_sig_path, sig_etag)
 
                         # TODO: Process the task file contents
                         # This would involve reading the task file, parsing its contents,
                         # and performing the required actions based on the task type
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path)
+                        task.handle()
 
                     except Exception as err:
                         log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err))
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
new file mode 100644
index 00000000..875f1b4f
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -0,0 +1,117 @@
+from enum import Enum, auto
+
+from eessi_task_action import EESSITaskAction
+from eessi_task_description import EESSITaskDescription
+
+class TaskState(Enum):
+    NEW = auto()  # The task has been created but not yet processed
+    STAGED = auto()  # The task has been staged to the Stratum-0
+    PR_OPENED = auto()  # The task has been opened as a PR in some staging repository
+    APPROVED = auto()  # The task has been approved
+    REJECTED = auto()  # The task has been rejected
+    INGESTED = auto()  # The task has been ingested into the target CernVM-FS repository
+
+    def __str__(self):
+        return self.name.lower()
+
+class EESSITask:
+    task_description: EESSITaskDescription
+    action: EESSITaskAction
+    state: TaskState
+
+    def __init__(self, task_description: EESSITaskDescription):
+        self.task_description = task_description
+        self.action = self._determine_action()
+        self.state = TaskState.NEW
+
+        # Define valid state transitions for all actions
+        self.valid_transitions = {
+            TaskState.NEW: [TaskState.STAGED],
+            TaskState.STAGED: [TaskState.PR_OPENED],
+            TaskState.PR_OPENED: [TaskState.APPROVED, TaskState.REJECTED],
+            TaskState.APPROVED: [TaskState.INGESTED],
+            TaskState.REJECTED: [],  # Terminal state
+            TaskState.INGESTED: []   # Terminal state
+        }
+
+    def _determine_action(self) -> EESSITaskAction:
+        """
+        Determine the action type based on task description metadata.
+        """
+        if 'task' in self.task_description.metadata and 'action' in self.task_description.metadata['task']:
+            action_str = self.task_description.metadata['action'].lower()
+            if action_str == "nop":
+                return EESSITaskAction.NOP
+            elif action_str == "delete":
+                return EESSITaskAction.DELETE
+            elif action_str == "add":
+                return EESSITaskAction.ADD
+            elif action_str == "update":
+                return EESSITaskAction.UPDATE
+        return EESSITaskAction.UNKNOWN
+
+    def handle(self):
+        """
+        Dynamically find and execute the appropriate handler based on action and state.
+        """
+        state_before_handle = self.state
+
+        # Construct handler method name
+        handler_name = f"_handle_{self.action}_{self.state}"
+
+        # Check if the handler exists
+        handler = getattr(self, handler_name, None)
+
+        if handler and callable(handler):
+            # Execute the handler if it exists
+            handler()
+            # if state has changed, run handle() again; otherwise, do nothing
+            if self.state != state_before_handle:
+                print(f"handler {handler_name} changed state from {state_before_handle} to {self.state} ; running handle() again")
+                self.handle()
+        else:
+            # Default behavior for missing handlers
+            print(f"No handler for action {self.action} and state {self.state} implemented; nothing to be done")
+
+    # Implement handlers for ADD action
+    def _handle_add_new(self):
+        """Handler for ADD action in NEW state"""
+        print("Handling ADD action in NEW state")
+        # Implementation for adding in NEW state
+        return True
+
+    def _handle_add_staged(self):
+        """Handler for ADD action in STAGED state"""
+        print("Handling ADD action in STAGED state")
+        # Implementation for adding in STAGED state
+        return True
+
+    def _handle_add_pr_opened(self):
+        """Handler for ADD action in PR_OPENED state"""
+        print("Handling ADD action in PR_OPENED state")
+        # Implementation for adding in PR_OPENED state
+        return True
+
+    def _handle_add_approved(self):
+        """Handler for ADD action in APPROVED state"""
+        print("Handling ADD action in APPROVED state")
+        # Implementation for adding in APPROVED state
+        return True
+
+    def _handle_add_ingested(self):
+        """Handler for ADD action in INGESTED state"""
+        print("Handling ADD action in INGESTED state")
+        # Implementation for adding in INGESTED state
+        return True
+
+    def transition_to(self, new_state: TaskState):
+        """
+        Transition the task to a new state if valid.
+        """
+        if new_state in self.valid_transitions[self.state]:
+            self.state = new_state
+            return True
+        return False
+
+    def __str__(self):
+        return f"EESSITask(task_description={self.task_description})"
\ No newline at end of file
diff --git a/scripts/automated_ingestion/eessi_task_action.py b/scripts/automated_ingestion/eessi_task_action.py
new file mode 100644
index 00000000..8f0ce599
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task_action.py
@@ -0,0 +1,11 @@
+from enum import Enum, auto
+
+class EESSITaskAction(Enum):
+    NOP = auto()  # perform no action
+    DELETE = auto()  # perform a delete operation
+    ADD = auto()  # perform an add operation
+    UPDATE = auto()  # perform an update operation
+    UNKNOWN = auto()  # unknown action
+
+    def __str__(self):
+        return self.name.lower()
diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index fcc5a68b..b615b3df 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -45,6 +45,13 @@ def __init__(self, task_object: EESSIDataAndSignatureObject):
             # Expected if signature is not verified yet
             pass
 
+        # TODO: Process the task file contents
+        # check if the task file contains a task field and add that to self
+        if 'task' in self.metadata:
+            self.task = self.metadata['task']
+        else:
+            self.task = None
+
     @log_function_entry_exit()
     def _read_metadata(self) -> None:
         """

From 49738c51bfd616e24df0db9713483c2ba0c0ea3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 4 May 2025 16:00:44 +0200
Subject: [PATCH 060/218] action is a field in task

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 875f1b4f..225e1c45 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -39,7 +39,7 @@ def _determine_action(self) -> EESSITaskAction:
         Determine the action type based on task description metadata.
         """
         if 'task' in self.task_description.metadata and 'action' in self.task_description.metadata['task']:
-            action_str = self.task_description.metadata['action'].lower()
+            action_str = self.task_description.metadata['task']['action'].lower()
             if action_str == "nop":
                 return EESSITaskAction.NOP
             elif action_str == "delete":

From f265d7920565dd0d4e4318ecf571b73a6699d054 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 20:17:45 +0200
Subject: [PATCH 061/218] determine metadata/task state from GH staging repo

---
 .../automated_ingestion.py                    |   5 +-
 scripts/automated_ingestion/eessi_task.py     | 159 +++++++++++++++++-
 .../eessi_task_description.py                 |  32 ++++
 3 files changed, 186 insertions(+), 10 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 01606b01..1706b4bb 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -256,12 +256,15 @@ def main():
                             task = EESSITask(
                                 EESSITaskDescription(
                                     EESSIDataAndSignatureObject(config, task_path, s3_bucket)
-                                )
+                                ), 
+                                gh_staging_repo
                             )
                         except Exception as err:
                             log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", task_path, str(err))
                             continue
 
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task: %s", task)
+
                         # TODO: update the information shown below (what makes sense to show?)
                         # Log information about the task
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task.task_description.task_object.local_file_path)
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 225e1c45..4bf122c0 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -11,18 +11,33 @@ class TaskState(Enum):
     REJECTED = auto()  # The task has been rejected
     INGESTED = auto()  # The task has been ingested into the target CernVM-FS repository
 
+    @classmethod
+    def from_string(cls, name, default=None, case_sensitive=False):
+        if case_sensitive:
+            return cls.__members__.get(name, default)
+
+        try:
+            return next(
+                member for member_name, member in cls.__members__.items()
+                if member_name.lower() == name.lower()
+            )
+        except StopIteration:
+            return default
+
     def __str__(self):
         return self.name.lower()
 
+
 class EESSITask:
-    task_description: EESSITaskDescription
+    description: EESSITaskDescription
     action: EESSITaskAction
     state: TaskState
+    git_repo: Github
 
-    def __init__(self, task_description: EESSITaskDescription):
-        self.task_description = task_description
-        self.action = self._determine_action()
-        self.state = TaskState.NEW
+    def __init__(self, description: EESSITaskDescription, git_repo: Github):
+        self.description = description
+        self.git_repo = git_repo
+        self.action = self._determine_task_action()
 
         # Define valid state transitions for all actions
         self.valid_transitions = {
@@ -34,12 +49,14 @@ def __init__(self, task_description: EESSITaskDescription):
             TaskState.INGESTED: []   # Terminal state
         }
 
-    def _determine_action(self) -> EESSITaskAction:
+        self.state = self._find_state()
+
+    def _determine_task_action(self) -> EESSITaskAction:
         """
         Determine the action type based on task description metadata.
         """
-        if 'task' in self.task_description.metadata and 'action' in self.task_description.metadata['task']:
-            action_str = self.task_description.metadata['task']['action'].lower()
+        if 'task' in self.description.metadata and 'action' in self.description.metadata['task']:
+            action_str = self.description.metadata['task']['action'].lower()
             if action_str == "nop":
                 return EESSITaskAction.NOP
             elif action_str == "delete":
@@ -50,6 +67,130 @@ def _determine_action(self) -> EESSITaskAction:
                 return EESSITaskAction.UPDATE
         return EESSITaskAction.UNKNOWN
 
+    def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool:
+        """
+        Check if a file exists in a repository branch.
+        """
+        if branch is None:
+            branch = self.git_repo.default_branch
+        try:
+            self.git_repo.get_contents(file_path, ref=branch)
+            log_msg = "Found file %s in branch %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch)
+            return True
+        except github.UnknownObjectException:
+            # file_path does not exist in branch
+            return False
+        except github.GithubException as err:
+            if err.status == 404:
+                # file_path does not exist in branch
+                return False
+            else: 
+                # if there was some other (e.g. connection) issue, log message and return False
+                log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
+                log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status)
+                return False
+        return False
+
+    def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]:
+        """
+        Determines in which sequence numbers the metadata/task file is included and in which it is not.
+
+        Returns:
+            A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is included in that sequence number.
+
+        Idea:
+         - The deployment for a single source PR could be split into multiple staging PRs each is assigned a unique
+           sequence number.
+         - For a given source PR (identified by the repo name and the PR number), a staging PR using a branch named
+           `REPO/PR_NUM/SEQ_NUM` is created. 
+         - In the staging repo we create a corresponding directory `REPO/PR_NUM/SEQ_NUM`.
+         - If a metadata/task file is handled by the staging PR with sequence number, it is included in that directory.
+         - We iterate over all directories under `REPO/PR_NUM`:
+           - If the metadata/task file is available in the directory, we add the sequence number to the list.
+
+        Note: this is a placeholder for now, as we do not know yet if we need to use a sequence number.
+        """
+        sequence_numbers = {}
+        repo = self.description.metadata['task']['repo']
+        pr = self.description.metadata['task']['pr']
+        repo_pr_dir = f"{repo}/{pr}"
+        # iterate over all directories under repo_pr_dir
+        for dir in self._list_directory_contents(repo_pr_dir):
+            # check if the directory is a number
+            if dir.name.isdigit():
+                remote_file_path = self.description.task_object.remote_file_path
+                if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"):
+                    sequence_numbers[int(dir.name)] = True
+                else:
+                    sequence_numbers[int(dir.name)] = False
+            else:
+                # directory is not a number, so we skip it
+                continue
+        return sequence_numbers
+
+    def _find_state(self) -> TaskState:
+        """
+        Determine the state of the task based on the task description metadata.
+
+        Returns:
+            The state of the task.
+        """
+        # obtain repo and pr from metadata
+        repo = self.description.metadata['task']['repo']
+        pr = self.description.metadata['task']['pr']
+
+        # iterate over all sequence numbers in repo/pr dir
+        sequence_numbers = self._determine_sequence_numbers_including_task_file()
+        for sequence_number in [key for key, value in sequence_numbers.items() if value]:
+            # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name
+            # format of the metadata file name is:
+            #   eessi-VERSION-COMPONENT-OS-ARCHITECTURE-TIMESTAMP.SUFFIX
+            # all uppercase words are placeholders
+            # all placeholders (except ARCHITECTURE) do not include any hyphens
+            # ARCHITECTURE can include one to two hyphens
+            # The SUFFIX is composed of two parts: TARBALLSUFFIX and METADATASUFFIX
+            # TARBALLSUFFIX is defined by the task object or in the configuration file
+            # METADATASUFFIX is defined by the task object or in the configuration file
+            #   Later, we may switch to using task action files instead of metadata files. The format of the
+            #   SUFFIX would then be defined by the task action or the configuration file.
+            version, component, os, architecture, timestamp, suffix = self.description.get_metadata_file_components()
+            metadata_file_name = f"eessi-{version}-{component}-{os}-{architecture}-{timestamp}.{suffix}"
+            metadata_file_state_path = f"{repo}/{pr}/{sequence_number}/{metadata_file_name}"
+            # get the state from the file in the metadata_file_state_path
+            state = self._get_state_from_metadata_file(metadata_file_state_path)
+            return state
+        # did not find metadata file in staging repo on GitHub
+        return TaskState.NEW
+
+    def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskState:
+        """
+        Get the state from the file in the metadata_file_state_path.
+        """
+        # get contents of metadata_file_state_path
+        contents = self.git_repo.get_contents(metadata_file_state_path)
+        try:
+            state = TaskState.from_string(contents.name)
+            return state
+        except ValueError:
+            return TaskState.NEW
+
+    def _list_directory_contents(self, directory_path, branch=None):
+        try:
+            # Get contents of the directory
+            contents = self.git_repo.get_contents(directory_path, ref=branch)
+
+            # If contents is a list, it means we successfully got directory contents
+            if isinstance(contents, list):
+                return contents
+            else:
+                # If it's not a list, it means the path is not a directory
+                raise ValueError(f"{directory_path} is not a directory")
+        except github.GithubException as err:
+            if err.status == 404:
+                raise FileNotFoundError(f"Directory not found: {directory_path}")
+            raise err
+
     def handle(self):
         """
         Dynamically find and execute the appropriate handler based on action and state.
@@ -114,4 +255,4 @@ def transition_to(self, new_state: TaskState):
         return False
 
     def __str__(self):
-        return f"EESSITask(task_description={self.task_description})"
\ No newline at end of file
+        return f"EESSITask(description={self.description}, action={self.action}, state={self.state})"
\ No newline at end of file
diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index b615b3df..866121ef 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -76,6 +76,38 @@ def _read_metadata(self) -> None:
                        self.task_object.local_file_path, str(e))
             raise
 
+    def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
+        """
+        Get the components of the metadata file name.
+
+        An example of the metadata file name is:
+          eessi-2023.06-software-linux-x86_64-amd-zen2-1745557626.tar.gz.meta.txt
+
+        The components are:
+          eessi: some prefix
+          VERSION: 2023.06
+          COMPONENT: software
+          OS: linux
+          ARCHITECTURE: x86_64-amd-zen2
+          TIMESTAMP: 1745557626
+          SUFFIX: tar.gz.meta.txt
+
+          The ARCHITECTURE component can include one to two hyphens.
+          The SUFFIX is the part after the first dot (no other components should include dots).
+        """
+        # obtain file name from local file path using basename
+        file_name = Path(self.task_object.local_file_path).name
+        # split file_name into part before suffix and the suffix
+        # from file_name_without_suffix determine VERSION (2nd element), COMPONENT (3rd element), OS (4th element),
+        #  ARCHITECTURE (5th to second last elements) and TIMESTAMP (last element)
+        components = file_name_without_suffix.split('-')
+        version = components[1]
+        component = components[2]
+        os = components[3]
+        architecture = '-'.join(components[4:-1])
+        timestamp = components[-1]
+        return version, component, os, architecture, timestamp, suffix
+
     def __str__(self) -> str:
         """Return a string representation of the EESSITaskDescription object."""
         return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" 
\ No newline at end of file

From 3959780d4634d48da471b6e60c8f06babb62db74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 20:20:36 +0200
Subject: [PATCH 062/218] import missing Tuple

---
 scripts/automated_ingestion/eessi_task_description.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 866121ef..618b7968 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -2,7 +2,7 @@
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 
 from eessi_data_object import EESSIDataAndSignatureObject
 from utils import log_function_entry_exit, log_message, LoggingScope

From c5e45a1a8e699c940d6cb36f787eeb504fa0071b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:09:19 +0200
Subject: [PATCH 063/218] flake8 improvements

---
 .../automated_ingestion.py                    |  60 +++++-----
 .../automated_ingestion/eessi_data_object.py  | 107 +++++++++---------
 scripts/automated_ingestion/eessi_task.py     |  24 ++--
 .../automated_ingestion/eessi_task_action.py  |   1 +
 .../eessi_task_description.py                 |  33 +++---
 scripts/automated_ingestion/eessitarball.py   |  28 +++--
 scripts/automated_ingestion/remote_storage.py |   2 +-
 scripts/automated_ingestion/s3_bucket.py      |  11 +-
 scripts/automated_ingestion/utils.py          |  14 ++-
 9 files changed, 149 insertions(+), 131 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 1706b4bb..12429e4a 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 from eessitarball import EessiTarball, EessiTarballGroup
-from eessi_data_object import EESSIDataAndSignatureObject, DownloadMode
+from eessi_data_object import EESSIDataAndSignatureObject
 from eessi_task import EESSITask
 from eessi_task_description import EESSITaskDescription
 from s3_bucket import EESSIS3Bucket
@@ -10,7 +10,6 @@
 from utils import log_function_entry_exit, log_message, LoggingScope, set_logging_scopes
 
 import argparse
-import boto3
 import configparser
 import github
 import json
@@ -19,7 +18,7 @@
 import pid
 import sys
 from pathlib import Path
-from typing import List, Dict
+from typing import List
 
 REQUIRED_CONFIG = {
     'secrets': ['aws_secret_access_key', 'aws_access_key_id', 'github_pat'],
@@ -135,30 +134,31 @@ def parse_args():
     # Logging options
     logging_group = parser.add_argument_group('Logging options')
     logging_group.add_argument('--log-file',
-                             help='Path to log file (overrides config file setting)')
+                               help='Path to log file (overrides config file setting)')
     logging_group.add_argument('--console-level',
-                             choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
-                             help='Logging level for console output (overrides config file setting)')
+                               choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+                               help='Logging level for console output (overrides config file setting)')
     logging_group.add_argument('--file-level',
-                             choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
-                             help='Logging level for file output (overrides config file setting)')
+                               choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+                               help='Logging level for file output (overrides config file setting)')
     logging_group.add_argument('--quiet',
-                             action='store_true',
-                             help='Suppress console output (overrides all other console settings)')
+                               action='store_true',
+                               help='Suppress console output (overrides all other console settings)')
     logging_group.add_argument('--log-scopes',
-                             help='Comma-separated list of logging scopes using +/- syntax. '
-                                  'Examples: "+FUNC_ENTRY_EXIT" (enable only function entry/exit), '
-                                  '"+ALL,-FUNC_ENTRY_EXIT" (enable all except function entry/exit), '
-                                  '"+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" (enable function entry/exit but disable example)')
+                               help='Comma-separated list of logging scopes using +/- syntax. '
+                               'Examples: "+FUNC_ENTRY_EXIT" (enable only function entry/exit), '
+                               '"+ALL,-FUNC_ENTRY_EXIT" (enable all except function entry/exit), '
+                               '"+FUNC_ENTRY_EXIT,-EXAMPLE_SCOPE" (enable function entry/exit but disable example)')
 
     # Existing arguments
     parser.add_argument('-c', '--config', type=str, help='path to configuration file',
-                       default='automated_ingestion.cfg', dest='config')
+                        default='automated_ingestion.cfg', dest='config')
     parser.add_argument('-d', '--debug', help='enable debug mode', action='store_true', dest='debug')
-    parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true', dest='list_only')
+    parser.add_argument('-l', '--list', help='only list available tarballs or tasks', action='store_true',
+                        dest='list_only')
     parser.add_argument('--task-based', help='use task-based ingestion instead of tarball-based. '
-                       'Optionally specify comma-separated list of extensions (default: .task)',
-                       nargs='?', const='.task', default=False)
+                        'Optionally specify comma-separated list of extensions (default: .task)',
+                        nargs='?', const='.task', default=False)
 
     return parser.parse_args()
 
@@ -175,7 +175,6 @@ def setup_logging(config, args):
     """
     # Get settings from config file
     log_file = config['logging'].get('filename')
-    log_format = config['logging'].get('format', '%(levelname)s: %(message)s')
     config_console_level = LOG_LEVELS.get(config['logging'].get('level', 'INFO').upper(), logging.INFO)
     config_file_level = LOG_LEVELS.get(config['logging'].get('file_level', 'DEBUG').upper(), logging.DEBUG)
 
@@ -256,27 +255,29 @@ def main():
                             task = EESSITask(
                                 EESSITaskDescription(
                                     EESSIDataAndSignatureObject(config, task_path, s3_bucket)
-                                ), 
+                                ),
                                 gh_staging_repo
                             )
                         except Exception as err:
-                            log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s", task_path, str(err))
+                            log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s",
+                                        task_path, str(err))
                             continue
 
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task: %s", task)
 
                         # TODO: update the information shown below (what makes sense to show?)
                         # Log information about the task
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task.task_description.task_object.local_file_path)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task.task_description.task_object.local_sig_path)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s", task.task_description.signature_verified)
+                        task_object = task.task_description.task_object
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s",
+                                    task.task_description.signature_verified)
 
                         # Log the ETags of the downloaded task file
-                        file_etag, sig_etag = task.task_description.task_object.get_etags()
+                        file_etag, sig_etag = task_object.get_etags()
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', 
-                                  "Task signature %s has ETag: %s", 
-                                  task.task_description.task_object.remote_sig_path, sig_etag)
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s",
+                                    task_object.remote_sig_path, sig_etag)
 
                         # TODO: Process the task file contents
                         # This would involve reading the task file, parsing its contents,
@@ -301,7 +302,8 @@ def main():
                         if tarballs:
                             # Create a group for these tarballs
                             group = EessiTarballGroup(tarballs[0], config, gh_staging_repo, s3_bucket, cvmfs_repo)
-                            log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s", group.to_string(oneline=True))
+                            log_message(LoggingScope.GROUP_OPS, 'INFO', "group created\n%s",
+                                        group.to_string(oneline=True))
                             group.process_group(tarballs)
             else:
                 # use old individual PR method
diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index 482ca6f3..6e8189fe 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -1,10 +1,8 @@
-import os
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional
 
-import boto3
 import configparser
 
 from utils import log_function_entry_exit, log_message, LoggingScope
@@ -88,24 +86,24 @@ def get_etags(self) -> tuple[Optional[str], Optional[str]]:
     def verify_signature(self) -> bool:
         """
         Verify the signature of the data file using the corresponding signature file.
-        
+
         Returns:
             bool: True if the signature is valid or if signatures are not required, False otherwise
         """
         # Check if signature file exists
         if not self.local_sig_path.exists():
-            log_message(LoggingScope.VERIFICATION, 'WARNING', "Signature file %s is missing", 
-                       self.local_sig_path)
-            
+            log_message(LoggingScope.VERIFICATION, 'WARNING', "Signature file %s is missing",
+                        self.local_sig_path)
+
             # If signatures are required, return failure
             if self.config['signatures'].getboolean('signatures_required', True):
-                log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing and signatures are required", 
-                           self.local_sig_path)
+                log_message(LoggingScope.ERROR, 'ERROR', "Signature file %s is missing and signatures are required",
+                            self.local_sig_path)
                 return False
             else:
-                log_message(LoggingScope.VERIFICATION, 'INFO', 
-                           "Signature file %s is missing, but signatures are not required", 
-                           self.local_sig_path)
+                log_message(LoggingScope.VERIFICATION, 'INFO',
+                            "Signature file %s is missing, but signatures are not required",
+                            self.local_sig_path)
                 return True
 
         # If signatures are provided, we should always verify them, regardless of the signatures_required setting
@@ -115,13 +113,13 @@ def verify_signature(self) -> bool:
 
         # Check if verification tools exist
         if not Path(verify_script).exists():
-            log_message(LoggingScope.ERROR, 'ERROR', 
-                       "Unable to verify signature: verification script %s does not exist", verify_script)
+            log_message(LoggingScope.ERROR, 'ERROR',
+                        "Unable to verify signature: verification script %s does not exist", verify_script)
             return False
 
         if not Path(allowed_signers_file).exists():
-            log_message(LoggingScope.ERROR, 'ERROR', 
-                       "Unable to verify signature: allowed signers file %s does not exist", allowed_signers_file)
+            log_message(LoggingScope.ERROR, 'ERROR',
+                        "Unable to verify signature: allowed signers file %s does not exist", allowed_signers_file)
             return False
 
         # Run the verification command with named parameters
@@ -133,25 +131,25 @@ def verify_signature(self) -> bool:
             '--signature-file', str(self.local_sig_path)
         ]
         log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(cmd))
-        
+
         try:
             result = subprocess.run(cmd, capture_output=True, text=True)
             if result.returncode == 0:
-                log_message(LoggingScope.VERIFICATION, 'INFO', 
-                           "Successfully verified signature for %s", self.local_file_path)
+                log_message(LoggingScope.VERIFICATION, 'INFO',
+                            "Successfully verified signature for %s", self.local_file_path)
                 log_message(LoggingScope.VERIFICATION, 'DEBUG', "  stdout: %s", result.stdout)
                 log_message(LoggingScope.VERIFICATION, 'DEBUG', "  stderr: %s", result.stderr)
                 return True
             else:
-                log_message(LoggingScope.ERROR, 'ERROR', 
-                           "Signature verification failed for %s", self.local_file_path)
+                log_message(LoggingScope.ERROR, 'ERROR',
+                            "Signature verification failed for %s", self.local_file_path)
                 log_message(LoggingScope.ERROR, 'ERROR', "  stdout: %s", result.stdout)
                 log_message(LoggingScope.ERROR, 'ERROR', "  stderr: %s", result.stderr)
                 return False
-        except Exception as e:
-            log_message(LoggingScope.ERROR, 'ERROR', 
-                       "Error during signature verification for %s: %s", 
-                       self.local_file_path, str(e))
+        except Exception as err:
+            log_message(LoggingScope.ERROR, 'ERROR',
+                        "Error during signature verification for %s: %s",
+                        self.local_file_path, str(err))
             return False
 
     @log_function_entry_exit()
@@ -179,8 +177,9 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
 
             # If files don't exist locally, we can skip ETag checks
             if not local_files_exist:
-                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, skipping ETag checks and downloading %s",
-                           self.remote_file_path)
+                log_message(LoggingScope.DOWNLOAD, 'INFO',
+                            "Local files missing, skipping ETag checks and downloading %s",
+                            self.remote_file_path)
                 should_download = True
             else:
                 # First check if we have local ETags
@@ -201,7 +200,7 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
                     if not local_file_etag or not local_sig_etag:
                         should_download = True
                         log_message(LoggingScope.DOWNLOAD, 'INFO', "Missing local ETags, downloading %s",
-                                  self.remote_file_path)
+                                    self.remote_file_path)
                     else:
                         # Get remote ETags and compare
                         remote_file_etag = self.remote_client.get_metadata(self.remote_file_path)['ETag']
@@ -216,18 +215,20 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
                         if should_download:
                             if remote_file_etag != local_file_etag:
                                 log_message(LoggingScope.DOWNLOAD, 'INFO', "File ETag changed from %s to %s",
-                                          local_file_etag, remote_file_etag)
+                                            local_file_etag, remote_file_etag)
                             if remote_sig_etag != local_sig_etag:
                                 log_message(LoggingScope.DOWNLOAD, 'INFO', "Signature ETag changed from %s to %s",
-                                          local_sig_etag, remote_sig_etag)
+                                            local_sig_etag, remote_sig_etag)
                             log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files have changed, downloading %s",
-                                      self.remote_file_path)
+                                        self.remote_file_path)
                         else:
-                            log_message(LoggingScope.DOWNLOAD, 'INFO', "Remote files unchanged, skipping download of %s",
-                                      self.remote_file_path)
+                            log_message(LoggingScope.DOWNLOAD, 'INFO',
+                                        "Remote files unchanged, skipping download of %s",
+                                        self.remote_file_path)
                 except Exception as etag_err:
                     # If we get any error with ETags, we'll just download the files
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s", str(etag_err))
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error handling ETags, will download files: %s",
+                                str(etag_err))
                     should_download = True
         else:  # CHECK_LOCAL
             should_download = (
@@ -239,11 +240,11 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
                     log_message(LoggingScope.DOWNLOAD, 'INFO', "Local file missing: %s", self.local_file_path)
                 if not self.local_sig_path.exists():
                     log_message(LoggingScope.DOWNLOAD, 'INFO', "Local signature missing: %s", self.local_sig_path)
-                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, downloading %s", 
-                          self.remote_file_path)
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files missing, downloading %s",
+                            self.remote_file_path)
             else:
-                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files exist, skipping download of %s", 
-                          self.remote_file_path)
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Local files exist, skipping download of %s",
+                            self.remote_file_path)
 
         if not should_download:
             return False
@@ -259,24 +260,24 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
             # Get and log the ETag of the downloaded file
             try:
                 file_etag = self._get_local_etag(self.local_file_path)
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", 
-                           self.remote_file_path, file_etag)
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s",
+                            self.remote_file_path, file_etag)
             except Exception as etag_err:
-                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", 
-                           self.remote_file_path, str(etag_err))
+                log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s",
+                            self.remote_file_path, str(etag_err))
 
             # Try to download the signature file
             try:
                 self.remote_client.download(self.remote_sig_path, str(self.local_sig_path))
                 try:
                     sig_etag = self._get_local_etag(self.local_sig_path)
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s", 
-                               self.remote_sig_path, sig_etag)
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Downloaded %s with ETag: %s",
+                                self.remote_sig_path, sig_etag)
                 except Exception as etag_err:
-                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s", 
-                               self.remote_sig_path, str(etag_err))
-                log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s and its signature", 
-                           self.remote_file_path)
+                    log_message(LoggingScope.DOWNLOAD, 'DEBUG', "Error getting ETag for %s: %s",
+                                self.remote_sig_path, str(etag_err))
+                log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s and its signature",
+                            self.remote_file_path)
             except Exception as sig_err:
                 # Check if signatures are required
                 if self.config['signatures'].getboolean('signatures_required', True):
@@ -290,8 +291,8 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
                     sig_etag_path = self._get_etag_file_path(self.local_sig_path)
                     if sig_etag_path.exists():
                         sig_etag_path.unlink()
-                    log_message(LoggingScope.ERROR, 'ERROR', "Failed to download required signature for %s: %s", 
-                               self.remote_file_path, str(sig_err))
+                    log_message(LoggingScope.ERROR, 'ERROR', "Failed to download required signature for %s: %s",
+                                self.remote_file_path, str(sig_err))
                     raise
                 else:
                     # If signatures are optional, just clean up any partial signature files
@@ -300,10 +301,10 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
                     sig_etag_path = self._get_etag_file_path(self.local_sig_path)
                     if sig_etag_path.exists():
                         sig_etag_path.unlink()
-                    log_message(LoggingScope.DOWNLOAD, 'WARNING', "Failed to download optional signature for %s: %s", 
-                               self.remote_file_path, str(sig_err))
-                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s (signature optional)", 
-                               self.remote_file_path)
+                    log_message(LoggingScope.DOWNLOAD, 'WARNING', "Failed to download optional signature for %s: %s",
+                                self.remote_file_path, str(sig_err))
+                    log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s (signature optional)",
+                                self.remote_file_path)
 
             return True
         except Exception as err:
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 4bf122c0..6bbe498f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,7 +1,10 @@
 from enum import Enum, auto
-
+from typing import Dict
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
+from utils import log_message, LoggingScope
+from github import Github
+
 
 class TaskState(Enum):
     NEW = auto()  # The task has been created but not yet processed
@@ -78,14 +81,14 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool:
             log_msg = "Found file %s in branch %s"
             log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch)
             return True
-        except github.UnknownObjectException:
+        except Github.UnknownObjectException:
             # file_path does not exist in branch
             return False
-        except github.GithubException as err:
+        except Github.GithubException as err:
             if err.status == 404:
                 # file_path does not exist in branch
                 return False
-            else: 
+            else:
                 # if there was some other (e.g. connection) issue, log message and return False
                 log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
                 log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status)
@@ -97,13 +100,14 @@ def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]:
         Determines in which sequence numbers the metadata/task file is included and in which it is not.
 
         Returns:
-            A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is included in that sequence number.
+            A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is
+            included in that sequence number.
 
         Idea:
          - The deployment for a single source PR could be split into multiple staging PRs each is assigned a unique
            sequence number.
          - For a given source PR (identified by the repo name and the PR number), a staging PR using a branch named
-           `REPO/PR_NUM/SEQ_NUM` is created. 
+           `REPO/PR_NUM/SEQ_NUM` is created.
          - In the staging repo we create a corresponding directory `REPO/PR_NUM/SEQ_NUM`.
          - If a metadata/task file is handled by the staging PR with sequence number, it is included in that directory.
          - We iterate over all directories under `REPO/PR_NUM`:
@@ -186,7 +190,7 @@ def _list_directory_contents(self, directory_path, branch=None):
             else:
                 # If it's not a list, it means the path is not a directory
                 raise ValueError(f"{directory_path} is not a directory")
-        except github.GithubException as err:
+        except Github.GithubException as err:
             if err.status == 404:
                 raise FileNotFoundError(f"Directory not found: {directory_path}")
             raise err
@@ -208,7 +212,9 @@ def handle(self):
             handler()
             # if state has changed, run handle() again; otherwise, do nothing
             if self.state != state_before_handle:
-                print(f"handler {handler_name} changed state from {state_before_handle} to {self.state} ; running handle() again")
+                msg = f"handler {handler_name} changed state from {state_before_handle} to {self.state}"
+                msg += " running handle() again"
+                print(msg)
                 self.handle()
         else:
             # Default behavior for missing handlers
@@ -255,4 +261,4 @@ def transition_to(self, new_state: TaskState):
         return False
 
     def __str__(self):
-        return f"EESSITask(description={self.description}, action={self.action}, state={self.state})"
\ No newline at end of file
+        return f"EESSITask(description={self.description}, action={self.action}, state={self.state})"
diff --git a/scripts/automated_ingestion/eessi_task_action.py b/scripts/automated_ingestion/eessi_task_action.py
index 8f0ce599..6f141435 100644
--- a/scripts/automated_ingestion/eessi_task_action.py
+++ b/scripts/automated_ingestion/eessi_task_action.py
@@ -1,5 +1,6 @@
 from enum import Enum, auto
 
+
 class EESSITaskAction(Enum):
     NOP = auto()  # perform no action
     DELETE = auto()  # perform a delete operation
diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 618b7968..271ff9a9 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -1,8 +1,7 @@
 import json
-import subprocess
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Tuple
 
 from eessi_data_object import EESSIDataAndSignatureObject
 from utils import log_function_entry_exit, log_message, LoggingScope
@@ -37,7 +36,7 @@ def __init__(self, task_object: EESSIDataAndSignatureObject):
 
         # Verify signature and set initial state
         self.signature_verified = self.task_object.verify_signature()
-        
+
         # Try to read metadata (will only succeed if signature is verified)
         try:
             self._read_metadata()
@@ -59,21 +58,22 @@ def _read_metadata(self) -> None:
         Only reads metadata if the signature has been verified.
         """
         if not self.signature_verified:
-            log_message(LoggingScope.ERROR, 'ERROR', "Cannot read metadata: signature not verified for %s", 
-                       self.task_object.local_file_path)
+            log_message(LoggingScope.ERROR, 'ERROR', "Cannot read metadata: signature not verified for %s",
+                        self.task_object.local_file_path)
             raise RuntimeError("Cannot read metadata: signature not verified")
 
         try:
-            with open(self.task_object.local_file_path, 'r') as f:
-                self.metadata = json.load(f)
-            log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s", self.task_object.local_file_path)
-        except json.JSONDecodeError as e:
-            log_message(LoggingScope.ERROR, 'ERROR', "Failed to parse JSON in task description file %s: %s", 
-                       self.task_object.local_file_path, str(e))
+            with open(self.task_object.local_file_path, 'r') as file:
+                self.metadata = json.load(file)
+            log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s",
+                        self.task_object.local_file_path)
+        except json.JSONDecodeError as err:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to parse JSON in task description file %s: %s",
+                        self.task_object.local_file_path, str(err))
             raise
-        except Exception as e:
-            log_message(LoggingScope.ERROR, 'ERROR', "Failed to read task description file %s: %s", 
-                       self.task_object.local_file_path, str(e))
+        except Exception as err:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to read task description file %s: %s",
+                        self.task_object.local_file_path, str(err))
             raise
 
     def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
@@ -98,6 +98,9 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
         # obtain file name from local file path using basename
         file_name = Path(self.task_object.local_file_path).name
         # split file_name into part before suffix and the suffix
+        #   idea: split on last hyphen, then split on first dot
+        suffix = file_name.split('-')[-1].split('.', 1)[1]
+        file_name_without_suffix = file_name.strip(f".{suffix}")
         # from file_name_without_suffix determine VERSION (2nd element), COMPONENT (3rd element), OS (4th element),
         #  ARCHITECTURE (5th to second last elements) and TIMESTAMP (last element)
         components = file_name_without_suffix.split('-')
@@ -110,4 +113,4 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
 
     def __str__(self) -> str:
         """Return a string representation of the EESSITaskDescription object."""
-        return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})" 
\ No newline at end of file
+        return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})"
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index eca6b67b..cb4ae801 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -1,11 +1,9 @@
 from utils import send_slack_message, sha256sum, log_function_entry_exit, log_message, LoggingScope
-from s3_bucket import EESSIS3Bucket
 
 from pathlib import PurePosixPath
 
 import github
 import json
-import logging
 import os
 import subprocess
 import tarfile
@@ -251,11 +249,11 @@ def verify_signatures(self):
             (self.local_metadata_path, self.local_metadata_sig_path)
         ]:
             command = verify_runenv + [verify_script, '--verify', '--allowed-signers-file', allowed_signers_file,
-                 '--file', file, '--signature-file', sig_file]
+                                       '--file', file, '--signature-file', sig_file]
             log_message(LoggingScope.VERIFICATION, 'INFO', "Running command: %s", ' '.join(command))
 
             verify_cmd = subprocess.run(
-                command, 
+                command,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE)
             if verify_cmd.returncode == 0:
@@ -455,9 +453,9 @@ def make_approval_request(self, tarballs_in_group=None):
                     log_msg = 'Warning, tarball %s is in a weird state:'
                     log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, self.object)
                     log_msg = 'Branch: %s\nPR: %s\nPR state: %s\nPR merged: %s'
-                    log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg, 
-                              git_branch, pr, pr.state, pr.merged)
-                    # TODO:  should we delete the branch or open an issue? 
+                    log_message(LoggingScope.GITHUB_OPS, 'WARNING', log_msg,
+                                git_branch, pr, pr.state, pr.merged)
+                    # TODO: should we delete the branch or open an issue?
                     return
             else:
                 log_msg = 'Tarball %s has a branch, but no PR.'
@@ -471,8 +469,8 @@ def make_approval_request(self, tarballs_in_group=None):
 
         # Move metadata file(s) to approved directory
         log_msg = "Moving metadata for %s from %s to %s in branch %s"
-        log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, 
-                   self.object, self.state, next_state, git_branch)
+        log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg,
+                    self.object, self.state, next_state, git_branch)
         if tarballs_in_group is None:
             log_message(LoggingScope.GITHUB_OPS, 'INFO', "Moving metadata for individual tarball to staged")
             self.move_metadata_file(self.state, next_state, branch=git_branch)
@@ -485,7 +483,7 @@ def make_approval_request(self, tarballs_in_group=None):
 
         # Create PR with appropriate template
         try:
-            pr_url=f"https://github.com/{repo}/pull/{pr_id}",
+            pr_url = f"https://github.com/{repo}/pull/{pr_id}"
             if tarballs_in_group is None:
                 log_msg = "Creating PR for individual tarball: %s"
                 log_message(LoggingScope.GITHUB_OPS, 'INFO', log_msg, self.object)
@@ -589,8 +587,8 @@ def move_metadata_file(self, old_state, new_state, branch='main'):
         """Move the metadata file of a tarball from an old state's directory to a new state's directory."""
         file_path_old = old_state + '/' + self.metadata_file
         file_path_new = new_state + '/' + self.metadata_file
-        log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Moving metadata file %s from %s to %s in branch %s', 
-                   self.metadata_file, file_path_old, file_path_new, branch)
+        log_message(LoggingScope.GITHUB_OPS, 'INFO', 'Moving metadata file %s from %s to %s in branch %s',
+                    self.metadata_file, file_path_old, file_path_new, branch)
         tarball_metadata = self.git_repo.get_contents(file_path_old)
         # Remove the metadata file from the old state's directory...
         self.git_repo.delete_file(file_path_old, 'remove from ' + old_state, sha=tarball_metadata.sha, branch=branch)
@@ -629,7 +627,7 @@ def extract_checked_tarballs(self, pr_body):
         checked_tarballs = []
         for line in pr_body.split('\n'):
             if line.strip().startswith('- [x] '):
-                tarball = line.strip()[6:] # Remove '- [x] ' prefix
+                tarball = line.strip()[6:]  # Remove '- [x] ' prefix
                 checked_tarballs.append(tarball)
         return checked_tarballs
 
@@ -638,7 +636,7 @@ def extract_tarballs_from_pr_body(self, pr_body):
         tarballs = []
         for line in pr_body.split('\n'):
             if line.strip().startswith('- ['):
-                tarball = line.strip()[6:] # Remove '- [ ] ' or '- [x] ' prefix
+                tarball = line.strip()[6:]  # Remove '- [ ] ' or '- [x] ' prefix
                 tarballs.append(tarball)
         return tarballs
 
@@ -704,7 +702,7 @@ def process_group(self, tarballs):
         # Mark all tarballs as staged in the group branch, however need to handle first tarball differently
         log_msg = "Processing first tarball in group: %s"
         log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, self.first_tar.object)
-        self.first_tar.mark_new_tarball_as_staged('main') # this sets the state of the first tarball to 'staged'
+        self.first_tar.mark_new_tarball_as_staged('main')  # this sets the state of the first tarball to 'staged'
         for tarball in tarballs[1:]:
             log_msg = "Processing tarball in group: %s"
             log_message(LoggingScope.GROUP_OPS, 'INFO', log_msg, tarball)
diff --git a/scripts/automated_ingestion/remote_storage.py b/scripts/automated_ingestion/remote_storage.py
index ac005af8..2a386a7d 100644
--- a/scripts/automated_ingestion/remote_storage.py
+++ b/scripts/automated_ingestion/remote_storage.py
@@ -31,4 +31,4 @@ def download(self, remote_path: str, local_path: str) -> None:
             remote_path: Path to the object in remote storage
             local_path: Local path where to save the file
         """
-        ... 
\ No newline at end of file
+        ...
diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py
index 79b8a055..ff62813f 100644
--- a/scripts/automated_ingestion/s3_bucket.py
+++ b/scripts/automated_ingestion/s3_bucket.py
@@ -3,10 +3,11 @@
 from typing import Dict, Optional
 
 import boto3
-
+from botocore.exceptions import ClientError
 from utils import log_function_entry_exit, log_message, LoggingScope
 from remote_storage import RemoteStorageClient
 
+
 class EESSIS3Bucket(RemoteStorageClient):
     """EESSI-specific S3 bucket implementation of the RemoteStorageClient protocol."""
 
@@ -98,8 +99,8 @@ def get_metadata(self, remote_path: str) -> Dict:
             response = self.client.head_object(Bucket=self.bucket, Key=remote_path)
             log_message(LoggingScope.DEBUG, 'DEBUG', "Retrieved metadata for %s: %s", remote_path, response)
             return response
-        except ClientError as e:
-            log_message(LoggingScope.ERROR, 'ERROR', "Failed to get metadata for %s: %s", remote_path, str(e))
+        except ClientError as err:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to get metadata for %s: %s", remote_path, str(err))
             raise
 
     def _get_etag_file_path(self, local_path: str) -> Path:
@@ -143,8 +144,8 @@ def download(self, remote_path: str, local_path: str) -> None:
             log_message(LoggingScope.DOWNLOAD, 'INFO', "Downloading %s to %s", remote_path, local_path)
             self.client.download_file(Bucket=self.bucket, Key=remote_path, Filename=local_path)
             log_message(LoggingScope.DOWNLOAD, 'INFO', "Successfully downloaded %s to %s", remote_path, local_path)
-        except ClientError as e:
-            log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", remote_path, str(e))
+        except ClientError as err:
+            log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", remote_path, str(err))
             raise
 
         # Get metadata first to obtain the ETag
diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 70fbd9de..ab1e2b2f 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -9,6 +9,7 @@
 from enum import IntFlag, auto
 import sys
 
+
 class LoggingScope(IntFlag):
     """Enumeration of different logging scopes."""
     NONE = 0
@@ -20,15 +21,18 @@ class LoggingScope(IntFlag):
     GROUP_OPS = auto()        # Logging related to tarball group operations
     ERROR = auto()           # Error logging (separate from other scopes for easier filtering)
     DEBUG = auto()           # Debug-level logging (separate from other scopes for easier filtering)
-    ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE | 
+    ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE |
            GITHUB_OPS | GROUP_OPS | ERROR | DEBUG)
 
+
 # Global setting for logging scopes
 ENABLED_LOGGING_SCOPES = LoggingScope.NONE
 
+
 # Global variable to track call stack depth
 _call_stack_depth = 0
 
+
 def set_logging_scopes(scopes):
     """
     Set the enabled logging scopes.
@@ -90,10 +94,12 @@ def set_logging_scopes(scopes):
         # Convert list to comma-separated string and process
         set_logging_scopes(",".join(scopes))
 
+
 def is_logging_scope_enabled(scope):
     """Check if a specific logging scope is enabled."""
     return bool(ENABLED_LOGGING_SCOPES & scope)
 
+
 def send_slack_message(webhook, msg):
     """Send a Slack message."""
     slack_data = {'text': msg}
@@ -187,7 +193,7 @@ def wrapper(*args, **kwargs):
                 end_time = time.time()
                 # For normal returns, show the last line of the function
                 log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{last_line_no}"
-                        f"{context} (took {end_time - start_time:.2f}s)")
+                         f"{context} (took {end_time - start_time:.2f}s)")
                 return result
             except Exception as err:
                 _call_stack_depth -= 1
@@ -198,11 +204,12 @@ def wrapper(*args, **kwargs):
                 except AttributeError:
                     exc_line_no = last_line_no
                 log.info(f"{indent}[FUNC_ENTRY_EXIT] Leaving {func.__name__} at {file_name}:{exc_line_no}"
-                        f"{context} with exception (took {end_time - start_time:.2f}s)")
+                         f"{context} with exception (took {end_time - start_time:.2f}s)")
                 raise err
         return wrapper
     return decorator
 
+
 def log_message(scope, level, msg, *args, logger=None, **kwargs):
     """
     Log a message if either:
@@ -256,7 +263,6 @@ def log_message(scope, level, msg, *args, logger=None, **kwargs):
             for handler in original_handlers:
                 if handler not in log.handlers:
                     log.addHandler(handler)
-
     # Only use normal logging if scope is not enabled AND level is high enough
     elif not is_logging_scope_enabled(scope) and log_level >= log.getEffectiveLevel():
         # Use normal logging with level check

From 67e74f3dcd663d5dcd4174187670b4560adc3e50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:14:32 +0200
Subject: [PATCH 064/218] add func entry/exit logging to EESSITask

---
 scripts/automated_ingestion/eessi_task.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 6bbe498f..c3bc2acb 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -2,7 +2,7 @@
 from typing import Dict
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
-from utils import log_message, LoggingScope
+from utils import log_message, LoggingScope, log_function_entry_exit
 from github import Github
 
 
@@ -37,6 +37,7 @@ class EESSITask:
     state: TaskState
     git_repo: Github
 
+    @log_function_entry_exit()
     def __init__(self, description: EESSITaskDescription, git_repo: Github):
         self.description = description
         self.git_repo = git_repo
@@ -54,6 +55,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github):
 
         self.state = self._find_state()
 
+    @log_function_entry_exit()
     def _determine_task_action(self) -> EESSITaskAction:
         """
         Determine the action type based on task description metadata.
@@ -70,6 +72,7 @@ def _determine_task_action(self) -> EESSITaskAction:
                 return EESSITaskAction.UPDATE
         return EESSITaskAction.UNKNOWN
 
+    @log_function_entry_exit()
     def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool:
         """
         Check if a file exists in a repository branch.
@@ -95,6 +98,7 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool:
                 return False
         return False
 
+    @log_function_entry_exit()
     def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]:
         """
         Determines in which sequence numbers the metadata/task file is included and in which it is not.
@@ -133,6 +137,7 @@ def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]:
                 continue
         return sequence_numbers
 
+    @log_function_entry_exit()
     def _find_state(self) -> TaskState:
         """
         Determine the state of the task based on the task description metadata.
@@ -167,6 +172,7 @@ def _find_state(self) -> TaskState:
         # did not find metadata file in staging repo on GitHub
         return TaskState.NEW
 
+    @log_function_entry_exit()
     def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskState:
         """
         Get the state from the file in the metadata_file_state_path.
@@ -179,6 +185,7 @@ def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskSt
         except ValueError:
             return TaskState.NEW
 
+    @log_function_entry_exit()
     def _list_directory_contents(self, directory_path, branch=None):
         try:
             # Get contents of the directory
@@ -195,6 +202,7 @@ def _list_directory_contents(self, directory_path, branch=None):
                 raise FileNotFoundError(f"Directory not found: {directory_path}")
             raise err
 
+    @log_function_entry_exit()
     def handle(self):
         """
         Dynamically find and execute the appropriate handler based on action and state.
@@ -221,36 +229,42 @@ def handle(self):
             print(f"No handler for action {self.action} and state {self.state} implemented; nothing to be done")
 
     # Implement handlers for ADD action
+    @log_function_entry_exit()
     def _handle_add_new(self):
         """Handler for ADD action in NEW state"""
         print("Handling ADD action in NEW state")
         # Implementation for adding in NEW state
         return True
 
+    @log_function_entry_exit()
     def _handle_add_staged(self):
         """Handler for ADD action in STAGED state"""
         print("Handling ADD action in STAGED state")
         # Implementation for adding in STAGED state
         return True
 
+    @log_function_entry_exit()
     def _handle_add_pr_opened(self):
         """Handler for ADD action in PR_OPENED state"""
         print("Handling ADD action in PR_OPENED state")
         # Implementation for adding in PR_OPENED state
         return True
 
+    @log_function_entry_exit()
     def _handle_add_approved(self):
         """Handler for ADD action in APPROVED state"""
         print("Handling ADD action in APPROVED state")
         # Implementation for adding in APPROVED state
         return True
 
+    @log_function_entry_exit()
     def _handle_add_ingested(self):
         """Handler for ADD action in INGESTED state"""
         print("Handling ADD action in INGESTED state")
         # Implementation for adding in INGESTED state
         return True
 
+    @log_function_entry_exit()
     def transition_to(self, new_state: TaskState):
         """
         Transition the task to a new state if valid.
@@ -260,5 +274,6 @@ def transition_to(self, new_state: TaskState):
             return True
         return False
 
+    @log_function_entry_exit()
     def __str__(self):
         return f"EESSITask(description={self.description}, action={self.action}, state={self.state})"

From c28469283748251ab9019b2379573642e6254fdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:24:25 +0200
Subject: [PATCH 065/218] change logginscope name and log info in _find_state

---
 scripts/automated_ingestion/eessi_task.py   |  6 +++++
 scripts/automated_ingestion/eessitarball.py | 26 ++++++++++-----------
 scripts/automated_ingestion/utils.py        |  4 ++--
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index c3bc2acb..63c56f5f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -148,9 +148,11 @@ def _find_state(self) -> TaskState:
         # obtain repo and pr from metadata
         repo = self.description.metadata['task']['repo']
         pr = self.description.metadata['task']['pr']
+        log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr)
 
         # iterate over all sequence numbers in repo/pr dir
         sequence_numbers = self._determine_sequence_numbers_including_task_file()
+        log_message(LoggingScope.TASK_OPS, 'INFO', "sequence_numbers: %s", sequence_numbers)
         for sequence_number in [key for key, value in sequence_numbers.items() if value]:
             # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name
             # format of the metadata file name is:
@@ -164,12 +166,16 @@ def _find_state(self) -> TaskState:
             #   Later, we may switch to using task action files instead of metadata files. The format of the
             #   SUFFIX would then be defined by the task action or the configuration file.
             version, component, os, architecture, timestamp, suffix = self.description.get_metadata_file_components()
+            log_msg = "version: %s, component: %s, os: %s, architecture: %s, timestamp: %s, suffix: %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, version, component, os, architecture, timestamp, suffix)
             metadata_file_name = f"eessi-{version}-{component}-{os}-{architecture}-{timestamp}.{suffix}"
             metadata_file_state_path = f"{repo}/{pr}/{sequence_number}/{metadata_file_name}"
             # get the state from the file in the metadata_file_state_path
             state = self._get_state_from_metadata_file(metadata_file_state_path)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
             return state
         # did not find metadata file in staging repo on GitHub
+        log_message(LoggingScope.TASK_OPS, 'INFO', "did not find metadata file in staging repo on GitHub, state: NEW")
         return TaskState.NEW
 
     @log_function_entry_exit()
diff --git a/scripts/automated_ingestion/eessitarball.py b/scripts/automated_ingestion/eessitarball.py
index cb4ae801..cc3c4ae4 100644
--- a/scripts/automated_ingestion/eessitarball.py
+++ b/scripts/automated_ingestion/eessitarball.py
@@ -106,7 +106,7 @@ def find_state(self):
             try:
                 self.git_repo.get_contents(state + '/' + self.metadata_file)
                 log_msg = "Found metadata file %s in state: %s"
-                log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.metadata_file, state)
+                log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, self.metadata_file, state)
                 return state
             except github.UnknownObjectException:
                 # no metadata file found in this state's directory, so keep searching...
@@ -120,7 +120,7 @@ def find_state(self):
                     log_msg = 'Unable to determine the state of %s, the GitHub API returned status %s!'
                     log_message(LoggingScope.ERROR, 'WARNING', log_msg, self.object, err.status)
                     return "unknown"
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', "Tarball %s is new", self.metadata_file)
+        log_message(LoggingScope.STATE_OPS, 'INFO', "Tarball %s is new", self.metadata_file)
         return "new"
 
     def get_contents_overview(self):
@@ -282,7 +282,7 @@ def verify_checksum(self):
     def ingest(self):
         """Process a tarball that is ready to be ingested by running the ingestion script."""
         # TODO: check if there is an open issue for this tarball, and if there is, skip it.
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Tarball %s is ready to be ingested.', self.object)
+        log_message(LoggingScope.STATE_OPS, 'INFO', 'Tarball %s is ready to be ingested.', self.object)
         self.download()
         log_message(LoggingScope.VERIFICATION, 'INFO', 'Verifying its signature...')
         if not self.verify_signatures():
@@ -308,7 +308,7 @@ def ingest(self):
 
         script = self.config['paths']['ingestion_script']
         sudo = ['sudo'] if self.config['cvmfs'].getboolean('ingest_as_root', True) else []
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Running the ingestion script for %s...', self.object)
+        log_message(LoggingScope.STATE_OPS, 'INFO', 'Running the ingestion script for %s...', self.object)
         ingest_cmd = subprocess.run(
             sudo + [script, self.cvmfs_repo, self.local_path],
             stdout=subprocess.PIPE,
@@ -334,38 +334,38 @@ def ingest(self):
             )
             if self.issue_exists(issue_title, state='open'):
                 log_msg = 'Failed to ingest %s, but an open issue already exists, skipping...'
-                log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object)
+                log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, self.object)
             else:
                 self.git_repo.create_issue(title=issue_title, body=issue_body)
 
     def print_ingested(self):
         """Process a tarball that has already been ingested."""
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', '%s has already been ingested, skipping...', self.object)
+        log_message(LoggingScope.STATE_OPS, 'INFO', '%s has already been ingested, skipping...', self.object)
 
     @log_function_entry_exit()
     def mark_new_tarball_as_staged(self, branch=None):
         """Process a new tarball that was added to the staging bucket."""
         next_state = self.next_state(self.state)
         log_msg = 'Found new tarball %s, downloading it...'
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, self.object)
+        log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, self.object)
         # Download the tarball and its metadata file.
         # Use force as it may be a new attempt for an existing tarball that failed before.
         self.download(force=True)
         if not self.local_path or not self.local_metadata_path:
             log_msg = "Skipping tarball %s - download failed"
-            log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object)
+            log_message(LoggingScope.STATE_OPS, 'WARNING', log_msg, self.object)
             return
 
         # Verify the signatures of the tarball and metadata file.
         if not self.verify_signatures():
             log_msg = "Skipping tarball %s - signature verification failed"
-            log_message(LoggingScope.STATE_CHANGE, 'WARNING', log_msg, self.object)
+            log_message(LoggingScope.STATE_OPS, 'WARNING', log_msg, self.object)
             return
 
         # If no branch is provided, use the main branch
         target_branch = branch if branch else 'main'
         log_msg = "Adding metadata to '%s' folder in %s branch"
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg, next_state, target_branch)
+        log_message(LoggingScope.STATE_OPS, 'INFO', log_msg, next_state, target_branch)
 
         file_path_staged = next_state + '/' + self.metadata_file
         contents = ''
@@ -379,14 +379,14 @@ def mark_new_tarball_as_staged(self, branch=None):
 
     def print_rejected(self):
         """Process a (rejected) tarball for which the corresponding PR has been closed witout merging."""
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', "This tarball was rejected, so we're skipping it.")
+        log_message(LoggingScope.STATE_OPS, 'INFO', "This tarball was rejected, so we're skipping it.")
         # Do we want to delete rejected tarballs at some point?
 
     def print_unknown(self):
         """Process a tarball which has an unknown state."""
         log_msg = "The state of this tarball could not be determined,"
         log_msg += " so we're skipping it."
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', log_msg)
+        log_message(LoggingScope.STATE_OPS, 'INFO', log_msg)
 
     def find_next_sequence_number(self, repo, pr_id):
         """Find the next available sequence number for staging PRs of a source PR."""
@@ -643,7 +643,7 @@ def extract_tarballs_from_pr_body(self, pr_body):
     def reject(self):
         """Reject a tarball for ingestion."""
         # Let's move the the tarball to the directory for rejected tarballs.
-        log_message(LoggingScope.STATE_CHANGE, 'INFO', 'Marking tarball %s as rejected...', self.object)
+        log_message(LoggingScope.STATE_OPS, 'INFO', 'Marking tarball %s as rejected...', self.object)
         next_state = 'rejected'
         self.move_metadata_file(self.state, next_state)
 
diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index ab1e2b2f..c5a80f0c 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -16,12 +16,12 @@ class LoggingScope(IntFlag):
     FUNC_ENTRY_EXIT = auto()  # Function entry/exit logging
     DOWNLOAD = auto()         # Logging related to file downloads
     VERIFICATION = auto()     # Logging related to signature and checksum verification
-    STATE_CHANGE = auto()     # Logging related to tarball state changes
+    STATE_OPS = auto()     # Logging related to tarball state changes
     GITHUB_OPS = auto()       # Logging related to GitHub operations (PRs, issues, etc.)
     GROUP_OPS = auto()        # Logging related to tarball group operations
     ERROR = auto()           # Error logging (separate from other scopes for easier filtering)
     DEBUG = auto()           # Debug-level logging (separate from other scopes for easier filtering)
-    ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_CHANGE |
+    ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS |
            GITHUB_OPS | GROUP_OPS | ERROR | DEBUG)
 
 

From fbfc1ab44c6c84fdfb0655dbc6af584c457200c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:27:22 +0200
Subject: [PATCH 066/218] add missing scope for task ops

---
 scripts/automated_ingestion/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index c5a80f0c..32471fcd 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -16,13 +16,14 @@ class LoggingScope(IntFlag):
     FUNC_ENTRY_EXIT = auto()  # Function entry/exit logging
     DOWNLOAD = auto()         # Logging related to file downloads
     VERIFICATION = auto()     # Logging related to signature and checksum verification
-    STATE_OPS = auto()     # Logging related to tarball state changes
+    STATE_OPS = auto()     # Logging related to tarball state operations
     GITHUB_OPS = auto()       # Logging related to GitHub operations (PRs, issues, etc.)
     GROUP_OPS = auto()        # Logging related to tarball group operations
+    TASK_OPS = auto()        # Logging related to task operations
     ERROR = auto()           # Error logging (separate from other scopes for easier filtering)
     DEBUG = auto()           # Debug-level logging (separate from other scopes for easier filtering)
     ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS |
-           GITHUB_OPS | GROUP_OPS | ERROR | DEBUG)
+           GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG)
 
 
 # Global setting for logging scopes

From 28a4745565b1864f4fecda3b64b84376bb02a7c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:32:18 +0200
Subject: [PATCH 067/218] add a bit more logging

---
 scripts/automated_ingestion/eessi_task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 63c56f5f..b4c0ea00 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -146,6 +146,7 @@ def _find_state(self) -> TaskState:
             The state of the task.
         """
         # obtain repo and pr from metadata
+        log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object)
         repo = self.description.metadata['task']['repo']
         pr = self.description.metadata['task']['pr']
         log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr)

From d2b275d863385033a4aff67d19f8b070e7032118 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:38:40 +0200
Subject: [PATCH 068/218] obtain repo/pr from task or link2pr OR raise
 ValueError

---
 scripts/automated_ingestion/eessi_task.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index b4c0ea00..b6d4f162 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -147,8 +147,16 @@ def _find_state(self) -> TaskState:
         """
         # obtain repo and pr from metadata
         log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object)
-        repo = self.description.metadata['task']['repo']
-        pr = self.description.metadata['task']['pr']
+        task = self.description.metadata['task'] if 'task' in self.description.metadata else None
+        link2pr = self.description.metadata['link2pr'] if 'link2pr' in self.description.metadata else None
+        if task:
+            repo = task['repo']
+            pr = task['pr']
+        elif link2pr:
+            repo = link2pr['repo']
+            pr = link2pr['pr']
+        else:
+            raise ValueError("no repo or pr found in metadata")
         log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr)
 
         # iterate over all sequence numbers in repo/pr dir

From ad7cb11c4eda34aa93375dda497586d48bea3c1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:40:51 +0200
Subject: [PATCH 069/218] add logging for obtaining repo/pr from metadata

---
 scripts/automated_ingestion/eessi_task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index b6d4f162..05a571d3 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -150,9 +150,11 @@ def _find_state(self) -> TaskState:
         task = self.description.metadata['task'] if 'task' in self.description.metadata else None
         link2pr = self.description.metadata['link2pr'] if 'link2pr' in self.description.metadata else None
         if task:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "task found in metadata: %s", task)
             repo = task['repo']
             pr = task['pr']
         elif link2pr:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", link2pr)
             repo = link2pr['repo']
             pr = link2pr['pr']
         else:

From 11a23a0aad23195335416868022e278046e7df62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:53:03 +0200
Subject: [PATCH 070/218] populate source from metadata (link2pr) and use it in
 _find_state

---
 scripts/automated_ingestion/eessi_task.py          | 14 +++++++-------
 .../automated_ingestion/eessi_task_description.py  | 12 ++++++++++++
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 05a571d3..626f3fb4 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -147,16 +147,16 @@ def _find_state(self) -> TaskState:
         """
         # obtain repo and pr from metadata
         log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object)
-        task = self.description.metadata['task'] if 'task' in self.description.metadata else None
-        link2pr = self.description.metadata['link2pr'] if 'link2pr' in self.description.metadata else None
-        if task:
+        task = self.description.task
+        source = self.description.source
+        if 'repo' in task and 'pr' in task:
             log_message(LoggingScope.TASK_OPS, 'INFO', "task found in metadata: %s", task)
             repo = task['repo']
             pr = task['pr']
-        elif link2pr:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", link2pr)
-            repo = link2pr['repo']
-            pr = link2pr['pr']
+        elif 'repo' in source and 'pr' in source:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", source)
+            repo = source['repo']
+            pr = source['pr']
         else:
             raise ValueError("no repo or pr found in metadata")
         log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr)
diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 271ff9a9..43da6139 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -21,6 +21,12 @@ class EESSITaskDescription:
     # Metadata from the task description file
     metadata: Dict[str, Any] = None
 
+    # task element
+    task: Dict[str, Any] = None
+
+    # source element
+    source: Dict[str, Any] = None
+
     @log_function_entry_exit()
     def __init__(self, task_object: EESSIDataAndSignatureObject):
         """
@@ -51,6 +57,12 @@ def __init__(self, task_object: EESSIDataAndSignatureObject):
         else:
             self.task = None
 
+        # check if the task file contains a link2pr field and add that to source element
+        if 'link2pr' in self.metadata:
+            self.source = self.metadata['link2pr']
+        else:
+            self.source = None
+
     @log_function_entry_exit()
     def _read_metadata(self) -> None:
         """

From 52d7bd388c64973ae6cc6ab2f1f32a8ecff4c672 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 21:57:16 +0200
Subject: [PATCH 071/218] hand over repo and pr number as argument

---
 scripts/automated_ingestion/eessi_task.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 626f3fb4..f07308d5 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -99,10 +99,14 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool:
         return False
 
     @log_function_entry_exit()
-    def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]:
+    def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> Dict[int, bool]:
         """
         Determines in which sequence numbers the metadata/task file is included and in which it is not.
 
+        Args:
+            repo: the repository name
+            pr: the pull request number
+
         Returns:
             A dictionary with the sequence numbers as keys and a boolean value indicating if the metadata/task file is
             included in that sequence number.
@@ -120,8 +124,6 @@ def _determine_sequence_numbers_including_task_file(self) -> Dict[int, bool]:
         Note: this is a placeholder for now, as we do not know yet if we need to use a sequence number.
         """
         sequence_numbers = {}
-        repo = self.description.metadata['task']['repo']
-        pr = self.description.metadata['task']['pr']
         repo_pr_dir = f"{repo}/{pr}"
         # iterate over all directories under repo_pr_dir
         for dir in self._list_directory_contents(repo_pr_dir):
@@ -154,7 +156,7 @@ def _find_state(self) -> TaskState:
             repo = task['repo']
             pr = task['pr']
         elif 'repo' in source and 'pr' in source:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "link2pr found in metadata: %s", source)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "source found in metadata: %s", source)
             repo = source['repo']
             pr = source['pr']
         else:
@@ -162,7 +164,7 @@ def _find_state(self) -> TaskState:
         log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr)
 
         # iterate over all sequence numbers in repo/pr dir
-        sequence_numbers = self._determine_sequence_numbers_including_task_file()
+        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr)
         log_message(LoggingScope.TASK_OPS, 'INFO', "sequence_numbers: %s", sequence_numbers)
         for sequence_number in [key for key, value in sequence_numbers.items() if value]:
             # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name

From d0be2eb3fe5f684fd32c5f210d838224885c3b50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 22:01:12 +0200
Subject: [PATCH 072/218] fix Github exception issues

---
 scripts/automated_ingestion/eessi_task.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index f07308d5..8173926e 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -3,7 +3,7 @@
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
 from utils import log_message, LoggingScope, log_function_entry_exit
-from github import Github
+from github import Github, GithubException, UnknownObjectException
 
 
 class TaskState(Enum):
@@ -84,10 +84,10 @@ def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool:
             log_msg = "Found file %s in branch %s"
             log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch)
             return True
-        except Github.UnknownObjectException:
+        except UnknownObjectException:
             # file_path does not exist in branch
             return False
-        except Github.GithubException as err:
+        except GithubException as err:
             if err.status == 404:
                 # file_path does not exist in branch
                 return False
@@ -216,7 +216,7 @@ def _list_directory_contents(self, directory_path, branch=None):
             else:
                 # If it's not a list, it means the path is not a directory
                 raise ValueError(f"{directory_path} is not a directory")
-        except Github.GithubException as err:
+        except GithubException as err:
             if err.status == 404:
                 raise FileNotFoundError(f"Directory not found: {directory_path}")
             raise err

From 37ea68409f1cd5c1ef83ac76a52419be30216165 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 22:06:36 +0200
Subject: [PATCH 073/218] add logging when listing dir contents

---
 scripts/automated_ingestion/eessi_task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8173926e..8eb9df08 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -208,6 +208,7 @@ def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskSt
     def _list_directory_contents(self, directory_path, branch=None):
         try:
             # Get contents of the directory
+            log_message(LoggingScope.TASK_OPS, 'INFO', "listing contents of %s in branch %s", directory_path, branch)
             contents = self.git_repo.get_contents(directory_path, ref=branch)
 
             # If contents is a list, it means we successfully got directory contents

From f170a257676d4b52c742f9348df9b5be60e3dcc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 22:08:27 +0200
Subject: [PATCH 074/218] set branch to default if needed

---
 scripts/automated_ingestion/eessi_task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8eb9df08..4a4d5322 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -208,6 +208,7 @@ def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskSt
     def _list_directory_contents(self, directory_path, branch=None):
         try:
             # Get contents of the directory
+            branch = self.git_repo.default_branch if branch is None else branch
             log_message(LoggingScope.TASK_OPS, 'INFO', "listing contents of %s in branch %s", directory_path, branch)
             contents = self.git_repo.get_contents(directory_path, ref=branch)
 

From a795710c371236c8dbfaae28bd5786b64cdaf8eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 22:15:45 +0200
Subject: [PATCH 075/218] handle file not found exception

---
 scripts/automated_ingestion/eessi_task.py | 29 +++++++++++++++--------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 4a4d5322..7d004707 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -126,17 +126,26 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) ->
         sequence_numbers = {}
         repo_pr_dir = f"{repo}/{pr}"
         # iterate over all directories under repo_pr_dir
-        for dir in self._list_directory_contents(repo_pr_dir):
-            # check if the directory is a number
-            if dir.name.isdigit():
-                remote_file_path = self.description.task_object.remote_file_path
-                if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"):
-                    sequence_numbers[int(dir.name)] = True
+        try:
+            directories = self._list_directory_contents(repo_pr_dir)
+            for dir in directories:
+                # check if the directory is a number
+                if dir.name.isdigit():
+                    remote_file_path = self.description.task_object.remote_file_path
+                    if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"):
+                        sequence_numbers[int(dir.name)] = True
+                    else:
+                        sequence_numbers[int(dir.name)] = False
                 else:
-                    sequence_numbers[int(dir.name)] = False
-            else:
-                # directory is not a number, so we skip it
-                continue
+                    # directory is not a number, so we skip it
+                    continue
+        except FileNotFoundError:
+            # repo_pr_dir does not exist, so we return an empty dictionary
+            return {}
+        except GithubException as err:
+            if err.status != 404:  # 404 is catched by FileNotFoundError
+                # some other error than the directory not existing
+                return {}
         return sequence_numbers
 
     @log_function_entry_exit()

From c63208f1b653eeef36dd6ab1b82c1c6697dcd501 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 11 May 2025 22:20:26 +0200
Subject: [PATCH 076/218] fix element naming

---
 scripts/automated_ingestion/automated_ingestion.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 12429e4a..7ece86cc 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -267,17 +267,17 @@ def main():
 
                         # TODO: update the information shown below (what makes sense to show?)
                         # Log information about the task
-                        task_object = task.task_description.task_object
+                        task_object = task.description.task_object
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path)
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path)
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s",
-                                    task.task_description.signature_verified)
+                                    task.description.signature_verified)
 
                         # Log the ETags of the downloaded task file
-                        file_etag, sig_etag = task_object.get_etags()
+                        file_etag, sig_etag = task.description.task_object.get_etags()
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag)
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s",
-                                    task_object.remote_sig_path, sig_etag)
+                                    task.description.task_object.remote_sig_path, sig_etag)
 
                         # TODO: Process the task file contents
                         # This would involve reading the task file, parsing its contents,

From 2c0673f9631d47b7aa9f6287a544e93a1099dcda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 16 May 2025 19:45:33 +0200
Subject: [PATCH 077/218] first step towards obtaining payload when handling
 new task

---
 scripts/automated_ingestion/eessi_task.py     | 18 +++++++++
 .../automated_ingestion/eessi_task_payload.py | 40 +++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 scripts/automated_ingestion/eessi_task_payload.py

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7d004707..802cdc4d 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,9 +1,12 @@
 from enum import Enum, auto
 from typing import Dict
+from eessi_data_object import EESSIDataAndSignatureObject
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
+from eessi_task_payload import EESSITaskPayload
 from utils import log_message, LoggingScope, log_function_entry_exit
 from github import Github, GithubException, UnknownObjectException
+import os
 
 
 class TaskState(Enum):
@@ -33,6 +36,7 @@ def __str__(self):
 
 class EESSITask:
     description: EESSITaskDescription
+    payload: EESSITaskPayload
     action: EESSITaskAction
     state: TaskState
     git_repo: Github
@@ -264,6 +268,20 @@ def _handle_add_new(self):
         """Handler for ADD action in NEW state"""
         print("Handling ADD action in NEW state")
         # Implementation for adding in NEW state
+        # get name of of payload from metadata
+        payload_name = self.description.metadata['payload']['filename']
+        # get config and remote_client from self.description.task_object
+        config = self.description.task_object.config
+        remote_client = self.description.task_object.remote_client
+        # determine remote_file_path by replacing basename of remote_file_path in self.description.task_object
+        #   with payload_name
+        description_remote_file_path = self.description.task_object.remote_file_path
+        payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name)
+        # initialize payload object
+        payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client)
+        self.payload = EESSITaskPayload(payload_object)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload)
+
         return True
 
     @log_function_entry_exit()
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
new file mode 100644
index 00000000..bba630fe
--- /dev/null
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -0,0 +1,40 @@
+from dataclasses import dataclass
+
+from eessi_data_object import EESSIDataAndSignatureObject
+from utils import log_function_entry_exit
+from remote_storage import DownloadMode
+
+
+@dataclass
+class EESSITaskPayload:
+    """Class representing an EESSI task payload (tarball/artifact) and its signature."""
+
+    # The EESSI data and signature object associated with this payload
+    payload_object: EESSIDataAndSignatureObject
+
+    # Whether the signature was successfully verified
+    signature_verified: bool = False
+
+    # possibly at a later point in time, we will add inferred metadata here
+    # such as the prefix in a tarball, the main elements, or which software
+    # package it includes
+
+    @log_function_entry_exit()
+    def __init__(self, payload_object: EESSIDataAndSignatureObject):
+        """
+        Initialize an EESSITaskPayload object.
+
+        Args:
+            payload_object: The EESSI data and signature object associated with this payload
+        """
+        self.payload_object = payload_object
+
+        # Download the payload and its signature
+        self.payload_object.download(mode=DownloadMode.CHECK_REMOTE)
+
+        # Verify signature
+        self.signature_verified = self.payload_object.verify_signature()
+
+    def __str__(self) -> str:
+        """Return a string representation of the EESSITaskPayload object."""
+        return f"EESSITaskPayload({self.payload_object.local_file_path}, verified={self.signature_verified})"

From 7bcf5eb724808dac1bcbc7f64b1786e1f9f65d7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 16 May 2025 19:46:46 +0200
Subject: [PATCH 078/218] add a bit logging when creating payload instance

---
 scripts/automated_ingestion/eessi_task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 802cdc4d..8271c39f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -270,6 +270,7 @@ def _handle_add_new(self):
         # Implementation for adding in NEW state
         # get name of of payload from metadata
         payload_name = self.description.metadata['payload']['filename']
+        log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name)
         # get config and remote_client from self.description.task_object
         config = self.description.task_object.config
         remote_client = self.description.task_object.remote_client
@@ -277,6 +278,7 @@ def _handle_add_new(self):
         #   with payload_name
         description_remote_file_path = self.description.task_object.remote_file_path
         payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "payload_remote_file_path: %s", payload_remote_file_path)
         # initialize payload object
         payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client)
         self.payload = EESSITaskPayload(payload_object)

From 795da23839c29e1eae4d3186b127b8dd98bbeeed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 16 May 2025 19:54:38 +0200
Subject: [PATCH 079/218] code formatting improvements

---
 scripts/automated_ingestion/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/utils.py b/scripts/automated_ingestion/utils.py
index 32471fcd..4b867764 100644
--- a/scripts/automated_ingestion/utils.py
+++ b/scripts/automated_ingestion/utils.py
@@ -16,12 +16,12 @@ class LoggingScope(IntFlag):
     FUNC_ENTRY_EXIT = auto()  # Function entry/exit logging
     DOWNLOAD = auto()         # Logging related to file downloads
     VERIFICATION = auto()     # Logging related to signature and checksum verification
-    STATE_OPS = auto()     # Logging related to tarball state operations
+    STATE_OPS = auto()        # Logging related to tarball state operations
     GITHUB_OPS = auto()       # Logging related to GitHub operations (PRs, issues, etc.)
     GROUP_OPS = auto()        # Logging related to tarball group operations
-    TASK_OPS = auto()        # Logging related to task operations
-    ERROR = auto()           # Error logging (separate from other scopes for easier filtering)
-    DEBUG = auto()           # Debug-level logging (separate from other scopes for easier filtering)
+    TASK_OPS = auto()         # Logging related to task operations
+    ERROR = auto()            # Error logging (separate from other scopes for easier filtering)
+    DEBUG = auto()            # Debug-level logging (separate from other scopes for easier filtering)
     ALL = (FUNC_ENTRY_EXIT | DOWNLOAD | VERIFICATION | STATE_OPS |
            GITHUB_OPS | GROUP_OPS | TASK_OPS | ERROR | DEBUG)
 

From 4f148240b16c02d45fe4f11b665c17e1c2a86b6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 16 May 2025 20:13:40 +0200
Subject: [PATCH 080/218] build up path to store task file in staging repo

---
 scripts/automated_ingestion/eessi_task.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8271c39f..3e6aa48a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -48,6 +48,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github):
         self.action = self._determine_task_action()
 
         # Define valid state transitions for all actions
+        # NOTE, TaskState.APPROVED must be the first element or _next_state() will not work
         self.valid_transitions = {
             TaskState.NEW: [TaskState.STAGED],
             TaskState.STAGED: [TaskState.PR_OPENED],
@@ -236,6 +237,16 @@ def _list_directory_contents(self, directory_path, branch=None):
                 raise FileNotFoundError(f"Directory not found: {directory_path}")
             raise err
 
+    @log_function_entry_exit()
+    def _next_state(self) -> TaskState:
+        """
+        Determine the next state based on the current state using the valid_transitions dictionary.
+
+        NOTE, it assumes that function is only called for non-terminal states and that the next state is the first
+        element of the list returned by the valid_transitions dictionary.
+        """
+        return self.valid_transitions[self.state][0]
+
     @log_function_entry_exit()
     def handle(self):
         """
@@ -283,7 +294,13 @@ def _handle_add_new(self):
         payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client)
         self.payload = EESSITaskPayload(payload_object)
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload)
-
+        # determine next state (NEXT_STATE), put metadata/task file into GH staging repo in main branch under directory
+        # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE
+        next_state = self._next_state()
+        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
+        repo_pr_dir = f"{self.description.task_object.repo}/{self.description.task_object.pr}"
+        staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}"
+        log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path)
         return True
 
     @log_function_entry_exit()

From 14762ef1c7aeb96814d1853e7f84ece15216e24a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 16 May 2025 21:48:32 +0200
Subject: [PATCH 081/218] add functions to determine repo name and pr number

---
 .../eessi_task_description.py                 | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 43da6139..4e5d638e 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -88,6 +88,7 @@ def _read_metadata(self) -> None:
                         self.task_object.local_file_path, str(err))
             raise
 
+    @log_function_entry_exit()
     def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
         """
         Get the components of the metadata file name.
@@ -123,6 +124,27 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
         timestamp = components[-1]
         return version, component, os, architecture, timestamp, suffix
 
+    @log_function_entry_exit()
+    def get_pr_number(self) -> str:
+        """
+        Get the PR number from the task description / metadata file.
+        """
+        if self.source and 'pr' in self.source:
+            return self.source['pr']
+        else:
+            return '0'
+
+    @log_function_entry_exit()
+    def get_repo_name(self) -> str:
+        """
+        Get the repository name from the task description / metadata file.
+        """
+        if self.source and 'repo' in self.source:
+            return self.source['repo']
+        else:
+            return 'None'
+
+    @log_function_entry_exit()
     def __str__(self) -> str:
         """Return a string representation of the EESSITaskDescription object."""
         return f"EESSITaskDescription({self.task_object.local_file_path}, verified={self.signature_verified})"

From ffae1cfc75f9d0ff0b5ec66202ea13148ba07a4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 16 May 2025 21:50:56 +0200
Subject: [PATCH 082/218] use functions to determine repo name and pr number

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 3e6aa48a..b45c176e 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -298,7 +298,7 @@ def _handle_add_new(self):
         # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE
         next_state = self._next_state()
         log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
-        repo_pr_dir = f"{self.description.task_object.repo}/{self.description.task_object.pr}"
+        repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}"
         staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}"
         log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path)
         return True

From 066ad16674eba6cac4ab0c60af040ec6fb7b82f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 16 May 2025 23:55:52 +0200
Subject: [PATCH 083/218] add metadata / task file to GH staging repo

---
 scripts/automated_ingestion/eessi_task.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index b45c176e..ae48a14c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -298,9 +298,15 @@ def _handle_add_new(self):
         # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE
         next_state = self._next_state()
         log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
-        repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}"
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        repo_pr_dir = f"{repo_name}/{pr_number}"
         staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}"
         log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path)
+        # contents of task description / metadata file
+        contents = self.description.get_contents()
+        self.git_repo.create_file(staging_repo_path, f"new task for {repo_name} PR {pr_number} add build for arch" ,
+                                  contents)
         return True
 
     @log_function_entry_exit()

From d671c827f8c9a8fa07f8adf9763f3606aa493b36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 00:22:13 +0200
Subject: [PATCH 084/218] add function to return raw contents of metadata /
 task file

---
 scripts/automated_ingestion/eessi_task_description.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 4e5d638e..686f1b90 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -76,7 +76,8 @@ def _read_metadata(self) -> None:
 
         try:
             with open(self.task_object.local_file_path, 'r') as file:
-                self.metadata = json.load(file)
+                self.raw_contents = file.read()
+                self.metadata = json.loads(self.raw_contents)
             log_message(LoggingScope.DEBUG, 'DEBUG', "Successfully read metadata from %s",
                         self.task_object.local_file_path)
         except json.JSONDecodeError as err:
@@ -88,6 +89,13 @@ def _read_metadata(self) -> None:
                         self.task_object.local_file_path, str(err))
             raise
 
+    @log_function_entry_exit()
+    def get_contents(self) -> str:
+        """
+        Get the contents of the task description / metadata file.
+        """
+        return self.raw_contents
+
     @log_function_entry_exit()
     def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
         """

From 14d62e5ec896ca364bd5bac6edb4b6b0a6f61d10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 09:39:04 +0200
Subject: [PATCH 085/218] include sequence number in path and various
 improvements

---
 scripts/automated_ingestion/eessi_task.py     | 135 +++++++++++-------
 .../eessi_task_description.py                 |  45 ++++--
 2 files changed, 117 insertions(+), 63 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index ae48a14c..352b7dae 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,5 +1,5 @@
 from enum import Enum, auto
-from typing import Dict
+from typing import Dict, List
 from eessi_data_object import EESSIDataAndSignatureObject
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
@@ -78,23 +78,44 @@ def _determine_task_action(self) -> EESSITaskAction:
         return EESSITaskAction.UNKNOWN
 
     @log_function_entry_exit()
-    def _file_exists_in_repo_branch(self, file_path, branch=None) -> bool:
+    def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch=None) -> bool:
         """
         Check if a file exists in a repository branch.
+
+        Args:
+            file_path_prefix: the prefix of the file path
+            branch: the branch to check
+
+        Returns:
+            True if a file with the prefix exists in the branch, False otherwise
         """
         if branch is None:
             branch = self.git_repo.default_branch
         try:
-            self.git_repo.get_contents(file_path, ref=branch)
-            log_msg = "Found file %s in branch %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path, branch)
-            return True
+            # get all files in directory part of file_path_prefix
+            directory_part = os.path.dirname(file_path_prefix)
+            files = self.git_repo.get_contents(directory_part, ref=branch)
+            log_msg = "Found files %s in directory %s in branch %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch)
+            # check if any of the files has file_path_prefix as prefix
+            for file in files:
+                if file.path.startswith(file_path_prefix):
+                    log_msg = "Found file %s in directory %s in branch %s"
+                    log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch)
+                    return True
+            log_msg = "No file with prefix %s found in directory %s in branch %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch)
+            return False
         except UnknownObjectException:
             # file_path does not exist in branch
+            log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch)
             return False
         except GithubException as err:
             if err.status == 404:
                 # file_path does not exist in branch
+                log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
+                log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch)
                 return False
             else:
                 # if there was some other (e.g. connection) issue, log message and return False
@@ -136,8 +157,10 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) ->
             for dir in directories:
                 # check if the directory is a number
                 if dir.name.isdigit():
+                    # determin if a state file with prefix exists in the sequence number directory
                     remote_file_path = self.description.task_object.remote_file_path
-                    if self._file_exists_in_repo_branch(f"{repo_pr_dir}/{dir.name}/{remote_file_path}"):
+                    state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path}"
+                    if self._state_file_with_prefix_exists_in_repo_branch(state_file_name_prefix):
                         sequence_numbers[int(dir.name)] = True
                     else:
                         sequence_numbers[int(dir.name)] = False
@@ -153,6 +176,15 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) ->
                 return {}
         return sequence_numbers
 
+    @log_function_entry_exit()
+    def _find_highest_number(self, str_list: List[str]) -> int:
+        """
+        Find the highest number in a list of strings.
+        """
+        # Convert all strings to integers
+        int_list = [int(num) for num in str_list]
+        return max(int_list)
+
     @log_function_entry_exit()
     def _find_state(self) -> TaskState:
         """
@@ -163,60 +195,46 @@ def _find_state(self) -> TaskState:
         """
         # obtain repo and pr from metadata
         log_message(LoggingScope.TASK_OPS, 'INFO', "finding state of task %s", self.description.task_object)
-        task = self.description.task
-        source = self.description.source
-        if 'repo' in task and 'pr' in task:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "task found in metadata: %s", task)
-            repo = task['repo']
-            pr = task['pr']
-        elif 'repo' in source and 'pr' in source:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "source found in metadata: %s", source)
-            repo = source['repo']
-            pr = source['pr']
-        else:
-            raise ValueError("no repo or pr found in metadata")
+        repo = self.description.get_repo_name()
+        pr = self.description.get_pr_number()
         log_message(LoggingScope.TASK_OPS, 'INFO', "repo: %s, pr: %s", repo, pr)
 
-        # iterate over all sequence numbers in repo/pr dir
+        # obtain all sequence numbers in repo/pr dir which include a state file for this task
         sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr)
-        log_message(LoggingScope.TASK_OPS, 'INFO', "sequence_numbers: %s", sequence_numbers)
-        for sequence_number in [key for key, value in sequence_numbers.items() if value]:
-            # create path to metadata file from repo, PR, repo, sequence number, metadata file name, state name
-            # format of the metadata file name is:
-            #   eessi-VERSION-COMPONENT-OS-ARCHITECTURE-TIMESTAMP.SUFFIX
-            # all uppercase words are placeholders
-            # all placeholders (except ARCHITECTURE) do not include any hyphens
-            # ARCHITECTURE can include one to two hyphens
-            # The SUFFIX is composed of two parts: TARBALLSUFFIX and METADATASUFFIX
-            # TARBALLSUFFIX is defined by the task object or in the configuration file
-            # METADATASUFFIX is defined by the task object or in the configuration file
-            #   Later, we may switch to using task action files instead of metadata files. The format of the
-            #   SUFFIX would then be defined by the task action or the configuration file.
-            version, component, os, architecture, timestamp, suffix = self.description.get_metadata_file_components()
-            log_msg = "version: %s, component: %s, os: %s, architecture: %s, timestamp: %s, suffix: %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, version, component, os, architecture, timestamp, suffix)
-            metadata_file_name = f"eessi-{version}-{component}-{os}-{architecture}-{timestamp}.{suffix}"
-            metadata_file_state_path = f"{repo}/{pr}/{sequence_number}/{metadata_file_name}"
-            # get the state from the file in the metadata_file_state_path
-            state = self._get_state_from_metadata_file(metadata_file_state_path)
-            log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
-            return state
-        # did not find metadata file in staging repo on GitHub
-        log_message(LoggingScope.TASK_OPS, 'INFO', "did not find metadata file in staging repo on GitHub, state: NEW")
-        return TaskState.NEW
+        if len(sequence_numbers) == 0:
+            # no sequence numbers found, so we return NEW
+            log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW")
+            return TaskState.NEW
+        # because a new sequence number is only created after the previous staging PR has been approved or rejected,
+        #   we need to check if the processing of the highest sequence number is finished.
+        highest_sequence_number = self._find_highest_number(sequence_numbers.keys())
+        # we obtain the state from the file in the highest sequence number directory
+        # TODO: verify if the state matches other information, e.g. the state of the staging PR
+        #       for now, we assume that the state is correct
+        task_file_name = self.description.get_task_file_name()
+        metadata_file_state_path_prefix = f"{repo}/{pr}/{highest_sequence_number}/{task_file_name}."
+        state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
+        return state
 
     @log_function_entry_exit()
-    def _get_state_from_metadata_file(self, metadata_file_state_path: str) -> TaskState:
+    def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str) -> TaskState:
         """
-        Get the state from the file in the metadata_file_state_path.
+        Get the state from the file in the metadata_file_state_path_prefix.
         """
-        # get contents of metadata_file_state_path
-        contents = self.git_repo.get_contents(metadata_file_state_path)
-        try:
-            state = TaskState.from_string(contents.name)
+        # first get all files in directory part of metadata_file_state_path_prefix
+        directory_part = os.path.dirname(metadata_file_state_path_prefix)
+        files = self._list_directory_contents(directory_part)
+        # check if any of the files has metadata_file_state_path_prefix as prefix
+        for file in files:
+            if file.path.startswith(metadata_file_state_path_prefix):
+                # get state from file name taking only the suffix
+                state = TaskState.from_string(file.name.split('.')[-1])
             return state
-        except ValueError:
-            return TaskState.NEW
+        # did not find any file with metadata_file_state_path_prefix as prefix
+        log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s",
+                    metadata_file_state_path_prefix)
+        return TaskState.NEW
 
     @log_function_entry_exit()
     def _list_directory_contents(self, directory_path, branch=None):
@@ -301,12 +319,19 @@ def _handle_add_new(self):
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         repo_pr_dir = f"{repo_name}/{pr_number}"
-        staging_repo_path = f"{repo_pr_dir}/{payload_name}.{next_state}"
+        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
+        if len(sequence_numbers) == 0:
+            sequence_number = 0
+        else:
+            sequence_number = self._find_highest_number(sequence_numbers.keys())
+        staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{payload_name}.{next_state}"
         log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path)
         # contents of task description / metadata file
         contents = self.description.get_contents()
-        self.git_repo.create_file(staging_repo_path, f"new task for {repo_name} PR {pr_number} add build for arch" ,
+        self.git_repo.create_file(staging_repo_path,
+                                  f"new task for {repo_name} PR {pr_number} seq {sequence_number}: add build for arch",
                                   contents)
+        self.state = next_state
         return True
 
     @log_function_entry_exit()
diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 686f1b90..c8e627ab 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -132,25 +132,54 @@ def get_metadata_file_components(self) -> Tuple[str, str, str, str, str, str]:
         timestamp = components[-1]
         return version, component, os, architecture, timestamp, suffix
 
+    @log_function_entry_exit()
+    def get_metadata_value(self, key: str) -> str:
+        """
+        Get the value of a key from the task description / metadata file.
+        """
+        # check that key is defined and has a length > 0
+        if not key or len(key) == 0:
+            raise ValueError("get_metadata_value: key is not defined or has a length of 0")
+
+        value = None
+        task = self.description.task
+        source = self.description.source
+        if task and 'repo' in task and key in task['repo']:
+            value = task['repo'][key]
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        f"Value '{value}' for key {key} found in information from task metadata: {task}")
+        elif source and 'repo' in source and key in source['repo']:
+            value = source['repo'][key]
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        f"Value '{value}' for key {key} found in information from source metadata: {source}")
+        else:
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        f"Value '{value}' for key {key} neither found in task metadata nor source metadata")
+            raise ValueError(f"Value '{value}' for key {key} neither found in task metadata nor source metadata")
+        return value
+
     @log_function_entry_exit()
     def get_pr_number(self) -> str:
         """
         Get the PR number from the task description / metadata file.
         """
-        if self.source and 'pr' in self.source:
-            return self.source['pr']
-        else:
-            return '0'
+        return self.get_metadata_value('pr')
 
     @log_function_entry_exit()
     def get_repo_name(self) -> str:
         """
         Get the repository name from the task description / metadata file.
         """
-        if self.source and 'repo' in self.source:
-            return self.source['repo']
-        else:
-            return 'None'
+        return self.get_metadata_value('repo')
+
+    @log_function_entry_exit()
+    def get_task_file_name(self) -> str:
+        """
+        Get the file name from the task description / metadata file.
+        """
+        # get file name from remote file path using basename
+        file_name = Path(self.task_object.remote_file_path).name
+        return file_name
 
     @log_function_entry_exit()
     def __str__(self) -> str:

From 2357abc425f7a0ed9cb73b2b6b336d54ecc404f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 09:51:56 +0200
Subject: [PATCH 086/218] fix issue with non-existing class element

---
 scripts/automated_ingestion/eessi_task_description.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index c8e627ab..f847c29e 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -142,8 +142,8 @@ def get_metadata_value(self, key: str) -> str:
             raise ValueError("get_metadata_value: key is not defined or has a length of 0")
 
         value = None
-        task = self.description.task
-        source = self.description.source
+        task = self.task
+        source = self.source
         if task and 'repo' in task and key in task['repo']:
             value = task['repo'][key]
             log_message(LoggingScope.TASK_OPS, 'INFO',

From 370b6a6759fcb31d077dffd9361277edb79828f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 10:07:25 +0200
Subject: [PATCH 087/218] add a bit more logging when obtaining value from
 metadata

---
 scripts/automated_ingestion/eessi_task_description.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index f847c29e..3bababe5 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -144,17 +144,19 @@ def get_metadata_value(self, key: str) -> str:
         value = None
         task = self.task
         source = self.source
+        log_message(LoggingScope.TASK_OPS, 'INFO',
+                    f"checking if either task ({task}) or source ({source}) contains information for key '{key}'")
         if task and 'repo' in task and key in task['repo']:
             value = task['repo'][key]
             log_message(LoggingScope.TASK_OPS, 'INFO',
-                        f"Value '{value}' for key {key} found in information from task metadata: {task}")
+                        f"Value '{value}' for key '{key}' found in information from task metadata: {task}")
         elif source and 'repo' in source and key in source['repo']:
             value = source['repo'][key]
             log_message(LoggingScope.TASK_OPS, 'INFO',
-                        f"Value '{value}' for key {key} found in information from source metadata: {source}")
+                        f"Value '{value}' for key '{key}' found in information from source metadata: {source}")
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO',
-                        f"Value '{value}' for key {key} neither found in task metadata nor source metadata")
+                        f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata")
             raise ValueError(f"Value '{value}' for key {key} neither found in task metadata nor source metadata")
         return value
 

From 6dcb675e64239c86e67233ac2ba25577d68c3765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 10:14:04 +0200
Subject: [PATCH 088/218] show data types of task and source elements

---
 scripts/automated_ingestion/eessi_task_description.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 3bababe5..91691223 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -144,8 +144,9 @@ def get_metadata_value(self, key: str) -> str:
         value = None
         task = self.task
         source = self.source
-        log_message(LoggingScope.TASK_OPS, 'INFO',
-                    f"checking if either task ({task}) or source ({source}) contains information for key '{key}'")
+        log_msg = f"checking if either task ({task}, type {type(task)}) or"
+        log_msg += f" source ({source}, type {type(source)}) contains information for key '{key}'"
+        log_message(LoggingScope.TASK_OPS, 'INFO', log_msg)
         if task and 'repo' in task and key in task['repo']:
             value = task['repo'][key]
             log_message(LoggingScope.TASK_OPS, 'INFO',
@@ -157,7 +158,7 @@ def get_metadata_value(self, key: str) -> str:
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata")
-            raise ValueError(f"Value '{value}' for key {key} neither found in task metadata nor source metadata")
+            raise ValueError(f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata")
         return value
 
     @log_function_entry_exit()

From 057a9535ccb134ad9efc556f28eeb0f4321686a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 10:18:45 +0200
Subject: [PATCH 089/218] fix logic to obtain value for key from task or source

---
 .../eessi_task_description.py                    | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task_description.py b/scripts/automated_ingestion/eessi_task_description.py
index 91691223..5ff4c196 100644
--- a/scripts/automated_ingestion/eessi_task_description.py
+++ b/scripts/automated_ingestion/eessi_task_description.py
@@ -144,21 +144,19 @@ def get_metadata_value(self, key: str) -> str:
         value = None
         task = self.task
         source = self.source
-        log_msg = f"checking if either task ({task}, type {type(task)}) or"
-        log_msg += f" source ({source}, type {type(source)}) contains information for key '{key}'"
-        log_message(LoggingScope.TASK_OPS, 'INFO', log_msg)
-        if task and 'repo' in task and key in task['repo']:
-            value = task['repo'][key]
+        # check if key is in task or source
+        if task and key in task:
+            value = task[key]
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         f"Value '{value}' for key '{key}' found in information from task metadata: {task}")
-        elif source and 'repo' in source and key in source['repo']:
-            value = source['repo'][key]
+        elif source and key in source:
+            value = source[key]
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         f"Value '{value}' for key '{key}' found in information from source metadata: {source}")
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO',
-                        f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata")
-            raise ValueError(f"Value '{value}' for key '{key}' neither found in task metadata nor source metadata")
+                        f"Value for key '{key}' neither found in task metadata nor source metadata")
+            raise ValueError(f"Value for key '{key}' neither found in task metadata nor source metadata")
         return value
 
     @log_function_entry_exit()

From 167cff29dc9c056c4dcd51a1666fdb1b66669531 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 11:12:30 +0200
Subject: [PATCH 090/218] use basename of remote_file_path

---
 scripts/automated_ingestion/eessi_task.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 352b7dae..8830e34a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -157,9 +157,10 @@ def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) ->
             for dir in directories:
                 # check if the directory is a number
                 if dir.name.isdigit():
-                    # determin if a state file with prefix exists in the sequence number directory
-                    remote_file_path = self.description.task_object.remote_file_path
-                    state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path}"
+                    # determine if a state file with prefix exists in the sequence number directory
+                    #   we need to use the basename of the remote file path
+                    remote_file_path_basename = os.path.basename(self.description.task_object.remote_file_path)
+                    state_file_name_prefix = f"{repo_pr_dir}/{dir.name}/{remote_file_path_basename}"
                     if self._state_file_with_prefix_exists_in_repo_branch(state_file_name_prefix):
                         sequence_numbers[int(dir.name)] = True
                     else:

From d700826bd16e28aef98873178aa9092748bcec36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 11:26:28 +0200
Subject: [PATCH 091/218] use task file name for storing state file in GH
 staging repo

---
 scripts/automated_ingestion/eessi_task.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8830e34a..703f87b0 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -325,7 +325,9 @@ def _handle_add_new(self):
             sequence_number = 0
         else:
             sequence_number = self._find_highest_number(sequence_numbers.keys())
-        staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{payload_name}.{next_state}"
+        # we use the basename of the remote file path for the task description file
+        task_file_name = self.description.get_task_file_name()
+        staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{task_file_name}.{next_state}"
         log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path)
         # contents of task description / metadata file
         contents = self.description.get_contents()

From a9dad6888e8aae54950b59daa3de7701e5c1c962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 17 May 2025 12:02:49 +0200
Subject: [PATCH 092/218] fix indentation

---
 scripts/automated_ingestion/eessi_task.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 703f87b0..7806b508 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -231,7 +231,8 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s
             if file.path.startswith(metadata_file_state_path_prefix):
                 # get state from file name taking only the suffix
                 state = TaskState.from_string(file.name.split('.')[-1])
-            return state
+                log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
+                return state
         # did not find any file with metadata_file_state_path_prefix as prefix
         log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s",
                     metadata_file_state_path_prefix)

From bc6423b68523b81931b0db5b4a8b8253d86e2f1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 18 May 2025 00:43:48 +0200
Subject: [PATCH 093/218] various improvement for determining the state of a
 task

---
 scripts/automated_ingestion/eessi_task.py | 94 +++++++++++++++++------
 1 file changed, 70 insertions(+), 24 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7806b508..26760dea 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -128,6 +128,9 @@ def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, b
     def _determine_sequence_numbers_including_task_file(self, repo: str, pr: str) -> Dict[int, bool]:
         """
         Determines in which sequence numbers the metadata/task file is included and in which it is not.
+        NOTE, we only need to check the default branch of the repository, because a for a new task a file
+        is added to the default branch and for the subsequent processing of the task we use a different branch.
+        Thus, until the PR is closed, the task file stays in the default branch.
 
         Args:
             repo: the repository name
@@ -206,37 +209,60 @@ def _find_state(self) -> TaskState:
             # no sequence numbers found, so we return NEW
             log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW")
             return TaskState.NEW
-        # because a new sequence number is only created after the previous staging PR has been approved or rejected,
-        #   we need to check if the processing of the highest sequence number is finished.
-        highest_sequence_number = self._find_highest_number(sequence_numbers.keys())
-        # we obtain the state from the file in the highest sequence number directory
-        # TODO: verify if the state matches other information, e.g. the state of the staging PR
-        #       for now, we assume that the state is correct
+        # we got at least one sequence number
+        # if one value for a sequence number is True, we can determine the state from the file in the directory
+        sequence_including_task = [key for key, value in sequence_numbers.items() if value is True]
+        if len(sequence_including_task) == 0:
+            # no sequence number includes the task file, so we return NEW
+            log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence number includes the task file, state: NEW")
+            return TaskState.NEW
+        # we got at least one sequence number which includes the task file
+        # we can determine the state from the filename in the directory
+        # NOTE, we use the first element in sequence_including_task (there should be only one)
+        #     we ignore other elements in sequence_including_task
+        sequence_number = sequence_including_task[0]
         task_file_name = self.description.get_task_file_name()
-        metadata_file_state_path_prefix = f"{repo}/{pr}/{highest_sequence_number}/{task_file_name}."
-        state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix)
+        metadata_file_state_path_prefix = f"{repo}/{pr}/{sequence_number}/{task_file_name}."
+        state = self._get_state_for_metadata_file_prefix(metadata_file_state_path_prefix, sequence_number)
         log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
         return state
 
     @log_function_entry_exit()
-    def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str) -> TaskState:
+    def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: str,
+                                            sequence_number: int) -> TaskState:
         """
         Get the state from the file in the metadata_file_state_path_prefix.
         """
-        # first get all files in directory part of metadata_file_state_path_prefix
+        # depending on the state of the deployment (NEW, STAGED, PR_OPENED, APPROVED, REJECTED, INGESTED)
+        # we need to check the task file in the default branch or in the branch corresponding to the sequence number
         directory_part = os.path.dirname(metadata_file_state_path_prefix)
-        files = self._list_directory_contents(directory_part)
-        # check if any of the files has metadata_file_state_path_prefix as prefix
-        for file in files:
-            if file.path.startswith(metadata_file_state_path_prefix):
-                # get state from file name taking only the suffix
-                state = TaskState.from_string(file.name.split('.')[-1])
-                log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
-                return state
-        # did not find any file with metadata_file_state_path_prefix as prefix
-        log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s",
-                    metadata_file_state_path_prefix)
-        return TaskState.NEW
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        default_branch_name = self.git_repo.default_branch
+        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+        all_branch_names = [branch.name for branch in self.git_repo.get_branches()]
+        states = []
+        for branch in [default_branch_name, branch_name]:
+            if branch in all_branch_names:
+                # first get all files in directory part of metadata_file_state_path_prefix
+                files = self._list_directory_contents(directory_part, branch)
+                # check if any of the files has metadata_file_state_path_prefix as prefix
+                for file in files:
+                    if file.path.startswith(metadata_file_state_path_prefix):
+                        # get state from file name taking only the suffix
+                        state = TaskState.from_string(file.name.split('.')[-1])
+                        log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
+                        states.append(state)
+        if len(states) == 0:
+            # did not find any file with metadata_file_state_path_prefix as prefix
+            log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s",
+                        metadata_file_state_path_prefix)
+            return TaskState.NEW
+        # sort the states and return the last one
+        states.sort()
+        state = states[-1]
+        log_message(LoggingScope.TASK_OPS, 'INFO', "state: %s", state)
+        return state
 
     @log_function_entry_exit()
     def _list_directory_contents(self, directory_path, branch=None):
@@ -298,7 +324,7 @@ def handle(self):
     def _handle_add_new(self):
         """Handler for ADD action in NEW state"""
         print("Handling ADD action in NEW state")
-        # Implementation for adding in NEW state
+        # Implementation for adding in NEW state: a task is only NEW if it was not processed yet
         # get name of of payload from metadata
         payload_name = self.description.metadata['payload']['filename']
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name)
@@ -315,7 +341,7 @@ def _handle_add_new(self):
         self.payload = EESSITaskPayload(payload_object)
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload)
         # determine next state (NEXT_STATE), put metadata/task file into GH staging repo in main branch under directory
-        # REPO/PR_NUM/SEQ_NUM/payload_name.NEXT_STATE
+        # REPO/PR_NUM/SEQ_NUM/task_file_name.NEXT_STATE
         next_state = self._next_state()
         log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
         repo_name = self.description.get_repo_name()
@@ -325,7 +351,18 @@ def _handle_add_new(self):
         if len(sequence_numbers) == 0:
             sequence_number = 0
         else:
+            # we need to figure out the status of the last deployment (with the highest sequence number)
+            # if a PR exists and it is closed, we add the task to the *next* higher sequence number
+            # otherwise we add the task to the highest sequence number
             sequence_number = self._find_highest_number(sequence_numbers.keys())
+            branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+            if branch_name in [branch.name for branch in self.git_repo.get_branches()]:
+                # branch exists, check if PR exists
+                find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')]
+                if find_pr:
+                    pr = find_pr.pop(0)
+                    if pr.state == 'closed':
+                        sequence_number += 1
         # we use the basename of the remote file path for the task description file
         task_file_name = self.description.get_task_file_name()
         staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{task_file_name}.{next_state}"
@@ -343,6 +380,15 @@ def _handle_add_staged(self):
         """Handler for ADD action in STAGED state"""
         print("Handling ADD action in STAGED state")
         # Implementation for adding in STAGED state
+        # construct supposed branch name
+        # check if branch exists
+        # - yes: check if corresponding PR exists
+        #   - yes: check status of PR
+        #     - open: rename file and add it to branch, set state, update PR contents, return
+        #     - closed && !merged: rename file to rejected, set state
+        #     - else: weird state, log message, return
+        #   - no: delete branch
+        # create new branch, add task file to branch, set state, create PR, update PR contents, return
         return True
 
     @log_function_entry_exit()

From 092f32b7e0a86b25acc6ea9d33f15c365eca7e27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Thu, 22 May 2025 21:42:44 +0200
Subject: [PATCH 094/218] add functions to handle sequences of deployments

---
 scripts/automated_ingestion/eessi_task.py | 157 +++++++++++++++++++++-
 1 file changed, 156 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 26760dea..8a65c84b 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -9,6 +9,12 @@
 import os
 
 
+class SequenceStatus(Enum):
+    DOES_NOT_EXIST = auto()
+    IN_PROGRESS = auto()
+    FINISHED = auto()
+
+
 class TaskState(Enum):
     NEW = auto()  # The task has been created but not yet processed
     STAGED = auto()  # The task has been staged to the Stratum-0
@@ -189,6 +195,112 @@ def _find_highest_number(self, str_list: List[str]) -> int:
         int_list = [int(num) for num in str_list]
         return max(int_list)
 
+    @log_function_entry_exit()
+    def _get_sequence_number_for_task_file(self) -> int:
+        """
+        Get the sequence number this task is assigned to at the moment.
+        NOTE, should only be called if the task is actually assigned to a sequence number.
+        """
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
+        if len(sequence_numbers) == 0:
+            raise ValueError("Found no sequence numbers at all")
+        else:
+            # get all entries with value True, there should be only one, so we return the first one
+            sequence_numbers_true = [key for key, value in sequence_numbers.items() if value is True]
+            if len(sequence_numbers_true) == 0:
+                raise ValueError("Found no sequence numbers that include the task file for task %s",
+                                 self.description)
+            else:
+                return sequence_numbers_true[0]
+
+    @log_function_entry_exit()
+    def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None) -> int:
+        """
+        Get the current sequence number based on the sequence numbers.
+        If sequence_numbers is not provided, we determine the sequence numbers from the task description.
+        """
+        if sequence_numbers is None:
+            repo_name = self.description.get_repo_name()
+            pr_number = self.description.get_pr_number()
+            sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
+        if len(sequence_numbers) == 0:
+            return 0
+        return self._find_highest_number(sequence_numbers.keys())
+
+    @log_function_entry_exit()
+    def _determine_sequence_status(self, sequence_number: int = None) -> int:
+        """
+        Determine the status of the sequence number. It could be: DOES_NOT_EXIST, IN_PROGRESS, FINISHED
+        If sequence_number is not provided, we use the highest existing sequence number.
+        """
+        if sequence_number is None:
+            sequence_number = self._get_current_sequence_number()
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
+        if len(sequence_numbers) == 0:
+            return SequenceStatus.DOES_NOT_EXIST
+        elif sequence_number not in sequence_numbers.keys():
+            return SequenceStatus.DOES_NOT_EXIST
+        elif sequence_number < self._find_highest_number(sequence_numbers.keys()):
+            return SequenceStatus.FINISHED
+        else:
+            # check status of PR if it exists
+            branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+            if branch_name in [branch.name for branch in self.git_repo.get_branches()]:
+                find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')]
+                if find_pr:
+                    pr = find_pr.pop(0)
+                    if pr.state == 'closed':
+                        return SequenceStatus.FINISHED
+            return SequenceStatus.IN_PROGRESS
+
+    @log_function_entry_exit()
+    def _find_staging_pr(self) -> Tuple[PullRequest, str, int]:
+        """
+        Find the staging PR for the task.
+        TODO: arg sequence number --> make function simpler
+        """
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        try:
+            sequence_number = self._get_sequence_number_for_task_file()
+        except ValueError:
+            # no sequence number found, so we return None
+            log_message(LoggingScope.ERROR, 'ERROR', "no sequence number found for task %s", self.description)
+            return None, None, None
+        except Exception as err:
+            # some other error
+            log_message(LoggingScope.ERROR, 'ERROR', "error finding staging PR for task %s: %s",
+                        self.description, err)
+            return None, None, None
+        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+        if branch_name in [branch.name for branch in self.git_repo.get_branches()]:
+            find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')]
+            if find_pr:
+                pr = find_pr.pop(0)
+                return pr, branch_name, sequence_number
+            else:
+                return None, branch_name, sequence_number
+        else:
+            return None, None, None
+
+    @log_function_entry_exit()
+    def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]:
+        """
+        Create a staging PR for the task.
+        NOTE, SHALL only be called if no staging PR for the task exists yet. 
+        """
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+        pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
+                                      body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
+                                      head=branch_name, base=self.git_repo.default_branch)
+        return pr, branch_name
+
     @log_function_entry_exit()
     def _find_state(self) -> TaskState:
         """
@@ -380,7 +492,50 @@ def _handle_add_staged(self):
         """Handler for ADD action in STAGED state"""
         print("Handling ADD action in STAGED state")
         # Implementation for adding in STAGED state
-        # construct supposed branch name
+        #  - create or find PR
+        #  - update PR contents
+        # determine PR
+        #  - no PR -> create one
+        #  - PR && closed -> create one (may require to move task file to different sequence number)
+        #  - PR && open -> update PR contents, task file status, etc
+        # TODO: determine sequence number, then use it to find staging pr
+        # find staging PR
+        staging_pr, staging_branch = self._find_staging_pr(sequence_number)
+        # create PR if necessary
+        if staging_pr is None and sequence_number is None:
+            # no PR found, create one
+            staging_pr, staging_branch = self._create_staging_pr(sequence_number)
+        elif staging_pr is None and sequence_number is not None:
+            # no PR found, create one
+            staging_pr, staging_branch = self._create_staging_pr(sequence_number)
+        elif staging_pr.state == 'closed':
+            # PR closed, create new one
+            staging_pr, staging_branch = self._create_staging_pr(sequence_number + 1)
+        if staging_pr is None:
+            # something went wrong, we cannot continue
+            log_message(LoggingScope.ERROR, 'ERROR', "no staging PR found for task %s", self.description)
+            return False
+        # update PR contents
+        self._update_pr_contents(staging_pr)
+        # update task file status
+        self._update_task_file_status(staging_branch)
+
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        # current sequence 
+        sequence_number = self._get_current_sequence_number()
+        sequence_status = self._determine_sequence_status(sequence_number)
+        if sequence_status == SequenceStatus.FINISHED:
+            sequence_number += 1
+            # re-determine sequence status
+            sequence_status = self._determine_sequence_status(sequence_number)
+        if sequence_status == SequenceStatus.DOES_NOT_EXIST:
+            # something is odd, the task file should already be in the default branch
+            log_message(LoggingScope.ERROR, 'ERROR', "sequence number %s does not exist", sequence_number)
+            return False
+        elif sequence_status == SequenceStatus.FINISHED:
+            # we need to figure out the status of the last deployment (with the highest sequence number)
+            branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
         # check if branch exists
         # - yes: check if corresponding PR exists
         #   - yes: check status of PR

From f44d80967b881593e921601e96d546e14a68fdd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 12:12:36 +0200
Subject: [PATCH 095/218] revise states of a task

---
 scripts/automated_ingestion/eessi_task.py | 67 ++++++++++++-----------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8a65c84b..0d9ff203 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -16,12 +16,14 @@ class SequenceStatus(Enum):
 
 
 class TaskState(Enum):
-    NEW = auto()  # The task has been created but not yet processed
-    STAGED = auto()  # The task has been staged to the Stratum-0
-    PR_OPENED = auto()  # The task has been opened as a PR in some staging repository
-    APPROVED = auto()  # The task has been approved
-    REJECTED = auto()  # The task has been rejected
-    INGESTED = auto()  # The task has been ingested into the target CernVM-FS repository
+    UNDETERMINED = auto()  # The task state was not determined yet
+    NEW_TASK = auto()  # The task has been created but not yet processed
+    PAYLOAD_STAGED = auto()  # The task's payload has been staged to the Stratum-0
+    PULL_REQUEST = auto()  # A PR for the task has been created or updated in some staging repository
+    APPROVED = auto()  # The PR for the task has been approved
+    REJECTED = auto()  # The PR for the task has been rejected
+    INGESTED = auto()  # The task's payload has been applied to the target CernVM-FS repository
+    DONE = auto()  # The task has been completed
 
     @classmethod
     def from_string(cls, name, default=None, case_sensitive=False):
@@ -56,12 +58,15 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github):
         # Define valid state transitions for all actions
         # NOTE, TaskState.APPROVED must be the first element or _next_state() will not work
         self.valid_transitions = {
-            TaskState.NEW: [TaskState.STAGED],
-            TaskState.STAGED: [TaskState.PR_OPENED],
-            TaskState.PR_OPENED: [TaskState.APPROVED, TaskState.REJECTED],
+            TaskState.UNDETERMINED: [TaskState.NEW_TASK, TaskState.PAYLOAD_STAGED, TaskState.PULL_REQUEST,
+                                    TaskState.APPROVED, TaskState.REJECTED, TaskState.INGESTED, TaskState.DONE],
+            TaskState.NEW_TASK: [TaskState.PAYLOAD_STAGED],
+            TaskState.PAYLOAD_STAGED: [TaskState.PULL_REQUEST],
+            TaskState.PULL_REQUEST: [TaskState.APPROVED, TaskState.REJECTED],
             TaskState.APPROVED: [TaskState.INGESTED],
-            TaskState.REJECTED: [],  # Terminal state
-            TaskState.INGESTED: []   # Terminal state
+            TaskState.REJECTED: [TaskState.DONE],
+            TaskState.INGESTED: [TaskState.DONE],
+            TaskState.DONE: []  # Terminal state
         }
 
         self.state = self._find_state()
@@ -318,16 +323,16 @@ def _find_state(self) -> TaskState:
         # obtain all sequence numbers in repo/pr dir which include a state file for this task
         sequence_numbers = self._determine_sequence_numbers_including_task_file(repo, pr)
         if len(sequence_numbers) == 0:
-            # no sequence numbers found, so we return NEW
-            log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW")
-            return TaskState.NEW
+            # no sequence numbers found, so we return NEW_TASK
+            log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence numbers found, state: NEW_TASK")
+            return TaskState.NEW_TASK
         # we got at least one sequence number
         # if one value for a sequence number is True, we can determine the state from the file in the directory
         sequence_including_task = [key for key, value in sequence_numbers.items() if value is True]
         if len(sequence_including_task) == 0:
-            # no sequence number includes the task file, so we return NEW
-            log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence number includes the task file, state: NEW")
-            return TaskState.NEW
+            # no sequence number includes the task file, so we return NEW_TASK
+            log_message(LoggingScope.TASK_OPS, 'INFO', "no sequence number includes the task file, state: NEW_TASK")
+            return TaskState.NEW_TASK
         # we got at least one sequence number which includes the task file
         # we can determine the state from the filename in the directory
         # NOTE, we use the first element in sequence_including_task (there should be only one)
@@ -345,7 +350,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s
         """
         Get the state from the file in the metadata_file_state_path_prefix.
         """
-        # depending on the state of the deployment (NEW, STAGED, PR_OPENED, APPROVED, REJECTED, INGESTED)
+        # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED, INGESTED)
         # we need to check the task file in the default branch or in the branch corresponding to the sequence number
         directory_part = os.path.dirname(metadata_file_state_path_prefix)
         repo_name = self.description.get_repo_name()
@@ -369,7 +374,7 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s
             # did not find any file with metadata_file_state_path_prefix as prefix
             log_message(LoggingScope.TASK_OPS, 'INFO', "did not find any file with prefix %s",
                         metadata_file_state_path_prefix)
-            return TaskState.NEW
+            return TaskState.NEW_TASK
         # sort the states and return the last one
         states.sort()
         state = states[-1]
@@ -433,10 +438,10 @@ def handle(self):
 
     # Implement handlers for ADD action
     @log_function_entry_exit()
-    def _handle_add_new(self):
-        """Handler for ADD action in NEW state"""
-        print("Handling ADD action in NEW state")
-        # Implementation for adding in NEW state: a task is only NEW if it was not processed yet
+    def _handle_add_new_task(self):
+        """Handler for ADD action in NEW_TASK state"""
+        print("Handling ADD action in NEW_TASK state")
+        # Implementation for adding in NEW_TASK state: a task is only NEW_TASK if it was not processed yet
         # get name of of payload from metadata
         payload_name = self.description.metadata['payload']['filename']
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name)
@@ -488,10 +493,10 @@ def _handle_add_new(self):
         return True
 
     @log_function_entry_exit()
-    def _handle_add_staged(self):
-        """Handler for ADD action in STAGED state"""
-        print("Handling ADD action in STAGED state")
-        # Implementation for adding in STAGED state
+    def _handle_add_payload_staged(self):
+        """Handler for ADD action in PAYLOAD_STAGED state"""
+        print("Handling ADD action in PAYLOAD_STAGED state")
+        # Implementation for adding in PAYLOAD_STAGED state
         #  - create or find PR
         #  - update PR contents
         # determine PR
@@ -547,10 +552,10 @@ def _handle_add_staged(self):
         return True
 
     @log_function_entry_exit()
-    def _handle_add_pr_opened(self):
-        """Handler for ADD action in PR_OPENED state"""
-        print("Handling ADD action in PR_OPENED state")
-        # Implementation for adding in PR_OPENED state
+    def _handle_add_pull_request(self):
+        """Handler for ADD action in PULL_REQUEST state"""
+        print("Handling ADD action in PULL_REQUEST state")
+        # Implementation for adding in PULL_REQUEST state
         return True
 
     @log_function_entry_exit()

From 3f9279f7b242c04db96520cf5ef0c4894d445759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 12:49:41 +0200
Subject: [PATCH 096/218] start revising determining state

---
 .../automated_ingestion.py                    |  3 ++
 scripts/automated_ingestion/eessi_task.py     | 39 ++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 7ece86cc..fc0d6f72 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -258,6 +258,9 @@ def main():
                                 ),
                                 gh_staging_repo
                             )
+                            current_state = task.determine_state()
+                            log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'", task_path, current_state)
+
                         except Exception as err:
                             log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s",
                                         task_path, str(err))
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0d9ff203..ef96f10d 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -69,7 +69,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github):
             TaskState.DONE: []  # Terminal state
         }
 
-        self.state = self._find_state()
+        # self.state = self._find_state()
 
     @log_function_entry_exit()
     def _determine_task_action(self) -> EESSITaskAction:
@@ -410,6 +410,43 @@ def _next_state(self) -> TaskState:
         """
         return self.valid_transitions[self.state][0]
 
+    @log_function_entry_exit()
+    def _path_exists_in_branch(self, path: str, branch: str = None) -> bool:
+        """
+        Check if a path exists in a branch.
+        """
+        try:
+            branch = self.git_repo.default_branch if branch is None else branch
+            contents = self._list_directory_contents(path, branch)
+            if isinstance(contents, list):
+                return True
+            else:
+                return False
+            return True
+        except FileNotFoundError:
+            return False
+
+    @log_function_entry_exit()
+    def determine_state(self) -> TaskState:
+        """
+        Determine the state of the task based on the state of the staging repository.
+        """
+        # High-level logic:
+        # 1. Check if path representing the task file exists in the default branch
+        path_in_default_branch = self.description.task_object.remote_file_path
+        if self._path_exists_in_branch(path_in_default_branch, branch=self.git_repo.default_branch):
+            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch",
+                        path_in_default_branch)
+        else:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch",
+                        path_in_default_branch)
+            # check if path exists in any other branch
+            for branch in self.git_repo.get_branches():
+                if self._path_exists_in_branch(path_in_default_branch, branch):
+                    log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s",
+        exit(0)
+        return TaskState.UNDETERMINED
+
     @log_function_entry_exit()
     def handle(self):
         """

From d0739f82ad87ac513b9fb81e6d3c33927485b749 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 13:00:12 +0200
Subject: [PATCH 097/218] fix syntax

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index ef96f10d..1031b860 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -445,7 +445,7 @@ def determine_state(self) -> TaskState:
                 if self._path_exists_in_branch(path_in_default_branch, branch):
                     log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s",
         exit(0)
-        return TaskState.UNDETERMINED
+        # return TaskState.UNDETERMINED
 
     @log_function_entry_exit()
     def handle(self):

From a068b7296c9e7c6d8f7cf935778dea13b11826a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 13:07:06 +0200
Subject: [PATCH 098/218] fix various flake8 issues

---
 .../automated_ingestion.py                    |  3 ++-
 scripts/automated_ingestion/eessi_task.py     | 22 +++++++++++--------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index fc0d6f72..29fe68ec 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -259,7 +259,8 @@ def main():
                                 gh_staging_repo
                             )
                             current_state = task.determine_state()
-                            log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'", task_path, current_state)
+                            log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'",
+                                        task_path, current_state.name)
 
                         except Exception as err:
                             log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s",
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 1031b860..8e07959c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,11 +1,11 @@
 from enum import Enum, auto
-from typing import Dict, List
+from typing import Dict, List, Tuple
 from eessi_data_object import EESSIDataAndSignatureObject
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
 from eessi_task_payload import EESSITaskPayload
 from utils import log_message, LoggingScope, log_function_entry_exit
-from github import Github, GithubException, UnknownObjectException
+from github import Github, GithubException, UnknownObjectException, PullRequest
 import os
 
 
@@ -59,7 +59,7 @@ def __init__(self, description: EESSITaskDescription, git_repo: Github):
         # NOTE, TaskState.APPROVED must be the first element or _next_state() will not work
         self.valid_transitions = {
             TaskState.UNDETERMINED: [TaskState.NEW_TASK, TaskState.PAYLOAD_STAGED, TaskState.PULL_REQUEST,
-                                    TaskState.APPROVED, TaskState.REJECTED, TaskState.INGESTED, TaskState.DONE],
+                                     TaskState.APPROVED, TaskState.REJECTED, TaskState.INGESTED, TaskState.DONE],
             TaskState.NEW_TASK: [TaskState.PAYLOAD_STAGED],
             TaskState.PAYLOAD_STAGED: [TaskState.PULL_REQUEST],
             TaskState.PULL_REQUEST: [TaskState.APPROVED, TaskState.REJECTED],
@@ -296,14 +296,14 @@ def _find_staging_pr(self) -> Tuple[PullRequest, str, int]:
     def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]:
         """
         Create a staging PR for the task.
-        NOTE, SHALL only be called if no staging PR for the task exists yet. 
+        NOTE, SHALL only be called if no staging PR for the task exists yet.
         """
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
         pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
-                                      body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
-                                      head=branch_name, base=self.git_repo.default_branch)
+                                       body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
+                                       head=branch_name, base=self.git_repo.default_branch)
         return pr, branch_name
 
     @log_function_entry_exit()
@@ -350,7 +350,8 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s
         """
         Get the state from the file in the metadata_file_state_path_prefix.
         """
-        # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED, INGESTED)
+        # depending on the state of the deployment (NEW_TASK, PAYLOAD_STAGED, PULL_REQUEST, APPROVED, REJECTED,
+        # INGESTED, DONE)
         # we need to check the task file in the default branch or in the branch corresponding to the sequence number
         directory_part = os.path.dirname(metadata_file_state_path_prefix)
         repo_name = self.description.get_repo_name()
@@ -444,8 +445,9 @@ def determine_state(self) -> TaskState:
             for branch in self.git_repo.get_branches():
                 if self._path_exists_in_branch(path_in_default_branch, branch):
                     log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s",
+                                path_in_default_branch, branch)
         exit(0)
-        # return TaskState.UNDETERMINED
+        return TaskState.UNDETERMINED
 
     @log_function_entry_exit()
     def handle(self):
@@ -542,6 +544,7 @@ def _handle_add_payload_staged(self):
         #  - PR && open -> update PR contents, task file status, etc
         # TODO: determine sequence number, then use it to find staging pr
         # find staging PR
+        sequence_number = self._get_sequence_number_for_task_file()
         staging_pr, staging_branch = self._find_staging_pr(sequence_number)
         # create PR if necessary
         if staging_pr is None and sequence_number is None:
@@ -564,7 +567,7 @@ def _handle_add_payload_staged(self):
 
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
-        # current sequence 
+        # current sequence
         sequence_number = self._get_current_sequence_number()
         sequence_status = self._determine_sequence_status(sequence_number)
         if sequence_status == SequenceStatus.FINISHED:
@@ -578,6 +581,7 @@ def _handle_add_payload_staged(self):
         elif sequence_status == SequenceStatus.FINISHED:
             # we need to figure out the status of the last deployment (with the highest sequence number)
             branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+            log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s exists", branch_name)
         # check if branch exists
         # - yes: check if corresponding PR exists
         #   - yes: check status of PR

From c4a60d29de07c471127a3a05bf59e1c8b3c4ef0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 13:12:35 +0200
Subject: [PATCH 099/218] fix typing issue

---
 scripts/automated_ingestion/eessi_task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8e07959c..ab15bed3 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,5 +1,5 @@
 from enum import Enum, auto
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Optional
 from eessi_data_object import EESSIDataAndSignatureObject
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
@@ -263,7 +263,7 @@ def _determine_sequence_status(self, sequence_number: int = None) -> int:
             return SequenceStatus.IN_PROGRESS
 
     @log_function_entry_exit()
-    def _find_staging_pr(self) -> Tuple[PullRequest, str, int]:
+    def _find_staging_pr(self) -> Tuple[Optional[PullRequest], Optional[str], Optional[int]]:
         """
         Find the staging PR for the task.
         TODO: arg sequence number --> make function simpler

From 0121bed2b6cb4701d8a31075b6e24303b6d75624 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 13:14:19 +0200
Subject: [PATCH 100/218] fix typing issue, take 2

---
 scripts/automated_ingestion/eessi_task.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index ab15bed3..90b3aa39 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -5,7 +5,8 @@
 from eessi_task_description import EESSITaskDescription
 from eessi_task_payload import EESSITaskPayload
 from utils import log_message, LoggingScope, log_function_entry_exit
-from github import Github, GithubException, UnknownObjectException, PullRequest
+from github import Github, GithubException, UnknownObjectException
+from github.PullRequest import PullRequest
 import os
 
 

From a1b81e03576e2ffd816b3034d54f9ceefd59ef9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 13:53:30 +0200
Subject: [PATCH 101/218] print task to be processed

---
 scripts/automated_ingestion/automated_ingestion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 29fe68ec..d72554d5 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -249,6 +249,7 @@ def main():
             else:
                 # Process each task file
                 for task_path in tasks:
+                    log_message(LoggingScope.GROUP_OPS, 'INFO', "Processing task: %s", task_path)
                     try:
                         # Create EESSITask for the task file
                         try:

From 0801ee358e740f45ac8d64d82d9f460f9caad8e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 13:57:58 +0200
Subject: [PATCH 102/218] only check default branch

---
 scripts/automated_ingestion/eessi_task.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 90b3aa39..dab6fe3f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -442,11 +442,6 @@ def determine_state(self) -> TaskState:
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch",
                         path_in_default_branch)
-            # check if path exists in any other branch
-            for branch in self.git_repo.get_branches():
-                if self._path_exists_in_branch(path_in_default_branch, branch):
-                    log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s",
-                                path_in_default_branch, branch)
         exit(0)
         return TaskState.UNDETERMINED
 

From e41670a18b2f9eef9cab870250ba9bdbe3e4e2fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 14:09:23 +0200
Subject: [PATCH 103/218] add main processing loop

---
 .../automated_ingestion/automated_ingestion.py | 18 ++++++++++++++----
 scripts/automated_ingestion/eessi_task.py      |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index d72554d5..312582e7 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -2,7 +2,7 @@
 
 from eessitarball import EessiTarball, EessiTarballGroup
 from eessi_data_object import EESSIDataAndSignatureObject
-from eessi_task import EESSITask
+from eessi_task import EESSITask, TaskState
 from eessi_task_description import EESSITaskDescription
 from s3_bucket import EESSIS3Bucket
 from pid.decorator import pidfile  # noqa: F401
@@ -259,9 +259,6 @@ def main():
                                 ),
                                 gh_staging_repo
                             )
-                            current_state = task.determine_state()
-                            log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'",
-                                        task_path, current_state.name)
 
                         except Exception as err:
                             log_message(LoggingScope.ERROR, 'ERROR', "Failed to create EESSITask for task %s: %s",
@@ -270,6 +267,19 @@ def main():
 
                         log_message(LoggingScope.GROUP_OPS, 'INFO', "Task: %s", task)
 
+                        previous_state = None
+                        current_state = task.determine_state()
+                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task '%s' is in state '%s'",
+                                    task_path, current_state.name)
+                        while (current_state is not None and
+                               current_state != TaskState.DONE and
+                               previous_state != current_state):
+                            previous_state = current_state
+                            current_state = task.handle()
+                            log_message(LoggingScope.GROUP_OPS, 'INFO',
+                                        "Task '%s': previous state = '%s', current state = '%s'",
+                                        task_path, previous_state.name, current_state.name)
+
                         # TODO: update the information shown below (what makes sense to show?)
                         # Log information about the task
                         task_object = task.description.task_object
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index dab6fe3f..6ab6fa12 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -442,8 +442,8 @@ def determine_state(self) -> TaskState:
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch",
                         path_in_default_branch)
+            return TaskState.UNDETERMINED
         exit(0)
-        return TaskState.UNDETERMINED
 
     @log_function_entry_exit()
     def handle(self):

From 9b0fb9a97d3c1e5654419e7d1c75fd659b1a35e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 14:12:15 +0200
Subject: [PATCH 104/218] fix missing state attribute

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 6ab6fa12..16f6c7ae 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -621,4 +621,4 @@ def transition_to(self, new_state: TaskState):
 
     @log_function_entry_exit()
     def __str__(self):
-        return f"EESSITask(description={self.description}, action={self.action}, state={self.state})"
+        return f"EESSITask(description={self.description}, action={self.action}, state={self.determine_state()})"

From 7fb88d30668c4513aa5e7f5ac0fcf5be2919f60d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 14:21:41 +0200
Subject: [PATCH 105/218] remove recursion from handle and return current state

---
 scripts/automated_ingestion/eessi_task.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 16f6c7ae..3d20b26a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -450,26 +450,23 @@ def handle(self):
         """
         Dynamically find and execute the appropriate handler based on action and state.
         """
-        state_before_handle = self.state
+        state_before_handle = self.determine_state()
 
         # Construct handler method name
-        handler_name = f"_handle_{self.action}_{self.state}"
+        handler_name = f"_handle_{self.action}_{state_before_handle}"
 
         # Check if the handler exists
         handler = getattr(self, handler_name, None)
 
         if handler and callable(handler):
             # Execute the handler if it exists
-            handler()
-            # if state has changed, run handle() again; otherwise, do nothing
-            if self.state != state_before_handle:
-                msg = f"handler {handler_name} changed state from {state_before_handle} to {self.state}"
-                msg += " running handle() again"
-                print(msg)
-                self.handle()
+            return handler()
         else:
             # Default behavior for missing handlers
-            print(f"No handler for action {self.action} and state {self.state} implemented; nothing to be done")
+            log_message(LoggingScope.TASK_OPS, 'ERROR',
+                        "No handler for action %s and state %s implemented; nothing to be done",
+                        self.action, state_before_handle)
+            return state_before_handle
 
     # Implement handlers for ADD action
     @log_function_entry_exit()

From 9b3bd002a23e664ef7e32b3d97f6d91de60413e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 14:26:55 +0200
Subject: [PATCH 106/218] commented out some logging and obsolete processing

---
 .../automated_ingestion.py                    | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 312582e7..146075b6 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -280,25 +280,25 @@ def main():
                                         "Task '%s': previous state = '%s', current state = '%s'",
                                         task_path, previous_state.name, current_state.name)
 
-                        # TODO: update the information shown below (what makes sense to show?)
-                        # Log information about the task
-                        task_object = task.description.task_object
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s",
-                                    task.description.signature_verified)
-
-                        # Log the ETags of the downloaded task file
-                        file_etag, sig_etag = task.description.task_object.get_etags()
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag)
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s",
-                                    task.description.task_object.remote_sig_path, sig_etag)
-
-                        # TODO: Process the task file contents
-                        # This would involve reading the task file, parsing its contents,
-                        # and performing the required actions based on the task type
-                        log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path)
-                        task.handle()
+                        # # TODO: update the information shown below (what makes sense to show?)
+                        # # Log information about the task
+                        # task_object = task.description.task_object
+                        # log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file: %s", task_object.local_file_path)
+                        # log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature file: %s", task_object.local_sig_path)
+                        # log_message(LoggingScope.GROUP_OPS, 'INFO', "Signature verified: %s",
+                        #             task.description.signature_verified)
+
+                        # # Log the ETags of the downloaded task file
+                        # file_etag, sig_etag = task.description.task_object.get_etags()
+                        # log_message(LoggingScope.GROUP_OPS, 'INFO', "Task file %s has ETag: %s", task_path, file_etag)
+                        # log_message(LoggingScope.GROUP_OPS, 'INFO', "Task signature %s has ETag: %s",
+                        #             task.description.task_object.remote_sig_path, sig_etag)
+
+                        # # TODO: Process the task file contents
+                        # # This would involve reading the task file, parsing its contents,
+                        # # and performing the required actions based on the task type
+                        # log_message(LoggingScope.GROUP_OPS, 'INFO', "TODO: Processing task file: %s", task_path)
+                        # task.handle()
 
                     except Exception as err:
                         log_message(LoggingScope.ERROR, 'ERROR', "Failed to process task %s: %s", task_path, str(err))

From 7768a91611718af6983ca6bdb9d77542d7757b7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 14:54:15 +0200
Subject: [PATCH 107/218] first version of handler for undetermined task state

---
 .../automated_ingestion.py                    |  1 +
 scripts/automated_ingestion/eessi_task.py     | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 146075b6..e0ca710b 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -279,6 +279,7 @@ def main():
                             log_message(LoggingScope.GROUP_OPS, 'INFO',
                                         "Task '%s': previous state = '%s', current state = '%s'",
                                         task_path, previous_state.name, current_state.name)
+                            exit(0)  # run loop body only once
 
                         # # TODO: update the information shown below (what makes sense to show?)
                         # # Log information about the task
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 3d20b26a..113ad69b 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -235,6 +235,13 @@ def _get_current_sequence_number(self, sequence_numbers: Dict[int, bool] = None)
             return 0
         return self._find_highest_number(sequence_numbers.keys())
 
+    @log_function_entry_exit()
+    def _get_fixed_sequence_number(self) -> int:
+        """
+        Get a fixed sequence number.
+        """
+        return 0
+
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:
         """
@@ -469,6 +476,28 @@ def handle(self):
             return state_before_handle
 
     # Implement handlers for ADD action
+    @log_function_entry_exit()
+    def _handle_add_undetermined(self):
+        """Handler for ADD action in UNDETERMINED state"""
+        print("Handling ADD action in UNDETERMINED state")
+        # create symlink target directory (REPO/PR/SEQ/TASK_FILE_NAME/)
+        # create task file in target directory (TARGET_DIR/TaskDescription)
+        # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK)
+        # create symlink from task file path to target directory (remote_file_path -> TARGET_DIR)
+        branch = self.git_repo.default_branch
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        sequence_number = self._get_fixed_sequence_number()
+        task_file_name = self.description.get_task_file_name()
+        target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}/"
+        self.git_repo.create_file(target_dir, "TaskDescription",
+                                  self.description.get_contents(), branch=branch)
+        self.git_repo.create_file(target_dir, f"TaskState.{TaskState.NEW_TASK.name}",
+                                  "", branch=branch)
+        self.git_repo.create_symlink(self.description.task_object.remote_file_path,
+                                     target_dir, branch=branch)
+        return TaskState.NEW_TASK
+
     @log_function_entry_exit()
     def _handle_add_new_task(self):
         """Handler for ADD action in NEW_TASK state"""

From e3ffa6cfa5ecd5e59ec84d416b26f513c04083e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 14:58:58 +0200
Subject: [PATCH 108/218] remove trailing / from target_dir name

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 113ad69b..2e60431c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -489,7 +489,7 @@ def _handle_add_undetermined(self):
         pr_number = self.description.get_pr_number()
         sequence_number = self._get_fixed_sequence_number()
         task_file_name = self.description.get_task_file_name()
-        target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}/"
+        target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
         self.git_repo.create_file(target_dir, "TaskDescription",
                                   self.description.get_contents(), branch=branch)
         self.git_repo.create_file(target_dir, f"TaskState.{TaskState.NEW_TASK.name}",

From 0fa6afbe987668fbb4fb350cea0313718e4e10c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 15:12:00 +0200
Subject: [PATCH 109/218] fix target directory structure for NEW_TASK and skip
 creating symlink

---
 scripts/automated_ingestion/eessi_task.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 2e60431c..a92f8914 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -490,12 +490,16 @@ def _handle_add_undetermined(self):
         sequence_number = self._get_fixed_sequence_number()
         task_file_name = self.description.get_task_file_name()
         target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
-        self.git_repo.create_file(target_dir, "TaskDescription",
+        task_description_file_path = f"{target_dir}/TaskDescription"
+        task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}"
+        self.git_repo.create_file(task_description_file_path,
+                                  f"new task description for {repo_name} PR {pr_number} seq {sequence_number}",
                                   self.description.get_contents(), branch=branch)
-        self.git_repo.create_file(target_dir, f"TaskState.{TaskState.NEW_TASK.name}",
+        self.git_repo.create_file(task_state_file_path,
+                                  f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
                                   "", branch=branch)
-        self.git_repo.create_symlink(self.description.task_object.remote_file_path,
-                                     target_dir, branch=branch)
+        # self.git_repo.create_symlink(self.description.task_object.remote_file_path,
+        #                              target_dir, branch=branch)
         return TaskState.NEW_TASK
 
     @log_function_entry_exit()

From 570babbc69433405a271e7f50cc88754405c9c0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 19:58:57 +0200
Subject: [PATCH 110/218] add creation of symlink

---
 scripts/automated_ingestion/eessi_task.py | 41 +++++++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a92f8914..d51c2c25 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -476,6 +476,40 @@ def handle(self):
             return state_before_handle
 
     # Implement handlers for ADD action
+    @log_function_entry_exit()
+    def _create_symlink(self, source_path: str, target_path: str, branch: str = None):
+        """Create a symlink in the given branch."""
+        try:
+            branch = self.git_repo.default_branch if branch is None else branch
+            ref = self.git_repo.get_git_ref(f"heads/{branch}")
+            commit = self.git_repo.get_git_commit(ref.object.sha)
+            base_tree = self.git_repo.get_git_tree(commit.tree.sha)
+
+            # Create blob for symlink target
+            blob = self.git_repo.create_git_blob(target_path, "utf-8")
+
+            # Create tree element
+            tree_element = {
+                "path": source_path,
+                "mode": "120000",
+                "type": "blob",
+                "sha": blob.sha
+            }
+
+            # Create new tree and commit
+            new_tree = self.git_repo.create_git_tree([tree_element], base_tree)
+            commit_message = f"Add symlink {source_path} -> {target_path}"
+            new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit])
+
+            # Update reference
+            ref.edit(new_commit.sha)
+
+            log_message(LoggingScope.TASK_OPS, 'INFO', "Symlink created: %s -> %s",
+                        source_path, target_path)
+
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err)
+
     @log_function_entry_exit()
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
@@ -487,7 +521,7 @@ def _handle_add_undetermined(self):
         branch = self.git_repo.default_branch
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
-        sequence_number = self._get_fixed_sequence_number()
+        sequence_number = self._get_fixed_sequence_number()  # corresponds to an open or yet to be created PR
         task_file_name = self.description.get_task_file_name()
         target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
         task_description_file_path = f"{target_dir}/TaskDescription"
@@ -498,8 +532,9 @@ def _handle_add_undetermined(self):
         self.git_repo.create_file(task_state_file_path,
                                   f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
                                   "", branch=branch)
-        # self.git_repo.create_symlink(self.description.task_object.remote_file_path,
-        #                              target_dir, branch=branch)
+        self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch)
+        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still open or
+        #   yet to be created); if it is not valid, perform corrective actions
         return TaskState.NEW_TASK
 
     @log_function_entry_exit()

From f1f813e1ff6b91f526099d76ae232214b42a0609 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 20:09:00 +0200
Subject: [PATCH 111/218] add error handling and log messages

---
 scripts/automated_ingestion/eessi_task.py | 31 +++++++++++++++++------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index d51c2c25..0ccc3e2c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -526,15 +526,30 @@ def _handle_add_undetermined(self):
         target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
         task_description_file_path = f"{target_dir}/TaskDescription"
         task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}"
-        self.git_repo.create_file(task_description_file_path,
-                                  f"new task description for {repo_name} PR {pr_number} seq {sequence_number}",
-                                  self.description.get_contents(), branch=branch)
-        self.git_repo.create_file(task_state_file_path,
-                                  f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
-                                  "", branch=branch)
+        try:
+            self.git_repo.create_file(task_description_file_path,
+                                      f"new task description for {repo_name} PR {pr_number} seq {sequence_number}",
+                                      self.description.get_contents(), branch=branch)
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task description file: %s", err)
+            return TaskState.UNDETERMINED
+        log_message(LoggingScope.TASK_OPS, 'INFO',
+                    "task description file created: %s", task_description_file_path)
+
+        try:
+            self.git_repo.create_file(task_state_file_path,
+                                      f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
+                                      "", branch=branch)
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err)
+            return TaskState.UNDETERMINED
+        log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path)
+
         self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch)
-        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still open or
-        #   yet to be created); if it is not valid, perform corrective actions
+        log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s",
+                    self.description.task_object.remote_file_path, target_dir)
+        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still
+        #   open or yet to be created); if it is not valid, perform corrective actions
         return TaskState.NEW_TASK
 
     @log_function_entry_exit()

From fc67e434a980f87531ea96430aaa3f8ec6a7487d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 20:25:57 +0200
Subject: [PATCH 112/218] add _safe_create_file and error handling

---
 scripts/automated_ingestion/eessi_task.py | 49 ++++++++++++++++-------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0ccc3e2c..56dd9897 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -509,6 +509,22 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
 
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err)
+            raise err
+
+    @log_function_entry_exit()
+    def _safe_create_file(self, path: str, message: str, content: str, branch: str = None):
+        """Create a file in the given branch."""
+        try:
+            branch = self.git_repo.default_branch if branch is None else branch
+            existing_file = self.git_repo.get_contents(path, ref=branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "File %s already exists", path)
+            return existing_file
+        except GithubException as err:
+            if err.status == 404:  # File doesn't exist
+                # Safe to create
+                return self.git_repo.create_file(path, message, content, branch=branch)
+            else:
+                raise err  # Some other error
 
     @log_function_entry_exit()
     def _handle_add_undetermined(self):
@@ -527,29 +543,34 @@ def _handle_add_undetermined(self):
         task_description_file_path = f"{target_dir}/TaskDescription"
         task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}"
         try:
-            self.git_repo.create_file(task_description_file_path,
-                                      f"new task description for {repo_name} PR {pr_number} seq {sequence_number}",
-                                      self.description.get_contents(), branch=branch)
+            self._safe_create_file(task_description_file_path,
+                                   f"new task description for {repo_name} PR {pr_number} seq {sequence_number}",
+                                   self.description.get_contents(), branch=branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "task description file created: %s", task_description_file_path)
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task description file: %s", err)
             return TaskState.UNDETERMINED
-        log_message(LoggingScope.TASK_OPS, 'INFO',
-                    "task description file created: %s", task_description_file_path)
 
         try:
-            self.git_repo.create_file(task_state_file_path,
-                                      f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
-                                      "", branch=branch)
+            self._safe_create_file(task_state_file_path,
+                                   f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
+                                   "", branch=branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path)
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err)
             return TaskState.UNDETERMINED
-        log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path)
 
-        self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch)
-        log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s",
-                    self.description.task_object.remote_file_path, target_dir)
-        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number is still
-        #   open or yet to be created); if it is not valid, perform corrective actions
+        try:
+            self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s",
+                        self.description.task_object.remote_file_path, target_dir)
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err)
+            return TaskState.UNDETERMINED
+
+        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number
+        #   is still open or yet to be created); if it is not valid, perform corrective actions
         return TaskState.NEW_TASK
 
     @log_function_entry_exit()

From 467781c48f85c2abd1aa35b954bd5a8983cfe367 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 20:33:42 +0200
Subject: [PATCH 113/218] improve logging

---
 scripts/automated_ingestion/eessi_task.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 56dd9897..811b42e7 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -487,6 +487,7 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
 
             # Create blob for symlink target
             blob = self.git_repo.create_git_blob(target_path, "utf-8")
+            log_message(LoggingScope.TASK_OPS, 'INFO', "blob created: %s", blob)
 
             # Create tree element
             tree_element = {
@@ -498,18 +499,22 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
 
             # Create new tree and commit
             new_tree = self.git_repo.create_git_tree([tree_element], base_tree)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree)
+
             commit_message = f"Add symlink {source_path} -> {target_path}"
             new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit])
+            log_message(LoggingScope.TASK_OPS, 'INFO', "new commit created: %s", new_commit)
 
             # Update reference
             ref.edit(new_commit.sha)
 
             log_message(LoggingScope.TASK_OPS, 'INFO', "Symlink created: %s -> %s",
                         source_path, target_path)
+            return True
 
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err)
-            raise err
+            return False
 
     @log_function_entry_exit()
     def _safe_create_file(self, path: str, message: str, content: str, branch: str = None):
@@ -559,14 +564,14 @@ def _handle_add_undetermined(self):
             log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path)
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err)
+            # TODO: rollback previous changes (task description file)
             return TaskState.UNDETERMINED
 
-        try:
-            self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch)
+        if self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch):
             log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s",
                         self.description.task_object.remote_file_path, target_dir)
-        except Exception as err:
-            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err)
+        else:
+            # TODO: rollback previous changes (task description file, task state file)
             return TaskState.UNDETERMINED
 
         # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number

From 09fb09215091e614b6724b593544ab3aee315695 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 20:36:44 +0200
Subject: [PATCH 114/218] create new tree in try/except block

---
 scripts/automated_ingestion/eessi_task.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 811b42e7..227c9442 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -497,10 +497,15 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
                 "sha": blob.sha
             }
 
-            # Create new tree and commit
-            new_tree = self.git_repo.create_git_tree([tree_element], base_tree)
-            log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree)
+            # Create new tree
+            try:
+                new_tree = self.git_repo.create_git_tree([tree_element], base_tree)
+                log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree)
+            except Exception as err:
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err)
+                return False
 
+            # Create new commit
             commit_message = f"Add symlink {source_path} -> {target_path}"
             new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit])
             log_message(LoggingScope.TASK_OPS, 'INFO', "new commit created: %s", new_commit)

From 6c6d3dd39b970c103adb368b7e98d45aa3165b57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 20:41:37 +0200
Subject: [PATCH 115/218] debug issue creating git tree

---
 scripts/automated_ingestion/eessi_task.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 227c9442..5985b80f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -503,6 +503,10 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
                 log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree)
             except Exception as err:
                 log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err)
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Status Code: %s", err.status)
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Error Message: %s", err.data)
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Headers: %s", err.headers)
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Raw Response: %s", err.response)
                 return False
 
             # Create new commit

From 031881673087db6d41f40d79964e404369d15f6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 20:54:03 +0200
Subject: [PATCH 116/218] improve error handling and reporting

---
 scripts/automated_ingestion/eessi_task.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 5985b80f..2619d2de 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -8,6 +8,7 @@
 from github import Github, GithubException, UnknownObjectException
 from github.PullRequest import PullRequest
 import os
+import traceback
 
 
 class SequenceStatus(Enum):
@@ -501,13 +502,20 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
             try:
                 new_tree = self.git_repo.create_git_tree([tree_element], base_tree)
                 log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree)
-            except Exception as err:
+            except GithubException as err:
                 log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err)
                 log_message(LoggingScope.TASK_OPS, 'ERROR', "  Status Code: %s", err.status)
                 log_message(LoggingScope.TASK_OPS, 'ERROR', "  Error Message: %s", err.data)
                 log_message(LoggingScope.TASK_OPS, 'ERROR', "  Headers: %s", err.headers)
                 log_message(LoggingScope.TASK_OPS, 'ERROR', "  Raw Response: %s", err.response)
                 return False
+            except Exception as err:
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "\n=== General Exception ===")
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Type: %s", type(err).__name__)
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Message: %s", str(err))
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Traceback:")
+                log_message(LoggingScope.TASK_OPS, 'ERROR', "    %s", traceback.format_exc())
+                return False
 
             # Create new commit
             commit_message = f"Add symlink {source_path} -> {target_path}"

From b842eba46ebeced4d830bc6cd61fcf694ccde3d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 1 Jun 2025 21:00:35 +0200
Subject: [PATCH 117/218] use InputGitTreeElement instead of simple Dict

---
 scripts/automated_ingestion/eessi_task.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 2619d2de..db918efc 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -5,7 +5,7 @@
 from eessi_task_description import EESSITaskDescription
 from eessi_task_payload import EESSITaskPayload
 from utils import log_message, LoggingScope, log_function_entry_exit
-from github import Github, GithubException, UnknownObjectException
+from github import Github, GithubException, InputGitTreeElement, UnknownObjectException
 from github.PullRequest import PullRequest
 import os
 import traceback
@@ -491,12 +491,13 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
             log_message(LoggingScope.TASK_OPS, 'INFO', "blob created: %s", blob)
 
             # Create tree element
-            tree_element = {
-                "path": source_path,
-                "mode": "120000",
-                "type": "blob",
-                "sha": blob.sha
-            }
+            tree_element = InputGitTreeElement(
+                path=source_path,
+                mode="120000",
+                type="blob",
+                sha=blob.sha
+            )
+            log_message(LoggingScope.TASK_OPS, 'INFO', "tree element created: %s", tree_element)
 
             # Create new tree
             try:

From 29fd410ea34be56eb2412815c3a1444044894006 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Fri, 6 Jun 2025 22:31:55 +0200
Subject: [PATCH 118/218] use pointer file instead of symlink

---
 scripts/automated_ingestion/eessi_task.py | 25 +++++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index db918efc..abc9f476 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -447,11 +447,12 @@ def determine_state(self) -> TaskState:
         if self._path_exists_in_branch(path_in_default_branch, branch=self.git_repo.default_branch):
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch",
                         path_in_default_branch)
+            # TODO: determine state
+            exit(0)
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch",
                         path_in_default_branch)
             return TaskState.UNDETERMINED
-        exit(0)
 
     @log_function_entry_exit()
     def handle(self):
@@ -553,10 +554,10 @@ def _safe_create_file(self, path: str, message: str, content: str, branch: str =
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
         print("Handling ADD action in UNDETERMINED state")
-        # create symlink target directory (REPO/PR/SEQ/TASK_FILE_NAME/)
+        # create target directory (REPO/PR/SEQ/TASK_FILE_NAME/)
         # create task file in target directory (TARGET_DIR/TaskDescription)
         # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK)
-        # create symlink from task file path to target directory (remote_file_path -> TARGET_DIR)
+        # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR)
         branch = self.git_repo.default_branch
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
@@ -564,7 +565,7 @@ def _handle_add_undetermined(self):
         task_file_name = self.description.get_task_file_name()
         target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
         task_description_file_path = f"{target_dir}/TaskDescription"
-        task_state_file_path = f"{target_dir}/TaskState.{TaskState.NEW_TASK.name}"
+        task_state_file_path = f"{target_dir}/TaskState"
         try:
             self._safe_create_file(task_description_file_path,
                                    f"new task description for {repo_name} PR {pr_number} seq {sequence_number}",
@@ -578,17 +579,23 @@ def _handle_add_undetermined(self):
         try:
             self._safe_create_file(task_state_file_path,
                                    f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
-                                   "", branch=branch)
+                                   f"{TaskState.NEW_TASK.name}", branch=branch)
             log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path)
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err)
             # TODO: rollback previous changes (task description file)
             return TaskState.UNDETERMINED
 
-        if self._create_symlink(self.description.task_object.remote_file_path, target_dir, branch=branch):
-            log_message(LoggingScope.TASK_OPS, 'INFO', "symlink created: %s -> %s",
-                        self.description.task_object.remote_file_path, target_dir)
-        else:
+        try:
+            remote_file_path = self.description.task_object.remote_file_path
+            self._safe_create_file(remote_file_path,
+                                   f"pointer from task file {remote_file_path} to target {target_dir}",
+                                   f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}",
+                                   branch=branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "pointer file created: %s -> %s",
+                        remote_file_path, target_dir)
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating pointer file: %s", err)
             # TODO: rollback previous changes (task description file, task state file)
             return TaskState.UNDETERMINED
 

From e6680d67e0ec72dcb7701fca9292c3282ef3e779 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 12:14:06 +0200
Subject: [PATCH 119/218] add a couple of files with a single commit

---
 scripts/automated_ingestion/eessi_task.py | 111 ++++++++++++++++------
 1 file changed, 82 insertions(+), 29 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index abc9f476..afa7ee87 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,14 +1,18 @@
 from enum import Enum, auto
 from typing import Dict, List, Tuple, Optional
+
+import os
+import traceback
+import base64
+
 from eessi_data_object import EESSIDataAndSignatureObject
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
 from eessi_task_payload import EESSITaskPayload
 from utils import log_message, LoggingScope, log_function_entry_exit
+
 from github import Github, GithubException, InputGitTreeElement, UnknownObjectException
 from github.PullRequest import PullRequest
-import os
-import traceback
 
 
 class SequenceStatus(Enum):
@@ -550,6 +554,61 @@ def _safe_create_file(self, path: str, message: str, content: str, branch: str =
             else:
                 raise err  # Some other error
 
+    @log_function_entry_exit()
+    def _create_multi_file_commit(self, files_data, commit_message, branch=None):
+        """
+        Create a commit with multiple file changes
+
+        files_data: dict with structure:
+        {
+            "path/to/file1.txt": {
+                "content": "file content",
+                "mode": "100644"  # optional, defaults to 100644
+            },
+            "path/to/file2.py": {
+                "content": "print('hello')",
+                "mode": "100644"
+            }
+        }
+        """
+        branch = self.git_repo.default_branch if branch is None else branch
+        ref = self.git_repo.get_git_ref(f"heads/{branch}")
+        current_commit = self.git_repo.get_git_commit(ref.object.sha)
+        base_tree = current_commit.tree
+
+        # Create tree elements
+        tree_elements = []
+        for file_path, file_info in files_data.items():
+            content = file_info["content"]
+            if isinstance(content, str):
+                content = content.encode('utf-8')
+
+            blob = self.git_repo.create_git_blob(
+                base64.b64encode(content).decode('utf-8'),
+                "base64"
+            )
+            tree_elements.append(InputGitTreeElement(
+                path=file_path,
+                mode=file_info.get("mode", "100644"),
+                type="blob",
+                sha=blob.sha
+            ))
+
+        # Create new tree
+        new_tree = self.git_repo.create_git_tree(tree_elements, base_tree)
+
+        # Create commit
+        new_commit = self.git_repo.create_git_commit(
+            commit_message,
+            new_tree,
+            [current_commit]
+        )
+
+        # Update branch reference
+        ref.edit(new_commit.sha)
+
+        return new_commit
+
     @log_function_entry_exit()
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
@@ -566,36 +625,30 @@ def _handle_add_undetermined(self):
         target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
         task_description_file_path = f"{target_dir}/TaskDescription"
         task_state_file_path = f"{target_dir}/TaskState"
-        try:
-            self._safe_create_file(task_description_file_path,
-                                   f"new task description for {repo_name} PR {pr_number} seq {sequence_number}",
-                                   self.description.get_contents(), branch=branch)
-            log_message(LoggingScope.TASK_OPS, 'INFO',
-                        "task description file created: %s", task_description_file_path)
-        except Exception as err:
-            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task description file: %s", err)
-            return TaskState.UNDETERMINED
-
-        try:
-            self._safe_create_file(task_state_file_path,
-                                   f"new task state for {repo_name} PR {pr_number} seq {sequence_number}",
-                                   f"{TaskState.NEW_TASK.name}", branch=branch)
-            log_message(LoggingScope.TASK_OPS, 'INFO', "task state file created: %s", task_state_file_path)
-        except Exception as err:
-            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating task state file: %s", err)
-            # TODO: rollback previous changes (task description file)
-            return TaskState.UNDETERMINED
+        remote_file_path = self.description.task_object.remote_file_path
+
+        files_to_commit = {
+            task_description_file_path: {
+                "content": self.description.get_contents(),
+                "mode": "100644"
+            },
+            task_state_file_path: {
+                "content": f"{TaskState.NEW_TASK.name}",
+                "mode": "100644"
+            },
+            remote_file_path: {
+                "content": f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}",
+                "mode": "100644"
+            }
+        }
 
         try:
-            remote_file_path = self.description.task_object.remote_file_path
-            self._safe_create_file(remote_file_path,
-                                   f"pointer from task file {remote_file_path} to target {target_dir}",
-                                   f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}",
-                                   branch=branch)
-            log_message(LoggingScope.TASK_OPS, 'INFO', "pointer file created: %s -> %s",
-                        remote_file_path, target_dir)
+            commit = self._create_multi_file_commit(files_to_commit,
+                                                    f"new task for {repo_name} PR {pr_number} seq {sequence_number}",
+                                                    branch=branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "commit created: %s", commit)
         except Exception as err:
-            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating pointer file: %s", err)
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating commit: %s", err)
             # TODO: rollback previous changes (task description file, task state file)
             return TaskState.UNDETERMINED
 

From b63699ba62b3a9267ce5055ff81f6fec7140a14c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 12:25:44 +0200
Subject: [PATCH 120/218] add new line to state file

---
 scripts/automated_ingestion/eessi_task.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index afa7ee87..a1c3614a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -633,7 +633,7 @@ def _handle_add_undetermined(self):
                 "mode": "100644"
             },
             task_state_file_path: {
-                "content": f"{TaskState.NEW_TASK.name}",
+                "content": f"{TaskState.NEW_TASK.name}\n",
                 "mode": "100644"
             },
             remote_file_path: {
@@ -643,9 +643,11 @@ def _handle_add_undetermined(self):
         }
 
         try:
-            commit = self._create_multi_file_commit(files_to_commit,
-                                                    f"new task for {repo_name} PR {pr_number} seq {sequence_number}",
-                                                    branch=branch)
+            commit = self._create_multi_file_commit(
+                files_to_commit,
+                f"new task for {repo_name} PR {pr_number} seq {sequence_number}",
+                branch=branch
+            )
             log_message(LoggingScope.TASK_OPS, 'INFO', "commit created: %s", commit)
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating commit: %s", err)

From 3a1d975ed51a42a8778350ab11367c5fd1cdb402 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 13:29:45 +0200
Subject: [PATCH 121/218] determine task state from TaskState file

---
 scripts/automated_ingestion/eessi_task.py | 80 ++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a1c3614a..9af53cb3 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -440,6 +440,60 @@ def _path_exists_in_branch(self, path: str, branch: str = None) -> bool:
         except FileNotFoundError:
             return False
 
+    @log_function_entry_exit()
+    def _read_dict_from_string(self, content: str) -> dict:
+        """
+        Read the dictionary from the string.
+        """
+        config_dict = {}
+        for line in content.strip().split('\n'):
+            if '=' in line and not line.strip().startswith('#'):  # Skip comments
+                key, value = line.split('=', 1)  # Split only on first '='
+                config_dict[key.strip()] = value.strip()
+        return config_dict
+
+    @log_function_entry_exit()
+    def _read_target_dir_from_file(self, path: str, branch: str = None) -> str:
+        """
+        Read the target directory from the file in the given branch.
+        """
+        branch = self.git_repo.default_branch if branch is None else branch
+        content = self.git_repo.get_contents(path, ref=branch)
+
+        # Decode the content from base64
+        content_str = content.decoded_content.decode('utf-8')
+
+        # Parse into dictionary
+        config_dict = self._read_dict_from_string(content_str)
+
+        return config_dict.get('target_dir', None)
+
+    @log_function_entry_exit()
+    def _branch_exists(self, branch_name: str) -> bool:
+        """
+        Check if a branch exists.
+        """
+        try:
+            self.git_repo.get_branch(branch_name)
+            return True
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "error checking if branch %s exists: %s",
+                        branch_name, err)
+            return False
+
+    @log_function_entry_exit()
+    def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState:
+        """
+        Read the task state from the file in the given branch.
+        """
+        branch = self.git_repo.default_branch if branch is None else branch
+        content = self.git_repo.get_contents(path, ref=branch)
+
+        # Decode the content from base64
+        content_str = content.decoded_content.decode('utf-8')
+
+        return TaskState.from_string(content_str)
+
     @log_function_entry_exit()
     def determine_state(self) -> TaskState:
         """
@@ -448,11 +502,33 @@ def determine_state(self) -> TaskState:
         # High-level logic:
         # 1. Check if path representing the task file exists in the default branch
         path_in_default_branch = self.description.task_object.remote_file_path
-        if self._path_exists_in_branch(path_in_default_branch, branch=self.git_repo.default_branch):
+        default_branch = self.git_repo.default_branch
+        if self._path_exists_in_branch(path_in_default_branch, branch=default_branch):
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch",
                         path_in_default_branch)
             # TODO: determine state
-            exit(0)
+            # - get state from task file in default branch
+            #   - get target_dir from path_in_default_branch
+            target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch)
+            # read the TaskState file in target dir
+            task_state_file_path = f"{target_dir}/TaskState"
+            task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch)
+            # - if branch for sequence number exists, get state from task file in corresponding branch
+            #   - branch name is of the form REPO-PR-SEQ
+            #   - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/
+            #   - obtain repo, pr, seq from target dir
+            org, repo, pr, seq, _ = target_dir.split('/')
+            staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
+            if self._branch_exists(staging_branch_name):
+                # read the TaskState file in staging branch
+                task_state_staging_branch = self._read_task_state_from_file(task_state_file_path, staging_branch_name)
+                log_message(LoggingScope.TASK_OPS, 'INFO', "task state in staging branch %s: %s",
+                            staging_branch_name, task_state_staging_branch)
+                return task_state_staging_branch
+            else:
+                log_message(LoggingScope.TASK_OPS, 'INFO', "task state in default branch: %s",
+                            task_state_default_branch)
+                return task_state_default_branch
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch",
                         path_in_default_branch)

From 6f2c92f8ca1b8248fb4a4790cd39106215810960 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 14:38:51 +0200
Subject: [PATCH 122/218] fix check for path

---
 scripts/automated_ingestion/eessi_task.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 9af53cb3..a33dd4ab 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -429,16 +429,15 @@ def _path_exists_in_branch(self, path: str, branch: str = None) -> bool:
         """
         Check if a path exists in a branch.
         """
+        branch = self.git_repo.default_branch if branch is None else branch
         try:
-            branch = self.git_repo.default_branch if branch is None else branch
-            contents = self._list_directory_contents(path, branch)
-            if isinstance(contents, list):
-                return True
-            else:
-                return False
+            self.git_repo.get_contents(path, ref=branch)
             return True
-        except FileNotFoundError:
-            return False
+        except GithubException as err:
+            if err.status == 404:
+                return False
+            else:
+                raise err
 
     @log_function_entry_exit()
     def _read_dict_from_string(self, content: str) -> dict:

From 57b9da61dc407a5e1396ae2ef379d772bfb30601 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 14:56:13 +0200
Subject: [PATCH 123/218] add log output when determining state

---
 scripts/automated_ingestion/eessi_task.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a33dd4ab..d18f4eca 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -490,8 +490,12 @@ def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState
 
         # Decode the content from base64
         content_str = content.decoded_content.decode('utf-8')
+        log_message(LoggingScope.TASK_OPS, 'INFO', "content in TaskState file: %s", content_str)
 
-        return TaskState.from_string(content_str)
+        task_state = TaskState.from_string(content_str)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "task state: %s", task_state)
+
+        return task_state
 
     @log_function_entry_exit()
     def determine_state(self) -> TaskState:

From c53729fec1c6a5d27ffee9f30109353fea0fb720 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 15:18:12 +0200
Subject: [PATCH 124/218] fix from_string

---
 scripts/automated_ingestion/eessi_task.py | 27 +++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index d18f4eca..d2e1a7c7 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -33,17 +33,30 @@ class TaskState(Enum):
 
     @classmethod
     def from_string(cls, name, default=None, case_sensitive=False):
+        log_message(LoggingScope.TASK_OPS, 'INFO', "from_string: %s", name)
         if case_sensitive:
-            return cls.__members__.get(name, default)
+            to_return = cls.__members__.get(name, default)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return)
+            return to_return
 
         try:
-            return next(
-                member for member_name, member in cls.__members__.items()
-                if member_name.lower() == name.lower()
-            )
-        except StopIteration:
+            to_return = cls[name.upper()]
+            log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return)
+            return to_return
+        except KeyError:
             return default
 
+#        try:
+#            log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will iterate over: %s", cls.__members__)
+#            to_return = next(
+#                member for member_name, member in cls.__members__.items()
+#                if member_name.lower() == name.lower()
+#            )
+#            log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return)
+#            return to_return
+#        except StopIteration:
+#            return default
+
     def __str__(self):
         return self.name.lower()
 
@@ -489,7 +502,7 @@ def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState
         content = self.git_repo.get_contents(path, ref=branch)
 
         # Decode the content from base64
-        content_str = content.decoded_content.decode('utf-8')
+        content_str = content.decoded_content.decode('utf-8').strip()
         log_message(LoggingScope.TASK_OPS, 'INFO', "content in TaskState file: %s", content_str)
 
         task_state = TaskState.from_string(content_str)

From af6e1d880122e40c2da6c053d57c0571d8ebe905 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 15:25:15 +0200
Subject: [PATCH 125/218] return upper case state name

---
 scripts/automated_ingestion/eessi_task.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index d2e1a7c7..1b17dee2 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -46,19 +46,8 @@ def from_string(cls, name, default=None, case_sensitive=False):
         except KeyError:
             return default
 
-#        try:
-#            log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will iterate over: %s", cls.__members__)
-#            to_return = next(
-#                member for member_name, member in cls.__members__.items()
-#                if member_name.lower() == name.lower()
-#            )
-#            log_message(LoggingScope.TASK_OPS, 'INFO', "from_string will return: %s", to_return)
-#            return to_return
-#        except StopIteration:
-#            return default
-
     def __str__(self):
-        return self.name.lower()
+        return self.name.upper()
 
 
 class EESSITask:

From c76b68b440cd6835ddec4d15c8abcffa2abf3840 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 15:34:35 +0200
Subject: [PATCH 126/218] use lower state name to create handler name

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 1b17dee2..08c59948 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -547,7 +547,7 @@ def handle(self):
         state_before_handle = self.determine_state()
 
         # Construct handler method name
-        handler_name = f"_handle_{self.action}_{state_before_handle}"
+        handler_name = f"_handle_{self.action}_{state_before_handle.lower()}"
 
         # Check if the handler exists
         handler = getattr(self, handler_name, None)

From ed75818b5a7629b55db5539fe8ecd1c42bc5fe94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 15:36:57 +0200
Subject: [PATCH 127/218] convert task state to str first

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 08c59948..7cf9689f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -547,7 +547,7 @@ def handle(self):
         state_before_handle = self.determine_state()
 
         # Construct handler method name
-        handler_name = f"_handle_{self.action}_{state_before_handle.lower()}"
+        handler_name = f"_handle_{self.action}_{str(state_before_handle).lower()}"
 
         # Check if the handler exists
         handler = getattr(self, handler_name, None)

From bc61294744c86996ba5305185da81ca3ed5b9e0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 16:28:51 +0200
Subject: [PATCH 128/218] complete handler for state NEW_TASK

---
 scripts/automated_ingestion/eessi_task.py | 90 ++++++++++++-----------
 1 file changed, 49 insertions(+), 41 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7cf9689f..6d2f799c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -417,14 +417,15 @@ def _list_directory_contents(self, directory_path, branch=None):
             raise err
 
     @log_function_entry_exit()
-    def _next_state(self) -> TaskState:
+    def _next_state(self, state: TaskState = None) -> TaskState:
         """
         Determine the next state based on the current state using the valid_transitions dictionary.
 
         NOTE, it assumes that function is only called for non-terminal states and that the next state is the first
         element of the list returned by the valid_transitions dictionary.
         """
-        return self.valid_transitions[self.state][0]
+        the_state = state if state is not None else self.determine_state()
+        return self.valid_transitions[the_state][0]
 
     @log_function_entry_exit()
     def _path_exists_in_branch(self, path: str, branch: str = None) -> bool:
@@ -504,24 +505,22 @@ def determine_state(self) -> TaskState:
         """
         Determine the state of the task based on the state of the staging repository.
         """
-        # High-level logic:
-        # 1. Check if path representing the task file exists in the default branch
+        # check if path representing the task file exists in the default branch
         path_in_default_branch = self.description.task_object.remote_file_path
         default_branch = self.git_repo.default_branch
         if self._path_exists_in_branch(path_in_default_branch, branch=default_branch):
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch",
                         path_in_default_branch)
-            # TODO: determine state
-            # - get state from task file in default branch
-            #   - get target_dir from path_in_default_branch
+            # get state from task file in default branch
+            # - get target_dir from path_in_default_branch
             target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch)
             # read the TaskState file in target dir
             task_state_file_path = f"{target_dir}/TaskState"
             task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch)
-            # - if branch for sequence number exists, get state from task file in corresponding branch
-            #   - branch name is of the form REPO-PR-SEQ
-            #   - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/
-            #   - obtain repo, pr, seq from target dir
+            # if branch for sequence number exists, get state from task file in corresponding branch
+            # - branch name is of the form REPO-PR-SEQ
+            # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/
+            # - obtain repo, pr, seq from target dir
             org, repo, pr, seq, _ = target_dir.split('/')
             staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
             if self._branch_exists(staging_branch_name):
@@ -690,6 +689,31 @@ def _create_multi_file_commit(self, files_data, commit_message, branch=None):
 
         return new_commit
 
+    @log_function_entry_exit()
+    def _update_file(self, file_path, new_content, commit_message, branch=None):
+        try:
+            branch = self.git_repo.default_branch if branch is None else branch
+
+            # Get the current file
+            file = self.git_repo.get_contents(file_path, ref=branch)
+
+            # Update the file
+            result = self.git_repo.update_file(
+                path=file_path,
+                message=commit_message,
+                content=new_content,
+                sha=file.sha,
+                branch=branch
+            )
+
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "File updated successfully. Commit SHA: %s", result['commit'].sha)
+            return result
+
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error updating file: %s", err)
+            return None
+
     @log_function_entry_exit()
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
@@ -759,40 +783,24 @@ def _handle_add_new_task(self):
         payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client)
         self.payload = EESSITaskPayload(payload_object)
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload)
-        # determine next state (NEXT_STATE), put metadata/task file into GH staging repo in main branch under directory
-        # REPO/PR_NUM/SEQ_NUM/task_file_name.NEXT_STATE
+
+        # determine next state (NEXT_STATE), update TaskState file content
         next_state = self._next_state()
         log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
+        target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path,
+                                                     self.git_repo.default_branch)
+        task_state_file_path = f"{target_dir}/TaskState"
+        default_branch = self.git_repo.default_branch
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
-        repo_pr_dir = f"{repo_name}/{pr_number}"
-        sequence_numbers = self._determine_sequence_numbers_including_task_file(repo_name, pr_number)
-        if len(sequence_numbers) == 0:
-            sequence_number = 0
-        else:
-            # we need to figure out the status of the last deployment (with the highest sequence number)
-            # if a PR exists and it is closed, we add the task to the *next* higher sequence number
-            # otherwise we add the task to the highest sequence number
-            sequence_number = self._find_highest_number(sequence_numbers.keys())
-            branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
-            if branch_name in [branch.name for branch in self.git_repo.get_branches()]:
-                # branch exists, check if PR exists
-                find_pr = [pr for pr in self.git_repo.get_pulls(head=branch_name, state='all')]
-                if find_pr:
-                    pr = find_pr.pop(0)
-                    if pr.state == 'closed':
-                        sequence_number += 1
-        # we use the basename of the remote file path for the task description file
-        task_file_name = self.description.get_task_file_name()
-        staging_repo_path = f"{repo_pr_dir}/{sequence_number}/{task_file_name}.{next_state}"
-        log_message(LoggingScope.TASK_OPS, 'INFO', "staging_repo_path: %s", staging_repo_path)
-        # contents of task description / metadata file
-        contents = self.description.get_contents()
-        self.git_repo.create_file(staging_repo_path,
-                                  f"new task for {repo_name} PR {pr_number} seq {sequence_number}: add build for arch",
-                                  contents)
-        self.state = next_state
-        return True
+        seq_num = self._get_fixed_sequence_number()
+        commit_message = f"changing task state for repo {repo_name} PR {pr_number} seq {seq_num} to {next_state}"
+        self._update_file(task_state_file_path,
+                          f"{next_state.name}\n",
+                          commit_message,
+                          branch=default_branch)
+
+        return next_state
 
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):

From 2ac47b69ca13f8a8706b4a052ebf8b24d6c2f759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 16:34:31 +0200
Subject: [PATCH 129/218] add TODO about checking validity of sequence number
 and corresponding branch

---
 scripts/automated_ingestion/eessi_task.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 6d2f799c..92ee380e 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -800,6 +800,8 @@ def _handle_add_new_task(self):
                           commit_message,
                           branch=default_branch)
 
+        # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number
+        #   is still open or yet to be created); if it is not valid, perform corrective actions
         return next_state
 
     @log_function_entry_exit()

From 62bd006cae9e78d12bc0573d183dcf771d3aad80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 21:37:20 +0200
Subject: [PATCH 130/218] first part for handling task after payload got staged

---
 scripts/automated_ingestion/eessi_task.py | 126 +++++++++++-----------
 1 file changed, 64 insertions(+), 62 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 92ee380e..868263eb 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -13,6 +13,7 @@
 
 from github import Github, GithubException, InputGitTreeElement, UnknownObjectException
 from github.PullRequest import PullRequest
+from github.Branch import Branch
 
 
 class SequenceStatus(Enum):
@@ -471,17 +472,21 @@ def _read_target_dir_from_file(self, path: str, branch: str = None) -> str:
         return config_dict.get('target_dir', None)
 
     @log_function_entry_exit()
-    def _branch_exists(self, branch_name: str) -> bool:
+    def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]:
         """
-        Check if a branch exists.
+        Get a branch object from its name.
         """
+        if not branch_name:
+            return self.git_repo.default_branch
+
         try:
-            self.git_repo.get_branch(branch_name)
-            return True
+            branch = self.git_repo.get_branch(branch_name)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s exists: %s", branch_name, branch)
+            return branch
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "error checking if branch %s exists: %s",
                         branch_name, err)
-            return False
+            return None
 
     @log_function_entry_exit()
     def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState:
@@ -523,7 +528,7 @@ def determine_state(self) -> TaskState:
             # - obtain repo, pr, seq from target dir
             org, repo, pr, seq, _ = target_dir.split('/')
             staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
-            if self._branch_exists(staging_branch_name):
+            if self._get_branch_from_name(staging_branch_name):
                 # read the TaskState file in staging branch
                 task_state_staging_branch = self._read_task_state_from_file(task_state_file_path, staging_branch_name)
                 log_message(LoggingScope.TASK_OPS, 'INFO', "task state in staging branch %s: %s",
@@ -767,18 +772,21 @@ def _handle_add_undetermined(self):
     def _handle_add_new_task(self):
         """Handler for ADD action in NEW_TASK state"""
         print("Handling ADD action in NEW_TASK state")
-        # Implementation for adding in NEW_TASK state: a task is only NEW_TASK if it was not processed yet
+
         # get name of of payload from metadata
         payload_name = self.description.metadata['payload']['filename']
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload_name: %s", payload_name)
+
         # get config and remote_client from self.description.task_object
         config = self.description.task_object.config
         remote_client = self.description.task_object.remote_client
+
         # determine remote_file_path by replacing basename of remote_file_path in self.description.task_object
         #   with payload_name
         description_remote_file_path = self.description.task_object.remote_file_path
         payload_remote_file_path = os.path.join(os.path.dirname(description_remote_file_path), payload_name)
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload_remote_file_path: %s", payload_remote_file_path)
+
         # initialize payload object
         payload_object = EESSIDataAndSignatureObject(config, payload_remote_file_path, remote_client)
         self.payload = EESSITaskPayload(payload_object)
@@ -804,66 +812,60 @@ def _handle_add_new_task(self):
         #   is still open or yet to be created); if it is not valid, perform corrective actions
         return next_state
 
+    @log_function_entry_exit()
+    def _determine_branch_name_from_sequence_number(self, sequence_number: int = None) -> str:
+        """Determine the branch name from the sequence number"""
+        sequence_number = self._get_fixed_sequence_number() if sequence_number is None else sequence_number
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        return f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+
+    @log_function_entry_exit()
+    def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
+        """
+        Find the single PR for the given branch in any state.
+
+        Args:
+            repo: GitHub repository
+            branch_name: Name of the branch
+
+        Returns:
+            PullRequest object if found, None otherwise
+        """
+        try:
+            head_ref = f"{self.git_repo.owner.login}:{branch_name}"
+            prs = list(self.git_repo.get_pulls(state='all', head=head_ref))
+            return prs[0] if prs else None
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err)
+            return None
+
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
         print("Handling ADD action in PAYLOAD_STAGED state")
-        # Implementation for adding in PAYLOAD_STAGED state
-        #  - create or find PR
-        #  - update PR contents
-        # determine PR
-        #  - no PR -> create one
-        #  - PR && closed -> create one (may require to move task file to different sequence number)
-        #  - PR && open -> update PR contents, task file status, etc
-        # TODO: determine sequence number, then use it to find staging pr
-        # find staging PR
-        sequence_number = self._get_sequence_number_for_task_file()
-        staging_pr, staging_branch = self._find_staging_pr(sequence_number)
-        # create PR if necessary
-        if staging_pr is None and sequence_number is None:
-            # no PR found, create one
-            staging_pr, staging_branch = self._create_staging_pr(sequence_number)
-        elif staging_pr is None and sequence_number is not None:
-            # no PR found, create one
-            staging_pr, staging_branch = self._create_staging_pr(sequence_number)
-        elif staging_pr.state == 'closed':
-            # PR closed, create new one
-            staging_pr, staging_branch = self._create_staging_pr(sequence_number + 1)
-        if staging_pr is None:
-            # something went wrong, we cannot continue
-            log_message(LoggingScope.ERROR, 'ERROR', "no staging PR found for task %s", self.description)
-            return False
-        # update PR contents
-        self._update_pr_contents(staging_pr)
-        # update task file status
-        self._update_task_file_status(staging_branch)
 
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        # current sequence
-        sequence_number = self._get_current_sequence_number()
-        sequence_status = self._determine_sequence_status(sequence_number)
-        if sequence_status == SequenceStatus.FINISHED:
-            sequence_number += 1
-            # re-determine sequence status
-            sequence_status = self._determine_sequence_status(sequence_number)
-        if sequence_status == SequenceStatus.DOES_NOT_EXIST:
-            # something is odd, the task file should already be in the default branch
-            log_message(LoggingScope.ERROR, 'ERROR', "sequence number %s does not exist", sequence_number)
-            return False
-        elif sequence_status == SequenceStatus.FINISHED:
-            # we need to figure out the status of the last deployment (with the highest sequence number)
-            branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
-            log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s exists", branch_name)
-        # check if branch exists
-        # - yes: check if corresponding PR exists
-        #   - yes: check status of PR
-        #     - open: rename file and add it to branch, set state, update PR contents, return
-        #     - closed && !merged: rename file to rejected, set state
-        #     - else: weird state, log message, return
-        #   - no: delete branch
-        # create new branch, add task file to branch, set state, create PR, update PR contents, return
-        return True
+        branch_name = self._determine_branch_name_from_sequence_number()
+        branch = self._get_branch_from_name(branch_name)
+        if not branch:
+            # branch for sequence number does not exist
+            # TODO: could have been merged already --> check if sequence directory exists
+            # ASSUME: it has not existed before --> create it
+            branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", self.git_repo.default_branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s created: %s", branch_name, branch)
+        else:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "found existing branch for %s: %s", branch_name, branch)
+
+        pr = self._find_pr_for_branch(branch_name)
+        if not pr:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "no PR found for branch %s", branch_name)
+            # TODO: create PR
+        else:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "found existing PR for branch %s: %s", branch_name, pr)
+            # TODO: check if PR is open or closed
+            # TODO: if closed, create issue (PR already closed)
+
+        return TaskState.PAYLOAD_STAGED
 
     @log_function_entry_exit()
     def _handle_add_pull_request(self):

From 16f0a304300a3c880aca2283996c8f9fa2a00d3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 7 Jun 2025 22:09:59 +0200
Subject: [PATCH 131/218] use sha for creating branch + make variable less
 ambiguous

---
 scripts/automated_ingestion/eessi_task.py | 105 +++++++++++-----------
 1 file changed, 55 insertions(+), 50 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 868263eb..0080a48e 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -98,44 +98,44 @@ def _determine_task_action(self) -> EESSITaskAction:
         return EESSITaskAction.UNKNOWN
 
     @log_function_entry_exit()
-    def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch=None) -> bool:
+    def _state_file_with_prefix_exists_in_repo_branch(self, file_path_prefix: str, branch_name: str = None) -> bool:
         """
         Check if a file exists in a repository branch.
 
         Args:
             file_path_prefix: the prefix of the file path
-            branch: the branch to check
+            branch_name: the branch to check
 
         Returns:
             True if a file with the prefix exists in the branch, False otherwise
         """
-        if branch is None:
-            branch = self.git_repo.default_branch
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        # branch = self._get_branch_from_name(branch_name)
         try:
             # get all files in directory part of file_path_prefix
             directory_part = os.path.dirname(file_path_prefix)
-            files = self.git_repo.get_contents(directory_part, ref=branch)
+            files = self.git_repo.get_contents(directory_part, ref=branch_name)
             log_msg = "Found files %s in directory %s in branch %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, files, directory_part, branch_name)
             # check if any of the files has file_path_prefix as prefix
             for file in files:
                 if file.path.startswith(file_path_prefix):
                     log_msg = "Found file %s in directory %s in branch %s"
-                    log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch)
+                    log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file.path, directory_part, branch_name)
                     return True
             log_msg = "No file with prefix %s found in directory %s in branch %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, file_path_prefix, directory_part, branch_name)
             return False
         except UnknownObjectException:
             # file_path does not exist in branch
             log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
-            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name)
             return False
         except GithubException as err:
             if err.status == 404:
                 # file_path does not exist in branch
                 log_msg = "Directory %s or file with prefix %s does not exist in branch %s"
-                log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch)
+                log_message(LoggingScope.TASK_OPS, 'INFO', log_msg, directory_part, file_path_prefix, branch_name)
                 return False
             else:
                 # if there was some other (e.g. connection) issue, log message and return False
@@ -317,9 +317,10 @@ def _create_staging_pr(self, sequence_number: int) -> Tuple[PullRequest, str]:
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+        default_branch_name = self.git_repo.default_branch
         pr = self.git_repo.create_pull(title=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
                                        body=f"Add task for {repo_name} PR {pr_number} seq {sequence_number}",
-                                       head=branch_name, base=self.git_repo.default_branch)
+                                       head=branch_name, base=default_branch_name)
         return pr, branch_name
 
     @log_function_entry_exit()
@@ -399,12 +400,13 @@ def _get_state_for_metadata_file_prefix(self, metadata_file_state_path_prefix: s
         return state
 
     @log_function_entry_exit()
-    def _list_directory_contents(self, directory_path, branch=None):
+    def _list_directory_contents(self, directory_path, branch_name: str = None):
         try:
             # Get contents of the directory
-            branch = self.git_repo.default_branch if branch is None else branch
-            log_message(LoggingScope.TASK_OPS, 'INFO', "listing contents of %s in branch %s", directory_path, branch)
-            contents = self.git_repo.get_contents(directory_path, ref=branch)
+            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "listing contents of %s in branch %s", directory_path, branch_name)
+            contents = self.git_repo.get_contents(directory_path, ref=branch_name)
 
             # If contents is a list, it means we successfully got directory contents
             if isinstance(contents, list):
@@ -429,13 +431,13 @@ def _next_state(self, state: TaskState = None) -> TaskState:
         return self.valid_transitions[the_state][0]
 
     @log_function_entry_exit()
-    def _path_exists_in_branch(self, path: str, branch: str = None) -> bool:
+    def _path_exists_in_branch(self, path: str, branch_name: str = None) -> bool:
         """
         Check if a path exists in a branch.
         """
-        branch = self.git_repo.default_branch if branch is None else branch
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
         try:
-            self.git_repo.get_contents(path, ref=branch)
+            self.git_repo.get_contents(path, ref=branch_name)
             return True
         except GithubException as err:
             if err.status == 404:
@@ -456,12 +458,12 @@ def _read_dict_from_string(self, content: str) -> dict:
         return config_dict
 
     @log_function_entry_exit()
-    def _read_target_dir_from_file(self, path: str, branch: str = None) -> str:
+    def _read_target_dir_from_file(self, path: str, branch_name: str = None) -> str:
         """
         Read the target directory from the file in the given branch.
         """
-        branch = self.git_repo.default_branch if branch is None else branch
-        content = self.git_repo.get_contents(path, ref=branch)
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        content = self.git_repo.get_contents(path, ref=branch_name)
 
         # Decode the content from base64
         content_str = content.decoded_content.decode('utf-8')
@@ -476,8 +478,7 @@ def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]:
         """
         Get a branch object from its name.
         """
-        if not branch_name:
-            return self.git_repo.default_branch
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
 
         try:
             branch = self.git_repo.get_branch(branch_name)
@@ -489,12 +490,12 @@ def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]:
             return None
 
     @log_function_entry_exit()
-    def _read_task_state_from_file(self, path: str, branch: str = None) -> TaskState:
+    def _read_task_state_from_file(self, path: str, branch_name: str = None) -> TaskState:
         """
         Read the task state from the file in the given branch.
         """
-        branch = self.git_repo.default_branch if branch is None else branch
-        content = self.git_repo.get_contents(path, ref=branch)
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        content = self.git_repo.get_contents(path, ref=branch_name)
 
         # Decode the content from base64
         content_str = content.decoded_content.decode('utf-8').strip()
@@ -512,16 +513,16 @@ def determine_state(self) -> TaskState:
         """
         # check if path representing the task file exists in the default branch
         path_in_default_branch = self.description.task_object.remote_file_path
-        default_branch = self.git_repo.default_branch
-        if self._path_exists_in_branch(path_in_default_branch, branch=default_branch):
+        default_branch_name = self.git_repo.default_branch
+        if self._path_exists_in_branch(path_in_default_branch, branch_name=default_branch_name):
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch",
                         path_in_default_branch)
             # get state from task file in default branch
             # - get target_dir from path_in_default_branch
-            target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch)
+            target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch_name)
             # read the TaskState file in target dir
             task_state_file_path = f"{target_dir}/TaskState"
-            task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch)
+            task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch_name)
             # if branch for sequence number exists, get state from task file in corresponding branch
             # - branch name is of the form REPO-PR-SEQ
             # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/
@@ -568,11 +569,11 @@ def handle(self):
 
     # Implement handlers for ADD action
     @log_function_entry_exit()
-    def _create_symlink(self, source_path: str, target_path: str, branch: str = None):
+    def _create_symlink(self, source_path: str, target_path: str, branch_name: str = None):
         """Create a symlink in the given branch."""
         try:
-            branch = self.git_repo.default_branch if branch is None else branch
-            ref = self.git_repo.get_git_ref(f"heads/{branch}")
+            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+            ref = self.git_repo.get_git_ref(f"heads/{branch_name}")
             commit = self.git_repo.get_git_commit(ref.object.sha)
             base_tree = self.git_repo.get_git_tree(commit.tree.sha)
 
@@ -625,22 +626,22 @@ def _create_symlink(self, source_path: str, target_path: str, branch: str = None
             return False
 
     @log_function_entry_exit()
-    def _safe_create_file(self, path: str, message: str, content: str, branch: str = None):
+    def _safe_create_file(self, path: str, message: str, content: str, branch_name: str = None):
         """Create a file in the given branch."""
         try:
-            branch = self.git_repo.default_branch if branch is None else branch
-            existing_file = self.git_repo.get_contents(path, ref=branch)
+            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+            existing_file = self.git_repo.get_contents(path, ref=branch_name)
             log_message(LoggingScope.TASK_OPS, 'INFO', "File %s already exists", path)
             return existing_file
         except GithubException as err:
             if err.status == 404:  # File doesn't exist
                 # Safe to create
-                return self.git_repo.create_file(path, message, content, branch=branch)
+                return self.git_repo.create_file(path, message, content, branch=branch_name)
             else:
                 raise err  # Some other error
 
     @log_function_entry_exit()
-    def _create_multi_file_commit(self, files_data, commit_message, branch=None):
+    def _create_multi_file_commit(self, files_data, commit_message, branch_name: str = None):
         """
         Create a commit with multiple file changes
 
@@ -656,8 +657,8 @@ def _create_multi_file_commit(self, files_data, commit_message, branch=None):
             }
         }
         """
-        branch = self.git_repo.default_branch if branch is None else branch
-        ref = self.git_repo.get_git_ref(f"heads/{branch}")
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+        ref = self.git_repo.get_git_ref(f"heads/{branch_name}")
         current_commit = self.git_repo.get_git_commit(ref.object.sha)
         base_tree = current_commit.tree
 
@@ -695,12 +696,12 @@ def _create_multi_file_commit(self, files_data, commit_message, branch=None):
         return new_commit
 
     @log_function_entry_exit()
-    def _update_file(self, file_path, new_content, commit_message, branch=None):
+    def _update_file(self, file_path, new_content, commit_message, branch_name: str = None):
         try:
-            branch = self.git_repo.default_branch if branch is None else branch
+            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
 
             # Get the current file
-            file = self.git_repo.get_contents(file_path, ref=branch)
+            file = self.git_repo.get_contents(file_path, ref=branch_name)
 
             # Update the file
             result = self.git_repo.update_file(
@@ -708,7 +709,7 @@ def _update_file(self, file_path, new_content, commit_message, branch=None):
                 message=commit_message,
                 content=new_content,
                 sha=file.sha,
-                branch=branch
+                branch=branch_name
             )
 
             log_message(LoggingScope.TASK_OPS, 'INFO',
@@ -727,7 +728,7 @@ def _handle_add_undetermined(self):
         # create task file in target directory (TARGET_DIR/TaskDescription)
         # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK)
         # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR)
-        branch = self.git_repo.default_branch
+        branch_name = self.git_repo.default_branch
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         sequence_number = self._get_fixed_sequence_number()  # corresponds to an open or yet to be created PR
@@ -756,7 +757,7 @@ def _handle_add_undetermined(self):
             commit = self._create_multi_file_commit(
                 files_to_commit,
                 f"new task for {repo_name} PR {pr_number} seq {sequence_number}",
-                branch=branch
+                branch_name=branch_name
             )
             log_message(LoggingScope.TASK_OPS, 'INFO', "commit created: %s", commit)
         except Exception as err:
@@ -795,10 +796,10 @@ def _handle_add_new_task(self):
         # determine next state (NEXT_STATE), update TaskState file content
         next_state = self._next_state()
         log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
+        default_branch_name = self.git_repo.default_branch
         target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path,
-                                                     self.git_repo.default_branch)
+                                                     default_branch_name)
         task_state_file_path = f"{target_dir}/TaskState"
-        default_branch = self.git_repo.default_branch
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         seq_num = self._get_fixed_sequence_number()
@@ -806,7 +807,7 @@ def _handle_add_new_task(self):
         self._update_file(task_state_file_path,
                           f"{next_state.name}\n",
                           commit_message,
-                          branch=default_branch)
+                          branch_name=default_branch_name)
 
         # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number
         #   is still open or yet to be created); if it is not valid, perform corrective actions
@@ -847,11 +848,15 @@ def _handle_add_payload_staged(self):
 
         branch_name = self._determine_branch_name_from_sequence_number()
         branch = self._get_branch_from_name(branch_name)
+        default_branch_name = self.git_repo.default_branch
+        default_branch = self._get_branch_from_name(default_branch_name)
+        default_sha = default_branch.commit.sha
         if not branch:
             # branch for sequence number does not exist
             # TODO: could have been merged already --> check if sequence directory exists
             # ASSUME: it has not existed before --> create it
-            branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", self.git_repo.default_branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s does not exist, creating it", branch_name)
+            branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", default_sha)
             log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s created: %s", branch_name, branch)
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO', "found existing branch for %s: %s", branch_name, branch)

From 2b0a19109cce1579a503d3c4c1c5a41c6b4d3546 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 17:59:34 +0200
Subject: [PATCH 132/218] simplify determination of state and obtain it from
 feature branch if it exists

---
 scripts/automated_ingestion/eessi_task.py | 50 +++++++++++------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0080a48e..7efa928b 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -512,36 +512,34 @@ def determine_state(self) -> TaskState:
         Determine the state of the task based on the state of the staging repository.
         """
         # check if path representing the task file exists in the default branch
-        path_in_default_branch = self.description.task_object.remote_file_path
-        default_branch_name = self.git_repo.default_branch
-        if self._path_exists_in_branch(path_in_default_branch, branch_name=default_branch_name):
+        # (name of task pointer file is the same in both the default branch and the "feature" branch)
+        task_pointer_file = self.description.task_object.remote_file_path
+        branch_to_use = self.git_repo.default_branch
+
+        if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use):
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch",
-                        path_in_default_branch)
-            # get state from task file in default branch
-            # - get target_dir from path_in_default_branch
-            target_dir = self._read_target_dir_from_file(path_in_default_branch, default_branch_name)
-            # read the TaskState file in target dir
-            task_state_file_path = f"{target_dir}/TaskState"
-            task_state_default_branch = self._read_task_state_from_file(task_state_file_path, default_branch_name)
-            # if branch for sequence number exists, get state from task file in corresponding branch
-            # - branch name is of the form REPO-PR-SEQ
-            # - target dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/
-            # - obtain repo, pr, seq from target dir
+                        task_pointer_file)
+
+            # determine if there is a "feature" branch for the sequence number
+            # - read target dir from task pointer file in default branch
+            # - construct feature branch name from target dir
+            target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use)
             org, repo, pr, seq, _ = target_dir.split('/')
-            staging_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
-            if self._get_branch_from_name(staging_branch_name):
-                # read the TaskState file in staging branch
-                task_state_staging_branch = self._read_task_state_from_file(task_state_file_path, staging_branch_name)
-                log_message(LoggingScope.TASK_OPS, 'INFO', "task state in staging branch %s: %s",
-                            staging_branch_name, task_state_staging_branch)
-                return task_state_staging_branch
-            else:
-                log_message(LoggingScope.TASK_OPS, 'INFO', "task state in default branch: %s",
-                            task_state_default_branch)
-                return task_state_default_branch
+            feature_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
+            if self._get_branch_from_name(feature_branch_name):
+                branch_to_use = feature_branch_name
+
+            # get state from task file in branch to use (default or feature)
+            # - read the TaskState file in target dir
+            task_state_file_path = f"{target_dir}/TaskState"
+            task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use)
+
+            log_message(LoggingScope.TASK_OPS, 'INFO', "task state in %s branch: %s",
+                        branch_to_use, task_state)
+            return task_state
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch",
-                        path_in_default_branch)
+                        task_pointer_file)
             return TaskState.UNDETERMINED
 
     @log_function_entry_exit()

From 4d1db2e64bbab5f3ffe04c19c84ae18cdc567288 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 18:12:36 +0200
Subject: [PATCH 133/218] small improvements to handler for new_task

---
 scripts/automated_ingestion/eessi_task.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7efa928b..c33e74f9 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -771,6 +771,9 @@ def _handle_add_undetermined(self):
     def _handle_add_new_task(self):
         """Handler for ADD action in NEW_TASK state"""
         print("Handling ADD action in NEW_TASK state")
+        # determine next state
+        next_state = self._next_state(TaskState.NEW_TASK)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
 
         # get name of of payload from metadata
         payload_name = self.description.metadata['payload']['filename']
@@ -791,9 +794,7 @@ def _handle_add_new_task(self):
         self.payload = EESSITaskPayload(payload_object)
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload)
 
-        # determine next state (NEXT_STATE), update TaskState file content
-        next_state = self._next_state()
-        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
+        # update TaskState file content
         default_branch_name = self.git_repo.default_branch
         target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path,
                                                      default_branch_name)

From 7eed7decbd56067df38f8605a6ce14e06cf1bda4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 19:33:22 +0200
Subject: [PATCH 134/218] only use specific branch for determining state, and
 default to main branch

---
 scripts/automated_ingestion/eessi_task.py | 29 ++++++++---------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index c33e74f9..c88fe526 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -507,39 +507,30 @@ def _read_task_state_from_file(self, path: str, branch_name: str = None) -> Task
         return task_state
 
     @log_function_entry_exit()
-    def determine_state(self) -> TaskState:
+    def determine_state(self, branch: str = None) -> TaskState:
         """
         Determine the state of the task based on the state of the staging repository.
         """
-        # check if path representing the task file exists in the default branch
-        # (name of task pointer file is the same in both the default branch and the "feature" branch)
+        # check if path representing the task file exists in the default branch or the "feature" branch
         task_pointer_file = self.description.task_object.remote_file_path
-        branch_to_use = self.git_repo.default_branch
+        branch_to_use = self.git_repo.default_branch if branch is None else branch
 
         if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use):
-            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in default branch",
-                        task_pointer_file)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s",
+                        task_pointer_file, branch_to_use)
 
-            # determine if there is a "feature" branch for the sequence number
-            # - read target dir from task pointer file in default branch
-            # - construct feature branch name from target dir
-            target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use)
-            org, repo, pr, seq, _ = target_dir.split('/')
-            feature_branch_name = f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
-            if self._get_branch_from_name(feature_branch_name):
-                branch_to_use = feature_branch_name
-
-            # get state from task file in branch to use (default or feature)
+            # get state from task file in branch to use
             # - read the TaskState file in target dir
+            target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use)
             task_state_file_path = f"{target_dir}/TaskState"
             task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use)
 
-            log_message(LoggingScope.TASK_OPS, 'INFO', "task state in %s branch: %s",
+            log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch %s: %s",
                         branch_to_use, task_state)
             return task_state
         else:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in default branch",
-                        task_pointer_file)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in branch %s",
+                        task_pointer_file, branch_to_use)
             return TaskState.UNDETERMINED
 
     @log_function_entry_exit()

From 58c7bdc112978dc05ec299705f6d438868292920 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 20:31:19 +0200
Subject: [PATCH 135/218] use different method to determine feature branch name

---
 scripts/automated_ingestion/eessi_task.py | 47 +++++++++++++++--------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index c88fe526..0dcdb680 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -831,32 +831,47 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err)
             return None
 
+    @log_function_entry_exit()
+    def _determine_feature_branch_name(self) -> str:
+        """Determine the feature branch name from the target directory name"""
+        task_pointer_file = self.description.task_object.remote_file_path
+        target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
+        org, repo, pr, seq, _ = target_dir.split('/')
+        return f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
+
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
         print("Handling ADD action in PAYLOAD_STAGED state")
 
-        branch_name = self._determine_branch_name_from_sequence_number()
-        branch = self._get_branch_from_name(branch_name)
-        default_branch_name = self.git_repo.default_branch
-        default_branch = self._get_branch_from_name(default_branch_name)
-        default_sha = default_branch.commit.sha
-        if not branch:
-            # branch for sequence number does not exist
-            # TODO: could have been merged already --> check if sequence directory exists
+        feature_branch_name = self._determine_feature_branch_name()
+        feature_branch = self._get_branch_from_name(feature_branch_name)
+        if not feature_branch:
+            # feature branch does not exist
+            # TODO: could have been merged already --> check if PR corresponding to the feature branch exists
             # ASSUME: it has not existed before --> create it
-            log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s does not exist, creating it", branch_name)
-            branch = self.git_repo.create_git_ref(f"refs/heads/{branch_name}", default_sha)
-            log_message(LoggingScope.TASK_OPS, 'INFO', "branch %s created: %s", branch_name, branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "branch %s does not exist, creating it", feature_branch_name)
+
+            default_branch_name = self.git_repo.default_branch
+            default_branch = self._get_branch_from_name(default_branch_name)
+            default_sha = default_branch.commit.sha
+            feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha)
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "branch %s created: %s", feature_branch_name, feature_branch)
         else:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "found existing branch for %s: %s", branch_name, branch)
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "found existing branch for %s: %s", feature_branch_name, feature_branch)
 
-        pr = self._find_pr_for_branch(branch_name)
-        if not pr:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "no PR found for branch %s", branch_name)
+        pull_request = self._find_pr_for_branch(feature_branch_name)
+        if not pull_request:
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "no PR found for branch %s", feature_branch_name)
             # TODO: create PR
         else:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "found existing PR for branch %s: %s", branch_name, pr)
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "found existing PR for branch %s: %s", feature_branch_name, pull_request)
             # TODO: check if PR is open or closed
             # TODO: if closed, create issue (PR already closed)
 

From f4bf9177012dc43c4e5eb1406a627f3a3c3a5aa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 22:12:45 +0200
Subject: [PATCH 136/218] create first version of PR plus some related
 improvements

---
 .../automated_ingestion.py                    |  2 +
 scripts/automated_ingestion/eessi_task.py     | 93 ++++++++++++++-----
 2 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index e0ca710b..0ebce03e 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -257,6 +257,8 @@ def main():
                                 EESSITaskDescription(
                                     EESSIDataAndSignatureObject(config, task_path, s3_bucket)
                                 ),
+                                config,
+                                cvmfs_repo,
                                 gh_staging_repo
                             )
 
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0dcdb680..165507ed 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -55,12 +55,14 @@ class EESSITask:
     description: EESSITaskDescription
     payload: EESSITaskPayload
     action: EESSITaskAction
-    state: TaskState
     git_repo: Github
+    config: Dict
 
     @log_function_entry_exit()
-    def __init__(self, description: EESSITaskDescription, git_repo: Github):
+    def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo: str, git_repo: Github):
         self.description = description
+        self.config = config
+        self.cvmfs_repo = cvmfs_repo
         self.git_repo = git_repo
         self.action = self._determine_task_action()
 
@@ -685,7 +687,7 @@ def _create_multi_file_commit(self, files_data, commit_message, branch_name: str
         return new_commit
 
     @log_function_entry_exit()
-    def _update_file(self, file_path, new_content, commit_message, branch_name: str = None):
+    def _update_file(self, file_path, new_content, commit_message, branch_name: str = None) -> Optional[Dict]:
         try:
             branch_name = self.git_repo.default_branch if branch_name is None else branch_name
 
@@ -758,6 +760,22 @@ def _handle_add_undetermined(self):
         #   is still open or yet to be created); if it is not valid, perform corrective actions
         return TaskState.NEW_TASK
 
+    @log_function_entry_exit()
+    def _update_task_state_file(self, next_state: TaskState, branch_name: str = None) -> Optional[Dict]:
+        """Update the TaskState file content in default or given branch"""
+        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
+
+        task_pointer_file = self.description.task_object.remote_file_path
+        target_dir = self._read_target_dir_from_file(task_pointer_file, branch_name)
+        task_state_file_path = f"{target_dir}/TaskState"
+        _, repo, pr, seq, _ = target_dir.split('/')
+        commit_message = f"changing task state for repo {repo} PR {pr} seq {seq} to {next_state}"
+        result = self._update_file(task_state_file_path,
+                                   f"{next_state.name}\n",
+                                   commit_message,
+                                   branch_name=branch_name)
+        return result
+
     @log_function_entry_exit()
     def _handle_add_new_task(self):
         """Handler for ADD action in NEW_TASK state"""
@@ -786,18 +804,7 @@ def _handle_add_new_task(self):
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload)
 
         # update TaskState file content
-        default_branch_name = self.git_repo.default_branch
-        target_dir = self._read_target_dir_from_file(self.description.task_object.remote_file_path,
-                                                     default_branch_name)
-        task_state_file_path = f"{target_dir}/TaskState"
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        seq_num = self._get_fixed_sequence_number()
-        commit_message = f"changing task state for repo {repo_name} PR {pr_number} seq {seq_num} to {next_state}"
-        self._update_file(task_state_file_path,
-                          f"{next_state.name}\n",
-                          commit_message,
-                          branch_name=default_branch_name)
+        self._update_task_state_file(next_state)
 
         # TODO: verify that the sequence number is still valid (PR corresponding to the sequence number
         #   is still open or yet to be created); if it is not valid, perform corrective actions
@@ -831,6 +838,15 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err)
             return None
 
+    @log_function_entry_exit()
+    def _determine_sequence_number(self) -> int:
+        """Determine the sequence number from the target directory name"""
+        task_pointer_file = self.description.task_object.remote_file_path
+        target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
+        _, _, _, seq, _ = target_dir.split('/')
+        return int(seq)
+
     @log_function_entry_exit()
     def _determine_feature_branch_name(self) -> str:
         """Determine the feature branch name from the target directory name"""
@@ -844,7 +860,12 @@ def _determine_feature_branch_name(self) -> str:
     def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
         print("Handling ADD action in PAYLOAD_STAGED state")
+        next_state = self._next_state(TaskState.PAYLOAD_STAGED)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
 
+        default_branch_name = self.git_repo.default_branch
+        default_branch = self._get_branch_from_name(default_branch_name)
+        default_sha = default_branch.commit.sha
         feature_branch_name = self._determine_feature_branch_name()
         feature_branch = self._get_branch_from_name(feature_branch_name)
         if not feature_branch:
@@ -854,9 +875,6 @@ def _handle_add_payload_staged(self):
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "branch %s does not exist, creating it", feature_branch_name)
 
-            default_branch_name = self.git_repo.default_branch
-            default_branch = self._get_branch_from_name(default_branch_name)
-            default_sha = default_branch.commit.sha
             feature_branch = self.git_repo.create_git_ref(f"refs/heads/{feature_branch_name}", default_sha)
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "branch %s created: %s", feature_branch_name, feature_branch)
@@ -868,21 +886,52 @@ def _handle_add_payload_staged(self):
         if not pull_request:
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "no PR found for branch %s", feature_branch_name)
-            # TODO: create PR
+            # update TaskState file content in feature branch
+            self._update_task_state_file(next_state, branch_name=feature_branch_name)
+            # create PR
+            pr_title_format = self.config['github']['grouped_pr_title']
+            pr_body_format = self.config['github']['grouped_pr_body']
+            repo_name = self.description.get_repo_name()
+            pr_number = self.description.get_pr_number()
+            pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
+            seq_num = self._determine_sequence_number()
+            pr_title = pr_title_format.format(
+                cvmfs_repo=self.cvmfs_repo,
+                pr=pr_number,
+                repo=repo_name,
+                seq_num=seq_num,
+            )
+            pr_body = pr_body_format.format(
+                cvmfs_repo=self.cvmfs_repo,
+                pr=pr_number,
+                pr_url=pr_url,
+                repo=repo_name,
+                seq_num=seq_num,
+                contents="TO BE DONE",
+                analysis="TO BE DONE",
+                action="TO BE DONE",
+            )
+            pr = self.git_repo.create_pull(
+                title=pr_title,
+                body=pr_body,
+                head=feature_branch_name,
+                base=default_branch_name
+            )
+            log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr)
+            return TaskState.PULL_REQUEST
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "found existing PR for branch %s: %s", feature_branch_name, pull_request)
             # TODO: check if PR is open or closed
             # TODO: if closed, create issue (PR already closed)
-
-        return TaskState.PAYLOAD_STAGED
+            return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()
     def _handle_add_pull_request(self):
         """Handler for ADD action in PULL_REQUEST state"""
         print("Handling ADD action in PULL_REQUEST state")
         # Implementation for adding in PULL_REQUEST state
-        return True
+        return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()
     def _handle_add_approved(self):

From 9fda61b4bec6b50cfcde40fccb096e44babf9d16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 22:34:21 +0200
Subject: [PATCH 137/218] filter some PRs, just during development

---
 scripts/automated_ingestion/eessi_task.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 165507ed..256f0369 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -832,7 +832,9 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            prs = list(self.git_repo.get_pulls(state='all', head=head_ref))
+            filter_prs = [16]  # TODO: remove this once the PR is merged
+            prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
+                   if pr.number not in filter_prs]
             return prs[0] if prs else None
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err)

From 4686e0c5f04cce89cf3a2a099de9969e7065a13e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 23:16:58 +0200
Subject: [PATCH 138/218] revise state updates to reflect current and future
 states

---
 scripts/automated_ingestion/eessi_task.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 256f0369..16cef73d 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -863,7 +863,8 @@ def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
         print("Handling ADD action in PAYLOAD_STAGED state")
         next_state = self._next_state(TaskState.PAYLOAD_STAGED)
-        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
+        approved_state = TaskState.APPROVED
+        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s, approved_state: %s", next_state, approved_state)
 
         default_branch_name = self.git_repo.default_branch
         default_branch = self._get_branch_from_name(default_branch_name)
@@ -888,8 +889,15 @@ def _handle_add_payload_staged(self):
         if not pull_request:
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "no PR found for branch %s", feature_branch_name)
-            # update TaskState file content in feature branch
-            self._update_task_state_file(next_state, branch_name=feature_branch_name)
+            # update TaskState file content
+            # - next state in default branch (interpreted as current state)
+            # - approved state in feature branch (interpreted as future state, ie, after the PR is merged)
+            self._update_task_state_file(next_state, branch_name=default_branch_name)
+            self._update_task_state_file(approved_state, branch_name=feature_branch_name)
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",
+                        next_state, default_branch_name, approved_state, feature_branch_name)
+
             # create PR
             pr_title_format = self.config['github']['grouped_pr_title']
             pr_body_format = self.config['github']['grouped_pr_body']

From d4d08cfbe115d484daf1bd4166d7247211e89766 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 23:19:31 +0200
Subject: [PATCH 139/218] filter one more PR

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 16cef73d..98dc989f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            filter_prs = [16]  # TODO: remove this once the PR is merged
+            filter_prs = [16, 17]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
             return prs[0] if prs else None

From 2c73ee9a90557b74cd929019ea77dc1937bdaae9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 23:44:42 +0200
Subject: [PATCH 140/218] alternative method to update taskstate file

---
 scripts/automated_ingestion/eessi_task.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 98dc989f..64f8b396 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -893,7 +893,10 @@ def _handle_add_payload_staged(self):
             # - next state in default branch (interpreted as current state)
             # - approved state in feature branch (interpreted as future state, ie, after the PR is merged)
             self._update_task_state_file(next_state, branch_name=default_branch_name)
-            self._update_task_state_file(approved_state, branch_name=feature_branch_name)
+            # try to first update the task state file in the feature branch to
+            # next state (attempt to avoid merge conflicts)
+            self._update_task_state_file(next_state, branch_name=feature_branch_name)
+            # self._update_task_state_file(approved_state, branch_name=feature_branch_name)
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",
                         next_state, default_branch_name, approved_state, feature_branch_name)

From d5cd773c2fac8c9c5c958a567509748e8f946249 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 23:45:29 +0200
Subject: [PATCH 141/218] also filter PR 18

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 64f8b396..f40ea04b 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            filter_prs = [16, 17]  # TODO: remove this once the PR is merged
+            filter_prs = [16, 17, 18]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
             return prs[0] if prs else None

From 902bae2276274ac2d161c4b7fe4b3c6c712bfe7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 8 Jun 2025 23:51:44 +0200
Subject: [PATCH 142/218] alternative method to update taskstate file

---
 scripts/automated_ingestion/eessi_task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index f40ea04b..0c33f47a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            filter_prs = [16, 17, 18]  # TODO: remove this once the PR is merged
+            filter_prs = [16, 17, 18, 19]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
             return prs[0] if prs else None
@@ -896,7 +896,7 @@ def _handle_add_payload_staged(self):
             # try to first update the task state file in the feature branch to
             # next state (attempt to avoid merge conflicts)
             self._update_task_state_file(next_state, branch_name=feature_branch_name)
-            # self._update_task_state_file(approved_state, branch_name=feature_branch_name)
+            self._update_task_state_file(approved_state, branch_name=feature_branch_name)
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",
                         next_state, default_branch_name, approved_state, feature_branch_name)

From 35616699c077a3fbcba24a4721b8bdfa5aa8cc09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 00:11:19 +0200
Subject: [PATCH 143/218] another attempt to avoid merge conflict

---
 scripts/automated_ingestion/eessi_task.py | 40 ++++++++++++++++++++---
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0c33f47a..0d6e2c64 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            filter_prs = [16, 17, 18, 19]  # TODO: remove this once the PR is merged
+            filter_prs = [16, 17, 18, 19, 20]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
             return prs[0] if prs else None
@@ -858,6 +858,39 @@ def _determine_feature_branch_name(self) -> str:
         org, repo, pr, seq, _ = target_dir.split('/')
         return f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
 
+    @log_function_entry_exit()
+    def _sync_task_state_file(self, source_branch: str, target_branch: str):
+        """Update task state file from source to target branch"""
+        task_pointer_file = self.description.task_object.remote_file_path
+        target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        task_state_file_path = f"{target_dir}/TaskState"
+
+        try:
+            # Get content from source branch
+            source_content = self.git_repo.get_contents(task_state_file_path, ref=source_branch)
+
+            # Get current file in target branch
+            target_file = self.git_repo.get_contents(task_state_file_path, ref=target_branch)
+
+            # Update if content is different
+            if source_content.sha != target_file.sha:
+                result = self.git_repo.update_file(
+                    path=task_state_file_path,
+                    message=f"Sync {task_state_file_path} from {source_branch} to {target_branch}",
+                    content=source_content.decoded_content,
+                    sha=target_file.sha,
+                    branch=target_branch
+                )
+                log_message(LoggingScope.TASK_OPS, 'INFO', "Updated %s", task_state_file_path)
+                return result
+            else:
+                log_message(LoggingScope.TASK_OPS, 'INFO', "No changes needed for %s", task_state_file_path)
+                return None
+
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error syncing task state file: %s", err)
+            return None
+
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
@@ -893,9 +926,8 @@ def _handle_add_payload_staged(self):
             # - next state in default branch (interpreted as current state)
             # - approved state in feature branch (interpreted as future state, ie, after the PR is merged)
             self._update_task_state_file(next_state, branch_name=default_branch_name)
-            # try to first update the task state file in the feature branch to
-            # next state (attempt to avoid merge conflicts)
-            self._update_task_state_file(next_state, branch_name=feature_branch_name)
+            # sync task state file from default to feature branch (attempt to avoid merge conflicts)
+            self._sync_task_state_file(default_branch_name, feature_branch_name)
             self._update_task_state_file(approved_state, branch_name=feature_branch_name)
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",

From 61fb94459b9f9eddd381acc6e09c2649adabfdec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 00:30:35 +0200
Subject: [PATCH 144/218] yet another attempt at avoiding merge conflict

---
 scripts/automated_ingestion/eessi_task.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0d6e2c64..6ddfb5b2 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            filter_prs = [16, 17, 18, 19, 20]  # TODO: remove this once the PR is merged
+            filter_prs = [16, 17, 18, 19, 20, 21]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
             return prs[0] if prs else None
@@ -926,8 +926,13 @@ def _handle_add_payload_staged(self):
             # - next state in default branch (interpreted as current state)
             # - approved state in feature branch (interpreted as future state, ie, after the PR is merged)
             self._update_task_state_file(next_state, branch_name=default_branch_name)
-            # sync task state file from default to feature branch (attempt to avoid merge conflicts)
-            self._sync_task_state_file(default_branch_name, feature_branch_name)
+            # merge default branch into feature branch (attempt to avoid merge conflicts)
+            self.git_repo.merge(
+                head=default_branch_name,
+                base=feature_branch_name,
+                commit_message=f"Merge {default_branch_name} into {feature_branch_name}"
+            )
+            # update task state file in feature branch
             self._update_task_state_file(approved_state, branch_name=feature_branch_name)
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",

From 2cae1504407c7dd5f4e01fcdabd2c6867d4fe8d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 08:57:09 +0200
Subject: [PATCH 145/218] improve readbility of code

---
 scripts/automated_ingestion/eessi_task.py | 132 ++++++++++++++--------
 1 file changed, 86 insertions(+), 46 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 6ddfb5b2..da361bbf 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -832,7 +832,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            filter_prs = [16, 17, 18, 19, 20, 21]  # TODO: remove this once the PR is merged
+            filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
             return prs[0] if prs else None
@@ -891,6 +891,85 @@ def _sync_task_state_file(self, source_branch: str, target_branch: str):
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error syncing task state file: %s", err)
             return None
 
+    @log_function_entry_exit()
+    def _update_task_states(self, next_state: TaskState, default_branch_name: str,
+                            approved_state: TaskState, feature_branch_name: str):
+        """
+        Update task states in default and feature branches
+
+        States have to be updated in a specific order and in particular the default branch has to be
+        merged into the feature branch before the feature branch can be updated to avoid a merge conflict.
+
+        Args:
+            next_state: next state to be applied to the default branch
+            default_branch_name: name of the default branch
+            approved_state: state to be applied to the feature branch
+            feature_branch_name: name of the feature branch
+        """
+        # TODO: add failure handling (capture failures and return them somehow)
+
+        # update TaskState file content
+        # - next_state in default branch (interpreted as current state)
+        # - approved_state in feature branch (interpreted as future state, ie, after
+        #   the PR corresponding to the feature branch will be merged)
+
+        # first, update the task state file in the default branch
+        self._update_task_state_file(next_state, branch_name=default_branch_name)
+
+        # second, merge default branch into feature branch (to avoid a merge conflict)
+        arch = self.description.task_object.arch
+        commit_message = f"merge {default_branch_name} into {feature_branch_name} for {arch}"
+        self.git_repo.merge(
+            head=default_branch_name,
+            base=feature_branch_name,
+            commit_message=commit_message
+        )
+
+        # last, update task state file in feature branch
+        self._update_task_state_file(approved_state, branch_name=feature_branch_name)
+        log_message(LoggingScope.TASK_OPS, 'INFO',
+                    "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",
+                    next_state, default_branch_name, approved_state, feature_branch_name)
+
+    @log_function_entry_exit()
+    def _create_pull_request(self, feature_branch_name: str, default_branch_name: str):
+        """
+        Create a PR from the feature branch to the default branch
+
+        Args:
+            feature_branch_name: name of the feature branch
+            default_branch_name: name of the default branch
+        """
+        pr_title_format = self.config['github']['grouped_pr_title']
+        pr_body_format = self.config['github']['grouped_pr_body']
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
+        seq_num = self._determine_sequence_number()
+        pr_title = pr_title_format.format(
+            cvmfs_repo=self.cvmfs_repo,
+            pr=pr_number,
+            repo=repo_name,
+            seq_num=seq_num,
+        )
+        pr_body = pr_body_format.format(
+            cvmfs_repo=self.cvmfs_repo,
+            pr=pr_number,
+            pr_url=pr_url,
+            repo=repo_name,
+            seq_num=seq_num,
+            contents="TO BE DONE",
+            analysis="TO BE DONE",
+            action="TO BE DONE",
+        )
+        pr = self.git_repo.create_pull(
+            title=pr_title,
+            body=pr_body,
+            head=feature_branch_name,
+            base=default_branch_name
+        )
+        log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr)
+
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
@@ -922,52 +1001,13 @@ def _handle_add_payload_staged(self):
         if not pull_request:
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "no PR found for branch %s", feature_branch_name)
-            # update TaskState file content
-            # - next state in default branch (interpreted as current state)
-            # - approved state in feature branch (interpreted as future state, ie, after the PR is merged)
-            self._update_task_state_file(next_state, branch_name=default_branch_name)
-            # merge default branch into feature branch (attempt to avoid merge conflicts)
-            self.git_repo.merge(
-                head=default_branch_name,
-                base=feature_branch_name,
-                commit_message=f"Merge {default_branch_name} into {feature_branch_name}"
-            )
-            # update task state file in feature branch
-            self._update_task_state_file(approved_state, branch_name=feature_branch_name)
-            log_message(LoggingScope.TASK_OPS, 'INFO',
-                        "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",
-                        next_state, default_branch_name, approved_state, feature_branch_name)
 
-            # create PR
-            pr_title_format = self.config['github']['grouped_pr_title']
-            pr_body_format = self.config['github']['grouped_pr_body']
-            repo_name = self.description.get_repo_name()
-            pr_number = self.description.get_pr_number()
-            pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
-            seq_num = self._determine_sequence_number()
-            pr_title = pr_title_format.format(
-                cvmfs_repo=self.cvmfs_repo,
-                pr=pr_number,
-                repo=repo_name,
-                seq_num=seq_num,
-            )
-            pr_body = pr_body_format.format(
-                cvmfs_repo=self.cvmfs_repo,
-                pr=pr_number,
-                pr_url=pr_url,
-                repo=repo_name,
-                seq_num=seq_num,
-                contents="TO BE DONE",
-                analysis="TO BE DONE",
-                action="TO BE DONE",
-            )
-            pr = self.git_repo.create_pull(
-                title=pr_title,
-                body=pr_body,
-                head=feature_branch_name,
-                base=default_branch_name
-            )
-            log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr)
+            # TODO: add failure handling (capture result and act on it)
+            self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name)
+
+            # TODO: add failure handling (capture result and act on it)
+            self._create_pull_request(feature_branch_name, default_branch_name)
+
             return TaskState.PULL_REQUEST
         else:
             log_message(LoggingScope.TASK_OPS, 'INFO',

From 9e6d4fe80f393c22d62479120e6ac8256b389be1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 09:14:20 +0200
Subject: [PATCH 146/218] fix obtaining arch value

---
 scripts/automated_ingestion/eessi_task.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index da361bbf..a3ab4b9b 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -917,7 +917,9 @@ def _update_task_states(self, next_state: TaskState, default_branch_name: str,
         self._update_task_state_file(next_state, branch_name=default_branch_name)
 
         # second, merge default branch into feature branch (to avoid a merge conflict)
-        arch = self.description.task_object.arch
+        # TODO: store arch info (CPU+ACCEL) in task/metdata file and then access that rather
+        #       than using a part of the file name
+        arch = self.description.get_metadata_file_components()[3]
         commit_message = f"merge {default_branch_name} into {feature_branch_name} for {arch}"
         self.git_repo.merge(
             head=default_branch_name,

From 029d5b6d3ba67e67fb9328a119c6db35ea91b614 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 09:29:58 +0200
Subject: [PATCH 147/218] improve commit message when changing task state

---
 scripts/automated_ingestion/eessi_task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a3ab4b9b..0aaee310 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -768,8 +768,8 @@ def _update_task_state_file(self, next_state: TaskState, branch_name: str = None
         task_pointer_file = self.description.task_object.remote_file_path
         target_dir = self._read_target_dir_from_file(task_pointer_file, branch_name)
         task_state_file_path = f"{target_dir}/TaskState"
-        _, repo, pr, seq, _ = target_dir.split('/')
-        commit_message = f"changing task state for repo {repo} PR {pr} seq {seq} to {next_state}"
+        arch = self.description.get_metadata_file_components()[3]
+        commit_message = f"change task state to {next_state} in {branch_name} for {arch}"
         result = self._update_file(task_state_file_path,
                                    f"{next_state.name}\n",
                                    commit_message,

From b996f6bee8943461972493c1877b5f46b05a38b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 09:40:56 +0200
Subject: [PATCH 148/218] first step towards adding another task to a
 deployment PR

---
 scripts/automated_ingestion/eessi_task.py | 25 +++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0aaee310..497f53e0 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -972,6 +972,14 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
         )
         log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr)
 
+    @log_function_entry_exit()
+    def _update_pull_request(self, pull_request: PullRequest, feature_branch_name: str):
+        """Update the pull request"""
+        # TODO: update sections (contents analysis, action)
+        # for now, function just logs a message
+        log_message(LoggingScope.TASK_OPS, 'INFO',
+                    "updating pull request %s for branch %s", pull_request, feature_branch_name)
+
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
@@ -1015,8 +1023,21 @@ def _handle_add_payload_staged(self):
             log_message(LoggingScope.TASK_OPS, 'INFO',
                         "found existing PR for branch %s: %s", feature_branch_name, pull_request)
             # TODO: check if PR is open or closed
-            # TODO: if closed, create issue (PR already closed)
-            return TaskState.PULL_REQUEST
+            if pull_request.state == 'closed':
+                log_message(LoggingScope.TASK_OPS, 'INFO',
+                            "PR %s is closed, creating issue", pull_request)
+                # TODO: create issue
+                return TaskState.PAYLOAD_STAGED
+            else:
+                log_message(LoggingScope.TASK_OPS, 'INFO',
+                            "PR %s is open, updating task states", pull_request)
+                # TODO: add failure handling (capture result and act on it)
+                self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name)
+
+                # TODO: add failure handling (capture result and act on it)
+                self._update_pull_request(pull_request, feature_branch_name)
+
+                return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()
     def _handle_add_pull_request(self):

From a4c2d02dee7381d8ea4c39a5f49f3fa2fbba15a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 10:14:11 +0200
Subject: [PATCH 149/218] don't stop processing after first task

---
 scripts/automated_ingestion/automated_ingestion.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 0ebce03e..46875b33 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -254,12 +254,8 @@ def main():
                         # Create EESSITask for the task file
                         try:
                             task = EESSITask(
-                                EESSITaskDescription(
-                                    EESSIDataAndSignatureObject(config, task_path, s3_bucket)
-                                ),
-                                config,
-                                cvmfs_repo,
-                                gh_staging_repo
+                                EESSITaskDescription(EESSIDataAndSignatureObject(config, task_path, s3_bucket)),
+                                config, cvmfs_repo, gh_staging_repo
                             )
 
                         except Exception as err:
@@ -281,7 +277,6 @@ def main():
                             log_message(LoggingScope.GROUP_OPS, 'INFO',
                                         "Task '%s': previous state = '%s', current state = '%s'",
                                         task_path, previous_state.name, current_state.name)
-                            exit(0)  # run loop body only once
 
                         # # TODO: update the information shown below (what makes sense to show?)
                         # # Log information about the task

From 121130a49b2c51ac940e0ff08d2e8616896a1f18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 10:44:50 +0200
Subject: [PATCH 150/218] first step to create contents overview

---
 scripts/automated_ingestion/eessi_task.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 497f53e0..0952c4df 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -933,6 +933,20 @@ def _update_task_states(self, next_state: TaskState, default_branch_name: str,
                     "TaskState file updated to %s in default branch (%s) and to %s in feature branch (%s)",
                     next_state, default_branch_name, approved_state, feature_branch_name)
 
+    @log_function_entry_exit()
+    def _create_contents_overview(self) -> str:
+        """Create a contents overview for the pull request"""
+        # TODO: implement
+        feature_branch_name = self._determine_feature_branch_name()
+        task_pointer_file = self.description.task_object.remote_file_path
+        target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name)
+        directories = self._list_directory_contents(target_dir, feature_branch_name)
+        for directory in directories:
+            print(directory)
+        # tarball_contents = self.description.task_object.get_contents_overview()
+
+        return "TO BE DONE"
+
     @log_function_entry_exit()
     def _create_pull_request(self, feature_branch_name: str, default_branch_name: str):
         """
@@ -954,13 +968,14 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             repo=repo_name,
             seq_num=seq_num,
         )
+        # contents_overview = self._create_contents_overview()
         pr_body = pr_body_format.format(
             cvmfs_repo=self.cvmfs_repo,
             pr=pr_number,
             pr_url=pr_url,
             repo=repo_name,
             seq_num=seq_num,
-            contents="TO BE DONE",
+            contents=contents_overview,
             analysis="TO BE DONE",
             action="TO BE DONE",
         )
@@ -1044,6 +1059,8 @@ def _handle_add_pull_request(self):
         """Handler for ADD action in PULL_REQUEST state"""
         print("Handling ADD action in PULL_REQUEST state")
         # Implementation for adding in PULL_REQUEST state
+        contents_overview = self._create_contents_overview()
+        log_message(LoggingScope.TASK_OPS, 'INFO', "contents_overview: %s", contents_overview)
         return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()

From 64c4fe807ef63df3492500141b621d3ac57e464d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 10:47:15 +0200
Subject: [PATCH 151/218] a little more log output

---
 scripts/automated_ingestion/eessi_task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0952c4df..51931217 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -941,6 +941,7 @@ def _create_contents_overview(self) -> str:
         task_pointer_file = self.description.task_object.remote_file_path
         target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name)
         directories = self._list_directory_contents(target_dir, feature_branch_name)
+        print(f"target_dir: {target_dir}")
         for directory in directories:
             print(directory)
         # tarball_contents = self.description.task_object.get_contents_overview()

From 1061e1e9b11917e6d46ad9a5182d3fc9a54b47c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 9 Jun 2025 10:49:48 +0200
Subject: [PATCH 152/218] one level up from target_dir

---
 scripts/automated_ingestion/eessi_task.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 51931217..e1fb7ec9 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -940,10 +940,12 @@ def _create_contents_overview(self) -> str:
         feature_branch_name = self._determine_feature_branch_name()
         task_pointer_file = self.description.task_object.remote_file_path
         target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name)
-        directories = self._list_directory_contents(target_dir, feature_branch_name)
+        pr_dir = os.path.dirname(target_dir)
+        directories = self._list_directory_contents(pr_dir, feature_branch_name)
         print(f"target_dir: {target_dir}")
+        print(f"pr_dir: {pr_dir}")
         for directory in directories:
-            print(directory)
+            print(f"directory: {directory}")
         # tarball_contents = self.description.task_object.get_contents_overview()
 
         return "TO BE DONE"

From ae0ad4b97d1bc3e093cd0cfdd0c0195cf181526d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 15 Jun 2025 20:29:38 +0200
Subject: [PATCH 153/218] show basic task summary

---
 scripts/automated_ingestion/eessi_task.py     | 47 +++++++++++++++++--
 .../automated_ingestion/eessi_task_payload.py |  7 +++
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index e1fb7ec9..a068ee44 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -934,7 +934,45 @@ def _update_task_states(self, next_state: TaskState, default_branch_name: str,
                     next_state, default_branch_name, approved_state, feature_branch_name)
 
     @log_function_entry_exit()
-    def _create_contents_overview(self) -> str:
+    def _create_task_summary(self) -> str:
+        """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory."""
+
+        # determine task summary file path in feature branch on GitHub
+        feature_branch_name = self._determine_feature_branch_name()
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        sequence_number = self._get_fixed_sequence_number()  # corresponds to an open PR
+        task_file_name = self.description.get_task_file_name()
+        target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
+        task_summary_file_path = f"{target_dir}/TaskSummary.html"
+
+        # check if task summary file already exists in repo on GitHub
+        task_summary_file = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
+        if task_summary_file:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path)
+            return task_summary_file
+
+        # create task summary
+        payload_name = self.description.metadata['payload']['filename']
+        payload_summary = self.payload.analyse_contents()
+        metadata_contents = self.description.get_contents()
+        task_summary = f"<details><summary>{payload_name}</summary>\n<ul>\n"
+        task_summary += "<li><details><summary>Metadata</summary>\n"
+        task_summary += f"<pre>{metadata_contents}</pre>\n</details></li>\n"
+        task_summary += f"<li><details><summary>Payload</summary>\n{payload_summary}\n</details></li>\n"
+        task_summary += "</ul>\n"
+        task_summary += "</details>\n"
+
+        # create HTML file with task summary in REPO-PR-SEQ directory
+        # TODO: add failure handling (capture result and act on it)
+        # self._safe_create_file(task_summary_file_path, f"create task summary for {task_file_name}",
+        #                        task_summary, branch_name=feature_branch_name)
+
+        # return task summary
+        return task_summary
+
+    @log_function_entry_exit()
+    def _create_pr_contents_overview(self) -> str:
         """Create a contents overview for the pull request"""
         # TODO: implement
         feature_branch_name = self._determine_feature_branch_name()
@@ -971,7 +1009,8 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             repo=repo_name,
             seq_num=seq_num,
         )
-        # contents_overview = self._create_contents_overview()
+        self._create_task_summary()
+        # contents_overview = self._create_pr_contents_overview()
         pr_body = pr_body_format.format(
             cvmfs_repo=self.cvmfs_repo,
             pr=pr_number,
@@ -1062,8 +1101,8 @@ def _handle_add_pull_request(self):
         """Handler for ADD action in PULL_REQUEST state"""
         print("Handling ADD action in PULL_REQUEST state")
         # Implementation for adding in PULL_REQUEST state
-        contents_overview = self._create_contents_overview()
-        log_message(LoggingScope.TASK_OPS, 'INFO', "contents_overview: %s", contents_overview)
+        task_summary = self._create_task_summary()
+        log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary)
         return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index bba630fe..548ac086 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -35,6 +35,13 @@ def __init__(self, payload_object: EESSIDataAndSignatureObject):
         # Verify signature
         self.signature_verified = self.payload_object.verify_signature()
 
+    @log_function_entry_exit()
+    def analyse_contents(self) -> str:
+        """Analyse the contents of the payload and return a summary in a ready-to-use HTML format."""
+        # TODO: implement
+        return "TO BE DONE"
+
+    @log_function_entry_exit()
     def __str__(self) -> str:
         """Return a string representation of the EESSITaskPayload object."""
         return f"EESSITaskPayload({self.payload_object.local_file_path}, verified={self.signature_verified})"

From 7b4d2675eb5bc1a8d9622cee4d0fbfc34db1f939 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 15 Jun 2025 20:39:34 +0200
Subject: [PATCH 154/218] use existing method to check for existance of task
 summary file

---
 scripts/automated_ingestion/eessi_task.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a068ee44..0c4e66ec 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -947,10 +947,10 @@ def _create_task_summary(self) -> str:
         task_summary_file_path = f"{target_dir}/TaskSummary.html"
 
         # check if task summary file already exists in repo on GitHub
-        task_summary_file = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
-        if task_summary_file:
+        if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
             log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path)
-            return task_summary_file
+            # TODO: read contents of task summary file
+            return "DUMMY TASK SUMMARY"
 
         # create task summary
         payload_name = self.description.metadata['payload']['filename']

From a754921a2fa436d189b7e3750c1d1d23ca808882 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 15 Jun 2025 20:52:35 +0200
Subject: [PATCH 155/218] init payload object in constructor for EESSITask
 given corresponding state

---
 scripts/automated_ingestion/eessi_task.py | 27 +++++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0c4e66ec..45394e71 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -80,7 +80,10 @@ def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo:
             TaskState.DONE: []  # Terminal state
         }
 
-        # self.state = self._find_state()
+        state = self.determine_state()
+        if state >= TaskState.PAYLOAD_STAGED:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "initializing payload object in constructor for EESSITask")
+            self._init_payload_object()
 
     @log_function_entry_exit()
     def _determine_task_action(self) -> EESSITaskAction:
@@ -777,12 +780,11 @@ def _update_task_state_file(self, next_state: TaskState, branch_name: str = None
         return result
 
     @log_function_entry_exit()
-    def _handle_add_new_task(self):
-        """Handler for ADD action in NEW_TASK state"""
-        print("Handling ADD action in NEW_TASK state")
-        # determine next state
-        next_state = self._next_state(TaskState.NEW_TASK)
-        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
+    def _init_payload_object(self):
+        """Initialize the payload object"""
+        if self.payload is not None:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "payload object already initialized")
+            return
 
         # get name of of payload from metadata
         payload_name = self.description.metadata['payload']['filename']
@@ -803,6 +805,17 @@ def _handle_add_new_task(self):
         self.payload = EESSITaskPayload(payload_object)
         log_message(LoggingScope.TASK_OPS, 'INFO', "payload: %s", self.payload)
 
+    @log_function_entry_exit()
+    def _handle_add_new_task(self):
+        """Handler for ADD action in NEW_TASK state"""
+        print("Handling ADD action in NEW_TASK state")
+        # determine next state
+        next_state = self._next_state(TaskState.NEW_TASK)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
+
+        # initialize payload object
+        self._init_payload_object()
+
         # update TaskState file content
         self._update_task_state_file(next_state)
 

From 8a37ae17b888baddbdb7daa6fa43c0d8ed54eee9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 15 Jun 2025 20:58:49 +0200
Subject: [PATCH 156/218] add comparison of TaskState values

---
 scripts/automated_ingestion/eessi_task.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 45394e71..07138998 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1,5 +1,6 @@
 from enum import Enum, auto
 from typing import Dict, List, Tuple, Optional
+from functools import total_ordering
 
 import os
 import traceback
@@ -22,6 +23,7 @@ class SequenceStatus(Enum):
     FINISHED = auto()
 
 
+@total_ordering
 class TaskState(Enum):
     UNDETERMINED = auto()  # The task state was not determined yet
     NEW_TASK = auto()  # The task has been created but not yet processed
@@ -47,6 +49,11 @@ def from_string(cls, name, default=None, case_sensitive=False):
         except KeyError:
             return default
 
+    def __lt__(self, other):
+        if self.__class__ is other.__class__:
+            return self.value < other.value
+        return NotImplemented
+
     def __str__(self):
         return self.name.upper()
 

From 088fee93a879b383797a4d25ead08d399feb56fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 15 Jun 2025 21:01:43 +0200
Subject: [PATCH 157/218] init payload to None initially

---
 scripts/automated_ingestion/eessi_task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 07138998..e5d415cb 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -87,6 +87,7 @@ def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo:
             TaskState.DONE: []  # Terminal state
         }
 
+        self.payload = None
         state = self.determine_state()
         if state >= TaskState.PAYLOAD_STAGED:
             log_message(LoggingScope.TASK_OPS, 'INFO', "initializing payload object in constructor for EESSITask")

From 047d271d03468e1776969734d4aa0e25bc9980c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 04:48:46 +0200
Subject: [PATCH 158/218] create payload analysis

---
 .../automated_ingestion.cfg.example           |  5 ++
 scripts/automated_ingestion/eessi_task.py     | 12 ++--
 .../automated_ingestion/eessi_task_payload.py | 60 ++++++++++++++++++-
 3 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.cfg.example b/scripts/automated_ingestion/automated_ingestion.cfg.example
index bdf40fa3..98ab1e79 100644
--- a/scripts/automated_ingestion/automated_ingestion.cfg.example
+++ b/scripts/automated_ingestion/automated_ingestion.cfg.example
@@ -114,6 +114,11 @@ grouped_pr_body = A group of tarballs has been staged for {pr_url}.
     
     {metadata}
 
+# Template for individual tarball PRs
+task_summary_payload_template =
+    {tar_overview}
+    
+
 [slack]
 ingestion_notification = yes
 ingestion_message = Tarball `{tarball}` has been ingested into the CVMFS repository `{cvmfs_repo}`.
diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index e5d415cb..51dbf2e8 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -970,8 +970,8 @@ def _create_task_summary(self) -> str:
         # check if task summary file already exists in repo on GitHub
         if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
             log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path)
-            # TODO: read contents of task summary file
-            return "DUMMY TASK SUMMARY"
+            task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
+            return task_summary.decoded_content
 
         # create task summary
         payload_name = self.description.metadata['payload']['filename']
@@ -980,7 +980,11 @@ def _create_task_summary(self) -> str:
         task_summary = f"<details><summary>{payload_name}</summary>\n<ul>\n"
         task_summary += "<li><details><summary>Metadata</summary>\n"
         task_summary += f"<pre>{metadata_contents}</pre>\n</details></li>\n"
-        task_summary += f"<li><details><summary>Payload</summary>\n{payload_summary}\n</details></li>\n"
+        task_summary += "<li><details><summary>Overview of payload contents</summary>\n"
+        task_summary += self.config['github']['task_summary_payload_template'].format(
+            payload_overview=payload_summary,
+        )
+        task_summary += "</details></li>\n"
         task_summary += "</ul>\n"
         task_summary += "</details>\n"
 
@@ -1038,7 +1042,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             pr_url=pr_url,
             repo=repo_name,
             seq_num=seq_num,
-            contents=contents_overview,
+            contents="TO BE DONE",
             analysis="TO BE DONE",
             action="TO BE DONE",
         )
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index 548ac086..9a643157 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -1,4 +1,7 @@
 from dataclasses import dataclass
+import tarfile
+from pathlib import PurePosixPath
+import os
 
 from eessi_data_object import EESSIDataAndSignatureObject
 from utils import log_function_entry_exit
@@ -38,8 +41,61 @@ def __init__(self, payload_object: EESSIDataAndSignatureObject):
     @log_function_entry_exit()
     def analyse_contents(self) -> str:
         """Analyse the contents of the payload and return a summary in a ready-to-use HTML format."""
-        # TODO: implement
-        return "TO BE DONE"
+        tar = tarfile.open(self.payload_object.local_file_path, 'r')
+        members = tar.getmembers()
+        tar_num_members = len(members)
+        paths = sorted([m.path for m in members])
+
+        if tar_num_members < 100:
+            tar_members_desc = "Full listing of the contents of the tarball:"
+            members_list = paths
+
+        else:
+            tar_members_desc = "Summarized overview of the contents of the tarball:"
+            # determine prefix after filtering out '<EESSI version>/init' subdirectory,
+            # to get actual prefix for specific CPU target (like '2023.06/software/linux/aarch64/neoverse_v1')
+            init_subdir = os.path.join('*', 'init')
+            non_init_paths = sorted(
+                [path for path in paths if not any(parent.match(init_subdir) for parent in PurePosixPath(path).parents)]
+            )
+            if non_init_paths:
+                prefix = os.path.commonprefix(non_init_paths)
+            else:
+                prefix = os.path.commonprefix(paths)
+
+            # TODO: this only works for software tarballs, how to handle compat layer tarballs?
+            swdirs = [  # all directory names with the pattern: <prefix>/software/<name>/<version>
+                member.path
+                for member in members
+                if member.isdir() and PurePosixPath(member.path).match(os.path.join(prefix, 'software', '*', '*'))
+            ]
+            modfiles = [  # all filenames with the pattern: <prefix>/modules/<category>/<name>/*.lua
+                member.path
+                for member in members
+                if member.isfile() and
+                PurePosixPath(member.path).match(os.path.join(prefix, 'modules', '*', '*', '*.lua'))
+            ]
+            other = [  # anything that is not in <prefix>/software nor <prefix>/modules
+                member.path
+                for member in members
+                if (not PurePosixPath(prefix).joinpath('software') in PurePosixPath(member.path).parents
+                    and not PurePosixPath(prefix).joinpath('modules') in PurePosixPath(member.path).parents)
+                # if not fnmatch.fnmatch(m.path, os.path.join(prefix, 'software', '*'))
+                # and not fnmatch.fnmatch(m.path, os.path.join(prefix, 'modules', '*'))
+            ]
+            members_list = sorted(swdirs + modfiles + other)
+
+        # Construct the overview.
+        tar_members = '\n'.join(members_list)
+        overview = f"Total number of items in the tarball: {tar_num_members}"
+        overview += f"\nURL to the tarball: {self.url}"
+        overview += f"\n{tar_members_desc}\n"
+        overview += f"```\n{tar_members}\n```"
+
+        # Make sure that the overview does not exceed Github's maximum length (65536 characters).
+        if len(overview) > 60000:
+            overview = overview[:60000] + "\n\nWARNING: output exceeded the maximum length and was truncated!\n```"
+        return overview
 
     @log_function_entry_exit()
     def __str__(self) -> str:

From eb3c1b163001955ae2436ab72ef114edee440097 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 04:58:06 +0200
Subject: [PATCH 159/218] add function to obtain URL for remote file

---
 scripts/automated_ingestion/eessi_data_object.py  | 5 +++++
 scripts/automated_ingestion/eessi_task_payload.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_data_object.py b/scripts/automated_ingestion/eessi_data_object.py
index 6e8189fe..c7adc05b 100644
--- a/scripts/automated_ingestion/eessi_data_object.py
+++ b/scripts/automated_ingestion/eessi_data_object.py
@@ -324,6 +324,11 @@ def download(self, mode: DownloadMode = DownloadMode.CHECK_REMOTE) -> bool:
             log_message(LoggingScope.ERROR, 'ERROR', "Failed to download %s: %s", self.remote_file_path, str(err))
             raise
 
+    @log_function_entry_exit()
+    def get_url(self) -> str:
+        """Get the URL of the data file."""
+        return f"https://{self.remote_client.bucket}.s3.amazonaws.com/{self.remote_file_path}"
+
     def __str__(self) -> str:
         """Return a string representation of the EESSI data and signature object."""
         return f"EESSIDataAndSignatureObject({self.remote_file_path})"
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index 9a643157..3729c846 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -88,7 +88,7 @@ def analyse_contents(self) -> str:
         # Construct the overview.
         tar_members = '\n'.join(members_list)
         overview = f"Total number of items in the tarball: {tar_num_members}"
-        overview += f"\nURL to the tarball: {self.url}"
+        overview += f"\nURL to the tarball: {self.payload_object.get_url()}"
         overview += f"\n{tar_members_desc}\n"
         overview += f"```\n{tar_members}\n```"
 

From fe4164659ac5a4a85e9b0a289ad951ba13c5013d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 05:03:47 +0200
Subject: [PATCH 160/218] fix var name in template

---
 scripts/automated_ingestion/automated_ingestion.cfg.example | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.cfg.example b/scripts/automated_ingestion/automated_ingestion.cfg.example
index 98ab1e79..18009d88 100644
--- a/scripts/automated_ingestion/automated_ingestion.cfg.example
+++ b/scripts/automated_ingestion/automated_ingestion.cfg.example
@@ -114,9 +114,9 @@ grouped_pr_body = A group of tarballs has been staged for {pr_url}.
     
     {metadata}
 
-# Template for individual tarball PRs
+# Template for payload overview
 task_summary_payload_template =
-    {tar_overview}
+    {payload_overview}
     
 
 [slack]

From e3e10e31c9185adcae18f2d600f4f02f13e9d379 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 05:09:32 +0200
Subject: [PATCH 161/218] =?UTF-8?q?create=20task=20summary=20file=20in=20s?=
 =?UTF-8?q?taging=20PRR=E2=80=9D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/automated_ingestion/eessi_task.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 51dbf2e8..92d79d4d 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -990,8 +990,10 @@ def _create_task_summary(self) -> str:
 
         # create HTML file with task summary in REPO-PR-SEQ directory
         # TODO: add failure handling (capture result and act on it)
-        # self._safe_create_file(task_summary_file_path, f"create task summary for {task_file_name}",
-        #                        task_summary, branch_name=feature_branch_name)
+        commit_message = f"create summary for {task_file_name} in {feature_branch_name}"
+        self._safe_create_file(task_summary_file_path, commit_message, task_summary,
+                               branch_name=feature_branch_name)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file created: %s", task_summary_file_path)
 
         # return task summary
         return task_summary

From aa2461214422759367b7df633d153d94054cf237 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 05:25:02 +0200
Subject: [PATCH 162/218] first step to create PR contents overview

---
 scripts/automated_ingestion/eessi_task.py | 25 +++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 92d79d4d..81f0b253 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1009,11 +1009,22 @@ def _create_pr_contents_overview(self) -> str:
         directories = self._list_directory_contents(pr_dir, feature_branch_name)
         print(f"target_dir: {target_dir}")
         print(f"pr_dir: {pr_dir}")
-        for directory in directories:
-            print(f"directory: {directory}")
-        # tarball_contents = self.description.task_object.get_contents_overview()
+        contents_overview = ""
+        if directories:
+            contents_overview += "<ul>\n"
+            for directory in directories:
+                task_summary_file_path = f"{directory}/TaskSummary.html"
+                if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
+                    task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
+                    contents_overview += f"<li>{task_summary.decoded_content}</li>\n"
+                else:
+                    contents_overview += f"<li>Task summary file not found: {task_summary_file_path}</li>\n"
+            contents_overview += "</ul>\n"
+        else:
+            contents_overview += "No tasks found in this PR\n"
 
-        return "TO BE DONE"
+        print(f"contents_overview: {contents_overview}")
+        return contents_overview
 
     @log_function_entry_exit()
     def _create_pull_request(self, feature_branch_name: str, default_branch_name: str):
@@ -1037,14 +1048,14 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             seq_num=seq_num,
         )
         self._create_task_summary()
-        # contents_overview = self._create_pr_contents_overview()
+        contents_overview = self._create_pr_contents_overview()
         pr_body = pr_body_format.format(
             cvmfs_repo=self.cvmfs_repo,
             pr=pr_number,
             pr_url=pr_url,
             repo=repo_name,
             seq_num=seq_num,
-            contents="TO BE DONE",
+            contents=contents_overview,
             analysis="TO BE DONE",
             action="TO BE DONE",
         )
@@ -1130,6 +1141,8 @@ def _handle_add_pull_request(self):
         # Implementation for adding in PULL_REQUEST state
         task_summary = self._create_task_summary()
         log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary)
+        contents_overview = self._create_pr_contents_overview()
+        log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview)
         return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()

From 6a4f62c0c88c295d1ca9fc4f70e4d802af67fd5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 05:29:34 +0200
Subject: [PATCH 163/218] use name of directory

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 81f0b253..df45dfe7 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1013,7 +1013,7 @@ def _create_pr_contents_overview(self) -> str:
         if directories:
             contents_overview += "<ul>\n"
             for directory in directories:
-                task_summary_file_path = f"{directory}/TaskSummary.html"
+                task_summary_file_path = f"{directory.name}/TaskSummary.html"
                 if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
                     task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
                     contents_overview += f"<li>{task_summary.decoded_content}</li>\n"

From 1d7b70717feb09e96b141791c3c29c51af13ae2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 05:32:29 +0200
Subject: [PATCH 164/218] add PR dir component

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index df45dfe7..ba994ac2 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1013,7 +1013,7 @@ def _create_pr_contents_overview(self) -> str:
         if directories:
             contents_overview += "<ul>\n"
             for directory in directories:
-                task_summary_file_path = f"{directory.name}/TaskSummary.html"
+                task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html"
                 if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
                     task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
                     contents_overview += f"<li>{task_summary.decoded_content}</li>\n"

From 5e4f10770370182b68a9f94a50d98a8a4734d79a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 05:39:44 +0200
Subject: [PATCH 165/218] note TODO

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index ba994ac2..8b83498a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1073,7 +1073,7 @@ def _update_pull_request(self, pull_request: PullRequest, feature_branch_name: s
         # TODO: update sections (contents analysis, action)
         # for now, function just logs a message
         log_message(LoggingScope.TASK_OPS, 'INFO',
-                    "updating pull request %s for branch %s", pull_request, feature_branch_name)
+                    "TODO: updating pull request %s for branch %s", pull_request, feature_branch_name)
 
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):

From 71f703c6b8f1c5a33bd59ab7ad31aee06064dc42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 05:44:36 +0200
Subject: [PATCH 166/218] bump sequence number to 1

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8b83498a..b3f61b06 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 0
+        return 1
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From 02c01d080612974cf4d1e202bb108c7503f80fd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 19:54:17 +0200
Subject: [PATCH 167/218] tweak formatting of bundling PR content

---
 scripts/automated_ingestion/eessi_task.py         | 15 ++++++++-------
 scripts/automated_ingestion/eessi_task_payload.py |  4 ++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index b3f61b06..6781f616 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -971,21 +971,22 @@ def _create_task_summary(self) -> str:
         if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
             log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path)
             task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
-            return task_summary.decoded_content
+            # return task_summary.decoded_content
+            return task_summary
 
         # create task summary
         payload_name = self.description.metadata['payload']['filename']
         payload_summary = self.payload.analyse_contents()
         metadata_contents = self.description.get_contents()
-        task_summary = f"<details><summary>{payload_name}</summary>\n<ul>\n"
-        task_summary += "<li><details><summary>Metadata</summary>\n"
-        task_summary += f"<pre>{metadata_contents}</pre>\n</details></li>\n"
-        task_summary += "<li><details><summary>Overview of payload contents</summary>\n"
+        task_summary = f"<details><summary><code>{payload_name}</code></summary>\n\n"
+        task_summary += "<details><summary>Metadata</summary>\n\n"
+        task_summary += f"```\n{metadata_contents}\n```\n</details>\n"
+        task_summary += "<details><summary>Overview of payload contents</summary>\n\n"
         task_summary += self.config['github']['task_summary_payload_template'].format(
             payload_overview=payload_summary,
         )
-        task_summary += "</details></li>\n"
-        task_summary += "</ul>\n"
+        task_summary += "</details>\n"
+        task_summary += "\n"
         task_summary += "</details>\n"
 
         # create HTML file with task summary in REPO-PR-SEQ directory
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index 3729c846..65bb2b9e 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -89,8 +89,8 @@ def analyse_contents(self) -> str:
         tar_members = '\n'.join(members_list)
         overview = f"Total number of items in the tarball: {tar_num_members}"
         overview += f"\nURL to the tarball: {self.payload_object.get_url()}"
-        overview += f"\n{tar_members_desc}\n"
-        overview += f"```\n{tar_members}\n```"
+        overview += f"\n{tar_members_desc}\n\n"
+        overview += f"```\n{tar_members}\n```\n"
 
         # Make sure that the overview does not exceed Github's maximum length (65536 characters).
         if len(overview) > 60000:

From e508148fb34312231ff71c951f48bcd6fce9da44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 20:02:13 +0200
Subject: [PATCH 168/218] bump sequence number to 2

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 6781f616..b552be7f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 1
+        return 2
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From 5c5475496850a2c85ab9c55a01c30f6d59e5e1a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 20:12:31 +0200
Subject: [PATCH 169/218] alternative for creating PR body, bumping seq to 3

---
 scripts/automated_ingestion/eessi_task.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index b552be7f..b4f48602 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 2
+        return 3
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:
@@ -1057,7 +1057,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             repo=repo_name,
             seq_num=seq_num,
             contents=contents_overview,
-            analysis="TO BE DONE",
+            analysis=str(contents_overview),
             action="TO BE DONE",
         )
         pr = self.git_repo.create_pull(

From 53670df5947e4f4c9b2c48275b2ec75511152ddf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 20:24:13 +0200
Subject: [PATCH 170/218] decode file contents from GitHub and bump sequence
 number

---
 scripts/automated_ingestion/eessi_task.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index b4f48602..7cb4b988 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 3
+        return 4
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:
@@ -1008,19 +1008,18 @@ def _create_pr_contents_overview(self) -> str:
         target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name)
         pr_dir = os.path.dirname(target_dir)
         directories = self._list_directory_contents(pr_dir, feature_branch_name)
-        print(f"target_dir: {target_dir}")
-        print(f"pr_dir: {pr_dir}")
         contents_overview = ""
         if directories:
-            contents_overview += "<ul>\n"
+            contents_overview += "\n"
             for directory in directories:
                 task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html"
                 if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
-                    task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
-                    contents_overview += f"<li>{task_summary.decoded_content}</li>\n"
+                    file_contents = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
+                    task_summary = base64.b64decode(file_contents).decode('utf-8')
+                    contents_overview += f"{task_summary}\n"
                 else:
-                    contents_overview += f"<li>Task summary file not found: {task_summary_file_path}</li>\n"
-            contents_overview += "</ul>\n"
+                    contents_overview += f"Task summary file not found: {task_summary_file_path}\n"
+            contents_overview += "\n"
         else:
             contents_overview += "No tasks found in this PR\n"
 
@@ -1057,7 +1056,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             repo=repo_name,
             seq_num=seq_num,
             contents=contents_overview,
-            analysis=str(contents_overview),
+            analysis=contents_overview,
             action="TO BE DONE",
         )
         pr = self.git_repo.create_pull(

From f0fc09ff84998d0109a8fbc538c939f49dbc3927 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 20:30:25 +0200
Subject: [PATCH 171/218] need to access .content

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7cb4b988..8226ab2c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1015,7 +1015,7 @@ def _create_pr_contents_overview(self) -> str:
                 task_summary_file_path = f"{pr_dir}/{directory.name}/TaskSummary.html"
                 if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
                     file_contents = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
-                    task_summary = base64.b64decode(file_contents).decode('utf-8')
+                    task_summary = base64.b64decode(file_contents.content).decode('utf-8')
                     contents_overview += f"{task_summary}\n"
                 else:
                     contents_overview += f"Task summary file not found: {task_summary_file_path}\n"

From 0660fa1b1b5721d6365b61a9f4b64a37bb05fa55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 21:34:45 +0200
Subject: [PATCH 172/218] add function to return bucket URL

---
 scripts/automated_ingestion/s3_bucket.py | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/scripts/automated_ingestion/s3_bucket.py b/scripts/automated_ingestion/s3_bucket.py
index ff62813f..79fed289 100644
--- a/scripts/automated_ingestion/s3_bucket.py
+++ b/scripts/automated_ingestion/s3_bucket.py
@@ -154,3 +154,34 @@ def download(self, remote_path: str, local_path: str) -> None:
 
         # Store the ETag
         self._write_etag(local_path, etag)
+
+    @log_function_entry_exit()
+    def get_bucket_url(self) -> str:
+        """
+        Get the HTTPS URL for a bucket from an initialized boto3 client.
+        Works with both AWS S3 and MinIO/S3-compatible services.
+        """
+        try:
+            # Check if this is a custom endpoint (MinIO) or AWS S3
+            endpoint_url = self.client.meta.endpoint_url
+
+            if endpoint_url:
+                # Custom endpoint (MinIO, DigitalOcean Spaces, etc.)
+                # Most S3-compatible services use path-style URLs
+                bucket_url = f"{endpoint_url}/{self.bucket}"
+
+            else:
+                # AWS S3 (no custom endpoint specified)
+                region = self.client.meta.region_name or 'us-east-1'
+
+                # AWS S3 virtual-hosted-style URLs
+                if region == 'us-east-1':
+                    bucket_url = f"https://{self.bucket}.s3.amazonaws.com"
+                else:
+                    bucket_url = f"https://{self.bucket}.s3.{region}.amazonaws.com"
+
+            return bucket_url
+
+        except Exception as err:
+            log_message(LoggingScope.ERROR, 'ERROR', "Error getting bucket URL: %s", str(err))
+            return None

From 26525891a814e4e753850c8f0008c1dca1778512 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 21:37:32 +0200
Subject: [PATCH 173/218] use method that returns bucket URL

---
 scripts/automated_ingestion/eessi_task_payload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index 65bb2b9e..809ce152 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -88,7 +88,7 @@ def analyse_contents(self) -> str:
         # Construct the overview.
         tar_members = '\n'.join(members_list)
         overview = f"Total number of items in the tarball: {tar_num_members}"
-        overview += f"\nURL to the tarball: {self.payload_object.get_url()}"
+        overview += f"\nURL to the tarball: {self.payload_object.remote_client.get_bucket_url()}"
         overview += f"\n{tar_members_desc}\n\n"
         overview += f"```\n{tar_members}\n```\n"
 

From 7b7bb63715ad6f3498a3e96a097c36bba0c74349 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 21:41:09 +0200
Subject: [PATCH 174/218] bump sequence number to 5

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 8226ab2c..5fb13e3a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 4
+        return 5
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From 6eb7002c0bbe76f3457be261bc3f5bb5707712a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 22:02:22 +0200
Subject: [PATCH 175/218] add remote_file_path to bucket_url and update pull
 request

---
 scripts/automated_ingestion/eessi_task.py     | 45 ++++++++++++++-----
 .../automated_ingestion/eessi_task_payload.py |  4 +-
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 5fb13e3a..a76523eb 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1056,7 +1056,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             repo=repo_name,
             seq_num=seq_num,
             contents=contents_overview,
-            analysis=contents_overview,
+            analysis="TO BE DONE",
             action="TO BE DONE",
         )
         pr = self.git_repo.create_pull(
@@ -1068,12 +1068,35 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
         log_message(LoggingScope.TASK_OPS, 'INFO', "PR created: %s", pr)
 
     @log_function_entry_exit()
-    def _update_pull_request(self, pull_request: PullRequest, feature_branch_name: str):
-        """Update the pull request"""
+    def _update_pull_request(self, pull_request: PullRequest):
+        """
+        Update the pull request
+
+        Args:
+            pull_request: instance of the pull request
+        """
         # TODO: update sections (contents analysis, action)
-        # for now, function just logs a message
-        log_message(LoggingScope.TASK_OPS, 'INFO',
-                    "TODO: updating pull request %s for branch %s", pull_request, feature_branch_name)
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
+        seq_num = self._determine_sequence_number()
+
+        self._create_task_summary()
+        contents_overview = self._create_pr_contents_overview()
+        pr_body_format = self.config['github']['grouped_pr_body']
+        pr_body = pr_body_format.format(
+            cvmfs_repo=self.cvmfs_repo,
+            pr=pr_number,
+            pr_url=pr_url,
+            repo=repo_name,
+            seq_num=seq_num,
+            contents=contents_overview,
+            analysis="TO BE DONE",
+            action="TO BE DONE",
+        )
+        pull_request.edit(body=pr_body)
+
+        log_message(LoggingScope.TASK_OPS, 'INFO', "PR updated: %s", pull_request)
 
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):
@@ -1130,7 +1153,7 @@ def _handle_add_payload_staged(self):
                 self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name)
 
                 # TODO: add failure handling (capture result and act on it)
-                self._update_pull_request(pull_request, feature_branch_name)
+                self._update_pull_request(pull_request)
 
                 return TaskState.PULL_REQUEST
 
@@ -1139,10 +1162,10 @@ def _handle_add_pull_request(self):
         """Handler for ADD action in PULL_REQUEST state"""
         print("Handling ADD action in PULL_REQUEST state")
         # Implementation for adding in PULL_REQUEST state
-        task_summary = self._create_task_summary()
-        log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary)
-        contents_overview = self._create_pr_contents_overview()
-        log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview)
+        # task_summary = self._create_task_summary()
+        # log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary)
+        # contents_overview = self._create_pr_contents_overview()
+        # log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview)
         return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index 809ce152..c8f82df2 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -88,7 +88,9 @@ def analyse_contents(self) -> str:
         # Construct the overview.
         tar_members = '\n'.join(members_list)
         overview = f"Total number of items in the tarball: {tar_num_members}"
-        overview += f"\nURL to the tarball: {self.payload_object.remote_client.get_bucket_url()}"
+        bucket_url = self.payload_object.remote_client.get_bucket_url()
+        remote_file_path = self.payload_object.remote_file_path
+        overview += f"\nURL to the tarball: {bucket_url}/{remote_file_path}"
         overview += f"\n{tar_members_desc}\n\n"
         overview += f"```\n{tar_members}\n```\n"
 

From bd090600db78c2feeb10e7dc7938b9394cd64992 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Mon, 16 Jun 2025 22:29:13 +0200
Subject: [PATCH 176/218] bump sequence number to 6

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a76523eb..88b54bdc 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -261,7 +261,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 5
+        return 6
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From c2b8513061e61852875504ea05a69990a7c9cd3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Tue, 17 Jun 2025 22:14:46 +0200
Subject: [PATCH 177/218] implement first version of handler for PULL_REQUEST
 state

---
 scripts/automated_ingestion/eessi_task.py | 88 +++++++++++++++++++++--
 1 file changed, 83 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 88b54bdc..65387ab1 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -730,7 +730,6 @@ def _handle_add_undetermined(self):
         # create task file in target directory (TARGET_DIR/TaskDescription)
         # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK)
         # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR)
-        branch_name = self.git_repo.default_branch
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         sequence_number = self._get_fixed_sequence_number()  # corresponds to an open or yet to be created PR
@@ -755,6 +754,7 @@ def _handle_add_undetermined(self):
             }
         }
 
+        branch_name = self.git_repo.default_branch
         try:
             commit = self._create_multi_file_commit(
                 files_to_commit,
@@ -1150,6 +1150,7 @@ def _handle_add_payload_staged(self):
                 log_message(LoggingScope.TASK_OPS, 'INFO',
                             "PR %s is open, updating task states", pull_request)
                 # TODO: add failure handling (capture result and act on it)
+                #   THINK about what a failure would mean and what to do about it.
                 self._update_task_states(next_state, default_branch_name, approved_state, feature_branch_name)
 
                 # TODO: add failure handling (capture result and act on it)
@@ -1162,6 +1163,72 @@ def _handle_add_pull_request(self):
         """Handler for ADD action in PULL_REQUEST state"""
         print("Handling ADD action in PULL_REQUEST state")
         # Implementation for adding in PULL_REQUEST state
+        # we got here because the state of the task is PULL_REQUEST in the default branch
+        # determine branch and PR and state of PR
+        # PR is open --> just return TaskState.PULL_REQUEST
+        # PR is closed & merged --> deployment is approved
+        # PR is closed & not merged --> deployment is rejected
+        sequence_number = self._determine_sequence_number()
+        feature_branch_name = self._determine_feature_branch_name(sequence_number)
+        # TODO: check if feature branch exists, for now ASSUME it does
+        pull_request = self._find_pr_for_branch(feature_branch_name)
+        if pull_request:
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "found PR for branch %s: %s", feature_branch_name, pull_request)
+            if pull_request.state == 'closed':
+                if pull_request.merged:
+                    log_message(LoggingScope.TASK_OPS, 'INFO',
+                                "PR %s is closed and merged, returning APPROVED state", pull_request)
+                    # TODO: How could we ended up here? state in default branch is PULL_REQUEST but
+                    #         PR is merged, hence it should have been in the APPROVED state
+                    #    ==> for now, just return TaskState.PULL_REQUEST
+                    #
+                    #       there is the possibility that the PR was updated just before the
+                    #         PR was merged
+                    #       WHY is it a problem? because a task may have been accepted that wouldn't
+                    #         have been accepted or worse shouldn't been accepted
+                    #       WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw
+                    #       HOWEVER, the contents of the PR directory may be inconsistent with
+                    #         respect to the TaskState file and missing TaskSummary.html file
+                    #       WE could create an issue and only return TaskState.APPROVED if the
+                    #         issue is closed
+                    #       WE could also defer all handling of this to the handler for the
+                    #         APPROVED state
+                    # NOPE, we have to do some handling here, at least for the tasks where their
+                    #   state file did
+                    #   --> check if we could have ended up here? If so, create an issue.
+                    #       Do we need a state ISSUE_OPENED to avoid processing the task again?
+                    return TaskState.PULL_REQUEST
+                else:
+                    log_message(LoggingScope.TASK_OPS, 'INFO',
+                                "PR %s is closed and not merged, returning REJECTED state", pull_request)
+                    # TODO: there is the possibility that the PR was updated just before the
+                    #         PR was closed
+                    #       WHY is it a problem? because a task may have been rejected that wouldn't
+                    #         have been rejected or worse shouldn't been rejected
+                    #       WHAT to do? ACCEPT/IGNORE THE ISSUE FOR NOw
+                    #       HOWEVER, the contents of the PR directory may be inconsistent with
+                    #         respect to the TaskState file and missing TaskSummary.html file
+                    #       WE could create an issue and only return TaskState.REJECTED if the
+                    #         issue is closed
+                    #       WE could also defer all handling of this to the handler for the
+                    #         REJECTED state
+                    # FOR NOW, we assume that the task was rejected on purpose
+                    #   we need to change the state of the task in the default branch to REJECTED
+                    self._update_task_state_file(TaskState.REJECTED)
+                    return TaskState.REJECTED
+            else:
+                log_message(LoggingScope.TASK_OPS, 'INFO',
+                            "PR %s is open, returning PULL_REQUEST state", pull_request)
+                return TaskState.PULL_REQUEST
+        else:
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "no PR found for branch %s", feature_branch_name)
+            # the method was called because the state of the task is PULL_REQUEST in the default branch
+            # however, it's weird that the PR was not found for the feature branch
+            # TODO: may create or update an issue for the task or deployment
+            return TaskState.PULL_REQUEST
+
         # task_summary = self._create_task_summary()
         # log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary)
         # contents_overview = self._create_pr_contents_overview()
@@ -1171,16 +1238,27 @@ def _handle_add_pull_request(self):
     @log_function_entry_exit()
     def _handle_add_approved(self):
         """Handler for ADD action in APPROVED state"""
-        print("Handling ADD action in APPROVED state")
+        print("Handling ADD action in APPROVED state: %s", self.description.get_task_file_name())
         # Implementation for adding in APPROVED state
-        return True
+        # TODO: essentially, run the ingest function
+        # TODO: change state in default branch to INGESTED
+        return TaskState.INGESTED
 
     @log_function_entry_exit()
     def _handle_add_ingested(self):
         """Handler for ADD action in INGESTED state"""
-        print("Handling ADD action in INGESTED state")
+        print("Handling ADD action in INGESTED state: %s", self.description.get_task_file_name())
         # Implementation for adding in INGESTED state
-        return True
+        # TODO: change state in default branch to DONE
+        return TaskState.DONE
+
+    @log_function_entry_exit()
+    def _handle_add_rejected(self):
+        """Handler for ADD action in REJECTED state"""
+        print("Handling ADD action in REJECTED state: %s", self.description.get_task_file_name())
+        # Implementation for adding in REJECTED state
+        # TODO: change state in default branch to DONE
+        return TaskState.DONE
 
     @log_function_entry_exit()
     def transition_to(self, new_state: TaskState):

From 04e3e7069b5f3f9717ce0edabf7e7a307b7ef2b5 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 18 Jun 2025 14:10:50 +0200
Subject: [PATCH 178/218] use different function to determine feature branch

---
 scripts/automated_ingestion/eessi_task.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 65387ab1..e124a9ed 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1168,8 +1168,7 @@ def _handle_add_pull_request(self):
         # PR is open --> just return TaskState.PULL_REQUEST
         # PR is closed & merged --> deployment is approved
         # PR is closed & not merged --> deployment is rejected
-        sequence_number = self._determine_sequence_number()
-        feature_branch_name = self._determine_feature_branch_name(sequence_number)
+        feature_branch_name = self._determine_feature_branch_name()
         # TODO: check if feature branch exists, for now ASSUME it does
         pull_request = self._find_pr_for_branch(feature_branch_name)
         if pull_request:
@@ -1229,10 +1228,6 @@ def _handle_add_pull_request(self):
             # TODO: may create or update an issue for the task or deployment
             return TaskState.PULL_REQUEST
 
-        # task_summary = self._create_task_summary()
-        # log_message(LoggingScope.TASK_OPS, 'INFO', "task summary: %s", task_summary)
-        # contents_overview = self._create_pr_contents_overview()
-        # log_message(LoggingScope.TASK_OPS, 'INFO', "PR contents overview: %s", contents_overview)
         return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()

From 3a45f93bfb92da32b8183e3ab1e5bca303e7a6e7 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 18 Jun 2025 14:24:26 +0200
Subject: [PATCH 179/218] fixing print statements in handlers

---
 scripts/automated_ingestion/eessi_task.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index e124a9ed..1807a5df 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -725,7 +725,7 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str
     @log_function_entry_exit()
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
-        print("Handling ADD action in UNDETERMINED state")
+        print("Handling ADD action in UNDETERMINED state: %s" % self.description.get_task_file_name())
         # create target directory (REPO/PR/SEQ/TASK_FILE_NAME/)
         # create task file in target directory (TARGET_DIR/TaskDescription)
         # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK)
@@ -816,7 +816,7 @@ def _init_payload_object(self):
     @log_function_entry_exit()
     def _handle_add_new_task(self):
         """Handler for ADD action in NEW_TASK state"""
-        print("Handling ADD action in NEW_TASK state")
+        print("Handling ADD action in NEW_TASK state: %s" % self.description.get_task_file_name())
         # determine next state
         next_state = self._next_state(TaskState.NEW_TASK)
         log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s", next_state)
@@ -1101,7 +1101,7 @@ def _update_pull_request(self, pull_request: PullRequest):
     @log_function_entry_exit()
     def _handle_add_payload_staged(self):
         """Handler for ADD action in PAYLOAD_STAGED state"""
-        print("Handling ADD action in PAYLOAD_STAGED state")
+        print("Handling ADD action in PAYLOAD_STAGED state: %s" % self.description.get_task_file_name())
         next_state = self._next_state(TaskState.PAYLOAD_STAGED)
         approved_state = TaskState.APPROVED
         log_message(LoggingScope.TASK_OPS, 'INFO', "next_state: %s, approved_state: %s", next_state, approved_state)
@@ -1161,7 +1161,7 @@ def _handle_add_payload_staged(self):
     @log_function_entry_exit()
     def _handle_add_pull_request(self):
         """Handler for ADD action in PULL_REQUEST state"""
-        print("Handling ADD action in PULL_REQUEST state")
+        print("Handling ADD action in PULL_REQUEST state: %s" % self.description.get_task_file_name())
         # Implementation for adding in PULL_REQUEST state
         # we got here because the state of the task is PULL_REQUEST in the default branch
         # determine branch and PR and state of PR
@@ -1233,7 +1233,7 @@ def _handle_add_pull_request(self):
     @log_function_entry_exit()
     def _handle_add_approved(self):
         """Handler for ADD action in APPROVED state"""
-        print("Handling ADD action in APPROVED state: %s", self.description.get_task_file_name())
+        print("Handling ADD action in APPROVED state: %s" % self.description.get_task_file_name())
         # Implementation for adding in APPROVED state
         # TODO: essentially, run the ingest function
         # TODO: change state in default branch to INGESTED
@@ -1242,7 +1242,7 @@ def _handle_add_approved(self):
     @log_function_entry_exit()
     def _handle_add_ingested(self):
         """Handler for ADD action in INGESTED state"""
-        print("Handling ADD action in INGESTED state: %s", self.description.get_task_file_name())
+        print("Handling ADD action in INGESTED state: %s" % self.description.get_task_file_name())
         # Implementation for adding in INGESTED state
         # TODO: change state in default branch to DONE
         return TaskState.DONE
@@ -1250,7 +1250,7 @@ def _handle_add_ingested(self):
     @log_function_entry_exit()
     def _handle_add_rejected(self):
         """Handler for ADD action in REJECTED state"""
-        print("Handling ADD action in REJECTED state: %s", self.description.get_task_file_name())
+        print("Handling ADD action in REJECTED state: %s" % self.description.get_task_file_name())
         # Implementation for adding in REJECTED state
         # TODO: change state in default branch to DONE
         return TaskState.DONE

From 569de25bf780fe1b3db65284d9667810f7e1a216 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 18 Jun 2025 14:34:12 +0200
Subject: [PATCH 180/218] add more logging to main loop

---
 scripts/automated_ingestion/automated_ingestion.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/automated_ingestion.py b/scripts/automated_ingestion/automated_ingestion.py
index 46875b33..974f8497 100755
--- a/scripts/automated_ingestion/automated_ingestion.py
+++ b/scripts/automated_ingestion/automated_ingestion.py
@@ -273,9 +273,12 @@ def main():
                                current_state != TaskState.DONE and
                                previous_state != current_state):
                             previous_state = current_state
+                            log_message(LoggingScope.GROUP_OPS, 'INFO',
+                                        "Task '%s': BEFORE handle(): previous state = '%s', current state = '%s'",
+                                        task_path, previous_state.name, current_state.name)
                             current_state = task.handle()
                             log_message(LoggingScope.GROUP_OPS, 'INFO',
-                                        "Task '%s': previous state = '%s', current state = '%s'",
+                                        "Task '%s': AFTER handle(): previous state = '%s', current state = '%s'",
                                         task_path, previous_state.name, current_state.name)
 
                         # # TODO: update the information shown below (what makes sense to show?)

From 8ad5ceeb7cb0a234357edc6ae71363a6144f07d2 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 18 Jun 2025 15:08:22 +0200
Subject: [PATCH 181/218] implement handler for approved state

---
 scripts/automated_ingestion/eessi_task.py | 55 +++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 1807a5df..77dcd2eb 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1230,12 +1230,67 @@ def _handle_add_pull_request(self):
 
         return TaskState.PULL_REQUEST
 
+    @log_function_entry_exit()
+    def _perform_task_action(self):
+        """Perform the task action"""
+        # TODO: support other actions than ADD
+        if self.action == EESSITaskAction.ADD:
+            self._perform_task_add()
+        else:
+            raise ValueError(f"Task action '{self.action}' not supported (yet)")
+
+    @log_function_entry_exit()
+    def _perform_task_add(self):
+        """Perform the ADD task action"""
+        # TODO: verify checksum here or before?
+        script = self.config['paths']['ingestion_script']
+        sudo = ['sudo'] if self.config['cvmfs'].getboolean('ingest_as_root', True) else []
+        log_message(LoggingScope.STATE_OPS, 'INFO',
+                    'Running the ingestion script for %s...\n  with script: %s\n  with sudo: %s',
+                    self.description.get_task_file_name(),
+                    script, 'no' if sudo == [] else 'yes')
+        # ingest_cmd = subprocess.run(
+        #     sudo + [script, self.cvmfs_repo, self.local_path],
+        #     stdout=subprocess.PIPE,
+        #     stderr=subprocess.PIPE)
+        # if ingest_cmd.returncode == 0:
+        if False:
+            next_state = self._next_state(self.state)
+            self._move_metadata_file(self.state, next_state)
+            if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False):
+                # send_slack_message(
+                #     self.config['secrets']['slack_webhook'],
+                #     self.config['slack']['ingestion_message'].format(
+                #         tarball=os.path.basename(self.payload.local_path),
+                #         cvmfs_repo=self.cvmfs_repo)
+                # )
+                pass
+        else:
+            issue_title = f'Failed to add {os.path.basename(self.payload.local_path)}'
+            # issue_body = self.config['github']['failed_ingestion_issue_body'].format(
+            #     command=' '.join(ingest_cmd.args),
+            #     tarball=os.path.basename(self.payload.local_path),
+            #     return_code=ingest_cmd.returncode,
+            #     stdout=ingest_cmd.stdout.decode('UTF-8'),
+            #     stderr=ingest_cmd.stderr.decode('UTF-8'),
+            # )
+            if self.issue_exists(issue_title, state='open'):
+                log_message(LoggingScope.STATE_OPS, 'INFO',
+                            'Failed to add %s, but an open issue already exists, skipping...',
+                            os.path.basename(self.payload.local_path))
+            else:
+                log_message(LoggingScope.STATE_OPS, 'INFO',
+                            'Failed to add %s, but an open issue does not exist, creating one...',
+                            os.path.basename(self.payload.local_path))
+                # TODO: self.git_repo.create_issue(title=issue_title, body=issue_body)
+
     @log_function_entry_exit()
     def _handle_add_approved(self):
         """Handler for ADD action in APPROVED state"""
         print("Handling ADD action in APPROVED state: %s" % self.description.get_task_file_name())
         # Implementation for adding in APPROVED state
         # TODO: essentially, run the ingest function
+        self._perform_task_action()
         # TODO: change state in default branch to INGESTED
         return TaskState.INGESTED
 

From 623b8c306db1efaec13cd102dc0d0a7c041985ec Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 18 Jun 2025 15:13:43 +0200
Subject: [PATCH 182/218] fix access to payload file name

---
 scripts/automated_ingestion/eessi_task.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 77dcd2eb..777456ba 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1250,7 +1250,7 @@ def _perform_task_add(self):
                     self.description.get_task_file_name(),
                     script, 'no' if sudo == [] else 'yes')
         # ingest_cmd = subprocess.run(
-        #     sudo + [script, self.cvmfs_repo, self.local_path],
+        #     sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path],
         #     stdout=subprocess.PIPE,
         #     stderr=subprocess.PIPE)
         # if ingest_cmd.returncode == 0:
@@ -1261,15 +1261,15 @@ def _perform_task_add(self):
                 # send_slack_message(
                 #     self.config['secrets']['slack_webhook'],
                 #     self.config['slack']['ingestion_message'].format(
-                #         tarball=os.path.basename(self.payload.local_path),
+                #         tarball=os.path.basename(self.payload.payload_object.local_file_path),
                 #         cvmfs_repo=self.cvmfs_repo)
                 # )
                 pass
         else:
-            issue_title = f'Failed to add {os.path.basename(self.payload.local_path)}'
+            issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}'
             # issue_body = self.config['github']['failed_ingestion_issue_body'].format(
             #     command=' '.join(ingest_cmd.args),
-            #     tarball=os.path.basename(self.payload.local_path),
+            #     tarball=os.path.basename(self.payload.payload_object.local_file_path),
             #     return_code=ingest_cmd.returncode,
             #     stdout=ingest_cmd.stdout.decode('UTF-8'),
             #     stderr=ingest_cmd.stderr.decode('UTF-8'),
@@ -1277,11 +1277,11 @@ def _perform_task_add(self):
             if self.issue_exists(issue_title, state='open'):
                 log_message(LoggingScope.STATE_OPS, 'INFO',
                             'Failed to add %s, but an open issue already exists, skipping...',
-                            os.path.basename(self.payload.local_path))
+                            os.path.basename(self.payload.payload_object.local_file_path))
             else:
                 log_message(LoggingScope.STATE_OPS, 'INFO',
                             'Failed to add %s, but an open issue does not exist, creating one...',
-                            os.path.basename(self.payload.local_path))
+                            os.path.basename(self.payload.payload_object.local_file_path))
                 # TODO: self.git_repo.create_issue(title=issue_title, body=issue_body)
 
     @log_function_entry_exit()

From 1cd5ebd56a77ce2a346bb18034ac5424f6e6c315 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Wed, 18 Jun 2025 21:29:08 +0200
Subject: [PATCH 183/218] add method _issue_exists

---
 scripts/automated_ingestion/eessi_task.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 777456ba..5b57517c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1239,6 +1239,18 @@ def _perform_task_action(self):
         else:
             raise ValueError(f"Task action '{self.action}' not supported (yet)")
 
+    @log_function_entry_exit()
+    def _issue_exists(self, title: str, state: str = 'open') -> bool:
+        """
+        Check if an issue with the given title and state already exists.
+        """
+        issues = self.git_repo.get_issues(state=state)
+        for issue in issues:
+            if issue.title == title and issue.state == state:
+                return True
+        else:
+            return False
+
     @log_function_entry_exit()
     def _perform_task_add(self):
         """Perform the ADD task action"""
@@ -1274,7 +1286,7 @@ def _perform_task_add(self):
             #     stdout=ingest_cmd.stdout.decode('UTF-8'),
             #     stderr=ingest_cmd.stderr.decode('UTF-8'),
             # )
-            if self.issue_exists(issue_title, state='open'):
+            if self._issue_exists(issue_title, state='open'):
                 log_message(LoggingScope.STATE_OPS, 'INFO',
                             'Failed to add %s, but an open issue already exists, skipping...',
                             os.path.basename(self.payload.payload_object.local_file_path))

From c30abfcbe419a68b2c764e4825095b81524ea145 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Thu, 19 Jun 2025 00:27:33 +0200
Subject: [PATCH 184/218] updates to ingestion incl state handling, error
 handling

---
 scripts/automated_ingestion/eessi_task.py | 83 +++++++++++++----------
 1 file changed, 47 insertions(+), 36 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 5b57517c..40d4d151 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -2,15 +2,16 @@
 from typing import Dict, List, Tuple, Optional
 from functools import total_ordering
 
+import base64
 import os
+import subprocess
 import traceback
-import base64
 
 from eessi_data_object import EESSIDataAndSignatureObject
 from eessi_task_action import EESSITaskAction
 from eessi_task_description import EESSITaskDescription
 from eessi_task_payload import EESSITaskPayload
-from utils import log_message, LoggingScope, log_function_entry_exit
+from utils import send_slack_message, log_message, LoggingScope, log_function_entry_exit
 
 from github import Github, GithubException, InputGitTreeElement, UnknownObjectException
 from github.PullRequest import PullRequest
@@ -82,9 +83,9 @@ def __init__(self, description: EESSITaskDescription, config: Dict, cvmfs_repo:
             TaskState.PAYLOAD_STAGED: [TaskState.PULL_REQUEST],
             TaskState.PULL_REQUEST: [TaskState.APPROVED, TaskState.REJECTED],
             TaskState.APPROVED: [TaskState.INGESTED],
-            TaskState.REJECTED: [TaskState.DONE],
-            TaskState.INGESTED: [TaskState.DONE],
-            TaskState.DONE: []  # Terminal state
+            TaskState.REJECTED: [],  # terminal state
+            TaskState.INGESTED: [],  # terminal state
+            TaskState.DONE: []  # virtual terminal state, not used to write on GitHub
         }
 
         self.payload = None
@@ -1231,11 +1232,11 @@ def _handle_add_pull_request(self):
         return TaskState.PULL_REQUEST
 
     @log_function_entry_exit()
-    def _perform_task_action(self):
+    def _perform_task_action(self) -> bool:
         """Perform the task action"""
         # TODO: support other actions than ADD
         if self.action == EESSITaskAction.ADD:
-            self._perform_task_add()
+            return self._perform_task_add()
         else:
             raise ValueError(f"Task action '{self.action}' not supported (yet)")
 
@@ -1252,7 +1253,7 @@ def _issue_exists(self, title: str, state: str = 'open') -> bool:
             return False
 
     @log_function_entry_exit()
-    def _perform_task_add(self):
+    def _perform_task_add(self) -> bool:
         """Perform the ADD task action"""
         # TODO: verify checksum here or before?
         script = self.config['paths']['ingestion_script']
@@ -1261,31 +1262,31 @@ def _perform_task_add(self):
                     'Running the ingestion script for %s...\n  with script: %s\n  with sudo: %s',
                     self.description.get_task_file_name(),
                     script, 'no' if sudo == [] else 'yes')
-        # ingest_cmd = subprocess.run(
-        #     sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path],
-        #     stdout=subprocess.PIPE,
-        #     stderr=subprocess.PIPE)
-        # if ingest_cmd.returncode == 0:
-        if False:
+        ingest_cmd = subprocess.run(
+            sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        # TODO: if ingest_cmd.returncode == 0:
+        if True:
             next_state = self._next_state(self.state)
-            self._move_metadata_file(self.state, next_state)
+            self._update_task_state_file(next_state)
             if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False):
-                # send_slack_message(
-                #     self.config['secrets']['slack_webhook'],
-                #     self.config['slack']['ingestion_message'].format(
-                #         tarball=os.path.basename(self.payload.payload_object.local_file_path),
-                #         cvmfs_repo=self.cvmfs_repo)
-                # )
-                pass
+                send_slack_message(
+                    self.config['secrets']['slack_webhook'],
+                    self.config['slack']['ingestion_message'].format(
+                        tarball=os.path.basename(self.payload.payload_object.local_file_path),
+                        cvmfs_repo=self.cvmfs_repo)
+                )
+            return True
         else:
             issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}'
-            # issue_body = self.config['github']['failed_ingestion_issue_body'].format(
-            #     command=' '.join(ingest_cmd.args),
-            #     tarball=os.path.basename(self.payload.payload_object.local_file_path),
-            #     return_code=ingest_cmd.returncode,
-            #     stdout=ingest_cmd.stdout.decode('UTF-8'),
-            #     stderr=ingest_cmd.stderr.decode('UTF-8'),
-            # )
+            issue_body = self.config['github']['failed_ingestion_issue_body'].format(
+                command=' '.join(ingest_cmd.args),
+                tarball=os.path.basename(self.payload.payload_object.local_file_path),
+                return_code=ingest_cmd.returncode,
+                stdout=ingest_cmd.stdout.decode('UTF-8'),
+                stderr=ingest_cmd.stderr.decode('UTF-8'),
+            )
             if self._issue_exists(issue_title, state='open'):
                 log_message(LoggingScope.STATE_OPS, 'INFO',
                             'Failed to add %s, but an open issue already exists, skipping...',
@@ -1294,24 +1295,33 @@ def _perform_task_add(self):
                 log_message(LoggingScope.STATE_OPS, 'INFO',
                             'Failed to add %s, but an open issue does not exist, creating one...',
                             os.path.basename(self.payload.payload_object.local_file_path))
-                # TODO: self.git_repo.create_issue(title=issue_title, body=issue_body)
+                self.git_repo.create_issue(title=issue_title, body=issue_body)
+            return False
 
     @log_function_entry_exit()
     def _handle_add_approved(self):
         """Handler for ADD action in APPROVED state"""
         print("Handling ADD action in APPROVED state: %s" % self.description.get_task_file_name())
         # Implementation for adding in APPROVED state
-        # TODO: essentially, run the ingest function
-        self._perform_task_action()
-        # TODO: change state in default branch to INGESTED
-        return TaskState.INGESTED
+        # If successful, _perform_task_action() will change the state
+        #   to INGESTED on GitHub
+        try:
+            if self._perform_task_action():
+                return TaskState.INGESTED
+            else:
+                return TaskState.APPROVED
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR',
+                        "Error performing task action: %s", err)
+            return TaskState.APPROVED
 
     @log_function_entry_exit()
     def _handle_add_ingested(self):
         """Handler for ADD action in INGESTED state"""
         print("Handling ADD action in INGESTED state: %s" % self.description.get_task_file_name())
         # Implementation for adding in INGESTED state
-        # TODO: change state in default branch to DONE
+        # DONT change state on GitHub, because the result
+        #   (INGESTED/REJECTED) would be overwritten
         return TaskState.DONE
 
     @log_function_entry_exit()
@@ -1319,7 +1329,8 @@ def _handle_add_rejected(self):
         """Handler for ADD action in REJECTED state"""
         print("Handling ADD action in REJECTED state: %s" % self.description.get_task_file_name())
         # Implementation for adding in REJECTED state
-        # TODO: change state in default branch to DONE
+        # DONT change state on GitHub, because the result
+        #   (INGESTED/REJECTED) would be overwritten
         return TaskState.DONE
 
     @log_function_entry_exit()

From d9d2fc836b4da63afbd8e1afdb5ee59e4fb67d06 Mon Sep 17 00:00:00 2001
From: Thomas Roeblitz <thomas.roblitz@uib.no>
Date: Thu, 19 Jun 2025 00:53:39 +0200
Subject: [PATCH 185/218] fix using state and remove unused function
 transition_to

---
 scripts/automated_ingestion/eessi_task.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 40d4d151..99e581c4 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1268,7 +1268,7 @@ def _perform_task_add(self) -> bool:
             stderr=subprocess.PIPE)
         # TODO: if ingest_cmd.returncode == 0:
         if True:
-            next_state = self._next_state(self.state)
+            next_state = self._next_state(TaskState.APPROVED)
             self._update_task_state_file(next_state)
             if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False):
                 send_slack_message(
@@ -1333,16 +1333,6 @@ def _handle_add_rejected(self):
         #   (INGESTED/REJECTED) would be overwritten
         return TaskState.DONE
 
-    @log_function_entry_exit()
-    def transition_to(self, new_state: TaskState):
-        """
-        Transition the task to a new state if valid.
-        """
-        if new_state in self.valid_transitions[self.state]:
-            self.state = new_state
-            return True
-        return False
-
     @log_function_entry_exit()
     def __str__(self):
         return f"EESSITask(description={self.description}, action={self.action}, state={self.determine_state()})"

From 96050b601689664d7e321f66ea3b0d05cc303879 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 11:39:27 +0200
Subject: [PATCH 186/218] enable code to run ingestion script

---
 scripts/automated_ingestion/eessi_task.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 99e581c4..23de8db7 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1266,8 +1266,7 @@ def _perform_task_add(self) -> bool:
             sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
-        # TODO: if ingest_cmd.returncode == 0:
-        if True:
+        if ingest_cmd.returncode == 0:
             next_state = self._next_state(TaskState.APPROVED)
             self._update_task_state_file(next_state)
             if self.config.has_section('slack') and self.config['slack'].getboolean('ingestion_notification', False):

From 435b96fa4cac4bffe679878697db0f89a30c8307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 12:07:16 +0200
Subject: [PATCH 187/218] bump sequence number to 7

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 23de8db7..4607bbbe 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 6
+        return 7
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From 37149ee5575480cde763803a5c772f9f70c4f573 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 12:18:06 +0200
Subject: [PATCH 188/218] log result and output of ingest script

---
 scripts/automated_ingestion/eessi_task.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 4607bbbe..2ea19e58 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1266,6 +1266,12 @@ def _perform_task_add(self) -> bool:
             sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
+        log_message(LoggingScope.STATE_OPS, 'INFO',
+                    'Ingestion script returned code %s', ingest_cmd.returncode)
+        log_message(LoggingScope.STATE_OPS, 'INFO',
+                    'Ingestion script stdout: %s', ingest_cmd.stdout.decode('UTF-8'))
+        log_message(LoggingScope.STATE_OPS, 'INFO',
+                    'Ingestion script stderr: %s', ingest_cmd.stderr.decode('UTF-8'))
         if ingest_cmd.returncode == 0:
             next_state = self._next_state(TaskState.APPROVED)
             self._update_task_state_file(next_state)

From c7ed07db3297cc606b9838098c2a6791535a5de0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 12:19:09 +0200
Subject: [PATCH 189/218] bump sequence number to 8

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 2ea19e58..e13ebe3d 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 7
+        return 8
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From d4528fef653485cbef13546c5eea18fe581542f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 12:24:47 +0200
Subject: [PATCH 190/218] bump sequence number to 9

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index e13ebe3d..ce43ba5d 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 8
+        return 9
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From 73a77ee1f2a13bef6dd968f171f65875f9c547a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 12:36:41 +0200
Subject: [PATCH 191/218] add logging for issue creation

---
 scripts/automated_ingestion/eessi_task.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index ce43ba5d..d97987ef 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1284,6 +1284,10 @@ def _perform_task_add(self) -> bool:
                 )
             return True
         else:
+            log_message(LoggingScope.STATE_OPS, 'ERROR',
+                        'Failed to add %s, return code %s',
+                        os.path.basename(self.payload.payload_object.local_file_path),
+                        ingest_cmd.returncode)
             issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}'
             issue_body = self.config['github']['failed_ingestion_issue_body'].format(
                 command=' '.join(ingest_cmd.args),
@@ -1292,6 +1296,9 @@ def _perform_task_add(self) -> bool:
                 stdout=ingest_cmd.stdout.decode('UTF-8'),
                 stderr=ingest_cmd.stderr.decode('UTF-8'),
             )
+            log_message(LoggingScope.STATE_OPS, 'INFO',
+                        'Creating issue for failed ingestion: title: %s, body: %s',
+                        issue_title, issue_body)
             if self._issue_exists(issue_title, state='open'):
                 log_message(LoggingScope.STATE_OPS, 'INFO',
                             'Failed to add %s, but an open issue already exists, skipping...',

From adcb918d02a743bcd955a5a7298bad7c8a19f103 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 12:37:06 +0200
Subject: [PATCH 192/218] bump sequence number to 10

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index d97987ef..cc771044 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 9
+        return 10
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From 98c04095568d1a49e78a2dd2f7978c29a0479710 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 17:17:27 +0200
Subject: [PATCH 193/218] improve logging when processing ingestion failure

---
 scripts/automated_ingestion/eessi_task.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index cc771044..5ced5ad1 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1284,21 +1284,29 @@ def _perform_task_add(self) -> bool:
                 )
             return True
         else:
+            tarball = os.path.basename(self.payload.payload_object.local_file_path)
             log_message(LoggingScope.STATE_OPS, 'ERROR',
                         'Failed to add %s, return code %s',
-                        os.path.basename(self.payload.payload_object.local_file_path),
+                        tarball,
                         ingest_cmd.returncode)
-            issue_title = f'Failed to add {os.path.basename(self.payload.payload_object.local_file_path)}'
+
+            issue_title = f'Failed to add {tarball}'
+            log_message(LoggingScope.STATE_OPS, 'INFO',
+                        'Creating issue for failed ingestion: title: %s',
+                        issue_title)
+
+            command = ' '.join(ingest_cmd.args)
             issue_body = self.config['github']['failed_ingestion_issue_body'].format(
-                command=' '.join(ingest_cmd.args),
-                tarball=os.path.basename(self.payload.payload_object.local_file_path),
+                command=command,
+                tarball=tarball,
                 return_code=ingest_cmd.returncode,
                 stdout=ingest_cmd.stdout.decode('UTF-8'),
-                stderr=ingest_cmd.stderr.decode('UTF-8'),
+                stderr=ingest_cmd.stderr.decode('UTF-8')
             )
             log_message(LoggingScope.STATE_OPS, 'INFO',
-                        'Creating issue for failed ingestion: title: %s, body: %s',
-                        issue_title, issue_body)
+                        'Creating issue for failed ingestion: body: %s',
+                        issue_body)
+
             if self._issue_exists(issue_title, state='open'):
                 log_message(LoggingScope.STATE_OPS, 'INFO',
                             'Failed to add %s, but an open issue already exists, skipping...',

From da9cc94d3679ea2e9c59f24dec195a56d454a42f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 18:06:00 +0200
Subject: [PATCH 194/218] add traceback when catching exception

---
 scripts/automated_ingestion/eessi_task.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 5ced5ad1..9eeb3446 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1292,11 +1292,12 @@ def _perform_task_add(self) -> bool:
 
             issue_title = f'Failed to add {tarball}'
             log_message(LoggingScope.STATE_OPS, 'INFO',
-                        'Creating issue for failed ingestion: title: %s',
+                        "Creating issue for failed ingestion: title: '%s'",
                         issue_title)
 
             command = ' '.join(ingest_cmd.args)
-            issue_body = self.config['github']['failed_ingestion_issue_body'].format(
+            failed_ingestion_issue_body = self.config['github']['failed_ingestion_issue_body']
+            issue_body = failed_ingestion_issue_body.format(
                 command=command,
                 tarball=tarball,
                 return_code=ingest_cmd.returncode,
@@ -1304,7 +1305,7 @@ def _perform_task_add(self) -> bool:
                 stderr=ingest_cmd.stderr.decode('UTF-8')
             )
             log_message(LoggingScope.STATE_OPS, 'INFO',
-                        'Creating issue for failed ingestion: body: %s',
+                        "Creating issue for failed ingestion: body: '%s'",
                         issue_body)
 
             if self._issue_exists(issue_title, state='open'):
@@ -1332,7 +1333,7 @@ def _handle_add_approved(self):
                 return TaskState.APPROVED
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR',
-                        "Error performing task action: %s", err)
+                        "Error performing task action: '%s'\nTraceback:\n%s", err, traceback.format_exc())
             return TaskState.APPROVED
 
     @log_function_entry_exit()

From 54062a37c6cde40d59b20b18d310e75b83b6572e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 18:15:21 +0200
Subject: [PATCH 195/218] convert Path to str

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 9eeb3446..11b7f055 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1263,7 +1263,7 @@ def _perform_task_add(self) -> bool:
                     self.description.get_task_file_name(),
                     script, 'no' if sudo == [] else 'yes')
         ingest_cmd = subprocess.run(
-            sudo + [script, self.cvmfs_repo, self.payload.payload_object.local_file_path],
+            sudo + [script, self.cvmfs_repo, str(self.payload.payload_object.local_file_path)],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
         log_message(LoggingScope.STATE_OPS, 'INFO',

From bb6021edb53fd1cf42c957ade72b3cc456bda7d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sat, 21 Jun 2025 20:14:26 +0200
Subject: [PATCH 196/218] bump sequence number to 11

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 11b7f055..7c21f1fe 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -262,7 +262,7 @@ def _get_fixed_sequence_number(self) -> int:
         """
         Get a fixed sequence number.
         """
-        return 10
+        return 11
 
     @log_function_entry_exit()
     def _determine_sequence_status(self, sequence_number: int = None) -> int:

From de00cee1032ae903c6fe16d20f6ea1b319126765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 12:32:32 +0200
Subject: [PATCH 197/218] 1st step to make sequence numbers non-hardcoded

---
 scripts/automated_ingestion/eessi_task.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7c21f1fe..84c77d4b 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -835,6 +835,7 @@ def _handle_add_new_task(self):
     @log_function_entry_exit()
     def _determine_branch_name_from_sequence_number(self, sequence_number: int = None) -> str:
         """Determine the branch name from the sequence number"""
+        # TODO: make sequence_number mandatory and thereby remove need for _get_fixed_sequence_number
         sequence_number = self._get_fixed_sequence_number() if sequence_number is None else sequence_number
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
@@ -863,7 +864,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             return None
 
     @log_function_entry_exit()
-    def _determine_sequence_number(self) -> int:
+    def _determine_sequence_number_from_pull_request_directory(self) -> int:
         """Determine the sequence number from the target directory name"""
         task_pointer_file = self.description.task_object.remote_file_path
         target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch)
@@ -963,6 +964,8 @@ def _create_task_summary(self) -> str:
         feature_branch_name = self._determine_feature_branch_name()
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
+        # TODO: determine sequence number from task pointer file and thereby remove need
+        #       for _get_fixed_sequence_number
         sequence_number = self._get_fixed_sequence_number()  # corresponds to an open PR
         task_file_name = self.description.get_task_file_name()
         target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
@@ -1041,7 +1044,7 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
-        seq_num = self._determine_sequence_number()
+        seq_num = self._determine_sequence_number_from_pull_request_directory()
         pr_title = pr_title_format.format(
             cvmfs_repo=self.cvmfs_repo,
             pr=pr_number,
@@ -1080,7 +1083,7 @@ def _update_pull_request(self, pull_request: PullRequest):
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         pr_url = f"https://github.com/{repo_name}/pull/{pr_number}"
-        seq_num = self._determine_sequence_number()
+        seq_num = self._determine_sequence_number_from_pull_request_directory()
 
         self._create_task_summary()
         contents_overview = self._create_pr_contents_overview()

From b8240dd25311ef18885623858682ea44de78cc35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 12:44:17 +0200
Subject: [PATCH 198/218] rename target_dir to pull_request_dir

---
 scripts/automated_ingestion/eessi_task.py | 60 +++++++++++------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 84c77d4b..a9cfb263 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -472,9 +472,9 @@ def _read_dict_from_string(self, content: str) -> dict:
         return config_dict
 
     @log_function_entry_exit()
-    def _read_target_dir_from_file(self, path: str, branch_name: str = None) -> str:
+    def _read_pull_request_dir_from_file(self, path: str, branch_name: str = None) -> str:
         """
-        Read the target directory from the file in the given branch.
+        Read the pull request directory from the file in the given branch.
         """
         branch_name = self.git_repo.default_branch if branch_name is None else branch_name
         content = self.git_repo.get_contents(path, ref=branch_name)
@@ -485,7 +485,7 @@ def _read_target_dir_from_file(self, path: str, branch_name: str = None) -> str:
         # Parse into dictionary
         config_dict = self._read_dict_from_string(content_str)
 
-        return config_dict.get('target_dir', None)
+        return config_dict.get('pull_request_dir', None)
 
     @log_function_entry_exit()
     def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]:
@@ -534,9 +534,9 @@ def determine_state(self, branch: str = None) -> TaskState:
                         task_pointer_file, branch_to_use)
 
             # get state from task file in branch to use
-            # - read the TaskState file in target dir
-            target_dir = self._read_target_dir_from_file(task_pointer_file, branch_to_use)
-            task_state_file_path = f"{target_dir}/TaskState"
+            # - read the TaskState file in pull request directory
+            pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_to_use)
+            task_state_file_path = f"{pull_request_dir}/TaskState"
             task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use)
 
             log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch %s: %s",
@@ -727,17 +727,17 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
         print("Handling ADD action in UNDETERMINED state: %s" % self.description.get_task_file_name())
-        # create target directory (REPO/PR/SEQ/TASK_FILE_NAME/)
-        # create task file in target directory (TARGET_DIR/TaskDescription)
-        # create task status file in target directory (TARGET_DIR/TaskState.NEW_TASK)
-        # create pointer file from task file path to target directory (remote_file_path -> TARGET_DIR)
+        # create pull request directory (REPO/PR/SEQ/TASK_FILE_NAME/)
+        # create task file in pull request directory (PULL_REQUEST_DIR/TaskDescription)
+        # create task status file in pull request directory (PULL_REQUEST_DIR/TaskState.NEW_TASK)
+        # create pointer file from task file path to pull request directory (remote_file_path -> PULL_REQUEST_DIR)
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         sequence_number = self._get_fixed_sequence_number()  # corresponds to an open or yet to be created PR
         task_file_name = self.description.get_task_file_name()
-        target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
-        task_description_file_path = f"{target_dir}/TaskDescription"
-        task_state_file_path = f"{target_dir}/TaskState"
+        pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
+        task_description_file_path = f"{pull_request_dir}/TaskDescription"
+        task_state_file_path = f"{pull_request_dir}/TaskState"
         remote_file_path = self.description.task_object.remote_file_path
 
         files_to_commit = {
@@ -750,7 +750,7 @@ def _handle_add_undetermined(self):
                 "mode": "100644"
             },
             remote_file_path: {
-                "content": f"remote_file_path = {remote_file_path}\ntarget_dir = {target_dir}",
+                "content": f"remote_file_path = {remote_file_path}\npull_request_dir = {pull_request_dir}",
                 "mode": "100644"
             }
         }
@@ -778,8 +778,8 @@ def _update_task_state_file(self, next_state: TaskState, branch_name: str = None
         branch_name = self.git_repo.default_branch if branch_name is None else branch_name
 
         task_pointer_file = self.description.task_object.remote_file_path
-        target_dir = self._read_target_dir_from_file(task_pointer_file, branch_name)
-        task_state_file_path = f"{target_dir}/TaskState"
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_name)
+        task_state_file_path = f"{pull_request_dir}/TaskState"
         arch = self.description.get_metadata_file_components()[3]
         commit_message = f"change task state to {next_state} in {branch_name} for {arch}"
         result = self._update_file(task_state_file_path,
@@ -865,28 +865,28 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
 
     @log_function_entry_exit()
     def _determine_sequence_number_from_pull_request_directory(self) -> int:
-        """Determine the sequence number from the target directory name"""
+        """Determine the sequence number from the pull request directory name"""
         task_pointer_file = self.description.task_object.remote_file_path
-        target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch)
-        # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
-        _, _, _, seq, _ = target_dir.split('/')
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
+        _, _, _, seq, _ = pull_request_dir.split('/')
         return int(seq)
 
     @log_function_entry_exit()
     def _determine_feature_branch_name(self) -> str:
-        """Determine the feature branch name from the target directory name"""
+        """Determine the feature branch name from the pull request directory name"""
         task_pointer_file = self.description.task_object.remote_file_path
-        target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch)
-        # target_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
-        org, repo, pr, seq, _ = target_dir.split('/')
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        # pull_request_dir is of the form REPO/PR/SEQ/TASK_FILE_NAME/ (REPO contains a '/' separating the org and repo)
+        org, repo, pr, seq, _ = pull_request_dir.split('/')
         return f"{org}-{repo}-PR-{pr}-SEQ-{seq}"
 
     @log_function_entry_exit()
     def _sync_task_state_file(self, source_branch: str, target_branch: str):
         """Update task state file from source to target branch"""
         task_pointer_file = self.description.task_object.remote_file_path
-        target_dir = self._read_target_dir_from_file(task_pointer_file, self.git_repo.default_branch)
-        task_state_file_path = f"{target_dir}/TaskState"
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, self.git_repo.default_branch)
+        task_state_file_path = f"{pull_request_dir}/TaskState"
 
         try:
             # Get content from source branch
@@ -968,8 +968,8 @@ def _create_task_summary(self) -> str:
         #       for _get_fixed_sequence_number
         sequence_number = self._get_fixed_sequence_number()  # corresponds to an open PR
         task_file_name = self.description.get_task_file_name()
-        target_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
-        task_summary_file_path = f"{target_dir}/TaskSummary.html"
+        pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
+        task_summary_file_path = f"{pull_request_dir}/TaskSummary.html"
 
         # check if task summary file already exists in repo on GitHub
         if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
@@ -1009,8 +1009,8 @@ def _create_pr_contents_overview(self) -> str:
         # TODO: implement
         feature_branch_name = self._determine_feature_branch_name()
         task_pointer_file = self.description.task_object.remote_file_path
-        target_dir = self._read_target_dir_from_file(task_pointer_file, feature_branch_name)
-        pr_dir = os.path.dirname(target_dir)
+        pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, feature_branch_name)
+        pr_dir = os.path.dirname(pull_request_dir)
         directories = self._list_directory_contents(pr_dir, feature_branch_name)
         contents_overview = ""
         if directories:

From 345bbb7a3b2903c8b631ee2367396d9c59e8ad92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 12:45:18 +0200
Subject: [PATCH 199/218] remove function to create symlink

---
 scripts/automated_ingestion/eessi_task.py | 57 -----------------------
 1 file changed, 57 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a9cfb263..f6a4d399 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -571,63 +571,6 @@ def handle(self):
             return state_before_handle
 
     # Implement handlers for ADD action
-    @log_function_entry_exit()
-    def _create_symlink(self, source_path: str, target_path: str, branch_name: str = None):
-        """Create a symlink in the given branch."""
-        try:
-            branch_name = self.git_repo.default_branch if branch_name is None else branch_name
-            ref = self.git_repo.get_git_ref(f"heads/{branch_name}")
-            commit = self.git_repo.get_git_commit(ref.object.sha)
-            base_tree = self.git_repo.get_git_tree(commit.tree.sha)
-
-            # Create blob for symlink target
-            blob = self.git_repo.create_git_blob(target_path, "utf-8")
-            log_message(LoggingScope.TASK_OPS, 'INFO', "blob created: %s", blob)
-
-            # Create tree element
-            tree_element = InputGitTreeElement(
-                path=source_path,
-                mode="120000",
-                type="blob",
-                sha=blob.sha
-            )
-            log_message(LoggingScope.TASK_OPS, 'INFO', "tree element created: %s", tree_element)
-
-            # Create new tree
-            try:
-                new_tree = self.git_repo.create_git_tree([tree_element], base_tree)
-                log_message(LoggingScope.TASK_OPS, 'INFO', "new tree created: %s", new_tree)
-            except GithubException as err:
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating new tree: %s", err)
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Status Code: %s", err.status)
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Error Message: %s", err.data)
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Headers: %s", err.headers)
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Raw Response: %s", err.response)
-                return False
-            except Exception as err:
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "\n=== General Exception ===")
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Type: %s", type(err).__name__)
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Message: %s", str(err))
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "  Traceback:")
-                log_message(LoggingScope.TASK_OPS, 'ERROR', "    %s", traceback.format_exc())
-                return False
-
-            # Create new commit
-            commit_message = f"Add symlink {source_path} -> {target_path}"
-            new_commit = self.git_repo.create_git_commit(commit_message, new_tree, [commit])
-            log_message(LoggingScope.TASK_OPS, 'INFO', "new commit created: %s", new_commit)
-
-            # Update reference
-            ref.edit(new_commit.sha)
-
-            log_message(LoggingScope.TASK_OPS, 'INFO', "Symlink created: %s -> %s",
-                        source_path, target_path)
-            return True
-
-        except Exception as err:
-            log_message(LoggingScope.TASK_OPS, 'ERROR', "Error creating symlink: %s", err)
-            return False
-
     @log_function_entry_exit()
     def _safe_create_file(self, path: str, message: str, content: str, branch_name: str = None):
         """Create a file in the given branch."""

From e8004155b168df215d3e5582619f465735c1838d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 12:47:50 +0200
Subject: [PATCH 200/218] remove function to obtain branch name from sequence
 number

---
 scripts/automated_ingestion/eessi_task.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index f6a4d399..0bda49ff 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -775,15 +775,6 @@ def _handle_add_new_task(self):
         #   is still open or yet to be created); if it is not valid, perform corrective actions
         return next_state
 
-    @log_function_entry_exit()
-    def _determine_branch_name_from_sequence_number(self, sequence_number: int = None) -> str:
-        """Determine the branch name from the sequence number"""
-        # TODO: make sequence_number mandatory and thereby remove need for _get_fixed_sequence_number
-        sequence_number = self._get_fixed_sequence_number() if sequence_number is None else sequence_number
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        return f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
-
     @log_function_entry_exit()
     def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """

From d717527a2ac7e076b69cf28c919ef99ea27531d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 12:54:46 +0200
Subject: [PATCH 201/218] remove one use of _get_fixed_sequence_number

---
 scripts/automated_ingestion/eessi_task.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0bda49ff..0c9057eb 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -895,17 +895,15 @@ def _create_task_summary(self) -> str:
         """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory."""
 
         # determine task summary file path in feature branch on GitHub
-        feature_branch_name = self._determine_feature_branch_name()
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
-        # TODO: determine sequence number from task pointer file and thereby remove need
-        #       for _get_fixed_sequence_number
-        sequence_number = self._get_fixed_sequence_number()  # corresponds to an open PR
+        sequence_number = self._determine_sequence_number_from_pull_request_directory()
         task_file_name = self.description.get_task_file_name()
         pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
         task_summary_file_path = f"{pull_request_dir}/TaskSummary.html"
 
         # check if task summary file already exists in repo on GitHub
+        feature_branch_name = self._determine_feature_branch_name()
         if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
             log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path)
             task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)

From 868b23be65aa2ec976146c8df683bc87df6aa483 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 13:08:32 +0200
Subject: [PATCH 202/218] improve ways to obtain pull request directory

---
 scripts/automated_ingestion/eessi_task.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 0c9057eb..be09eb5a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -472,12 +472,18 @@ def _read_dict_from_string(self, content: str) -> dict:
         return config_dict
 
     @log_function_entry_exit()
-    def _read_pull_request_dir_from_file(self, path: str, branch_name: str = None) -> str:
+    def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch_name: str = None) -> str:
         """
         Read the pull request directory from the file in the given branch.
         """
-        branch_name = self.git_repo.default_branch if branch_name is None else branch_name
-        content = self.git_repo.get_contents(path, ref=branch_name)
+        # set default values for task pointer file and branch name
+        if task_pointer_file is None:
+            task_pointer_file = self.description.task_object.remote_file_path
+        if branch_name is None:
+            branch_name = self.git_repo.default_branch
+
+        # read the pull request directory from the file in the given branch
+        content = self.git_repo.get_contents(task_pointer_file, ref=branch_name)
 
         # Decode the content from base64
         content_str = content.decoded_content.decode('utf-8')
@@ -487,6 +493,11 @@ def _read_pull_request_dir_from_file(self, path: str, branch_name: str = None) -
 
         return config_dict.get('pull_request_dir', None)
 
+    @log_function_entry_exit()
+    def _determine_pull_request_dir(self, task_pointer_file: str = None, branch_name: str = None) -> str:
+        """Determine the pull request directory via the task pointer file"""
+        return self._read_pull_request_dir_from_file(task_pointer_file=task_pointer_file, branch_name=branch_name)
+
     @log_function_entry_exit()
     def _get_branch_from_name(self, branch_name: str = None) -> Optional[Branch]:
         """
@@ -535,7 +546,7 @@ def determine_state(self, branch: str = None) -> TaskState:
 
             # get state from task file in branch to use
             # - read the TaskState file in pull request directory
-            pull_request_dir = self._read_pull_request_dir_from_file(task_pointer_file, branch_to_use)
+            pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use)
             task_state_file_path = f"{pull_request_dir}/TaskState"
             task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use)
 
@@ -678,7 +689,7 @@ def _handle_add_undetermined(self):
         pr_number = self.description.get_pr_number()
         sequence_number = self._get_fixed_sequence_number()  # corresponds to an open or yet to be created PR
         task_file_name = self.description.get_task_file_name()
-        pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
+        pull_request_dir = self._determine_pull_request_dir()
         task_description_file_path = f"{pull_request_dir}/TaskDescription"
         task_state_file_path = f"{pull_request_dir}/TaskState"
         remote_file_path = self.description.task_object.remote_file_path

From dd0e8bbf4ca0f098e1e96ea4eb66682d4bebb127 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 13:13:37 +0200
Subject: [PATCH 203/218] clarify how to determine pull_request_dir

---
 scripts/automated_ingestion/eessi_task.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index be09eb5a..a05e57ea 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -681,6 +681,8 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
         print("Handling ADD action in UNDETERMINED state: %s" % self.description.get_task_file_name())
+        # task is in state UNDETERMINED if there is no pull request directory for the task yet
+        #
         # create pull request directory (REPO/PR/SEQ/TASK_FILE_NAME/)
         # create task file in pull request directory (PULL_REQUEST_DIR/TaskDescription)
         # create task status file in pull request directory (PULL_REQUEST_DIR/TaskState.NEW_TASK)
@@ -689,7 +691,9 @@ def _handle_add_undetermined(self):
         pr_number = self.description.get_pr_number()
         sequence_number = self._get_fixed_sequence_number()  # corresponds to an open or yet to be created PR
         task_file_name = self.description.get_task_file_name()
-        pull_request_dir = self._determine_pull_request_dir()
+        # we cannot use self._determine_pull_request_dir() here because it requires a task pointer file
+        #   and we don't have one yet
+        pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
         task_description_file_path = f"{pull_request_dir}/TaskDescription"
         task_state_file_path = f"{pull_request_dir}/TaskState"
         remote_file_path = self.description.task_object.remote_file_path

From 20a7ae87ce41507df1f19f165bec0025a52cb3d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 13:16:34 +0200
Subject: [PATCH 204/218] remove need for determining sequence number in
 _create_task_summary

---
 scripts/automated_ingestion/eessi_task.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index a05e57ea..d7cf6ecc 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -910,15 +910,11 @@ def _create_task_summary(self) -> str:
         """Analyse contents of current task and create a file for it in the REPO-PR-SEQ directory."""
 
         # determine task summary file path in feature branch on GitHub
-        repo_name = self.description.get_repo_name()
-        pr_number = self.description.get_pr_number()
-        sequence_number = self._determine_sequence_number_from_pull_request_directory()
-        task_file_name = self.description.get_task_file_name()
-        pull_request_dir = f"{repo_name}/{pr_number}/{sequence_number}/{task_file_name}"
+        feature_branch_name = self._determine_feature_branch_name()
+        pull_request_dir = self._determine_pull_request_dir(branch_name=feature_branch_name)
         task_summary_file_path = f"{pull_request_dir}/TaskSummary.html"
 
         # check if task summary file already exists in repo on GitHub
-        feature_branch_name = self._determine_feature_branch_name()
         if self._path_exists_in_branch(task_summary_file_path, feature_branch_name):
             log_message(LoggingScope.TASK_OPS, 'INFO', "task summary file already exists: %s", task_summary_file_path)
             task_summary = self.git_repo.get_contents(task_summary_file_path, ref=feature_branch_name)
@@ -942,6 +938,7 @@ def _create_task_summary(self) -> str:
 
         # create HTML file with task summary in REPO-PR-SEQ directory
         # TODO: add failure handling (capture result and act on it)
+        task_file_name = self.description.get_task_file_name()
         commit_message = f"create summary for {task_file_name} in {feature_branch_name}"
         self._safe_create_file(task_summary_file_path, commit_message, task_summary,
                                branch_name=feature_branch_name)

From 814cd50cef8dd4e4442a693180406b317d841356 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 15:42:40 +0200
Subject: [PATCH 205/218] add function to determine sequence number and use the
 function

---
 scripts/automated_ingestion/eessi_task.py | 64 ++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index d7cf6ecc..7368cecb 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -677,6 +677,60 @@ def _update_file(self, file_path, new_content, commit_message, branch_name: str
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error updating file: %s", err)
             return None
 
+    @log_function_entry_exit()
+    def _sorted_list_of_sequence_numbers(self) -> List[int]:
+        """Create a sorted list of sequence numbers from the pull requests directory"""
+        # a pull request's directory is of the form REPO/PR/SEQ
+        # hence, we can get all sequence numbers from the pull requests directory REPO/PR
+        sequence_numbers = []
+        repo_pr_dir = f"{self.description.get_repo_name()}/{self.description.get_pr_number()}"
+
+        # iterate over all directories under repo_pr_dir
+        try:
+            directories = self._list_directory_contents(repo_pr_dir)
+            for dir in directories:
+                # check if the directory is a number
+                if dir.name.isdigit():
+                    sequence_numbers.append(int(dir.name))
+                else:
+                    # directory is not a number, so we skip it
+                    continue
+        except FileNotFoundError:
+            # repo_pr_dir does not exist, so we return an empty dictionary
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Pull requests directory '%s' does not exist", repo_pr_dir)
+        except GithubException as err:
+            if err.status != 404:  # 404 is catched by FileNotFoundError
+                # some other error than the directory not existing
+                log_message(LoggingScope.TASK_OPS, 'ERROR',
+                            "Some other error than the directory not existing: %s", err)
+        except Exception as err:
+            log_message(LoggingScope.TASK_OPS, 'ERROR', "Unexpected error: %s", err)
+
+        return sorted(sequence_numbers)
+
+    @log_function_entry_exit()
+    def _determine_sequence_number(self) -> int:
+        """Determine the sequence number for the task"""
+
+        sequence_numbers = self._sorted_list_of_sequence_numbers()
+        if len(sequence_numbers) == 0:
+            return 0
+
+        # get the highest sequence number
+        highest_sequence_number = sequence_numbers[-1]
+
+        pull_request = self._find_pr_for_sequence_number(highest_sequence_number)
+        if pull_request is None:
+            # the directory for the sequence number exists but no PR yet
+            return highest_sequence_number
+        else:
+            if pull_request.is_merged():
+                # the PR is merged, so we use the next sequence number
+                return highest_sequence_number + 1
+            else:
+                # the PR is not merged, so we can use the current sequence number
+                return highest_sequence_number
+
     @log_function_entry_exit()
     def _handle_add_undetermined(self):
         """Handler for ADD action in UNDETERMINED state"""
@@ -689,7 +743,7 @@ def _handle_add_undetermined(self):
         # create pointer file from task file path to pull request directory (remote_file_path -> PULL_REQUEST_DIR)
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
-        sequence_number = self._get_fixed_sequence_number()  # corresponds to an open or yet to be created PR
+        sequence_number = self._determine_sequence_number()  # corresponds to an open or yet to be created PR
         task_file_name = self.description.get_task_file_name()
         # we cannot use self._determine_pull_request_dir() here because it requires a task pointer file
         #   and we don't have one yet
@@ -812,6 +866,14 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err)
             return None
 
+    @log_function_entry_exit()
+    def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullRequest]:
+        """Find the PR for the given sequence number"""
+        repo_name = self.description.get_repo_name()
+        pr_number = self.description.get_pr_number()
+        feature_branch_name = f"{repo_name}-PR-{pr_number}-SEQ-{sequence_number}"
+        return self._find_pr_for_branch(feature_branch_name)
+
     @log_function_entry_exit()
     def _determine_sequence_number_from_pull_request_directory(self) -> int:
         """Determine the sequence number from the pull request directory name"""

From cff761ebabc8a5fe6984a3fa044f5374f1bf9cfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 16:07:18 +0200
Subject: [PATCH 206/218] improve logging when determining task state

---
 scripts/automated_ingestion/eessi_task.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7368cecb..35b2ee02 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -541,20 +541,22 @@ def determine_state(self, branch: str = None) -> TaskState:
         branch_to_use = self.git_repo.default_branch if branch is None else branch
 
         if self._path_exists_in_branch(task_pointer_file, branch_name=branch_to_use):
-            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s exists in branch %s",
+            log_message(LoggingScope.TASK_OPS, 'INFO', "path '%s' exists in branch '%s'",
                         task_pointer_file, branch_to_use)
 
             # get state from task file in branch to use
             # - read the TaskState file in pull request directory
             pull_request_dir = self._determine_pull_request_dir(branch_name=branch_to_use)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "pull request directory: '%s'", pull_request_dir)
             task_state_file_path = f"{pull_request_dir}/TaskState"
+            log_message(LoggingScope.TASK_OPS, 'INFO', "task state file path: '%s'", task_state_file_path)
             task_state = self._read_task_state_from_file(task_state_file_path, branch_to_use)
 
-            log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch %s: %s",
+            log_message(LoggingScope.TASK_OPS, 'INFO', "task state in branch '%s': %s",
                         branch_to_use, task_state)
             return task_state
         else:
-            log_message(LoggingScope.TASK_OPS, 'INFO', "path %s does not exist in branch %s",
+            log_message(LoggingScope.TASK_OPS, 'INFO', "path '%s' does not exist in branch '%s'",
                         task_pointer_file, branch_to_use)
             return TaskState.UNDETERMINED
 

From d85e017c654116244deb2eeb093a7b4ed0a233aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 16:12:48 +0200
Subject: [PATCH 207/218] add logging and consider 'target_dir' attr name

---
 scripts/automated_ingestion/eessi_task.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 35b2ee02..673c51d6 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -481,6 +481,8 @@ def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch
             task_pointer_file = self.description.task_object.remote_file_path
         if branch_name is None:
             branch_name = self.git_repo.default_branch
+        log_message(LoggingScope.TASK_OPS, 'INFO', "reading pull request directory from file '%s' in branch '%s'",
+                    task_pointer_file, branch_name)
 
         # read the pull request directory from the file in the given branch
         content = self.git_repo.get_contents(task_pointer_file, ref=branch_name)
@@ -491,7 +493,8 @@ def _read_pull_request_dir_from_file(self, task_pointer_file: str = None, branch
         # Parse into dictionary
         config_dict = self._read_dict_from_string(content_str)
 
-        return config_dict.get('pull_request_dir', None)
+        target_dir = config_dict.get('target_dir', None)
+        return config_dict.get('pull_request_dir', target_dir)
 
     @log_function_entry_exit()
     def _determine_pull_request_dir(self, task_pointer_file: str = None, branch_name: str = None) -> str:

From cf081158a85e65fc59ff3a23fad6feed2a4fe883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 16:34:13 +0200
Subject: [PATCH 208/218] add logging for determining sequence number and pull
 request

---
 scripts/automated_ingestion/eessi_task.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 673c51d6..7a4b789d 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -718,14 +718,22 @@ def _determine_sequence_number(self) -> int:
         """Determine the sequence number for the task"""
 
         sequence_numbers = self._sorted_list_of_sequence_numbers()
+        log_message(LoggingScope.TASK_OPS, 'INFO', "number of sequence numbers: %d", len(sequence_numbers))
         if len(sequence_numbers) == 0:
             return 0
 
+        log_message(LoggingScope.TASK_OPS, 'INFO', "sequence numbers: [%s]", ", ".join(map(str, sequence_numbers)))
+
         # get the highest sequence number
         highest_sequence_number = sequence_numbers[-1]
+        log_message(LoggingScope.TASK_OPS, 'INFO', "highest sequence number: %d", highest_sequence_number)
 
         pull_request = self._find_pr_for_sequence_number(highest_sequence_number)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "pull request: %s", pull_request)
+
         if pull_request is None:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "Did not find pull request for sequence number %d",
+                        highest_sequence_number)
             # the directory for the sequence number exists but no PR yet
             return highest_sequence_number
         else:
@@ -863,9 +871,13 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         """
         try:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
+            log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref)
             filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
+            log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs))
+            if len(prs):
+                log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d", prs[0].number)
             return prs[0] if prs else None
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err)

From f94f69d37f34d91626860cb1ef72a55cabc322ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 16:47:46 +0200
Subject: [PATCH 209/218] print head refs for all PRs

---
 scripts/automated_ingestion/eessi_task.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 7a4b789d..b3f695d6 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -873,6 +873,11 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             head_ref = f"{self.git_repo.owner.login}:{branch_name}"
             log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref)
             filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
+
+            all_prs = list(self.git_repo.get_pulls(state='all'))
+            for pr in all_prs:
+                log_message(LoggingScope.TASK_OPS, 'INFO', "PR #{pr.number}: {pr.head.ref}")
+
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]
             log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs))

From 06faa4b4e8ae90c2e8b151835881cfb31f8a9313 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 16:50:20 +0200
Subject: [PATCH 210/218] fix PR head ref logging

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index b3f695d6..5a18c30c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -876,7 +876,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
 
             all_prs = list(self.git_repo.get_pulls(state='all'))
             for pr in all_prs:
-                log_message(LoggingScope.TASK_OPS, 'INFO', "PR #{pr.number}: {pr.head.ref}")
+                log_message(LoggingScope.TASK_OPS, 'INFO', "PR #%d: %s", pr.number, pr.head.ref)
 
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
                    if pr.number not in filter_prs]

From 9979adffc1223c327184ee572f50067f5757a005 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 16:54:24 +0200
Subject: [PATCH 211/218] do not use login when searching for PRs

---
 scripts/automated_ingestion/eessi_task.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 5a18c30c..20512a4f 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -870,7 +870,9 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             PullRequest object if found, None otherwise
         """
         try:
-            head_ref = f"{self.git_repo.owner.login}:{branch_name}"
+            # head_ref = f"{self.git_repo.owner.login}:{branch_name}"
+            # apparently, the head_ref does not contain the login
+            head_ref = f"{branch_name}"
             log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref)
             filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
 

From af021aef9ffc4a3877394665caf5671d593a9529 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 17:27:36 +0200
Subject: [PATCH 212/218] improve function to determine PR

---
 scripts/automated_ingestion/eessi_task.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 20512a4f..dfdadf6c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -737,6 +737,9 @@ def _determine_sequence_number(self) -> int:
             # the directory for the sequence number exists but no PR yet
             return highest_sequence_number
         else:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "pull request found: %s", pull_request)
+            log_message(LoggingScope.TASK_OPS, 'INFO', "pull request state/merged: %s/%s",
+                        pull_request.state, str(pull_request.is_merged()))
             if pull_request.is_merged():
                 # the PR is merged, so we use the next sequence number
                 return highest_sequence_number + 1
@@ -872,15 +875,22 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         try:
             # head_ref = f"{self.git_repo.owner.login}:{branch_name}"
             # apparently, the head_ref does not contain the login
-            head_ref = f"{branch_name}"
-            log_message(LoggingScope.TASK_OPS, 'INFO', "searching for PRs with head_ref: '%s'", head_ref)
+            last_dash = branch_name.rfind('-')
+            if last_dash != -1:
+                head_ref_wout_seq_num = branch_name[:last_dash + 1]  # +1 to include the separator
+            else:
+                head_ref_wout_seq_num = branch_name
+
+            log_message(LoggingScope.TASK_OPS, 'INFO',
+                        "searching for PRs starting with head_ref: '%s'", head_ref_wout_seq_num)
             filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
 
-            all_prs = list(self.git_repo.get_pulls(state='all'))
+            all_prs = [pr for pr in list(self.git_repo.get_pulls(state='all'))
+                       if pr.head.ref.startswith(head_ref_wout_seq_num)]
             for pr in all_prs:
                 log_message(LoggingScope.TASK_OPS, 'INFO', "PR #%d: %s", pr.number, pr.head.ref)
 
-            prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=head_ref))
+            prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=branch_name))
                    if pr.number not in filter_prs]
             log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs))
             if len(prs):
@@ -896,7 +906,10 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         feature_branch_name = f"{repo_name}-PR-{pr_number}-SEQ-{sequence_number}"
-        return self._find_pr_for_branch(feature_branch_name)
+        pull_request = self._find_pr_for_branch(feature_branch_name)
+        log_message(LoggingScope.TASK_OPS, 'INFO', "pull request for branch '%s': %s",
+                    feature_branch_name, pull_request)
+        return pull_request
 
     @log_function_entry_exit()
     def _determine_sequence_number_from_pull_request_directory(self) -> int:

From 251b60eae5f36da0c0d6d35e904c3643a378b033 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 17:35:58 +0200
Subject: [PATCH 213/218] fix branch name

---
 scripts/automated_ingestion/eessi_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index dfdadf6c..86ccae01 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -905,7 +905,7 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq
         """Find the PR for the given sequence number"""
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
-        feature_branch_name = f"{repo_name}-PR-{pr_number}-SEQ-{sequence_number}"
+        feature_branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
         pull_request = self._find_pr_for_branch(feature_branch_name)
         log_message(LoggingScope.TASK_OPS, 'INFO', "pull request for branch '%s': %s",
                     feature_branch_name, pull_request)

From 2275007fbacd09ffbc62fe81d878011122610d9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 18:04:52 +0200
Subject: [PATCH 214/218] =?UTF-8?q?restructure=20logging=20when=20determin?=
 =?UTF-8?q?ing=20PRR=E2=80=9D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/automated_ingestion/eessi_task.py | 34 +++++++++++++----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 86ccae01..50c6ea2a 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -875,21 +875,7 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
         try:
             # head_ref = f"{self.git_repo.owner.login}:{branch_name}"
             # apparently, the head_ref does not contain the login
-            last_dash = branch_name.rfind('-')
-            if last_dash != -1:
-                head_ref_wout_seq_num = branch_name[:last_dash + 1]  # +1 to include the separator
-            else:
-                head_ref_wout_seq_num = branch_name
-
-            log_message(LoggingScope.TASK_OPS, 'INFO',
-                        "searching for PRs starting with head_ref: '%s'", head_ref_wout_seq_num)
             filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
-
-            all_prs = [pr for pr in list(self.git_repo.get_pulls(state='all'))
-                       if pr.head.ref.startswith(head_ref_wout_seq_num)]
-            for pr in all_prs:
-                log_message(LoggingScope.TASK_OPS, 'INFO', "PR #%d: %s", pr.number, pr.head.ref)
-
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=branch_name))
                    if pr.number not in filter_prs]
             log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs))
@@ -906,6 +892,26 @@ def _find_pr_for_sequence_number(self, sequence_number: int) -> Optional[PullReq
         repo_name = self.description.get_repo_name()
         pr_number = self.description.get_pr_number()
         feature_branch_name = f"{repo_name.replace('/', '-')}-PR-{pr_number}-SEQ-{sequence_number}"
+
+        # list all PRs with head_ref starting with the feature branch name without the sequence number
+        last_dash = feature_branch_name.rfind('-')
+        if last_dash != -1:
+            head_ref_wout_seq_num = feature_branch_name[:last_dash + 1]  # +1 to include the separator
+        else:
+            head_ref_wout_seq_num = feature_branch_name
+
+        log_message(LoggingScope.TASK_OPS, 'INFO',
+                    "searching for PRs whose head_ref starts with: '%s'", head_ref_wout_seq_num)
+
+        all_prs = [pr for pr in list(self.git_repo.get_pulls(state='all'))
+                   if pr.head.ref.startswith(head_ref_wout_seq_num)]
+        log_message(LoggingScope.TASK_OPS, 'INFO', "  number of PRs found: %d", len(all_prs))
+        for pr in all_prs:
+            log_message(LoggingScope.TASK_OPS, 'INFO', "  PR #%d: %s", pr.number, pr.head.ref)
+
+        # now, find the PR for the feature branch name (if any)
+        log_message(LoggingScope.TASK_OPS, 'INFO',
+                    "searching PR for feature branch name: '%s'", feature_branch_name)
         pull_request = self._find_pr_for_branch(feature_branch_name)
         log_message(LoggingScope.TASK_OPS, 'INFO', "pull request for branch '%s': %s",
                     feature_branch_name, pull_request)

From cffc319416d072d2f1d892ebd90143c6915ce45f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 18:47:57 +0200
Subject: [PATCH 215/218] change way to determine PRs for branch name

---
 scripts/automated_ingestion/eessi_task.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 50c6ea2a..784cbb73 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -876,11 +876,11 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             # head_ref = f"{self.git_repo.owner.login}:{branch_name}"
             # apparently, the head_ref does not contain the login
             filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
-            prs = [pr for pr in list(self.git_repo.get_pulls(state='all', head=branch_name))
-                   if pr.number not in filter_prs]
+            prs = [pr for pr in list(self.git_repo.get_pulls(state='all'))
+                   if pr.number not in filter_prs and pr.head.ref == branch_name]
             log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs))
             if len(prs):
-                log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d", prs[0].number)
+                log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d, %s", prs[0].number, prs[0].head.ref)
             return prs[0] if prs else None
         except Exception as err:
             log_message(LoggingScope.TASK_OPS, 'ERROR', "Error finding PR for branch %s: %s", branch_name, err)

From 29d17fc0b8e06ea33f2f7cfe700a2536557c62b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 18:57:38 +0200
Subject: [PATCH 216/218] little code cleanup

---
 scripts/automated_ingestion/eessi_task.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 784cbb73..be0ce67c 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -873,11 +873,8 @@ def _find_pr_for_branch(self, branch_name: str) -> Optional[PullRequest]:
             PullRequest object if found, None otherwise
         """
         try:
-            # head_ref = f"{self.git_repo.owner.login}:{branch_name}"
-            # apparently, the head_ref does not contain the login
-            filter_prs = [16, 17, 18, 19, 20, 21, 22]  # TODO: remove this once the PR is merged
             prs = [pr for pr in list(self.git_repo.get_pulls(state='all'))
-                   if pr.number not in filter_prs and pr.head.ref == branch_name]
+                   if pr.head.ref == branch_name]
             log_message(LoggingScope.TASK_OPS, 'INFO', "number of PRs found: %d", len(prs))
             if len(prs):
                 log_message(LoggingScope.TASK_OPS, 'INFO', "1st PR found: %d, %s", prs[0].number, prs[0].head.ref)

From be9443c659615e93db46fff1850cd7adb35121b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 20:43:37 +0200
Subject: [PATCH 217/218] reformat PR body and make format configurable

---
 scripts/automated_ingestion/eessi_task.py     | 22 ++++++++-----------
 .../automated_ingestion/eessi_task_payload.py | 20 ++++++++---------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index be0ce67c..5e54597e 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1027,16 +1027,12 @@ def _create_task_summary(self) -> str:
         payload_name = self.description.metadata['payload']['filename']
         payload_summary = self.payload.analyse_contents()
         metadata_contents = self.description.get_contents()
-        task_summary = f"<details><summary><code>{payload_name}</code></summary>\n\n"
-        task_summary += "<details><summary>Metadata</summary>\n\n"
-        task_summary += f"```\n{metadata_contents}\n```\n</details>\n"
-        task_summary += "<details><summary>Overview of payload contents</summary>\n\n"
-        task_summary += self.config['github']['task_summary_payload_template'].format(
-            payload_overview=payload_summary,
+
+        task_summary = self.config['github']['task_summary_payload_template'].format(
+            payload_name=payload_name,
+            metadata_contents=metadata_contents,
+            payload_overview=payload_summary
         )
-        task_summary += "</details>\n"
-        task_summary += "\n"
-        task_summary += "</details>\n"
 
         # create HTML file with task summary in REPO-PR-SEQ directory
         # TODO: add failure handling (capture result and act on it)
@@ -1106,8 +1102,8 @@ def _create_pull_request(self, feature_branch_name: str, default_branch_name: st
             repo=repo_name,
             seq_num=seq_num,
             contents=contents_overview,
-            analysis="TO BE DONE",
-            action="TO BE DONE",
+            analysis="<dd>TO BE DONE</dd>",
+            action="<dd>TO BE DONE</dd>",
         )
         pr = self.git_repo.create_pull(
             title=pr_title,
@@ -1141,8 +1137,8 @@ def _update_pull_request(self, pull_request: PullRequest):
             repo=repo_name,
             seq_num=seq_num,
             contents=contents_overview,
-            analysis="TO BE DONE",
-            action="TO BE DONE",
+            analysis="<dd>TO BE DONE</dd>",
+            action="<dd>TO BE DONE</dd>",
         )
         pull_request.edit(body=pr_body)
 
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index c8f82df2..39ab3724 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -85,16 +85,16 @@ def analyse_contents(self) -> str:
             ]
             members_list = sorted(swdirs + modfiles + other)
 
-        # Construct the overview.
-        tar_members = '\n'.join(members_list)
-        overview = f"Total number of items in the tarball: {tar_num_members}"
-        bucket_url = self.payload_object.remote_client.get_bucket_url()
-        remote_file_path = self.payload_object.remote_file_path
-        overview += f"\nURL to the tarball: {bucket_url}/{remote_file_path}"
-        overview += f"\n{tar_members_desc}\n\n"
-        overview += f"```\n{tar_members}\n```\n"
-
-        # Make sure that the overview does not exceed Github's maximum length (65536 characters).
+        # Construct the overview
+        overview = self.config['github']['task_summary_payload_overview_template'].format(
+            tar_num_members=tar_num_members,
+            bucket_url=self.payload_object.remote_client.get_bucket_url(),
+            remote_file_path=self.payload_object.remote_file_path,
+            tar_members_desc=tar_members_desc,
+            tar_members='\n'.join(members_list)
+        )
+
+        # Make sure that the overview does not exceed Github's maximum length (65536 characters)
         if len(overview) > 60000:
             overview = overview[:60000] + "\n\nWARNING: output exceeded the maximum length and was truncated!\n```"
         return overview

From 3713639e48c339c548158513bc6a1e7ec2cf09da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= <thomas.roblitz@uib.no>
Date: Sun, 22 Jun 2025 20:52:47 +0200
Subject: [PATCH 218/218] make config dictionary available for analyse_contents

---
 scripts/automated_ingestion/eessi_task.py         | 2 +-
 scripts/automated_ingestion/eessi_task_payload.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/automated_ingestion/eessi_task.py b/scripts/automated_ingestion/eessi_task.py
index 5e54597e..bd863946 100644
--- a/scripts/automated_ingestion/eessi_task.py
+++ b/scripts/automated_ingestion/eessi_task.py
@@ -1025,7 +1025,7 @@ def _create_task_summary(self) -> str:
 
         # create task summary
         payload_name = self.description.metadata['payload']['filename']
-        payload_summary = self.payload.analyse_contents()
+        payload_summary = self.payload.analyse_contents(self.config)
         metadata_contents = self.description.get_contents()
 
         task_summary = self.config['github']['task_summary_payload_template'].format(
diff --git a/scripts/automated_ingestion/eessi_task_payload.py b/scripts/automated_ingestion/eessi_task_payload.py
index 39ab3724..cb39cc81 100644
--- a/scripts/automated_ingestion/eessi_task_payload.py
+++ b/scripts/automated_ingestion/eessi_task_payload.py
@@ -2,6 +2,7 @@
 import tarfile
 from pathlib import PurePosixPath
 import os
+from typing import Dict
 
 from eessi_data_object import EESSIDataAndSignatureObject
 from utils import log_function_entry_exit
@@ -39,7 +40,7 @@ def __init__(self, payload_object: EESSIDataAndSignatureObject):
         self.signature_verified = self.payload_object.verify_signature()
 
     @log_function_entry_exit()
-    def analyse_contents(self) -> str:
+    def analyse_contents(self, config: Dict) -> str:
         """Analyse the contents of the payload and return a summary in a ready-to-use HTML format."""
         tar = tarfile.open(self.payload_object.local_file_path, 'r')
         members = tar.getmembers()
@@ -86,7 +87,7 @@ def analyse_contents(self) -> str:
             members_list = sorted(swdirs + modfiles + other)
 
         # Construct the overview
-        overview = self.config['github']['task_summary_payload_overview_template'].format(
+        overview = config['github']['task_summary_payload_overview_template'].format(
             tar_num_members=tar_num_members,
             bucket_url=self.payload_object.remote_client.get_bucket_url(),
             remote_file_path=self.payload_object.remote_file_path,