From 8dd2e65cdf5039ebc43c43d9c7a26560912ed75a Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 10 Feb 2025 13:42:40 +1000
Subject: [PATCH 01/17] feat: check PyPI registry when deps.dev fails to find a
 source repository

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/json_tools.py                    |  4 +-
 src/macaron/repo_finder/repo_finder.py       |  7 +-
 src/macaron/repo_finder/repo_finder_enums.py | 14 +++-
 src/macaron/repo_finder/repo_finder_pypi.py  | 70 ++++++++++++++++++++
 4 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 src/macaron/repo_finder/repo_finder_pypi.py

diff --git a/src/macaron/json_tools.py b/src/macaron/json_tools.py
index 3cd7a7d37..a69b0eaa8 100644
--- a/src/macaron/json_tools.py
+++ b/src/macaron/json_tools.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """This module provides utility functions for JSON data."""
@@ -53,5 +53,5 @@ def json_extract(entry: dict | list, keys: Sequence[str | int], type_: type[T])
     if isinstance(entry, type_):
         return entry
 
-    logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type(type_))
+    logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type_)
     return None
diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py
index f98f2688e..c5a5cc2cf 100644
--- a/src/macaron/repo_finder/repo_finder.py
+++ b/src/macaron/repo_finder/repo_finder.py
@@ -43,7 +43,7 @@
 from macaron.config.defaults import defaults
 from macaron.config.global_config import global_config
 from macaron.errors import CloneError, RepoCheckOutError
-from macaron.repo_finder import to_domain_from_known_purl_types
+from macaron.repo_finder import repo_finder_pypi, to_domain_from_known_purl_types
 from macaron.repo_finder.commit_finder import find_commit, match_tags
 from macaron.repo_finder.repo_finder_base import BaseRepoFinder
 from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder
@@ -103,6 +103,11 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
     logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder))
     found_repo, outcome = repo_finder.find_repo(purl)
 
+    if not found_repo and purl.type == "pypi":
+        found_repo, outcome = repo_finder_pypi.find_repo(purl)
+        if not found_repo:
+            logger.debug("Could not find repository from PyPI registry for PURL: %s", purl)
+
     if check_latest_version and not defaults.getboolean("repofinder", "try_latest_purl", fallback=True):
         check_latest_version = False
 
diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py
index 4d088a5cc..7dff875c6 100644
--- a/src/macaron/repo_finder/repo_finder_enums.py
+++ b/src/macaron/repo_finder/repo_finder_enums.py
@@ -57,6 +57,15 @@ class RepoFinderInfo(Enum):
     #: Reported if deps.dev returns data that does not contain the desired SCM URL. E.g. The repository URL.
     DDEV_NO_URLS = "deps.dev no URLs"
 
+    #: Reported if there was an error with the request sent to the PyPI registry.
+    PYPI_HTTP_ERROR = "PyPI HTTP error"
+
+    #: Reported if there was an error parsing the JSON returned by the PyPI registry.
+    PYPI_JSON_ERROR = "PyPI JSON error"
+
+    #: Reported if there was no matching URLs in the JSON returned by the PyPI registry.
+    PYPI_NO_URLS = "PyPI no matching URLs"
+
     #: Reported if the provided PURL did not produce a result, but a more recent version could not be found.
     NO_NEWER_VERSION = "No newer version than provided which failed"
 
@@ -70,7 +79,10 @@ class RepoFinderInfo(Enum):
     FOUND_FROM_PARENT = "Found from parent"
 
     #: Reported when a repository is found from a more recent version than was provided by the user.
-    FOUND_FROM_LATEST = "Found form latest"
+    FOUND_FROM_LATEST = "Found from latest"
+
+    #: Reported when a repository could only be found by checking the PyPI registry JSON.
+    FOUND_FROM_PYPI = "Found from PyPI"
 
     #: Default value. Reported if the Repo Finder was not called. E.g. Because the repository URL was already present.
     NOT_USED = "Not used"
diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py
new file mode 100644
index 000000000..40c042415
--- /dev/null
+++ b/src/macaron/repo_finder/repo_finder_pypi.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+"""This module contains the logic for finding repositories of PyPI projects."""
+import logging
+import urllib.parse
+
+from packageurl import PackageURL
+
+from macaron.errors import InvalidHTTPResponseError
+from macaron.json_tools import json_extract
+from macaron.repo_finder.repo_finder_enums import RepoFinderInfo
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]:
+    """Retrieve the repository URL that matches the given PyPI PURL.
+
+    Parameters
+    ----------
+    purl : PackageURL
+        The parsed PURL to convert to the repository path.
+
+    Returns
+    -------
+    tuple[str, RepoFinderOutcome] :
+        The repository URL for the passed package, if found, and the outcome to report.
+    """
+    # TODO solve circular dependency
+    from macaron.slsa_analyzer.package_registry import PyPIRegistry  # pylint: disable=import-outside-toplevel
+
+    pypi_registry = PyPIRegistry()
+    pypi_registry.load_defaults()
+    json_endpoint = f"pypi/{purl.name}/json"
+    url = urllib.parse.urljoin(pypi_registry.registry_url, json_endpoint)
+    try:
+        json = pypi_registry.download_package_json(url)
+    except InvalidHTTPResponseError as error:
+        logger.debug(error)
+        # TODO improve accuracy of this outcome.
+        return "", RepoFinderInfo.PYPI_HTTP_ERROR
+
+    url_dict = json_extract(json, ["info", "project_urls"], dict)
+    if not url_dict:
+        return "", RepoFinderInfo.PYPI_JSON_ERROR
+
+    for url_key in url_dict:
+        url = url_dict[url_key]
+        parsed_url = urllib.parse.urlparse(url)
+        if not parsed_url.hostname:
+            continue
+        if not parsed_url.hostname.lower() == "github.com":
+            continue
+        split_path = parsed_url.path.split("/")
+        if not split_path or len(split_path) < 3:
+            continue
+        # Fix the URL so that it is the base GitHub URL. E.g. github.com/{owner}/{repo}
+        fixed_url = urllib.parse.ParseResult(
+            scheme=parsed_url.scheme,
+            netloc=parsed_url.netloc,
+            path=f"{split_path[1]}/{split_path[2]}",
+            params=parsed_url.params,
+            query=parsed_url.query,
+            fragment=parsed_url.fragment,
+        ).geturl()
+        logger.debug("Found repository URL from PyPI: %s", fixed_url)
+        return fixed_url, RepoFinderInfo.FOUND_FROM_PYPI
+
+    return "", RepoFinderInfo.PYPI_NO_URLS

From 07ea44d142592f8a83a981953de0d9cc1b21e605 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 11 Feb 2025 08:20:04 +1000
Subject: [PATCH 02/17] chore: avoid circular dependency

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/slsa_analyzer/analyzer.py         |  2 +-
 .../package_registry/jfrog_maven_registry.py  | 29 +----------------
 .../maven_central_registry.py                 | 27 +---------------
 .../package_registry/npm_registry.py          | 32 +------------------
 .../package_registry/package_registry.py      | 13 +++++---
 .../package_registry/pypi_registry.py         | 26 +--------------
 6 files changed, 13 insertions(+), 116 deletions(-)

diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index d17567110..b551d3f77 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -1018,7 +1018,7 @@ def _determine_package_registries(self, analyze_ctx: AnalyzeContext) -> None:
         )
         for package_registry in PACKAGE_REGISTRIES:
             for build_tool in build_tools:
-                if package_registry.is_detected(build_tool):
+                if package_registry.is_detected(build_tool.name):
                     analyze_ctx.dynamic_data["package_registries"].append(
                         PackageRegistryInfo(
                             build_tool=build_tool,
diff --git a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
index f7a546911..ca0c92ac2 100644
--- a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
@@ -17,9 +17,6 @@
 from macaron.config.defaults import defaults
 from macaron.errors import ConfigurationError
 from macaron.json_tools import JsonType
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
-from macaron.slsa_analyzer.build_tool.gradle import Gradle
-from macaron.slsa_analyzer.build_tool.maven import Maven
 from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -126,6 +123,7 @@ def __init__(
         self.request_timeout = request_timeout or 10
         self.download_timeout = download_timeout or 120
         self.enabled = enabled or False
+        self.build_tool_names = {"maven", "gradle"}
         super().__init__("JFrog Maven Registry")
 
     def load_defaults(self) -> None:
@@ -173,31 +171,6 @@ def load_defaults(self) -> None:
 
         self.enabled = True
 
-    def is_detected(self, build_tool: BaseBuildTool) -> bool:
-        """Detect if artifacts of the repo under analysis can possibly be published to this package registry.
-
-        The detection here is based on the repo's detected build tool.
-        If the package registry is compatible with the given build tool, it can be a
-        possible place where the artifacts produced from the repo are published.
-
-        ``JFrogMavenRegistry`` is compatible with Maven and Gradle.
-
-        Parameters
-        ----------
-        build_tool : BaseBuildTool
-            A detected build tool of the repository under analysis.
-
-        Returns
-        -------
-        bool
-            ``True`` if the repo under analysis can be published to this package registry,
-            based on the given build tool.
-        """
-        if not self.enabled:
-            return False
-        compatible_build_tool_classes = [Maven, Gradle]
-        return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes)
-
     def fetch_artifact_ids(self, group_id: str) -> list[str]:
         """Get all artifact ids under a group id.
 
diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
index a73ef519c..fc5e3966d 100644
--- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
@@ -12,9 +12,6 @@
 
 from macaron.config.defaults import defaults
 from macaron.errors import ConfigurationError, InvalidHTTPResponseError
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
-from macaron.slsa_analyzer.build_tool.gradle import Gradle
-from macaron.slsa_analyzer.build_tool.maven import Maven
 from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
 from macaron.util import send_get_http_raw
 
@@ -108,6 +105,7 @@ def __init__(
         self.registry_url_scheme = registry_url_scheme or ""
         self.registry_url = ""  # Created from the registry_url_scheme and registry_url_netloc.
         self.request_timeout = request_timeout or 10
+        self.build_tool_names = {"maven", "gradle"}
         super().__init__("Maven Central Registry")
 
     def load_defaults(self) -> None:
@@ -159,29 +157,6 @@ def load_defaults(self) -> None:
                 f"of the .ini configuration file is invalid: {error}",
             ) from error
 
-    def is_detected(self, build_tool: BaseBuildTool) -> bool:
-        """Detect if artifacts of the repo under analysis can possibly be published to this package registry.
-
-        The detection here is based on the repo's detected build tools.
-        If the package registry is compatible with the given build tools, it can be a
-        possible place where the artifacts produced from the repo are published.
-
-        ``MavenCentralRegistry`` is compatible with Maven and Gradle.
-
-        Parameters
-        ----------
-        build_tool : BaseBuildTool
-            A detected build tool of the repository under analysis.
-
-        Returns
-        -------
-        bool
-            ``True`` if the repo under analysis can be published to this package registry,
-            based on the given build tool.
-        """
-        compatible_build_tool_classes = [Maven, Gradle]
-        return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes)
-
     def find_publish_timestamp(self, purl: str) -> datetime:
         """Make a search request to Maven Central to find the publishing timestamp of an artifact.
 
diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
index f200bb5e0..7cbeb7913 100644
--- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
@@ -12,9 +12,6 @@
 
 from macaron.config.defaults import defaults
 from macaron.errors import ConfigurationError, InvalidHTTPResponseError
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
-from macaron.slsa_analyzer.build_tool.npm import NPM
-from macaron.slsa_analyzer.build_tool.yarn import Yarn
 from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
 from macaron.util import send_get_http_raw
 
@@ -53,6 +50,7 @@ def __init__(
         self.attestation_endpoint = attestation_endpoint or ""
         self.request_timeout = request_timeout or 10
         self.enabled = enabled
+        self.build_tool_names = {"npm", "yarn"}
         super().__init__("npm Registry")
 
     def load_defaults(self) -> None:
@@ -95,34 +93,6 @@ def load_defaults(self) -> None:
                 f"of the .ini configuration file is invalid: {error}",
             ) from error
 
-    def is_detected(self, build_tool: BaseBuildTool) -> bool:
-        """Detect if artifacts under analysis can be published to this package registry.
-
-        The detection here is based on the repo's detected build tools.
-        If the package registry is compatible with the given build tools, it can be a
-        possible place where the artifacts are published.
-
-        ``NPMRegistry`` is compatible with npm and Yarn build tools.
-
-        Note: if the npm registry is disabled through the ini configuration, this method returns False.
-
-        Parameters
-        ----------
-        build_tool : BaseBuildTool
-            A detected build tool of the repository under analysis.
-
-        Returns
-        -------
-        bool
-            ``True`` if the repo under analysis can be published to this package registry,
-            based on the given build tool.
-        """
-        if not self.enabled:
-            logger.debug("Support for the npm registry is disabled.")
-            return False
-        compatible_build_tool_classes = [NPM, Yarn]
-        return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes)
-
     def download_attestation_payload(self, url: str, download_path: str) -> bool:
         """Download the npm attestation from npm registry.
 
diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py
index 146958252..fd943fb3d 100644
--- a/src/macaron/slsa_analyzer/package_registry/package_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py
@@ -9,7 +9,6 @@
 
 from macaron.errors import InvalidHTTPResponseError
 from macaron.json_tools import json_extract
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
 from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -20,13 +19,14 @@ class PackageRegistry(ABC):
 
     def __init__(self, name: str) -> None:
         self.name = name
+        self.build_tool_names: set[str] = set()
+        self.enabled: bool = True
 
     @abstractmethod
     def load_defaults(self) -> None:
         """Load the .ini configuration for the current package registry."""
 
-    @abstractmethod
-    def is_detected(self, build_tool: BaseBuildTool) -> bool:
+    def is_detected(self, build_tool_name: str) -> bool:
         """Detect if artifacts of the repo under analysis can possibly be published to this package registry.
 
         The detection here is based on the repo's detected build tool.
@@ -35,8 +35,8 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool:
 
         Parameters
         ----------
-        build_tool : BaseBuildTool
-            A detected build tool of the repository under analysis.
+        build_tool_name: str
+            The name of a detected build tool of the repository under analysis.
 
         Returns
         -------
@@ -44,6 +44,9 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool:
             ``True`` if the repo under analysis can be published to this package registry,
             based on the given build tool.
         """
+        if not self.enabled:
+            return False
+        return build_tool_name in self.build_tool_names
 
     def find_publish_timestamp(self, purl: str) -> datetime:
         """Retrieve the publication timestamp for a package specified by its purl from the deps.dev repository by default.
diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
index e349663b0..69c35a55e 100644
--- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
@@ -21,8 +21,6 @@
 from macaron.errors import ConfigurationError, InvalidHTTPResponseError
 from macaron.json_tools import json_extract
 from macaron.malware_analyzer.datetime_parser import parse_datetime
-from macaron.slsa_analyzer.build_tool import Pip, Poetry
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
 from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
 from macaron.util import send_get_http_raw
 
@@ -75,6 +73,7 @@ def __init__(
         self.request_timeout = request_timeout or 10
         self.enabled = enabled
         self.registry_url = ""
+        self.build_tool_names = {"pip", "poetry"}
         super().__init__("PyPI Registry")
 
     def load_defaults(self) -> None:
@@ -129,29 +128,6 @@ def load_defaults(self) -> None:
                 f"of the .ini configuration file is invalid: {error}",
             ) from error
 
-    def is_detected(self, build_tool: BaseBuildTool) -> bool:
-        """Detect if artifacts of the repo under analysis can possibly be published to this package registry.
-
-        The detection here is based on the repo's detected build tools.
-        If the package registry is compatible with the given build tools, it can be a
-        possible place where the artifacts produced from the repo are published.
-
-        ``PyPIRegistry`` is compatible with Pip and Poetry.
-
-        Parameters
-        ----------
-        build_tool: BaseBuildTool
-            A detected build tool of the repository under analysis.
-
-        Returns
-        -------
-        bool
-            ``True`` if the repo under analysis can be published to this package registry,
-            based on the given build tool.
-        """
-        compatible_build_tool_classes = [Pip, Poetry]
-        return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes)
-
     def download_package_json(self, url: str) -> dict:
         """Download the package JSON metadata from pypi registry.
 

From 52d3c59f6b771809899dffd45360a361dc90e010 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 11 Feb 2025 08:21:23 +1000
Subject: [PATCH 03/17] chore: add alternative find repo for latest purl
 version also

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/repo_finder/repo_finder.py      | 24 +++++++++++++++++----
 src/macaron/repo_finder/repo_finder_pypi.py | 11 +++++-----
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py
index c5a5cc2cf..9f367ee14 100644
--- a/src/macaron/repo_finder/repo_finder.py
+++ b/src/macaron/repo_finder/repo_finder.py
@@ -103,10 +103,8 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
     logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder))
     found_repo, outcome = repo_finder.find_repo(purl)
 
-    if not found_repo and purl.type == "pypi":
-        found_repo, outcome = repo_finder_pypi.find_repo(purl)
-        if not found_repo:
-            logger.debug("Could not find repository from PyPI registry for PURL: %s", purl)
+    if not found_repo:
+        found_repo, outcome = find_repo_alternative(purl, outcome)
 
     if check_latest_version and not defaults.getboolean("repofinder", "try_latest_purl", fallback=True):
         check_latest_version = False
@@ -122,6 +120,12 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
         return "", RepoFinderInfo.NO_NEWER_VERSION
 
     found_repo, outcome = DepsDevRepoFinder().find_repo(latest_version_purl)
+    if found_repo:
+        return found_repo, outcome
+
+    if not found_repo:
+        found_repo, outcome = find_repo_alternative(latest_version_purl, outcome)
+
     if not found_repo:
         logger.debug("Could not find repo from latest version of PURL: %s", latest_version_purl)
         return "", RepoFinderInfo.LATEST_VERSION_INVALID
@@ -129,6 +133,18 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
     return found_repo, outcome
 
 
+def find_repo_alternative(purl: PackageURL, outcome: RepoFinderInfo) -> tuple[str, RepoFinderInfo]:
+    """Use PURL type specific methods to find the repository when the standard methods have failed."""
+    found_repo = ""
+    if purl.type == "pypi":
+        found_repo, outcome = repo_finder_pypi.find_repo(purl)
+
+    if not found_repo:
+        logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl)
+
+    return found_repo, outcome
+
+
 def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None:
     """Return the repository path from the PURL string.
 
diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py
index 40c042415..70722310f 100644
--- a/src/macaron/repo_finder/repo_finder_pypi.py
+++ b/src/macaron/repo_finder/repo_finder_pypi.py
@@ -10,6 +10,7 @@
 from macaron.errors import InvalidHTTPResponseError
 from macaron.json_tools import json_extract
 from macaron.repo_finder.repo_finder_enums import RepoFinderInfo
+from macaron.slsa_analyzer.package_registry import PyPIRegistry
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -27,9 +28,6 @@ def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]:
     tuple[str, RepoFinderOutcome] :
         The repository URL for the passed package, if found, and the outcome to report.
     """
-    # TODO solve circular dependency
-    from macaron.slsa_analyzer.package_registry import PyPIRegistry  # pylint: disable=import-outside-toplevel
-
     pypi_registry = PyPIRegistry()
     pypi_registry.load_defaults()
     json_endpoint = f"pypi/{purl.name}/json"
@@ -52,14 +50,15 @@ def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]:
             continue
         if not parsed_url.hostname.lower() == "github.com":
             continue
-        split_path = parsed_url.path.split("/")
-        if not split_path or len(split_path) < 3:
+        # The path starts with a "/".
+        split_path = parsed_url.path[1:].split("/")
+        if not split_path or len(split_path) < 2:
             continue
         # Fix the URL so that it is the base GitHub URL. E.g. github.com/{owner}/{repo}
         fixed_url = urllib.parse.ParseResult(
             scheme=parsed_url.scheme,
             netloc=parsed_url.netloc,
-            path=f"{split_path[1]}/{split_path[2]}",
+            path=f"{split_path[0]}/{split_path[1]}",
             params=parsed_url.params,
             query=parsed_url.query,
             fragment=parsed_url.fragment,

From cd724cf248777082d0ecac6673cabe5e1b34d832 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 11 Feb 2025 08:50:53 +1000
Subject: [PATCH 04/17] chore: add integration test

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../cases/repo_finder_pypi/policy.dl          | 10 ++++++++++
 .../cases/repo_finder_pypi/test.yaml          | 20 +++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 tests/integration/cases/repo_finder_pypi/policy.dl
 create mode 100644 tests/integration/cases/repo_finder_pypi/test.yaml

diff --git a/tests/integration/cases/repo_finder_pypi/policy.dl b/tests/integration/cases/repo_finder_pypi/policy.dl
new file mode 100644
index 000000000..38b2dd9f4
--- /dev/null
+++ b/tests/integration/cases/repo_finder_pypi/policy.dl
@@ -0,0 +1,10 @@
+/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */
+/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */
+
+#include "prelude.dl"
+
+Policy("test_policy", component_id, "") :-
+    check_passed(component_id, "mcn_version_control_system_1").
+
+apply_policy_to("test_policy", component_id) :-
+    is_component(component_id, "pkg:pypi/torch@2.6.0").
diff --git a/tests/integration/cases/repo_finder_pypi/test.yaml b/tests/integration/cases/repo_finder_pypi/test.yaml
new file mode 100644
index 000000000..d3cf1c557
--- /dev/null
+++ b/tests/integration/cases/repo_finder_pypi/test.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+description: |
+  Analyzing a PyPI PURL that is not correctly found by deps.dev and must be sought on the package registry directly.
+
+tags:
+- macaron-python-package
+
+steps:
+- name: Run macaron analyze
+  kind: analyze
+  options:
+    command_args:
+    - -purl
+    - pkg:pypi/torch@2.6.0
+- name: Run macaron verify-policy to verify passed/failed checks
+  kind: verify
+  options:
+    policy: policy.dl

From c7492fba6ce31ed6bfa6f7aeb145ec20e8000183 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 11 Feb 2025 08:57:37 +1000
Subject: [PATCH 05/17] chore: update tests

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../package_registry/test_jfrog_maven_registry.py        | 6 +++---
 .../package_registry/test_maven_central_registry.py      | 6 ++----
 .../slsa_analyzer/package_registry/test_npm_registry.py  | 9 +++------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py b/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py
index ebb960366..ef7276dcf 100644
--- a/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py
+++ b/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """Tests for the ``JFrogMavenRegistry`` class."""
@@ -129,12 +129,12 @@ def test_is_detected(
     expected_result: bool,
 ) -> None:
     """Test the ``is_detected`` method."""
-    assert jfrog_maven.is_detected(build_tool) == expected_result
+    assert jfrog_maven.is_detected(build_tool.name) == expected_result
 
     # The method always returns False when the jfrog_maven instance is not enabled
     # (in the ini config).
     jfrog_maven.enabled = False
-    assert jfrog_maven.is_detected(build_tool) is False
+    assert jfrog_maven.is_detected(build_tool.name) is False
 
 
 @pytest.mark.parametrize(
diff --git a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py
index 8a0287b36..62b9fdca0 100644
--- a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py
+++ b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """Tests for the Maven Central registry."""
@@ -14,7 +14,6 @@
 
 from macaron.config.defaults import load_defaults
 from macaron.errors import ConfigurationError, InvalidHTTPResponseError
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
 from macaron.slsa_analyzer.package_registry.maven_central_registry import MavenCentralRegistry
 
 
@@ -124,12 +123,11 @@ def test_load_defaults_with_invalid_config(tmp_path: Path, user_config_input: st
 )
 def test_is_detected(
     maven_central: MavenCentralRegistry,
-    build_tools: dict[str, BaseBuildTool],
     build_tool_name: str,
     expected_result: bool,
 ) -> None:
     """Test the ``is_detected`` method."""
-    assert maven_central.is_detected(build_tools[build_tool_name]) == expected_result
+    assert maven_central.is_detected(build_tool_name) == expected_result
 
 
 @pytest.mark.parametrize(
diff --git a/tests/slsa_analyzer/package_registry/test_npm_registry.py b/tests/slsa_analyzer/package_registry/test_npm_registry.py
index a6cadb4ba..a180ea78b 100644
--- a/tests/slsa_analyzer/package_registry/test_npm_registry.py
+++ b/tests/slsa_analyzer/package_registry/test_npm_registry.py
@@ -13,7 +13,6 @@
 
 from macaron.config.defaults import load_defaults
 from macaron.errors import ConfigurationError, InvalidHTTPResponseError
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
 from macaron.slsa_analyzer.build_tool.npm import NPM
 from macaron.slsa_analyzer.package_registry.npm_registry import NPMAttestationAsset, NPMRegistry
 
@@ -45,7 +44,7 @@ def test_disable_npm_registry(npm_registry: NPMRegistry, tmp_path: Path, npm_too
     npm_registry.load_defaults()
 
     assert npm_registry.enabled is False
-    assert npm_registry.is_detected(build_tool=npm_tool) is False
+    assert npm_registry.is_detected(npm_tool.name) is False
 
 
 @pytest.mark.parametrize(
@@ -87,12 +86,10 @@ def test_npm_registry_invalid_config(npm_registry: NPMRegistry, tmp_path: Path,
         ("maven", False),
     ],
 )
-def test_is_detected(
-    npm_registry: NPMRegistry, build_tools: dict[str, BaseBuildTool], build_tool_name: str, expected: bool
-) -> None:
+def test_is_detected(npm_registry: NPMRegistry, build_tool_name: str, expected: bool) -> None:
     """Test that the registry is correctly detected for a build tool."""
     npm_registry.load_defaults()
-    assert npm_registry.is_detected(build_tool=build_tools[build_tool_name]) == expected
+    assert npm_registry.is_detected(build_tool_name) == expected
 
 
 @pytest.mark.parametrize(

From e38975a39edb3ed17e2ee4d29574016501141fa5 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 11 Feb 2025 09:04:38 +1000
Subject: [PATCH 06/17] chore: pass build tool names to super class

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../slsa_analyzer/package_registry/jfrog_maven_registry.py | 3 +--
 .../package_registry/maven_central_registry.py             | 3 +--
 src/macaron/slsa_analyzer/package_registry/npm_registry.py | 3 +--
 .../slsa_analyzer/package_registry/package_registry.py     | 7 +++++--
 .../slsa_analyzer/package_registry/pypi_registry.py        | 3 +--
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
index ca0c92ac2..02188de1d 100644
--- a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
@@ -123,8 +123,7 @@ def __init__(
         self.request_timeout = request_timeout or 10
         self.download_timeout = download_timeout or 120
         self.enabled = enabled or False
-        self.build_tool_names = {"maven", "gradle"}
-        super().__init__("JFrog Maven Registry")
+        super().__init__("JFrog Maven Registry", {"maven", "gradle"})
 
     def load_defaults(self) -> None:
         """Load the .ini configuration for the current package registry.
diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
index fc5e3966d..131051b66 100644
--- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
@@ -105,8 +105,7 @@ def __init__(
         self.registry_url_scheme = registry_url_scheme or ""
         self.registry_url = ""  # Created from the registry_url_scheme and registry_url_netloc.
         self.request_timeout = request_timeout or 10
-        self.build_tool_names = {"maven", "gradle"}
-        super().__init__("Maven Central Registry")
+        super().__init__("Maven Central Registry", {"maven", "gradle"})
 
     def load_defaults(self) -> None:
         """Load the .ini configuration for the current package registry.
diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
index 7cbeb7913..fe009cc34 100644
--- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
@@ -50,8 +50,7 @@ def __init__(
         self.attestation_endpoint = attestation_endpoint or ""
         self.request_timeout = request_timeout or 10
         self.enabled = enabled
-        self.build_tool_names = {"npm", "yarn"}
-        super().__init__("npm Registry")
+        super().__init__("npm Registry", {"npm", "yarn"})
 
     def load_defaults(self) -> None:
         """Load the .ini configuration for the current package registry.
diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py
index fd943fb3d..7fbbf4258 100644
--- a/src/macaron/slsa_analyzer/package_registry/package_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py
@@ -17,9 +17,9 @@
 class PackageRegistry(ABC):
     """Base package registry class."""
 
-    def __init__(self, name: str) -> None:
+    def __init__(self, name: str, build_tool_names: set[str]) -> None:
         self.name = name
-        self.build_tool_names: set[str] = set()
+        self.build_tool_names = build_tool_names
         self.enabled: bool = True
 
     @abstractmethod
@@ -44,6 +44,9 @@ def is_detected(self, build_tool_name: str) -> bool:
             ``True`` if the repo under analysis can be published to this package registry,
             based on the given build tool.
         """
+        print()
+        print(f"{build_tool_name} in {self.build_tool_names} ?")
+        print()
         if not self.enabled:
             return False
         return build_tool_name in self.build_tool_names
diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
index 69c35a55e..f8316dc8f 100644
--- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
@@ -73,8 +73,7 @@ def __init__(
         self.request_timeout = request_timeout or 10
         self.enabled = enabled
         self.registry_url = ""
-        self.build_tool_names = {"pip", "poetry"}
-        super().__init__("PyPI Registry")
+        super().__init__("PyPI Registry", {"pip", "poetry"})
 
     def load_defaults(self) -> None:
         """Load the .ini configuration for the current package registry.

From 93982955c55a89cba7c24541ceec8631fed749d5 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 13 Feb 2025 14:52:39 +1000
Subject: [PATCH 07/17] chore: reuse PyPI JSON asset

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../metadata/closer_release_join_date.py      |  2 +-
 src/macaron/repo_finder/repo_finder.py        | 32 +++++++++--
 src/macaron/repo_finder/repo_finder_enums.py  |  3 +
 src/macaron/repo_finder/repo_finder_pypi.py   | 42 +++++++++-----
 src/macaron/slsa_analyzer/analyzer.py         | 57 ++++++++++++++-----
 .../checks/detect_malicious_metadata_check.py | 19 +++++--
 .../package_registry/package_registry.py      |  3 -
 .../package_registry/pypi_registry.py         | 14 +++--
 .../specs/package_registry_spec.py            |  5 +-
 9 files changed, 127 insertions(+), 50 deletions(-)

diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py
index 4ff41a619..bfa9a0704 100644
--- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py
+++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py
@@ -95,7 +95,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
             The result and related information collected during the analysis.
         """
         maintainers_join_date: list[datetime] | None = self._get_maintainers_join_date(
-            pypi_package_json.pypi_registry, pypi_package_json.component.name
+            pypi_package_json.pypi_registry, pypi_package_json.component_name
         )
         latest_release_date: datetime | None = self._get_latest_release_date(pypi_package_json)
         detail_info: dict[str, JsonType] = {
diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py
index 9f367ee14..081ff68cf 100644
--- a/src/macaron/repo_finder/repo_finder.py
+++ b/src/macaron/repo_finder/repo_finder.py
@@ -66,11 +66,14 @@
     list_remote_references,
     resolve_local_path,
 )
+from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
 
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, RepoFinderInfo]:
+def find_repo(
+    purl: PackageURL, check_latest_version: bool = True, all_package_registries: list[PackageRegistryInfo] | None = None
+) -> tuple[str, RepoFinderInfo]:
     """Retrieve the repository URL that matches the given PURL.
 
     Parameters
@@ -79,6 +82,8 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
         The parsed PURL to convert to the repository path.
     check_latest_version: bool
         A flag that determines whether the latest version of the PURL is also checked.
+    all_package_registries: list[PackageRegistryInfo] | None
+        The list of package registries, if any.
 
     Returns
     -------
@@ -104,7 +109,7 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
     found_repo, outcome = repo_finder.find_repo(purl)
 
     if not found_repo:
-        found_repo, outcome = find_repo_alternative(purl, outcome)
+        found_repo, outcome = find_repo_alternative(purl, outcome, all_package_registries)
 
     if check_latest_version and not defaults.getboolean("repofinder", "try_latest_purl", fallback=True):
         check_latest_version = False
@@ -133,11 +138,28 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str,
     return found_repo, outcome
 
 
-def find_repo_alternative(purl: PackageURL, outcome: RepoFinderInfo) -> tuple[str, RepoFinderInfo]:
-    """Use PURL type specific methods to find the repository when the standard methods have failed."""
+def find_repo_alternative(
+    purl: PackageURL, outcome: RepoFinderInfo, all_package_registries: list[PackageRegistryInfo] | None = None
+) -> tuple[str, RepoFinderInfo]:
+    """Use PURL type specific methods to find the repository when the standard methods have failed.
+
+    Parameters
+    ----------
+    purl : PackageURL
+        The parsed PURL to convert to the repository path.
+    outcome: RepoFinderInfo
+        A previous outcome to report if this method does nothing.
+    all_package_registries: list[PackageRegistryInfo] | None
+        The list of package registries, if any.
+
+    Returns
+    -------
+    tuple[str, RepoFinderOutcome] :
+        The repository URL for the passed package, if found, and the outcome to report.
+    """
     found_repo = ""
     if purl.type == "pypi":
-        found_repo, outcome = repo_finder_pypi.find_repo(purl)
+        found_repo, outcome = repo_finder_pypi.find_repo(purl, all_package_registries)
 
     if not found_repo:
         logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl)
diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py
index 7dff875c6..43e8d5e8b 100644
--- a/src/macaron/repo_finder/repo_finder_enums.py
+++ b/src/macaron/repo_finder/repo_finder_enums.py
@@ -66,6 +66,9 @@ class RepoFinderInfo(Enum):
     #: Reported if there was no matching URLs in the JSON returned by the PyPI registry.
     PYPI_NO_URLS = "PyPI no matching URLs"
 
+    #: Reported if the PyPI registry is disabled or not present in the list of package registries.
+    PYPI_NO_REGISTRY = "PyPI registry disabled or absent"
+
     #: Reported if the provided PURL did not produce a result, but a more recent version could not be found.
     NO_NEWER_VERSION = "No newer version than provided which failed"
 
diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py
index 70722310f..537e3297d 100644
--- a/src/macaron/repo_finder/repo_finder_pypi.py
+++ b/src/macaron/repo_finder/repo_finder_pypi.py
@@ -7,39 +7,55 @@
 
 from packageurl import PackageURL
 
-from macaron.errors import InvalidHTTPResponseError
-from macaron.json_tools import json_extract
 from macaron.repo_finder.repo_finder_enums import RepoFinderInfo
-from macaron.slsa_analyzer.package_registry import PyPIRegistry
+from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, PyPIRegistry
+from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
+from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
 
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]:
+def find_repo(
+    purl: PackageURL, all_package_registries: list[PackageRegistryInfo] | None = None
+) -> tuple[str, RepoFinderInfo]:
     """Retrieve the repository URL that matches the given PyPI PURL.
 
     Parameters
     ----------
     purl : PackageURL
         The parsed PURL to convert to the repository path.
+    all_package_registries: list[PackageRegistryInfo] | None
+        The context of the current analysis, if any.
 
     Returns
     -------
     tuple[str, RepoFinderOutcome] :
         The repository URL for the passed package, if found, and the outcome to report.
     """
-    pypi_registry = PyPIRegistry()
+    pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None)
+    if not pypi_registry:
+        return "", RepoFinderInfo.PYPI_NO_REGISTRY
+
     pypi_registry.load_defaults()
-    json_endpoint = f"pypi/{purl.name}/json"
-    url = urllib.parse.urljoin(pypi_registry.registry_url, json_endpoint)
-    try:
-        json = pypi_registry.download_package_json(url)
-    except InvalidHTTPResponseError as error:
-        logger.debug(error)
-        # TODO improve accuracy of this outcome.
+    pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, pypi_registry, {})
+    if not pypi_asset.download(dest=""):
         return "", RepoFinderInfo.PYPI_HTTP_ERROR
 
-    url_dict = json_extract(json, ["info", "project_urls"], dict)
+    if all_package_registries:
+        # Find the package registry info object that contains the PyPI registry and has the pypi build tool.
+        registry_info = next(
+            (
+                info
+                for info in all_package_registries
+                if info.package_registry == pypi_registry and info.build_tool_name == "pypi"
+            ),
+            None,
+        )
+        if registry_info:
+            # Save the asset for later use.
+            registry_info.metadata.append(pypi_asset)
+
+    url_dict = pypi_asset.get_project_links()
     if not url_dict:
         return "", RepoFinderInfo.PYPI_JSON_ERROR
 
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index b551d3f77..e9bece9ca 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -353,6 +353,9 @@ def run_single(
                 status=SCMStatus.ANALYSIS_FAILED,
             )
 
+        # Pre-populate all package registries so assets can be stored for later.
+        all_package_registries = self._populate_package_registry_info()
+
         provenance_is_verified = False
         if not provenance_payload and parsed_purl:
             # Try to find the provenance file for the parsed PURL.
@@ -385,7 +388,12 @@ def run_single(
         available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname]
         try:
             analysis_target = Analyzer.to_analysis_target(
-                config, available_domains, parsed_purl, provenance_repo_url, provenance_commit_digest
+                config,
+                available_domains,
+                parsed_purl,
+                provenance_repo_url,
+                provenance_commit_digest,
+                all_package_registries,
             )
         except InvalidAnalysisTargetError as error:
             return Record(
@@ -474,7 +482,7 @@ def run_single(
         self._determine_build_tools(analyze_ctx, git_service)
         if parsed_purl is not None:
             self._verify_repository_link(parsed_purl, analyze_ctx)
-        self._determine_package_registries(analyze_ctx)
+        self._determine_package_registries(analyze_ctx, all_package_registries)
 
         provenance_l3_verified = False
         if not provenance_payload:
@@ -802,6 +810,7 @@ def to_analysis_target(
         parsed_purl: PackageURL | None,
         provenance_repo_url: str | None = None,
         provenance_commit_digest: str | None = None,
+        all_package_registries: list[PackageRegistryInfo] | None = None,
     ) -> AnalysisTarget:
         """Resolve the details of a software component from user input.
 
@@ -818,6 +827,8 @@ def to_analysis_target(
             The repository URL extracted from provenance, or None if not found or no provenance.
         provenance_commit_digest: str | None
             The commit extracted from provenance, or None if not found or no provenance.
+        all_package_registries: list[PackageRegistryInfo] | None
+            The list of all package registries.
 
         Returns
         -------
@@ -860,7 +871,9 @@ def to_analysis_target(
                     converted_repo_path = repo_finder.to_repo_path(parsed_purl, available_domains)
                     if converted_repo_path is None:
                         # Try to find repo from PURL
-                        repo, repo_finder_outcome = repo_finder.find_repo(parsed_purl)
+                        repo, repo_finder_outcome = repo_finder.find_repo(
+                            parsed_purl, all_package_registries=all_package_registries
+                        )
 
                 return Analyzer.AnalysisTarget(
                     parsed_purl=parsed_purl,
@@ -1011,20 +1024,38 @@ def _determine_ci_services(self, analyze_ctx: AnalyzeContext, git_service: BaseG
                     )
                 )
 
-    def _determine_package_registries(self, analyze_ctx: AnalyzeContext) -> None:
+    def _populate_package_registry_info(self) -> list[PackageRegistryInfo]:
+        """Add all possible package registries to the analysis context."""
+        package_registries = []
+        for package_registry in PACKAGE_REGISTRIES:
+            for build_tool in BUILD_TOOLS:
+                build_tool_name = build_tool.name
+                if build_tool_name not in package_registry.build_tool_names:
+                    continue
+                package_registries.append(
+                    PackageRegistryInfo(
+                        build_tool_name=build_tool_name,
+                        package_registry=package_registry,
+                    )
+                )
+        return package_registries
+
+    def _determine_package_registries(
+        self, analyze_ctx: AnalyzeContext, all_package_registries: list[PackageRegistryInfo]
+    ) -> None:
         """Determine the package registries used by the software component based on its build tools."""
         build_tools = (
             analyze_ctx.dynamic_data["build_spec"]["tools"] or analyze_ctx.dynamic_data["build_spec"]["purl_tools"]
         )
-        for package_registry in PACKAGE_REGISTRIES:
-            for build_tool in build_tools:
-                if package_registry.is_detected(build_tool.name):
-                    analyze_ctx.dynamic_data["package_registries"].append(
-                        PackageRegistryInfo(
-                            build_tool=build_tool,
-                            package_registry=package_registry,
-                        )
-                    )
+        build_tool_names = {build_tool.name for build_tool in build_tools}
+        relevant_package_registries = []
+        for package_registry in all_package_registries:
+            if package_registry.build_tool_name not in build_tool_names:
+                continue
+            relevant_package_registries.append(package_registry)
+
+        # Assign the updated list of registries.
+        analyze_ctx.dynamic_data["package_registries"] = relevant_package_registries
 
     def _verify_repository_link(self, parsed_purl: PackageURL, analyze_ctx: AnalyzeContext) -> None:
         """Verify whether the claimed repository links back to the artifact."""
diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
index 80439bb79..26ae8937e 100644
--- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
+++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
@@ -29,8 +29,6 @@
 from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer
 from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer
 from macaron.slsa_analyzer.analyze_context import AnalyzeContext
-from macaron.slsa_analyzer.build_tool.pip import Pip
-from macaron.slsa_analyzer.build_tool.poetry import Poetry
 from macaron.slsa_analyzer.checks.base_check import BaseCheck
 from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType
 from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService
@@ -260,14 +258,23 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
             match package_registry_info_entry:
                 # Currently, only PyPI packages are supported.
                 case PackageRegistryInfo(
-                    build_tool=Pip() | Poetry(),
+                    build_tool_name="pip" | "poetry",
                     package_registry=PyPIRegistry() as pypi_registry,
                 ) as pypi_registry_info:
 
-                    # Create an AssetLocator object for the PyPI package JSON object.
-                    pypi_package_json = PyPIPackageJsonAsset(
-                        component=ctx.component, pypi_registry=pypi_registry, package_json={}
+                    # Retrieve the pre-existing AssetLocator object for the PyPI package JSON object, if it exists.
+                    pypi_package_json = next(
+                        (asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)),
+                        None,
                     )
+                    if not pypi_package_json:
+                        # Create an AssetLocator object for the PyPI package JSON object.
+                        pypi_package_json = PyPIPackageJsonAsset(
+                            component_name=ctx.component.name,
+                            component_version=ctx.component.version,
+                            pypi_registry=pypi_registry,
+                            package_json={},
+                        )
 
                     pypi_registry_info.metadata.append(pypi_package_json)
 
diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py
index 7fbbf4258..9e71fc595 100644
--- a/src/macaron/slsa_analyzer/package_registry/package_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py
@@ -44,9 +44,6 @@ def is_detected(self, build_tool_name: str) -> bool:
             ``True`` if the repo under analysis can be published to this package registry,
             based on the given build tool.
         """
-        print()
-        print(f"{build_tool_name} in {self.build_tool_names} ?")
-        print()
         if not self.enabled:
             return False
         return build_tool_name in self.build_tool_names
diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
index f8316dc8f..77d2c8b8a 100644
--- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
@@ -17,7 +17,6 @@
 from requests import RequestException
 
 from macaron.config.defaults import defaults
-from macaron.database.table_definitions import Component
 from macaron.errors import ConfigurationError, InvalidHTTPResponseError
 from macaron.json_tools import json_extract
 from macaron.malware_analyzer.datetime_parser import parse_datetime
@@ -341,8 +340,11 @@ def get_maintainer_join_date(self, username: str) -> datetime | None:
 class PyPIPackageJsonAsset:
     """The package JSON hosted on the PyPI registry."""
 
-    #: The target pypi software component.
-    component: Component
+    #: The target pypi software component name.
+    component_name: str
+
+    #: The target pypi software component version.
+    component_version: str | None
 
     #: The pypi registry.
     pypi_registry: PyPIRegistry
@@ -372,7 +374,7 @@ def url(self) -> str:
         -------
         str
         """
-        json_endpoint = f"pypi/{self.component.name}/json"
+        json_endpoint = f"pypi/{self.component_name}/json"
         return urllib.parse.urljoin(self.pypi_registry.registry_url, json_endpoint)
 
     def download(self, dest: str) -> bool:  # pylint: disable=unused-argument
@@ -434,8 +436,8 @@ def get_sourcecode_url(self) -> str | None:
             The URL of the source distribution.
         """
         urls: list | None = None
-        if self.component.version:
-            urls = json_extract(self.package_json, ["releases", self.component.version], list)
+        if self.component_version:
+            urls = json_extract(self.package_json, ["releases", self.component_version], list)
         else:
             # Get the latest version.
             urls = json_extract(self.package_json, ["urls"], list)
diff --git a/src/macaron/slsa_analyzer/specs/package_registry_spec.py b/src/macaron/slsa_analyzer/specs/package_registry_spec.py
index e28d9c6d8..ecd91d2b8 100644
--- a/src/macaron/slsa_analyzer/specs/package_registry_spec.py
+++ b/src/macaron/slsa_analyzer/specs/package_registry_spec.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 
@@ -7,7 +7,6 @@
 from dataclasses import dataclass, field
 
 from macaron.slsa_analyzer.asset import AssetLocator
-from macaron.slsa_analyzer.build_tool import BaseBuildTool
 from macaron.slsa_analyzer.package_registry import PackageRegistry
 from macaron.slsa_analyzer.provenance.provenance import DownloadedProvenanceData
 
@@ -17,7 +16,7 @@ class PackageRegistryInfo:
     """This class contains data for one package registry that is matched against a repository."""
 
     #: The build tool matched against the repository.
-    build_tool: BaseBuildTool
+    build_tool_name: str
     #: The package registry matched against the repository. This is dependent on the build tool detected.
     package_registry: PackageRegistry
     #: The provenances matched against the current repo.

From ae950efea01af3d08958d003889684c1a5330e9e Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 13 Feb 2025 15:24:22 +1000
Subject: [PATCH 08/17] chore: update tests

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../pypi_heuristics/metadata/wheel_absence.py              | 2 +-
 .../malware_analyzer/pypi/test_closer_release_join_date.py | 5 ++++-
 tests/malware_analyzer/pypi/test_wheel_absence.py          | 7 ++++---
 .../checks/test_detect_malicious_metadata_check.py         | 5 ++---
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py
index 2a8217353..3a3033e22 100644
--- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py
+++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py
@@ -61,7 +61,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
             logger.debug(error_msg)
             raise HeuristicAnalyzerValueError(error_msg)
 
-        version = pypi_package_json.component.version
+        version = pypi_package_json.component_version
         if version is None:  # check latest release version
             version = pypi_package_json.get_latest_version()
 
diff --git a/tests/malware_analyzer/pypi/test_closer_release_join_date.py b/tests/malware_analyzer/pypi/test_closer_release_join_date.py
index 4ed1a9b24..309574a21 100644
--- a/tests/malware_analyzer/pypi/test_closer_release_join_date.py
+++ b/tests/malware_analyzer/pypi/test_closer_release_join_date.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """Tests for closer release join date heuristic."""
@@ -17,6 +17,7 @@ def test_analyze_pass(pypi_package_json: MagicMock) -> None:
     pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["maintainer1", "maintainer2"]
     pypi_package_json.pypi_registry.get_maintainer_join_date.side_effect = [datetime(2018, 1, 1), datetime(2019, 1, 1)]
     pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00"
+    pypi_package_json.component_name = "mock1"
 
     # Call the method.
     result, detail_info = analyzer.analyze(pypi_package_json)
@@ -35,6 +36,7 @@ def test_analyze_process(pypi_package_json: MagicMock) -> None:
     pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["maintainer1"]
     pypi_package_json.pypi_registry.get_maintainer_join_date.side_effect = [datetime(2022, 6, 18)]
     pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00"
+    pypi_package_json.component_name = "mock1"
 
     # Call the method.
     result, detail_info = analyzer.analyze(pypi_package_json)
@@ -52,6 +54,7 @@ def test_analyze_skip(pypi_package_json: MagicMock) -> None:
     # Set up mock return values.
     pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = None
     pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00"
+    pypi_package_json.component_name = "mock1"
 
     # Call the method.
     result, detail_info = analyzer.analyze(pypi_package_json)
diff --git a/tests/malware_analyzer/pypi/test_wheel_absence.py b/tests/malware_analyzer/pypi/test_wheel_absence.py
index a2eebd554..3cfccfbe7 100644
--- a/tests/malware_analyzer/pypi/test_wheel_absence.py
+++ b/tests/malware_analyzer/pypi/test_wheel_absence.py
@@ -67,10 +67,11 @@ def test_analyze_tar_present(mock_send_head_http_raw: MagicMock, pypi_package_js
 
     pypi_package_json.get_releases.return_value = release
     pypi_package_json.get_latest_version.return_value = version
-    pypi_package_json.component.version = None
+    pypi_package_json.component_version = None
     pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}}
     pypi_package_json.pypi_registry.inspector_url_scheme = "https"
     pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io"
+
     mock_send_head_http_raw.return_value = MagicMock()  # assume valid URL for testing purposes
 
     expected_detail_info = {
@@ -126,7 +127,7 @@ def test_analyze_whl_present(mock_send_head_http_raw: MagicMock, pypi_package_js
     }
 
     pypi_package_json.get_releases.return_value = release
-    pypi_package_json.component.version = version
+    pypi_package_json.component_version = version
     pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}}
     pypi_package_json.pypi_registry.inspector_url_scheme = "https"
     pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io"
@@ -214,7 +215,7 @@ def test_analyze_both_present(mock_send_head_http_raw: MagicMock, pypi_package_j
     }
 
     pypi_package_json.get_releases.return_value = release
-    pypi_package_json.component.version = version
+    pypi_package_json.component_version = version
     pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}}
     pypi_package_json.pypi_registry.inspector_url_scheme = "https"
     pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io"
diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py
index c6ecb044d..8f15c636a 100644
--- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py
+++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py
@@ -12,7 +12,6 @@
 from pytest_httpserver import HTTPServer
 
 from macaron.config.defaults import load_defaults
-from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
 from macaron.slsa_analyzer.checks.check_result import CheckResultType
 from macaron.slsa_analyzer.checks.detect_malicious_metadata_check import DetectMaliciousMetadataCheck
 from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIRegistry
@@ -35,7 +34,7 @@
     ],
 )
 def test_detect_malicious_metadata(
-    httpserver: HTTPServer, tmp_path: Path, pip_tool: BaseBuildTool, macaron_path: Path, purl: str, expected: str
+    httpserver: HTTPServer, tmp_path: Path, macaron_path: Path, purl: str, expected: str
 ) -> None:
     """Test that the check handles repositories correctly."""
     check = DetectMaliciousMetadataCheck()
@@ -43,7 +42,7 @@ def test_detect_malicious_metadata(
     # Set up the context object with PyPIRegistry instance.
     ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl)
     pypi_registry = PyPIRegistry()
-    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool, pypi_registry)]
+    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", pypi_registry)]
 
     # Set up responses of PyPI endpoints using the httpserver plugin.
     with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page:

From bc03bbb99e4aa2a3cc3591bf858b91a4be737b76 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 13 Feb 2025 16:04:15 +1000
Subject: [PATCH 09/17] chore: add purl type to build tool in registry info

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/slsa_analyzer/analyzer.py            |  1 +
 .../checks/detect_malicious_metadata_check.py    |  1 +
 .../checks/infer_artifact_pipeline_check.py      |  2 +-
 .../slsa_analyzer/specs/package_registry_spec.py |  4 +++-
 .../test_detect_malicious_metadata_check.py      |  2 +-
 .../checks/test_repo_verification_check.py       | 16 +++++++++++-----
 6 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index e9bece9ca..de881bab7 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -1035,6 +1035,7 @@ def _populate_package_registry_info(self) -> list[PackageRegistryInfo]:
                 package_registries.append(
                     PackageRegistryInfo(
                         build_tool_name=build_tool_name,
+                        build_tool_purl_type=build_tool.purl_type,
                         package_registry=package_registry,
                     )
                 )
diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
index 26ae8937e..86c567762 100644
--- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
+++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
@@ -259,6 +259,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
                 # Currently, only PyPI packages are supported.
                 case PackageRegistryInfo(
                     build_tool_name="pip" | "poetry",
+                    build_tool_purl_type="pypi",
                     package_registry=PyPIRegistry() as pypi_registry,
                 ) as pypi_registry_info:
 
diff --git a/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py b/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py
index 96f83cefc..c02fa8380 100644
--- a/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py
+++ b/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py
@@ -123,7 +123,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
         # Look for the artifact in the corresponding registry and find the publish timestamp.
         artifact_published_date = None
         for registry_info in ctx.dynamic_data["package_registries"]:
-            if registry_info.build_tool.purl_type == ctx.component.type:
+            if registry_info.build_tool_purl_type == ctx.component.type:
                 try:
                     artifact_published_date = registry_info.package_registry.find_publish_timestamp(ctx.component.purl)
                     break
diff --git a/src/macaron/slsa_analyzer/specs/package_registry_spec.py b/src/macaron/slsa_analyzer/specs/package_registry_spec.py
index ecd91d2b8..84b2a69e7 100644
--- a/src/macaron/slsa_analyzer/specs/package_registry_spec.py
+++ b/src/macaron/slsa_analyzer/specs/package_registry_spec.py
@@ -15,8 +15,10 @@
 class PackageRegistryInfo:
     """This class contains data for one package registry that is matched against a repository."""
 
-    #: The build tool matched against the repository.
+    #: The name of the build tool matched against the repository.
     build_tool_name: str
+    #: The purl type of the build tool matched against the repository.
+    build_tool_purl_type: str
     #: The package registry matched against the repository. This is dependent on the build tool detected.
     package_registry: PackageRegistry
     #: The provenances matched against the current repo.
diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py
index 8f15c636a..c4251ff66 100644
--- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py
+++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py
@@ -42,7 +42,7 @@ def test_detect_malicious_metadata(
     # Set up the context object with PyPIRegistry instance.
     ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl)
     pypi_registry = PyPIRegistry()
-    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", pypi_registry)]
+    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", "pypi", pypi_registry)]
 
     # Set up responses of PyPI endpoints using the httpserver plugin.
     with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page:
diff --git a/tests/slsa_analyzer/checks/test_repo_verification_check.py b/tests/slsa_analyzer/checks/test_repo_verification_check.py
index f0f3dd923..dcc15af43 100644
--- a/tests/slsa_analyzer/checks/test_repo_verification_check.py
+++ b/tests/slsa_analyzer/checks/test_repo_verification_check.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """Module to test the repository verification check."""
@@ -23,7 +23,9 @@ def test_repo_verification_pass(maven_tool: BaseBuildTool, macaron_path: Path) -
 
     ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test")
     maven_registry = MavenCentralRegistry()
-    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)]
+    ctx.dynamic_data["package_registries"] = [
+        PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry)
+    ]
     ctx.dynamic_data["repo_verification"] = [
         RepositoryVerificationResult(
             status=RepositoryVerificationStatus.PASSED,
@@ -41,7 +43,9 @@ def test_repo_verification_fail(maven_tool: BaseBuildTool, macaron_path: Path) -
 
     ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test")
     maven_registry = MavenCentralRegistry()
-    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)]
+    ctx.dynamic_data["package_registries"] = [
+        PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry)
+    ]
     ctx.dynamic_data["repo_verification"] = [
         RepositoryVerificationResult(
             status=RepositoryVerificationStatus.FAILED,
@@ -59,7 +63,9 @@ def test_check_unknown_for_unknown_repo_verification(maven_tool: BaseBuildTool,
 
     ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test")
     maven_registry = MavenCentralRegistry()
-    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)]
+    ctx.dynamic_data["package_registries"] = [
+        PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry)
+    ]
     ctx.dynamic_data["repo_verification"] = [
         RepositoryVerificationResult(
             status=RepositoryVerificationStatus.UNKNOWN,
@@ -77,6 +83,6 @@ def test_check_unknown_for_unsupported_build_tools(pip_tool: BaseBuildTool, maca
 
     ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:pypi/test/test")
     pypi_registry = PyPIRegistry()
-    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool, pypi_registry)]
+    ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool.name, pip_tool.purl_type, pypi_registry)]
 
     assert check.run_check(ctx).result_type == CheckResultType.UNKNOWN

From 0c1c9f09484bad2763f011fbf1a4a4e9539750a6 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 10 Mar 2025 13:41:34 +1000
Subject: [PATCH 10/17] chore: add repository info for source code heuristic;
 minor fixes

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../pypi_heuristics/metadata/source_code_repo.py            | 2 +-
 src/macaron/slsa_analyzer/analyzer.py                       | 6 +++---
 .../slsa_analyzer/checks/detect_malicious_metadata_check.py | 2 ++
 src/macaron/slsa_analyzer/package_registry/pypi_registry.py | 3 +++
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py
index 8d8c9619d..708301807 100644
--- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py
+++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py
@@ -41,6 +41,6 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
             The result and related information collected during the analysis.
         """
         # If a sourcecode repo exists, then this will have already been validated
-        if not pypi_package_json.component.repository:
+        if not pypi_package_json.has_repository:
             return HeuristicResult.FAIL, {}
         return HeuristicResult.PASS, {}
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index de881bab7..8ae5ff3b2 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -472,7 +472,7 @@ def run_single(
         logger.info("With PURL: %s", component.purl)
         logger.info("=====================================")
 
-        analyze_ctx = self.get_analyze_ctx(component)
+        analyze_ctx = self.create_analyze_ctx(component)
         analyze_ctx.dynamic_data["expectation"] = self.expectations.get_expectation_for_target(
             analyze_ctx.component.purl.split("@")[0]
         )
@@ -917,8 +917,8 @@ def to_analysis_target(
                     "Cannot determine the analysis target: PURL and repository path are missing."
                 )
 
-    def get_analyze_ctx(self, component: Component) -> AnalyzeContext:
-        """Return the analyze context for a target component.
+    def create_analyze_ctx(self, component: Component) -> AnalyzeContext:
+        """Create and return an analysis context for the passed component.
 
         Parameters
         ----------
diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
index 86c567762..05444bec3 100644
--- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
+++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
@@ -222,6 +222,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
         # First check if this package is a known malware
         data = {"package": {"purl": ctx.component.purl}}
 
+        package_exists = False
         try:
             package_exists = bool(DepsDevService.get_package_info(ctx.component.purl))
         except APIAccessError as error:
@@ -273,6 +274,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
                         pypi_package_json = PyPIPackageJsonAsset(
                             component_name=ctx.component.name,
                             component_version=ctx.component.version,
+                            has_repository=ctx.component.repository is not None,
                             pypi_registry=pypi_registry,
                             package_json={},
                         )
diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
index 77d2c8b8a..20f75db08 100644
--- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
@@ -346,6 +346,9 @@ class PyPIPackageJsonAsset:
     #: The target pypi software component version.
     component_version: str | None
 
+    #: Whether the component of this asset has a related repository.
+    has_repository: bool
+
     #: The pypi registry.
     pypi_registry: PyPIRegistry
 

From f3a5419dcb94c1ec3047c9ba8ec3d4701036bd21 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 10 Mar 2025 13:46:39 +1000
Subject: [PATCH 11/17] chore: fix test

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../malware_analyzer/pypi/test_source_code_repo.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tests/malware_analyzer/pypi/test_source_code_repo.py b/tests/malware_analyzer/pypi/test_source_code_repo.py
index 668c80865..3cc9db15d 100644
--- a/tests/malware_analyzer/pypi/test_source_code_repo.py
+++ b/tests/malware_analyzer/pypi/test_source_code_repo.py
@@ -14,19 +14,13 @@
 @pytest.mark.parametrize(
     ("repository", "expected_result"),
     [
-        pytest.param(None, HeuristicResult.FAIL, id="test_no_repo"),
-        pytest.param(
-            MagicMock(),
-            HeuristicResult.PASS,
-            id="test_valid_repo",
-        ),
+        pytest.param(False, HeuristicResult.FAIL, id="test_no_repo"),
+        pytest.param(True, HeuristicResult.PASS, id="test_valid_repo"),
     ],
 )
-def test_repo_existence(
-    pypi_package_json: MagicMock, repository: MagicMock | None, expected_result: HeuristicResult
-) -> None:
+def test_repo_existence(pypi_package_json: MagicMock, repository: bool, expected_result: HeuristicResult) -> None:
     """Test if the source code repo exists."""
-    pypi_package_json.component.repository = repository
+    pypi_package_json.has_repository = repository
     analyzer = SourceCodeRepoAnalyzer()
     result, _ = analyzer.analyze(pypi_package_json)
     assert result == expected_result

From e73235c0840db938c8bf74cd222f17fc6046c6c6 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 10 Mar 2025 14:20:13 +1000
Subject: [PATCH 12/17] chore: minor fix

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/repo_finder/repo_finder_pypi.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py
index 537e3297d..93af5395e 100644
--- a/src/macaron/repo_finder/repo_finder_pypi.py
+++ b/src/macaron/repo_finder/repo_finder_pypi.py
@@ -37,7 +37,7 @@ def find_repo(
         return "", RepoFinderInfo.PYPI_NO_REGISTRY
 
     pypi_registry.load_defaults()
-    pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, pypi_registry, {})
+    pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {})
     if not pypi_asset.download(dest=""):
         return "", RepoFinderInfo.PYPI_HTTP_ERROR
 
@@ -80,6 +80,7 @@ def find_repo(
             fragment=parsed_url.fragment,
         ).geturl()
         logger.debug("Found repository URL from PyPI: %s", fixed_url)
+        pypi_asset.has_repository = True
         return fixed_url, RepoFinderInfo.FOUND_FROM_PYPI
 
     return "", RepoFinderInfo.PYPI_NO_URLS

From 79566935411ba8c04613a72d49ebd1a7ab32dc6a Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Wed, 19 Mar 2025 11:04:34 +1000
Subject: [PATCH 13/17] chore: add integration test for find-source command

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder_pypi_find_source/test.yaml    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 tests/integration/cases/repo_finder_pypi_find_source/test.yaml

diff --git a/tests/integration/cases/repo_finder_pypi_find_source/test.yaml b/tests/integration/cases/repo_finder_pypi_find_source/test.yaml
new file mode 100644
index 000000000..690658908
--- /dev/null
+++ b/tests/integration/cases/repo_finder_pypi_find_source/test.yaml
@@ -0,0 +1,22 @@
+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+description: |
+  Finding the source of a PyPI PURL that is not correctly found by deps.dev and must be sought on the package registry directly.
+
+tags:
+- macaron-python-package
+
+steps:
+- name: Run macaron analyze
+  kind: find-source
+  options:
+    command_args:
+    - -purl
+    - pkg:pypi/torch@2.6.0
+- name: Validate the produced report
+  kind: validate_schema
+  options:
+    kind: json_schema
+    schema: find_source_json_report
+    result: output/reports/pypi/torch/torch.source.json

From 599d261cfc083cdb6b2f56ff09c9fdab4a221a03 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 18 Mar 2025 16:51:31 +1000
Subject: [PATCH 14/17] feat: add GitHub attestation discovery

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/artifact/local_artifact.py        |  57 ++++++++-
 src/macaron/slsa_analyzer/analyzer.py         | 117 ++++++++++++++++--
 .../slsa_analyzer/git_service/api_client.py   |  21 +++-
 .../maven_central_registry.py                 |  72 +++++++++++
 .../slsa_analyzer/provenance/loader.py        |   8 +-
 .../cases/github_maven_attestation/policy.dl  |  10 ++
 .../cases/github_maven_attestation/test.yaml  |  22 ++++
 .../github_maven_attestation_local/policy.dl  |  10 ++
 .../github_maven_attestation_local/test.yaml  |  28 +++++
 9 files changed, 328 insertions(+), 17 deletions(-)
 create mode 100644 tests/integration/cases/github_maven_attestation/policy.dl
 create mode 100644 tests/integration/cases/github_maven_attestation/test.yaml
 create mode 100644 tests/integration/cases/github_maven_attestation_local/policy.dl
 create mode 100644 tests/integration/cases/github_maven_attestation_local/test.yaml

diff --git a/src/macaron/artifact/local_artifact.py b/src/macaron/artifact/local_artifact.py
index ed37c335a..582799824 100644
--- a/src/macaron/artifact/local_artifact.py
+++ b/src/macaron/artifact/local_artifact.py
@@ -1,16 +1,21 @@
-# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """This module declares types and utilities for handling local artifacts."""
 
 import fnmatch
 import glob
+import hashlib
+import logging
 import os
 
 from packageurl import PackageURL
 
 from macaron.artifact.maven import construct_maven_repository_path
 from macaron.errors import LocalArtifactFinderError
+from macaron.slsa_analyzer.package_registry import MavenCentralRegistry
+
+logger: logging.Logger = logging.getLogger(__name__)
 
 
 def construct_local_artifact_dirs_glob_pattern_maven_purl(maven_purl: PackageURL) -> list[str] | None:
@@ -247,3 +252,53 @@ def get_local_artifact_dirs(
         )
 
     raise LocalArtifactFinderError(f"Unsupported PURL type {purl_type}")
+
+
+def get_local_artifact_hash(purl: PackageURL, artifact_dirs: list[str], hash_algorithm_name: str) -> str | None:
+    """Compute the hash of the local artifact.
+
+    Parameters
+    ----------
+    purl: PackageURL
+        The PURL of the artifact being sought.
+    artifact_dirs: list[str]
+        The possible locations of the artifact.
+    hash_algorithm_name: str
+        The hash algorithm to use.
+
+    Returns
+    -------
+    str | None
+        The hash, or None if not found.
+    """
+    if not artifact_dirs:
+        logger.debug("No artifact directories provided.")
+        return None
+
+    if not purl.version:
+        logger.debug("PURL is missing version.")
+        return None
+
+    artifact_target = None
+    if purl.type == "maven":
+        artifact_target = MavenCentralRegistry.get_artifact_file_name(purl)
+
+    if not artifact_target:
+        logger.debug("PURL type not supported: %s", purl.type)
+        return None
+
+    for artifact_dir in artifact_dirs:
+        full_path = os.path.join(artifact_dir, artifact_target)
+        if not os.path.exists(full_path):
+            continue
+
+        with open(full_path, "rb") as file:
+            try:
+                hash_result = hashlib.file_digest(file, hash_algorithm_name)
+            except ValueError as error:
+                logger.debug("Error while hashing file: %s", error)
+                continue
+
+            return hash_result.hexdigest()
+
+    return None
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 8ae5ff3b2..a8fb88830 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -4,11 +4,14 @@
 """This module handles the cloning and analyzing a Git repo."""
 
 import glob
+import hashlib
+import json
 import logging
 import os
 import re
 import sys
 import tempfile
+import urllib.parse
 from collections.abc import Mapping
 from datetime import datetime, timezone
 from pathlib import Path
@@ -20,7 +23,10 @@
 from sqlalchemy.orm import Session
 
 from macaron import __version__
-from macaron.artifact.local_artifact import get_local_artifact_dirs
+from macaron.artifact.local_artifact import (
+    get_local_artifact_dirs,
+    get_local_artifact_hash,
+)
 from macaron.config.global_config import global_config
 from macaron.config.target_config import Configuration
 from macaron.database.database_manager import DatabaseManager, get_db_manager, get_db_session
@@ -41,6 +47,7 @@
     ProvenanceError,
     PURLNotFoundError,
 )
+from macaron.json_tools import json_extract
 from macaron.output_reporter.reporter import FileReporter
 from macaron.output_reporter.results import Record, Report, SCMStatus
 from macaron.provenance import provenance_verifier
@@ -66,12 +73,14 @@
 from macaron.slsa_analyzer.checks import *  # pylint: disable=wildcard-import,unused-wildcard-import # noqa: F401,F403
 from macaron.slsa_analyzer.ci_service import CI_SERVICES
 from macaron.slsa_analyzer.database_store import store_analyze_context_to_db
-from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService
+from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService, GitHub
 from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService
 from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR
-from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES
+from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry
 from macaron.slsa_analyzer.provenance.expectations.expectation_registry import ExpectationRegistry
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV01Payload
+from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError
+from macaron.slsa_analyzer.provenance.loader import load_provenance_payload
 from macaron.slsa_analyzer.provenance.slsa import SLSAProvenanceData
 from macaron.slsa_analyzer.registry import registry
 from macaron.slsa_analyzer.specs.ci_spec import CIInfo
@@ -403,6 +412,17 @@ def run_single(
                 status=SCMStatus.ANALYSIS_FAILED,
             )
 
+        local_artifact_dirs = None
+        if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper:
+            local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type]
+            try:
+                local_artifact_dirs = get_local_artifact_dirs(
+                    purl=parsed_purl,
+                    local_artifact_repo_path=local_artifact_repo_path,
+                )
+            except LocalArtifactFinderError as error:
+                logger.debug(error)
+
         # Prepare the repo.
         git_obj = None
         commit_finder_outcome = CommitFinderInfo.NOT_USED
@@ -480,6 +500,37 @@ def run_single(
         git_service = self._determine_git_service(analyze_ctx)
         self._determine_ci_services(analyze_ctx, git_service)
         self._determine_build_tools(analyze_ctx, git_service)
+
+        # Try to find an attestation from GitHub, if applicable.
+        if parsed_purl and not provenance_payload and analysis_target.repo_path and isinstance(git_service, GitHub):
+            # Try to discover GitHub attestation for the target software component.
+            url = None
+            try:
+                url = urllib.parse.urlparse(analysis_target.repo_path)
+            except TypeError as error:
+                logger.debug("Failed to parse repository path as URL: %s", error)
+            if url and url.hostname == "github.com":
+                artifact_hash = self.get_artifact_hash(parsed_purl, local_artifact_dirs, hashlib.sha256())
+                if artifact_hash:
+                    git_attestation_dict = git_service.api_client.get_attestation(
+                        analyze_ctx.component.repository.full_name, artifact_hash
+                    )
+                    if git_attestation_dict:
+                        git_attestation_list = json_extract(git_attestation_dict, ["attestations"], list)
+                        if git_attestation_list:
+                            git_attestation = git_attestation_list[0]
+
+                            with tempfile.TemporaryDirectory() as temp_dir:
+                                attestation_file = os.path.join(temp_dir, "attestation")
+                                with open(attestation_file, "w", encoding="UTF-8") as file:
+                                    json.dump(git_attestation, file)
+
+                                try:
+                                    payload = load_provenance_payload(attestation_file)
+                                    provenance_payload = payload
+                                except LoadIntotoAttestationError as error:
+                                    logger.debug("Failed to load provenance payload: %s", error)
+
         if parsed_purl is not None:
             self._verify_repository_link(parsed_purl, analyze_ctx)
         self._determine_package_registries(analyze_ctx, all_package_registries)
@@ -541,16 +592,8 @@ def run_single(
 
         analyze_ctx.dynamic_data["validate_malware"] = validate_malware
 
-        if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper:
-            local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type]
-            try:
-                local_artifact_dirs = get_local_artifact_dirs(
-                    purl=parsed_purl,
-                    local_artifact_repo_path=local_artifact_repo_path,
-                )
-                analyze_ctx.dynamic_data["local_artifact_paths"].extend(local_artifact_dirs)
-            except LocalArtifactFinderError as error:
-                logger.debug(error)
+        if local_artifact_dirs:
+            analyze_ctx.dynamic_data["local_artifact_paths"].extend(local_artifact_dirs)
 
         analyze_ctx.check_results = registry.scan(analyze_ctx)
 
@@ -939,6 +982,54 @@ def create_analyze_ctx(self, component: Component) -> AnalyzeContext:
 
         return analyze_ctx
 
+    def get_artifact_hash(
+        self, purl: PackageURL, cached_artifacts: list[str] | None, hash_algorithm: Any
+    ) -> str | None:
+        """Get the hash of the artifact found from the passed PURL using local or remote files.
+
+        Parameters
+        ----------
+        purl: PackageURL
+            The PURL of the artifact.
+        cached_artifacts: list[str] | None
+            The list of local files that match the PURL.
+        hash_algorithm: Any
+            The hash algorithm to use.
+
+        Returns
+        -------
+        str | None
+            The hash of the artifact, or None if not found.
+        """
+        if cached_artifacts:
+            # Try to get the hash from a local file.
+            artifact_hash = get_local_artifact_hash(purl, cached_artifacts, hash_algorithm.name)
+
+            if artifact_hash:
+                return artifact_hash
+
+        # Download the artifact.
+        if purl.type == "maven":
+            maven_registry = next(
+                (
+                    package_registry
+                    for package_registry in PACKAGE_REGISTRIES
+                    if isinstance(package_registry, MavenCentralRegistry)
+                ),
+                None,
+            )
+            if not maven_registry:
+                return None
+
+            return maven_registry.get_artifact_hash(purl, hash_algorithm)
+
+        if purl.type == "pypi":
+            # TODO implement
+            return None
+
+        logger.debug("Purl type '%s' not yet supported for GitHub attestation discovery.", purl.type)
+        return None
+
     def _determine_git_service(self, analyze_ctx: AnalyzeContext) -> BaseGitService:
         """Determine the Git service used by the software component."""
         remote_path = analyze_ctx.component.repository.remote_path if analyze_ctx.component.repository else None
diff --git a/src/macaron/slsa_analyzer/git_service/api_client.py b/src/macaron/slsa_analyzer/git_service/api_client.py
index 8e987e6ca..681a1f4e0 100644
--- a/src/macaron/slsa_analyzer/git_service/api_client.py
+++ b/src/macaron/slsa_analyzer/git_service/api_client.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """The module provides API clients for VCS services, such as GitHub."""
@@ -659,6 +659,25 @@ def download_asset(self, url: str, download_path: str) -> bool:
 
         return True
 
+    def get_attestation(self, full_name: str, artifact_hash: str) -> dict:
+        """Download and return the attestation associated with the passed artifact hash, if any.
+
+        Parameters
+        ----------
+        full_name : str
+            The full name of the repo.
+        artifact_hash: str
+            The SHA256 hash of an artifact.
+
+        Returns
+        -------
+        dict
+            The attestation data, or an empty dict if not found.
+        """
+        url = f"{GhAPIClient._REPO_END_POINT}/{full_name}/attestations/sha256:{artifact_hash}"
+        response_data = send_get_http(url, self.headers)
+        return response_data or {}
+
 
 def get_default_gh_client(access_token: str) -> GhAPIClient:
     """Return a GhAPIClient instance with default values.
diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
index 131051b66..593c15b88 100644
--- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
@@ -6,10 +6,13 @@
 import logging
 import urllib.parse
 from datetime import datetime, timezone
+from typing import Any
 
 import requests
 from packageurl import PackageURL
+from requests import RequestException
 
+from macaron.artifact.maven import construct_maven_repository_path
 from macaron.config.defaults import defaults
 from macaron.errors import ConfigurationError, InvalidHTTPResponseError
 from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
@@ -236,3 +239,72 @@ def find_publish_timestamp(self, purl: str) -> datetime:
                 raise InvalidHTTPResponseError(f"The timestamp returned by {url} is invalid") from error
 
         raise InvalidHTTPResponseError(f"Invalid response from Maven central for {url}.")
+
+    @staticmethod
+    def get_artifact_file_name(purl: PackageURL) -> str | None:
+        """Return the artifact file name of the passed PURL based on the Maven registry standard.
+
+        Parameters
+        ----------
+        purl: PackageURL
+            The PURL of the artifact.
+
+        Returns
+        -------
+        str | None
+            The artifact file name, or None if invalid.
+        """
+        if not purl.version:
+            return None
+
+        return purl.name + "-" + purl.version + ".jar"
+
+    def get_artifact_hash(self, purl: PackageURL, hash_algorithm: Any) -> str | None:
+        """Return the hash of the artifact found by the passed purl relevant to the registry's URL.
+
+        Parameters
+        ----------
+        purl: PackageURL
+            The purl of the artifact.
+        hash_algorithm: Any
+            The hash algorithm to use.
+
+        Returns
+        -------
+        str | None
+            The hash of the artifact, or None if not found.
+        """
+        if not (purl.namespace and purl.version):
+            return None
+
+        artifact_path = construct_maven_repository_path(purl.namespace, purl.name, purl.version)
+        file_name = MavenCentralRegistry.get_artifact_file_name(purl)
+        if not file_name:
+            return None
+
+        artifact_url = self.registry_url + "/" + artifact_path + "/" + file_name
+        logger.debug("Search for artifact using URL: %s", artifact_url)
+
+        try:
+            response = requests.get(artifact_url, stream=True, timeout=40)
+            response.raise_for_status()
+        except requests.exceptions.HTTPError as http_err:
+            logger.debug("HTTP error occurred: %s", http_err)
+            return None
+
+        if response.status_code != 200:
+            return None
+
+        # Download file and compute hash as chunks are received.
+        try:
+            for chunk in response.iter_content():
+                hash_algorithm.update(chunk)
+        except RequestException as error:
+            # Something went wrong with the request, abort.
+            logger.debug("Error while streaming target file: %s", error)
+            response.close()
+            return None
+
+        artifact_hash: str = hash_algorithm.hexdigest()
+        logger.debug("Computed hash of artifact: %s", artifact_hash)
+        return artifact_hash
diff --git a/src/macaron/slsa_analyzer/provenance/loader.py b/src/macaron/slsa_analyzer/provenance/loader.py
index 65dfee1bb..d75faa726 100644
--- a/src/macaron/slsa_analyzer/provenance/loader.py
+++ b/src/macaron/slsa_analyzer/provenance/loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """This module contains the loaders for SLSA provenances."""
@@ -12,7 +12,7 @@
 from urllib.parse import urlparse
 
 from macaron.config.defaults import defaults
-from macaron.json_tools import JsonType
+from macaron.json_tools import JsonType, json_extract
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, validate_intoto_payload
 from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError, ValidateInTotoPayloadError
 from macaron.util import send_get_http_raw
@@ -83,6 +83,10 @@ def _load_provenance_file_content(
         # Some provenances, such as Witness may not include the DSSE envelope `dsseEnvelope`
         # property but contain its value directly.
         provenance_payload = provenance.get("payload", None)
+    if not provenance_payload:
+        # GitHub Attestation.
+        # TODO Check if old method (above) actually works.
+        provenance_payload = json_extract(provenance, ["bundle", "dsseEnvelope", "payload"], str)
     if not provenance_payload:
         raise LoadIntotoAttestationError(
             'Cannot find the "payload" field in the decoded provenance.',
diff --git a/tests/integration/cases/github_maven_attestation/policy.dl b/tests/integration/cases/github_maven_attestation/policy.dl
new file mode 100644
index 000000000..9df46219b
--- /dev/null
+++ b/tests/integration/cases/github_maven_attestation/policy.dl
@@ -0,0 +1,10 @@
+/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */
+/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */
+
+#include "prelude.dl"
+
+Policy("test_policy", component_id, "") :-
+    check_passed(component_id, "mcn_provenance_available_1").
+
+apply_policy_to("test_policy", component_id) :-
+    is_component(component_id, "pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22").
diff --git a/tests/integration/cases/github_maven_attestation/test.yaml b/tests/integration/cases/github_maven_attestation/test.yaml
new file mode 100644
index 000000000..9913d930e
--- /dev/null
+++ b/tests/integration/cases/github_maven_attestation/test.yaml
@@ -0,0 +1,22 @@
+# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+description: |
+  Discovering attestation of a Maven artifact on GitHub
+
+tags:
+- macaron-python-package
+
+steps:
+- name: Run macaron analyze
+  kind: analyze
+  options:
+    command_args:
+    - -purl
+    - pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22
+    - -rp
+    - https://github.com/liftwizard/liftwizard
+- name: Run macaron verify-policy to verify passed/failed checks
+  kind: verify
+  options:
+    policy: policy.dl
diff --git a/tests/integration/cases/github_maven_attestation_local/policy.dl b/tests/integration/cases/github_maven_attestation_local/policy.dl
new file mode 100644
index 000000000..ff31abf90
--- /dev/null
+++ b/tests/integration/cases/github_maven_attestation_local/policy.dl
@@ -0,0 +1,10 @@
+/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */
+/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */
+
+#include "prelude.dl"
+
+Policy("test_policy", component_id, "") :-
+    check_failed(component_id, "mcn_provenance_available_1").
+
+apply_policy_to("test_policy", component_id) :-
+    is_component(component_id, "pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22").
diff --git a/tests/integration/cases/github_maven_attestation_local/test.yaml b/tests/integration/cases/github_maven_attestation_local/test.yaml
new file mode 100644
index 000000000..d66a089b2
--- /dev/null
+++ b/tests/integration/cases/github_maven_attestation_local/test.yaml
@@ -0,0 +1,28 @@
+# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+description: |
+  Discovering GitHub attestation of a local Maven artifact but failing because the artifact is wrong
+
+tags:
+- macaron-python-package
+
+steps:
+- name: Download artifact POM instead of the JAR
+  kind: shell
+  options:
+    cmd: curl --create-dirs -o ./output/.m2/repository/io/liftwizard/liftwizard-checkstyle/2.1.22/liftwizard-checkstyle-2.1.22.jar https://repo1.maven.org/maven2/io/liftwizard/liftwizard-checkstyle/2.1.22/liftwizard-checkstyle-2.1.22.pom
+- name: Run macaron analyze
+  kind: analyze
+  options:
+    command_args:
+    - -purl
+    - pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22
+    - -rp
+    - https://github.com/liftwizard/liftwizard
+    - --local-maven-repo
+    - ./output/.m2
+- name: Run macaron verify-policy to verify no provenance was found
+  kind: verify
+  options:
+    policy: policy.dl

From 69099a837428286518ef75231ff2e13506d8faa1 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Wed, 19 Mar 2025 20:11:07 +1000
Subject: [PATCH 15/17] chore: add support for PyPI PURLs

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/slsa_analyzer/analyzer.py         | 54 ++++++++++--
 .../checks/detect_malicious_metadata_check.py | 29 +++----
 .../package_registry/pypi_registry.py         | 87 ++++++++++++++++++-
 3 files changed, 146 insertions(+), 24 deletions(-)

diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index a8fb88830..21cbd3ad0 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -76,7 +76,8 @@
 from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService, GitHub
 from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService
 from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR
-from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry
+from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry, PyPIRegistry
+from macaron.slsa_analyzer.package_registry.pypi_registry import find_or_create_pypi_asset
 from macaron.slsa_analyzer.provenance.expectations.expectation_registry import ExpectationRegistry
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV01Payload
 from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError
@@ -510,7 +511,9 @@ def run_single(
             except TypeError as error:
                 logger.debug("Failed to parse repository path as URL: %s", error)
             if url and url.hostname == "github.com":
-                artifact_hash = self.get_artifact_hash(parsed_purl, local_artifact_dirs, hashlib.sha256())
+                artifact_hash = self.get_artifact_hash(
+                    parsed_purl, local_artifact_dirs, hashlib.sha256(), all_package_registries
+                )
                 if artifact_hash:
                     git_attestation_dict = git_service.api_client.get_attestation(
                         analyze_ctx.component.repository.full_name, artifact_hash
@@ -983,7 +986,11 @@ def create_analyze_ctx(self, component: Component) -> AnalyzeContext:
         return analyze_ctx
 
     def get_artifact_hash(
-        self, purl: PackageURL, cached_artifacts: list[str] | None, hash_algorithm: Any
+        self,
+        purl: PackageURL,
+        cached_artifacts: list[str] | None,
+        hash_algorithm: Any,
+        all_package_registries: list[PackageRegistryInfo],
     ) -> str | None:
         """Get the hash of the artifact found from the passed PURL using local or remote files.
 
@@ -995,6 +1002,8 @@ def get_artifact_hash(
             The list of local files that match the PURL.
         hash_algorithm: Any
             The hash algorithm to use.
+        all_package_registries: list[PackageRegistryInfo]
+            The list of package registry information.
 
         Returns
         -------
@@ -1024,8 +1033,43 @@ def get_artifact_hash(
             return maven_registry.get_artifact_hash(purl, hash_algorithm)
 
         if purl.type == "pypi":
-            # TODO implement
-            return None
+            pypi_registry = next(
+                (
+                    package_registry
+                    for package_registry in PACKAGE_REGISTRIES
+                    if isinstance(package_registry, PyPIRegistry)
+                ),
+                None,
+            )
+            if not pypi_registry:
+                logger.debug("Missing registry for PyPI")
+                return None
+
+            registry_info = next(
+                (
+                    info
+                    for info in all_package_registries
+                    if info.package_registry == pypi_registry and info.build_tool_name in {"pip", "poetry"}
+                ),
+                None,
+            )
+            if not registry_info:
+                logger.debug("Missing registry information for PyPI")
+                return None
+
+            pypi_asset = find_or_create_pypi_asset(purl.name, purl.version, registry_info)
+            if not pypi_asset:
+                return None
+
+            pypi_asset.has_repository = True
+            if not pypi_asset.download(""):
+                return None
+
+            source_url = pypi_asset.get_sourcecode_url("bdist_wheel")
+            if not source_url:
+                return None
+
+            return pypi_registry.get_artifact_hash(source_url, hash_algorithm)
 
         logger.debug("Purl type '%s' not yet supported for GitHub attestation discovery.", purl.type)
         return None
diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
index 05444bec3..4e58b7cab 100644
--- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
+++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
@@ -32,7 +32,11 @@
 from macaron.slsa_analyzer.checks.base_check import BaseCheck
 from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType
 from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService
-from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry
+from macaron.slsa_analyzer.package_registry.pypi_registry import (
+    PyPIPackageJsonAsset,
+    PyPIRegistry,
+    find_or_create_pypi_asset,
+)
 from macaron.slsa_analyzer.registry import registry
 from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
 from macaron.util import send_post_http_raw
@@ -261,23 +265,16 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
                 case PackageRegistryInfo(
                     build_tool_name="pip" | "poetry",
                     build_tool_purl_type="pypi",
-                    package_registry=PyPIRegistry() as pypi_registry,
+                    package_registry=PyPIRegistry(),
                 ) as pypi_registry_info:
-
-                    # Retrieve the pre-existing AssetLocator object for the PyPI package JSON object, if it exists.
-                    pypi_package_json = next(
-                        (asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)),
-                        None,
+                    # Retrieve the pre-existing asset, or create a new one.
+                    pypi_package_json = find_or_create_pypi_asset(
+                        ctx.component.name, ctx.component.version, pypi_registry_info
                     )
-                    if not pypi_package_json:
-                        # Create an AssetLocator object for the PyPI package JSON object.
-                        pypi_package_json = PyPIPackageJsonAsset(
-                            component_name=ctx.component.name,
-                            component_version=ctx.component.version,
-                            has_repository=ctx.component.repository is not None,
-                            pypi_registry=pypi_registry,
-                            package_json={},
-                        )
+                    if pypi_package_json is None:
+                        return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN)
+
+                    pypi_package_json.has_repository = ctx.component.repository is not None
 
                     pypi_registry_info.metadata.append(pypi_package_json)
 
diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
index 20f75db08..35d6e9c41 100644
--- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
@@ -11,6 +11,7 @@
 import zipfile
 from dataclasses import dataclass
 from datetime import datetime
+from typing import Any
 
 import requests
 from bs4 import BeautifulSoup, Tag
@@ -21,6 +22,7 @@
 from macaron.json_tools import json_extract
 from macaron.malware_analyzer.datetime_parser import parse_datetime
 from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
+from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
 from macaron.util import send_get_http_raw
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -231,6 +233,45 @@ def fetch_sourcecode(self, src_url: str) -> dict[str, str] | None:
             logger.debug("Successfully fetch the source code from PyPI")
             return py_files_content
 
+    def get_artifact_hash(self, artifact_url: str, hash_algorithm: Any) -> str | None:
+        """Return the hash of the artifact found at the passed URL.
+
+        Parameters
+        ----------
+        artifact_url
+            The URL of the artifact.
+        hash_algorithm: Any
+            The hash algorithm to use.
+
+        Returns
+        -------
+        str | None
+            The hash of the artifact, or None if not found.
+        """
+        try:
+            response = requests.get(artifact_url, stream=True, timeout=40)
+            response.raise_for_status()
+        except requests.exceptions.HTTPError as http_err:
+            logger.debug("HTTP error occurred: %s", http_err)
+            return None
+
+        if response.status_code != 200:
+            logger.debug("Invalid response: %s", response.status_code)
+            return None
+
+        try:
+            for chunk in response.iter_content():
+                hash_algorithm.update(chunk)
+        except RequestException as error:
+            # Something went wrong with the request, abort.
+            logger.debug("Error while streaming source file: %s", error)
+            response.close()
+            return None
+
+        artifact_hash: str = hash_algorithm.hexdigest()
+        logger.debug("Computed artifact hash: %s", artifact_hash)
+        return artifact_hash
+
     def get_package_page(self, package_name: str) -> str | None:
         """Implement custom API to get package main page.
 
@@ -430,15 +471,19 @@ def get_latest_version(self) -> str | None:
         """
         return json_extract(self.package_json, ["info", "version"], str)
 
-    def get_sourcecode_url(self) -> str | None:
+    def get_sourcecode_url(self, package_type: str = "sdist") -> str | None:
         """Get the url of the source distribution.
 
+        Parameters
+        ----------
+        package_type: str
+            The package type to retrieve the URL of.
+
         Returns
         -------
         str | None
             The URL of the source distribution.
         """
-        urls: list | None = None
         if self.component_version:
             urls = json_extract(self.package_json, ["releases", self.component_version], list)
         else:
@@ -447,7 +492,7 @@ def get_sourcecode_url(self) -> str | None:
         if not urls:
             return None
         for distribution in urls:
-            if distribution.get("packagetype") != "sdist":
+            if distribution.get("packagetype") != package_type:
                 continue
             # We intentionally check if the url is None and use empty string if that's the case.
             source_url: str = distribution.get("url") or ""
@@ -497,3 +542,39 @@ def get_sourcecode(self) -> dict[str, str] | None:
             source_code: dict[str, str] | None = self.pypi_registry.fetch_sourcecode(url)
             return source_code
         return None
+
+
+def find_or_create_pypi_asset(
+    asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo
+) -> PyPIPackageJsonAsset | None:
+    """Find the asset in the provided package registry information, or create it.
+
+    Parameters
+    ----------
+    asset_name: str
+        The name of the asset.
+    asset_version: str | None
+        The version of the asset.
+    pypi_registry_info:
+        The package registry information.
+
+    Returns
+    -------
+    PyPIPackageJsonAsset | None
+        The asset, or None if not found.
+    """
+    pypi_package_json = next(
+        (asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)),
+        None,
+    )
+    if pypi_package_json:
+        return pypi_package_json
+
+    package_registry = pypi_registry_info.package_registry
+    if not isinstance(package_registry, PyPIRegistry):
+        logger.debug("Failed to create PyPIPackageJson asset.")
+        return None
+
+    asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {})
+    pypi_registry_info.metadata.append(asset)
+    return asset

From 167b34a5a9e699d0ac636c8d3557c54ad02ebeaf Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 31 Mar 2025 10:31:46 +1000
Subject: [PATCH 16/17] chore: add support for sha256 hashes in maven

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../package_registry/maven_central_registry.py           | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
index 593c15b88..2ad0cbf1e 100644
--- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py
@@ -282,8 +282,15 @@ def get_artifact_hash(self, purl: PackageURL, hash_algorithm: Any) -> str | None
         if not file_name:
             return None
 
+        # Maven supports but does not require a sha256 hash of uploaded artifacts. Check that first.
         artifact_url = self.registry_url + "/" + artifact_path + "/" + file_name
-        logger.debug("Search for artifact using URL: %s", artifact_url)
+        sha256_url = artifact_url + ".sha256"
+        logger.debug("Search for artifact hash using URL: %s", [sha256_url, artifact_url])
+
+        response = send_get_http_raw(sha256_url, {})
+        if response and response.text:
+            logger.debug("Found hash of artifact: %s", response.text)
+            return response.text
 
         try:
             response = requests.get(artifact_url, stream=True, timeout=40)

From ea49398afcef30c6ce7b3f84969723c44fbe346b Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 31 Mar 2025 11:09:50 +1000
Subject: [PATCH 17/17] chore: add pypi sha256 support

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/slsa_analyzer/analyzer.py         |  4 ++++
 .../package_registry/pypi_registry.py         | 20 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 21cbd3ad0..a4698866e 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -1065,6 +1065,10 @@ def get_artifact_hash(
             if not pypi_asset.download(""):
                 return None
 
+            artifact_hash = pypi_asset.get_sha256()
+            if artifact_hash:
+                return artifact_hash
+
             source_url = pypi_asset.get_sourcecode_url("bdist_wheel")
             if not source_url:
                 return None
diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
index 35d6e9c41..0852d554c 100644
--- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py
@@ -543,6 +543,26 @@ def get_sourcecode(self) -> dict[str, str] | None:
             return source_code
         return None
 
+    def get_sha256(self) -> str | None:
+        """Get the sha256 hash of the artifact from its payload.
+
+        Returns
+        -------
+        str | None
+            The sha256 hash of the artifact, or None if not found.
+        """
+        if not self.package_json and not self.download(""):
+            return None
+
+        if not self.component_version:
+            artifact_hash = json_extract(self.package_json, ["urls", 0, "digests", "sha256"], str)
+        else:
+            artifact_hash = json_extract(
+                self.package_json, ["releases", self.component_version, "digests", "sha256"], str
+            )
+        logger.debug("Found sha256 hash: %s", artifact_hash)
+        return artifact_hash
+
 
 def find_or_create_pypi_asset(
     asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo