From 8dd2e65cdf5039ebc43c43d9c7a26560912ed75a Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Mon, 10 Feb 2025 13:42:40 +1000 Subject: [PATCH 01/17] feat: check PyPI registry when deps.dev fails to find a source repository Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/json_tools.py | 4 +- src/macaron/repo_finder/repo_finder.py | 7 +- src/macaron/repo_finder/repo_finder_enums.py | 14 +++- src/macaron/repo_finder/repo_finder_pypi.py | 70 ++++++++++++++++++++ 4 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 src/macaron/repo_finder/repo_finder_pypi.py diff --git a/src/macaron/json_tools.py b/src/macaron/json_tools.py index 3cd7a7d37..a69b0eaa8 100644 --- a/src/macaron/json_tools.py +++ b/src/macaron/json_tools.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module provides utility functions for JSON data.""" @@ -53,5 +53,5 @@ def json_extract(entry: dict | list, keys: Sequence[str | int], type_: type[T]) if isinstance(entry, type_): return entry - logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type(type_)) + logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type_) return None diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index f98f2688e..c5a5cc2cf 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -43,7 +43,7 @@ from macaron.config.defaults import defaults from macaron.config.global_config import global_config from macaron.errors import CloneError, RepoCheckOutError -from macaron.repo_finder import to_domain_from_known_purl_types +from macaron.repo_finder import repo_finder_pypi, to_domain_from_known_purl_types from macaron.repo_finder.commit_finder import find_commit, match_tags from macaron.repo_finder.repo_finder_base import BaseRepoFinder from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder @@ -103,6 +103,11 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder)) found_repo, outcome = repo_finder.find_repo(purl) + if not found_repo and purl.type == "pypi": + found_repo, outcome = repo_finder_pypi.find_repo(purl) + if not found_repo: + logger.debug("Could not find repository from PyPI registry for PURL: %s", purl) + if check_latest_version and not defaults.getboolean("repofinder", "try_latest_purl", fallback=True): check_latest_version = False diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py index 4d088a5cc..7dff875c6 100644 --- a/src/macaron/repo_finder/repo_finder_enums.py +++ b/src/macaron/repo_finder/repo_finder_enums.py @@ -57,6 +57,15 @@ class RepoFinderInfo(Enum): #: Reported if deps.dev returns data that does not contain the desired SCM URL. E.g. The repository URL. DDEV_NO_URLS = "deps.dev no URLs" + #: Reported if there was an error with the request sent to the PyPI registry. + PYPI_HTTP_ERROR = "PyPI HTTP error" + + #: Reported if there was an error parsing the JSON returned by the PyPI registry. + PYPI_JSON_ERROR = "PyPI JSON error" + + #: Reported if there was no matching URLs in the JSON returned by the PyPI registry. + PYPI_NO_URLS = "PyPI no matching URLs" + #: Reported if the provided PURL did not produce a result, but a more recent version could not be found. NO_NEWER_VERSION = "No newer version than provided which failed" @@ -70,7 +79,10 @@ class RepoFinderInfo(Enum): FOUND_FROM_PARENT = "Found from parent" #: Reported when a repository is found from a more recent version than was provided by the user. - FOUND_FROM_LATEST = "Found form latest" + FOUND_FROM_LATEST = "Found from latest" + + #: Reported when a repository could only be found by checking the PyPI registry JSON. + FOUND_FROM_PYPI = "Found from PyPI" #: Default value. Reported if the Repo Finder was not called. E.g. Because the repository URL was already present. NOT_USED = "Not used" diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py new file mode 100644 index 000000000..40c042415 --- /dev/null +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains the logic for finding repositories of PyPI projects.""" +import logging +import urllib.parse + +from packageurl import PackageURL + +from macaron.errors import InvalidHTTPResponseError +from macaron.json_tools import json_extract +from macaron.repo_finder.repo_finder_enums import RepoFinderInfo + +logger: logging.Logger = logging.getLogger(__name__) + + +def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: + """Retrieve the repository URL that matches the given PyPI PURL. + + Parameters + ---------- + purl : PackageURL + The parsed PURL to convert to the repository path. + + Returns + ------- + tuple[str, RepoFinderOutcome] : + The repository URL for the passed package, if found, and the outcome to report. + """ + # TODO solve circular dependency + from macaron.slsa_analyzer.package_registry import PyPIRegistry # pylint: disable=import-outside-toplevel + + pypi_registry = PyPIRegistry() + pypi_registry.load_defaults() + json_endpoint = f"pypi/{purl.name}/json" + url = urllib.parse.urljoin(pypi_registry.registry_url, json_endpoint) + try: + json = pypi_registry.download_package_json(url) + except InvalidHTTPResponseError as error: + logger.debug(error) + # TODO improve accuracy of this outcome. + return "", RepoFinderInfo.PYPI_HTTP_ERROR + + url_dict = json_extract(json, ["info", "project_urls"], dict) + if not url_dict: + return "", RepoFinderInfo.PYPI_JSON_ERROR + + for url_key in url_dict: + url = url_dict[url_key] + parsed_url = urllib.parse.urlparse(url) + if not parsed_url.hostname: + continue + if not parsed_url.hostname.lower() == "github.com": + continue + split_path = parsed_url.path.split("/") + if not split_path or len(split_path) < 3: + continue + # Fix the URL so that it is the base GitHub URL. E.g. github.com/{owner}/{repo} + fixed_url = urllib.parse.ParseResult( + scheme=parsed_url.scheme, + netloc=parsed_url.netloc, + path=f"{split_path[1]}/{split_path[2]}", + params=parsed_url.params, + query=parsed_url.query, + fragment=parsed_url.fragment, + ).geturl() + logger.debug("Found repository URL from PyPI: %s", fixed_url) + return fixed_url, RepoFinderInfo.FOUND_FROM_PYPI + + return "", RepoFinderInfo.PYPI_NO_URLS From 07ea44d142592f8a83a981953de0d9cc1b21e605 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Tue, 11 Feb 2025 08:20:04 +1000 Subject: [PATCH 02/17] chore: avoid circular dependency Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/slsa_analyzer/analyzer.py | 2 +- .../package_registry/jfrog_maven_registry.py | 29 +---------------- .../maven_central_registry.py | 27 +--------------- .../package_registry/npm_registry.py | 32 +------------------ .../package_registry/package_registry.py | 13 +++++--- .../package_registry/pypi_registry.py | 26 +-------------- 6 files changed, 13 insertions(+), 116 deletions(-) diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index d17567110..b551d3f77 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -1018,7 +1018,7 @@ def _determine_package_registries(self, analyze_ctx: AnalyzeContext) -> None: ) for package_registry in PACKAGE_REGISTRIES: for build_tool in build_tools: - if package_registry.is_detected(build_tool): + if package_registry.is_detected(build_tool.name): analyze_ctx.dynamic_data["package_registries"].append( PackageRegistryInfo( build_tool=build_tool, diff --git a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py index f7a546911..ca0c92ac2 100644 --- a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py @@ -17,9 +17,6 @@ from macaron.config.defaults import defaults from macaron.errors import ConfigurationError from macaron.json_tools import JsonType -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool -from macaron.slsa_analyzer.build_tool.gradle import Gradle -from macaron.slsa_analyzer.build_tool.maven import Maven from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry logger: logging.Logger = logging.getLogger(__name__) @@ -126,6 +123,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.download_timeout = download_timeout or 120 self.enabled = enabled or False + self.build_tool_names = {"maven", "gradle"} super().__init__("JFrog Maven Registry") def load_defaults(self) -> None: @@ -173,31 +171,6 @@ def load_defaults(self) -> None: self.enabled = True - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts of the repo under analysis can possibly be published to this package registry. - - The detection here is based on the repo's detected build tool. - If the package registry is compatible with the given build tool, it can be a - possible place where the artifacts produced from the repo are published. - - ``JFrogMavenRegistry`` is compatible with Maven and Gradle. - - Parameters - ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - if not self.enabled: - return False - compatible_build_tool_classes = [Maven, Gradle] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def fetch_artifact_ids(self, group_id: str) -> list[str]: """Get all artifact ids under a group id. diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index a73ef519c..fc5e3966d 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -12,9 +12,6 @@ from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool -from macaron.slsa_analyzer.build_tool.gradle import Gradle -from macaron.slsa_analyzer.build_tool.maven import Maven from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry from macaron.util import send_get_http_raw @@ -108,6 +105,7 @@ def __init__( self.registry_url_scheme = registry_url_scheme or "" self.registry_url = "" # Created from the registry_url_scheme and registry_url_netloc. self.request_timeout = request_timeout or 10 + self.build_tool_names = {"maven", "gradle"} super().__init__("Maven Central Registry") def load_defaults(self) -> None: @@ -159,29 +157,6 @@ def load_defaults(self) -> None: f"of the .ini configuration file is invalid: {error}", ) from error - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts of the repo under analysis can possibly be published to this package registry. - - The detection here is based on the repo's detected build tools. - If the package registry is compatible with the given build tools, it can be a - possible place where the artifacts produced from the repo are published. - - ``MavenCentralRegistry`` is compatible with Maven and Gradle. - - Parameters - ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - compatible_build_tool_classes = [Maven, Gradle] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def find_publish_timestamp(self, purl: str) -> datetime: """Make a search request to Maven Central to find the publishing timestamp of an artifact. diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py index f200bb5e0..7cbeb7913 100644 --- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py @@ -12,9 +12,6 @@ from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool -from macaron.slsa_analyzer.build_tool.npm import NPM -from macaron.slsa_analyzer.build_tool.yarn import Yarn from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry from macaron.util import send_get_http_raw @@ -53,6 +50,7 @@ def __init__( self.attestation_endpoint = attestation_endpoint or "" self.request_timeout = request_timeout or 10 self.enabled = enabled + self.build_tool_names = {"npm", "yarn"} super().__init__("npm Registry") def load_defaults(self) -> None: @@ -95,34 +93,6 @@ def load_defaults(self) -> None: f"of the .ini configuration file is invalid: {error}", ) from error - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts under analysis can be published to this package registry. - - The detection here is based on the repo's detected build tools. - If the package registry is compatible with the given build tools, it can be a - possible place where the artifacts are published. - - ``NPMRegistry`` is compatible with npm and Yarn build tools. - - Note: if the npm registry is disabled through the ini configuration, this method returns False. - - Parameters - ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - if not self.enabled: - logger.debug("Support for the npm registry is disabled.") - return False - compatible_build_tool_classes = [NPM, Yarn] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def download_attestation_payload(self, url: str, download_path: str) -> bool: """Download the npm attestation from npm registry. diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index 146958252..fd943fb3d 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -9,7 +9,6 @@ from macaron.errors import InvalidHTTPResponseError from macaron.json_tools import json_extract -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService logger: logging.Logger = logging.getLogger(__name__) @@ -20,13 +19,14 @@ class PackageRegistry(ABC): def __init__(self, name: str) -> None: self.name = name + self.build_tool_names: set[str] = set() + self.enabled: bool = True @abstractmethod def load_defaults(self) -> None: """Load the .ini configuration for the current package registry.""" - @abstractmethod - def is_detected(self, build_tool: BaseBuildTool) -> bool: + def is_detected(self, build_tool_name: str) -> bool: """Detect if artifacts of the repo under analysis can possibly be published to this package registry. The detection here is based on the repo's detected build tool. @@ -35,8 +35,8 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool: Parameters ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. + build_tool_name: str + The name of a detected build tool of the repository under analysis. Returns ------- @@ -44,6 +44,9 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool: ``True`` if the repo under analysis can be published to this package registry, based on the given build tool. """ + if not self.enabled: + return False + return build_tool_name in self.build_tool_names def find_publish_timestamp(self, purl: str) -> datetime: """Retrieve the publication timestamp for a package specified by its purl from the deps.dev repository by default. diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index e349663b0..69c35a55e 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -21,8 +21,6 @@ from macaron.errors import ConfigurationError, InvalidHTTPResponseError from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime -from macaron.slsa_analyzer.build_tool import Pip, Poetry -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry from macaron.util import send_get_http_raw @@ -75,6 +73,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.enabled = enabled self.registry_url = "" + self.build_tool_names = {"pip", "poetry"} super().__init__("PyPI Registry") def load_defaults(self) -> None: @@ -129,29 +128,6 @@ def load_defaults(self) -> None: f"of the .ini configuration file is invalid: {error}", ) from error - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts of the repo under analysis can possibly be published to this package registry. - - The detection here is based on the repo's detected build tools. - If the package registry is compatible with the given build tools, it can be a - possible place where the artifacts produced from the repo are published. - - ``PyPIRegistry`` is compatible with Pip and Poetry. - - Parameters - ---------- - build_tool: BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - compatible_build_tool_classes = [Pip, Poetry] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def download_package_json(self, url: str) -> dict: """Download the package JSON metadata from pypi registry. From 52d3c59f6b771809899dffd45360a361dc90e010 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Tue, 11 Feb 2025 08:21:23 +1000 Subject: [PATCH 03/17] chore: add alternative find repo for latest purl version also Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/repo_finder/repo_finder.py | 24 +++++++++++++++++---- src/macaron/repo_finder/repo_finder_pypi.py | 11 +++++----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index c5a5cc2cf..9f367ee14 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -103,10 +103,8 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder)) found_repo, outcome = repo_finder.find_repo(purl) - if not found_repo and purl.type == "pypi": - found_repo, outcome = repo_finder_pypi.find_repo(purl) - if not found_repo: - logger.debug("Could not find repository from PyPI registry for PURL: %s", purl) + if not found_repo: + found_repo, outcome = find_repo_alternative(purl, outcome) if check_latest_version and not defaults.getboolean("repofinder", "try_latest_purl", fallback=True): check_latest_version = False @@ -122,6 +120,12 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, return "", RepoFinderInfo.NO_NEWER_VERSION found_repo, outcome = DepsDevRepoFinder().find_repo(latest_version_purl) + if found_repo: + return found_repo, outcome + + if not found_repo: + found_repo, outcome = find_repo_alternative(latest_version_purl, outcome) + if not found_repo: logger.debug("Could not find repo from latest version of PURL: %s", latest_version_purl) return "", RepoFinderInfo.LATEST_VERSION_INVALID @@ -129,6 +133,18 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, return found_repo, outcome +def find_repo_alternative(purl: PackageURL, outcome: RepoFinderInfo) -> tuple[str, RepoFinderInfo]: + """Use PURL type specific methods to find the repository when the standard methods have failed.""" + found_repo = "" + if purl.type == "pypi": + found_repo, outcome = repo_finder_pypi.find_repo(purl) + + if not found_repo: + logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl) + + return found_repo, outcome + + def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None: """Return the repository path from the PURL string. diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 40c042415..70722310f 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -10,6 +10,7 @@ from macaron.errors import InvalidHTTPResponseError from macaron.json_tools import json_extract from macaron.repo_finder.repo_finder_enums import RepoFinderInfo +from macaron.slsa_analyzer.package_registry import PyPIRegistry logger: logging.Logger = logging.getLogger(__name__) @@ -27,9 +28,6 @@ def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: tuple[str, RepoFinderOutcome] : The repository URL for the passed package, if found, and the outcome to report. """ - # TODO solve circular dependency - from macaron.slsa_analyzer.package_registry import PyPIRegistry # pylint: disable=import-outside-toplevel - pypi_registry = PyPIRegistry() pypi_registry.load_defaults() json_endpoint = f"pypi/{purl.name}/json" @@ -52,14 +50,15 @@ def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: continue if not parsed_url.hostname.lower() == "github.com": continue - split_path = parsed_url.path.split("/") - if not split_path or len(split_path) < 3: + # The path starts with a "/". + split_path = parsed_url.path[1:].split("/") + if not split_path or len(split_path) < 2: continue # Fix the URL so that it is the base GitHub URL. E.g. github.com/{owner}/{repo} fixed_url = urllib.parse.ParseResult( scheme=parsed_url.scheme, netloc=parsed_url.netloc, - path=f"{split_path[1]}/{split_path[2]}", + path=f"{split_path[0]}/{split_path[1]}", params=parsed_url.params, query=parsed_url.query, fragment=parsed_url.fragment, From cd724cf248777082d0ecac6673cabe5e1b34d832 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Tue, 11 Feb 2025 08:50:53 +1000 Subject: [PATCH 04/17] chore: add integration test Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../cases/repo_finder_pypi/policy.dl | 10 ++++++++++ .../cases/repo_finder_pypi/test.yaml | 20 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 tests/integration/cases/repo_finder_pypi/policy.dl create mode 100644 tests/integration/cases/repo_finder_pypi/test.yaml diff --git a/tests/integration/cases/repo_finder_pypi/policy.dl b/tests/integration/cases/repo_finder_pypi/policy.dl new file mode 100644 index 000000000..38b2dd9f4 --- /dev/null +++ b/tests/integration/cases/repo_finder_pypi/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_version_control_system_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:pypi/torch@2.6.0"). diff --git a/tests/integration/cases/repo_finder_pypi/test.yaml b/tests/integration/cases/repo_finder_pypi/test.yaml new file mode 100644 index 000000000..d3cf1c557 --- /dev/null +++ b/tests/integration/cases/repo_finder_pypi/test.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing a PyPI PURL that is not correctly found by deps.dev and must be sought on the package registry directly. + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/torch@2.6.0 +- name: Run macaron verify-policy to verify passed/failed checks + kind: verify + options: + policy: policy.dl From c7492fba6ce31ed6bfa6f7aeb145ec20e8000183 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Tue, 11 Feb 2025 08:57:37 +1000 Subject: [PATCH 05/17] chore: update tests Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../package_registry/test_jfrog_maven_registry.py | 6 +++--- .../package_registry/test_maven_central_registry.py | 6 ++---- .../slsa_analyzer/package_registry/test_npm_registry.py | 9 +++------ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py b/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py index ebb960366..ef7276dcf 100644 --- a/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py +++ b/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for the ``JFrogMavenRegistry`` class.""" @@ -129,12 +129,12 @@ def test_is_detected( expected_result: bool, ) -> None: """Test the ``is_detected`` method.""" - assert jfrog_maven.is_detected(build_tool) == expected_result + assert jfrog_maven.is_detected(build_tool.name) == expected_result # The method always returns False when the jfrog_maven instance is not enabled # (in the ini config). jfrog_maven.enabled = False - assert jfrog_maven.is_detected(build_tool) is False + assert jfrog_maven.is_detected(build_tool.name) is False @pytest.mark.parametrize( diff --git a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py index 8a0287b36..62b9fdca0 100644 --- a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py +++ b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for the Maven Central registry.""" @@ -14,7 +14,6 @@ from macaron.config.defaults import load_defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry.maven_central_registry import MavenCentralRegistry @@ -124,12 +123,11 @@ def test_load_defaults_with_invalid_config(tmp_path: Path, user_config_input: st ) def test_is_detected( maven_central: MavenCentralRegistry, - build_tools: dict[str, BaseBuildTool], build_tool_name: str, expected_result: bool, ) -> None: """Test the ``is_detected`` method.""" - assert maven_central.is_detected(build_tools[build_tool_name]) == expected_result + assert maven_central.is_detected(build_tool_name) == expected_result @pytest.mark.parametrize( diff --git a/tests/slsa_analyzer/package_registry/test_npm_registry.py b/tests/slsa_analyzer/package_registry/test_npm_registry.py index a6cadb4ba..a180ea78b 100644 --- a/tests/slsa_analyzer/package_registry/test_npm_registry.py +++ b/tests/slsa_analyzer/package_registry/test_npm_registry.py @@ -13,7 +13,6 @@ from macaron.config.defaults import load_defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.build_tool.npm import NPM from macaron.slsa_analyzer.package_registry.npm_registry import NPMAttestationAsset, NPMRegistry @@ -45,7 +44,7 @@ def test_disable_npm_registry(npm_registry: NPMRegistry, tmp_path: Path, npm_too npm_registry.load_defaults() assert npm_registry.enabled is False - assert npm_registry.is_detected(build_tool=npm_tool) is False + assert npm_registry.is_detected(npm_tool.name) is False @pytest.mark.parametrize( @@ -87,12 +86,10 @@ def test_npm_registry_invalid_config(npm_registry: NPMRegistry, tmp_path: Path, ("maven", False), ], ) -def test_is_detected( - npm_registry: NPMRegistry, build_tools: dict[str, BaseBuildTool], build_tool_name: str, expected: bool -) -> None: +def test_is_detected(npm_registry: NPMRegistry, build_tool_name: str, expected: bool) -> None: """Test that the registry is correctly detected for a build tool.""" npm_registry.load_defaults() - assert npm_registry.is_detected(build_tool=build_tools[build_tool_name]) == expected + assert npm_registry.is_detected(build_tool_name) == expected @pytest.mark.parametrize( From e38975a39edb3ed17e2ee4d29574016501141fa5 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Tue, 11 Feb 2025 09:04:38 +1000 Subject: [PATCH 06/17] chore: pass build tool names to super class Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../slsa_analyzer/package_registry/jfrog_maven_registry.py | 3 +-- .../package_registry/maven_central_registry.py | 3 +-- src/macaron/slsa_analyzer/package_registry/npm_registry.py | 3 +-- .../slsa_analyzer/package_registry/package_registry.py | 7 +++++-- .../slsa_analyzer/package_registry/pypi_registry.py | 3 +-- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py index ca0c92ac2..02188de1d 100644 --- a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py @@ -123,8 +123,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.download_timeout = download_timeout or 120 self.enabled = enabled or False - self.build_tool_names = {"maven", "gradle"} - super().__init__("JFrog Maven Registry") + super().__init__("JFrog Maven Registry", {"maven", "gradle"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index fc5e3966d..131051b66 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -105,8 +105,7 @@ def __init__( self.registry_url_scheme = registry_url_scheme or "" self.registry_url = "" # Created from the registry_url_scheme and registry_url_netloc. self.request_timeout = request_timeout or 10 - self.build_tool_names = {"maven", "gradle"} - super().__init__("Maven Central Registry") + super().__init__("Maven Central Registry", {"maven", "gradle"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py index 7cbeb7913..fe009cc34 100644 --- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py @@ -50,8 +50,7 @@ def __init__( self.attestation_endpoint = attestation_endpoint or "" self.request_timeout = request_timeout or 10 self.enabled = enabled - self.build_tool_names = {"npm", "yarn"} - super().__init__("npm Registry") + super().__init__("npm Registry", {"npm", "yarn"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index fd943fb3d..7fbbf4258 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -17,9 +17,9 @@ class PackageRegistry(ABC): """Base package registry class.""" - def __init__(self, name: str) -> None: + def __init__(self, name: str, build_tool_names: set[str]) -> None: self.name = name - self.build_tool_names: set[str] = set() + self.build_tool_names = build_tool_names self.enabled: bool = True @abstractmethod @@ -44,6 +44,9 @@ def is_detected(self, build_tool_name: str) -> bool: ``True`` if the repo under analysis can be published to this package registry, based on the given build tool. """ + print() + print(f"{build_tool_name} in {self.build_tool_names} ?") + print() if not self.enabled: return False return build_tool_name in self.build_tool_names diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 69c35a55e..f8316dc8f 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -73,8 +73,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.enabled = enabled self.registry_url = "" - self.build_tool_names = {"pip", "poetry"} - super().__init__("PyPI Registry") + super().__init__("PyPI Registry", {"pip", "poetry"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. From 93982955c55a89cba7c24541ceec8631fed749d5 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Thu, 13 Feb 2025 14:52:39 +1000 Subject: [PATCH 07/17] chore: reuse PyPI JSON asset Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../metadata/closer_release_join_date.py | 2 +- src/macaron/repo_finder/repo_finder.py | 32 +++++++++-- src/macaron/repo_finder/repo_finder_enums.py | 3 + src/macaron/repo_finder/repo_finder_pypi.py | 42 +++++++++----- src/macaron/slsa_analyzer/analyzer.py | 57 ++++++++++++++----- .../checks/detect_malicious_metadata_check.py | 19 +++++-- .../package_registry/package_registry.py | 3 - .../package_registry/pypi_registry.py | 14 +++-- .../specs/package_registry_spec.py | 5 +- 9 files changed, 127 insertions(+), 50 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py index 4ff41a619..bfa9a0704 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py @@ -95,7 +95,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes The result and related information collected during the analysis. """ maintainers_join_date: list[datetime] | None = self._get_maintainers_join_date( - pypi_package_json.pypi_registry, pypi_package_json.component.name + pypi_package_json.pypi_registry, pypi_package_json.component_name ) latest_release_date: datetime | None = self._get_latest_release_date(pypi_package_json) detail_info: dict[str, JsonType] = { diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index 9f367ee14..081ff68cf 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -66,11 +66,14 @@ list_remote_references, resolve_local_path, ) +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo logger: logging.Logger = logging.getLogger(__name__) -def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, RepoFinderInfo]: +def find_repo( + purl: PackageURL, check_latest_version: bool = True, all_package_registries: list[PackageRegistryInfo] | None = None +) -> tuple[str, RepoFinderInfo]: """Retrieve the repository URL that matches the given PURL. Parameters @@ -79,6 +82,8 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, The parsed PURL to convert to the repository path. check_latest_version: bool A flag that determines whether the latest version of the PURL is also checked. + all_package_registries: list[PackageRegistryInfo] | None + The list of package registries, if any. Returns ------- @@ -104,7 +109,7 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, found_repo, outcome = repo_finder.find_repo(purl) if not found_repo: - found_repo, outcome = find_repo_alternative(purl, outcome) + found_repo, outcome = find_repo_alternative(purl, outcome, all_package_registries) if check_latest_version and not defaults.getboolean("repofinder", "try_latest_purl", fallback=True): check_latest_version = False @@ -133,11 +138,28 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, return found_repo, outcome -def find_repo_alternative(purl: PackageURL, outcome: RepoFinderInfo) -> tuple[str, RepoFinderInfo]: - """Use PURL type specific methods to find the repository when the standard methods have failed.""" +def find_repo_alternative( + purl: PackageURL, outcome: RepoFinderInfo, all_package_registries: list[PackageRegistryInfo] | None = None +) -> tuple[str, RepoFinderInfo]: + """Use PURL type specific methods to find the repository when the standard methods have failed. + + Parameters + ---------- + purl : PackageURL + The parsed PURL to convert to the repository path. + outcome: RepoFinderInfo + A previous outcome to report if this method does nothing. + all_package_registries: list[PackageRegistryInfo] | None + The list of package registries, if any. + + Returns + ------- + tuple[str, RepoFinderOutcome] : + The repository URL for the passed package, if found, and the outcome to report. + """ found_repo = "" if purl.type == "pypi": - found_repo, outcome = repo_finder_pypi.find_repo(purl) + found_repo, outcome = repo_finder_pypi.find_repo(purl, all_package_registries) if not found_repo: logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl) diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py index 7dff875c6..43e8d5e8b 100644 --- a/src/macaron/repo_finder/repo_finder_enums.py +++ b/src/macaron/repo_finder/repo_finder_enums.py @@ -66,6 +66,9 @@ class RepoFinderInfo(Enum): #: Reported if there was no matching URLs in the JSON returned by the PyPI registry. PYPI_NO_URLS = "PyPI no matching URLs" + #: Reported if the PyPI registry is disabled or not present in the list of package registries. + PYPI_NO_REGISTRY = "PyPI registry disabled or absent" + #: Reported if the provided PURL did not produce a result, but a more recent version could not be found. NO_NEWER_VERSION = "No newer version than provided which failed" diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 70722310f..537e3297d 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -7,39 +7,55 @@ from packageurl import PackageURL -from macaron.errors import InvalidHTTPResponseError -from macaron.json_tools import json_extract from macaron.repo_finder.repo_finder_enums import RepoFinderInfo -from macaron.slsa_analyzer.package_registry import PyPIRegistry +from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, PyPIRegistry +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo logger: logging.Logger = logging.getLogger(__name__) -def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: +def find_repo( + purl: PackageURL, all_package_registries: list[PackageRegistryInfo] | None = None +) -> tuple[str, RepoFinderInfo]: """Retrieve the repository URL that matches the given PyPI PURL. Parameters ---------- purl : PackageURL The parsed PURL to convert to the repository path. + all_package_registries: list[PackageRegistryInfo] | None + The context of the current analysis, if any. Returns ------- tuple[str, RepoFinderOutcome] : The repository URL for the passed package, if found, and the outcome to report. """ - pypi_registry = PyPIRegistry() + pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None) + if not pypi_registry: + return "", RepoFinderInfo.PYPI_NO_REGISTRY + pypi_registry.load_defaults() - json_endpoint = f"pypi/{purl.name}/json" - url = urllib.parse.urljoin(pypi_registry.registry_url, json_endpoint) - try: - json = pypi_registry.download_package_json(url) - except InvalidHTTPResponseError as error: - logger.debug(error) - # TODO improve accuracy of this outcome. + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, pypi_registry, {}) + if not pypi_asset.download(dest=""): return "", RepoFinderInfo.PYPI_HTTP_ERROR - url_dict = json_extract(json, ["info", "project_urls"], dict) + if all_package_registries: + # Find the package registry info object that contains the PyPI registry and has the pypi build tool. + registry_info = next( + ( + info + for info in all_package_registries + if info.package_registry == pypi_registry and info.build_tool_name == "pypi" + ), + None, + ) + if registry_info: + # Save the asset for later use. + registry_info.metadata.append(pypi_asset) + + url_dict = pypi_asset.get_project_links() if not url_dict: return "", RepoFinderInfo.PYPI_JSON_ERROR diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index b551d3f77..e9bece9ca 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -353,6 +353,9 @@ def run_single( status=SCMStatus.ANALYSIS_FAILED, ) + # Pre-populate all package registries so assets can be stored for later. + all_package_registries = self._populate_package_registry_info() + provenance_is_verified = False if not provenance_payload and parsed_purl: # Try to find the provenance file for the parsed PURL. @@ -385,7 +388,12 @@ def run_single( available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname] try: analysis_target = Analyzer.to_analysis_target( - config, available_domains, parsed_purl, provenance_repo_url, provenance_commit_digest + config, + available_domains, + parsed_purl, + provenance_repo_url, + provenance_commit_digest, + all_package_registries, ) except InvalidAnalysisTargetError as error: return Record( @@ -474,7 +482,7 @@ def run_single( self._determine_build_tools(analyze_ctx, git_service) if parsed_purl is not None: self._verify_repository_link(parsed_purl, analyze_ctx) - self._determine_package_registries(analyze_ctx) + self._determine_package_registries(analyze_ctx, all_package_registries) provenance_l3_verified = False if not provenance_payload: @@ -802,6 +810,7 @@ def to_analysis_target( parsed_purl: PackageURL | None, provenance_repo_url: str | None = None, provenance_commit_digest: str | None = None, + all_package_registries: list[PackageRegistryInfo] | None = None, ) -> AnalysisTarget: """Resolve the details of a software component from user input. @@ -818,6 +827,8 @@ def to_analysis_target( The repository URL extracted from provenance, or None if not found or no provenance. provenance_commit_digest: str | None The commit extracted from provenance, or None if not found or no provenance. + all_package_registries: list[PackageRegistryInfo] | None + The list of all package registries. Returns ------- @@ -860,7 +871,9 @@ def to_analysis_target( converted_repo_path = repo_finder.to_repo_path(parsed_purl, available_domains) if converted_repo_path is None: # Try to find repo from PURL - repo, repo_finder_outcome = repo_finder.find_repo(parsed_purl) + repo, repo_finder_outcome = repo_finder.find_repo( + parsed_purl, all_package_registries=all_package_registries + ) return Analyzer.AnalysisTarget( parsed_purl=parsed_purl, @@ -1011,20 +1024,38 @@ def _determine_ci_services(self, analyze_ctx: AnalyzeContext, git_service: BaseG ) ) - def _determine_package_registries(self, analyze_ctx: AnalyzeContext) -> None: + def _populate_package_registry_info(self) -> list[PackageRegistryInfo]: + """Add all possible package registries to the analysis context.""" + package_registries = [] + for package_registry in PACKAGE_REGISTRIES: + for build_tool in BUILD_TOOLS: + build_tool_name = build_tool.name + if build_tool_name not in package_registry.build_tool_names: + continue + package_registries.append( + PackageRegistryInfo( + build_tool_name=build_tool_name, + package_registry=package_registry, + ) + ) + return package_registries + + def _determine_package_registries( + self, analyze_ctx: AnalyzeContext, all_package_registries: list[PackageRegistryInfo] + ) -> None: """Determine the package registries used by the software component based on its build tools.""" build_tools = ( analyze_ctx.dynamic_data["build_spec"]["tools"] or analyze_ctx.dynamic_data["build_spec"]["purl_tools"] ) - for package_registry in PACKAGE_REGISTRIES: - for build_tool in build_tools: - if package_registry.is_detected(build_tool.name): - analyze_ctx.dynamic_data["package_registries"].append( - PackageRegistryInfo( - build_tool=build_tool, - package_registry=package_registry, - ) - ) + build_tool_names = {build_tool.name for build_tool in build_tools} + relevant_package_registries = [] + for package_registry in all_package_registries: + if package_registry.build_tool_name not in build_tool_names: + continue + relevant_package_registries.append(package_registry) + + # Assign the updated list of registries. + analyze_ctx.dynamic_data["package_registries"] = relevant_package_registries def _verify_repository_link(self, parsed_purl: PackageURL, analyze_ctx: AnalyzeContext) -> None: """Verify whether the claimed repository links back to the artifact.""" diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 80439bb79..26ae8937e 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -29,8 +29,6 @@ from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext -from macaron.slsa_analyzer.build_tool.pip import Pip -from macaron.slsa_analyzer.build_tool.poetry import Poetry from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService @@ -260,14 +258,23 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: match package_registry_info_entry: # Currently, only PyPI packages are supported. case PackageRegistryInfo( - build_tool=Pip() | Poetry(), + build_tool_name="pip" | "poetry", package_registry=PyPIRegistry() as pypi_registry, ) as pypi_registry_info: - # Create an AssetLocator object for the PyPI package JSON object. - pypi_package_json = PyPIPackageJsonAsset( - component=ctx.component, pypi_registry=pypi_registry, package_json={} + # Retrieve the pre-existing AssetLocator object for the PyPI package JSON object, if it exists. + pypi_package_json = next( + (asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)), + None, ) + if not pypi_package_json: + # Create an AssetLocator object for the PyPI package JSON object. + pypi_package_json = PyPIPackageJsonAsset( + component_name=ctx.component.name, + component_version=ctx.component.version, + pypi_registry=pypi_registry, + package_json={}, + ) pypi_registry_info.metadata.append(pypi_package_json) diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index 7fbbf4258..9e71fc595 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -44,9 +44,6 @@ def is_detected(self, build_tool_name: str) -> bool: ``True`` if the repo under analysis can be published to this package registry, based on the given build tool. """ - print() - print(f"{build_tool_name} in {self.build_tool_names} ?") - print() if not self.enabled: return False return build_tool_name in self.build_tool_names diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index f8316dc8f..77d2c8b8a 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -17,7 +17,6 @@ from requests import RequestException from macaron.config.defaults import defaults -from macaron.database.table_definitions import Component from macaron.errors import ConfigurationError, InvalidHTTPResponseError from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime @@ -341,8 +340,11 @@ def get_maintainer_join_date(self, username: str) -> datetime | None: class PyPIPackageJsonAsset: """The package JSON hosted on the PyPI registry.""" - #: The target pypi software component. - component: Component + #: The target pypi software component name. + component_name: str + + #: The target pypi software component version. + component_version: str | None #: The pypi registry. pypi_registry: PyPIRegistry @@ -372,7 +374,7 @@ def url(self) -> str: ------- str """ - json_endpoint = f"pypi/{self.component.name}/json" + json_endpoint = f"pypi/{self.component_name}/json" return urllib.parse.urljoin(self.pypi_registry.registry_url, json_endpoint) def download(self, dest: str) -> bool: # pylint: disable=unused-argument @@ -434,8 +436,8 @@ def get_sourcecode_url(self) -> str | None: The URL of the source distribution. """ urls: list | None = None - if self.component.version: - urls = json_extract(self.package_json, ["releases", self.component.version], list) + if self.component_version: + urls = json_extract(self.package_json, ["releases", self.component_version], list) else: # Get the latest version. urls = json_extract(self.package_json, ["urls"], list) diff --git a/src/macaron/slsa_analyzer/specs/package_registry_spec.py b/src/macaron/slsa_analyzer/specs/package_registry_spec.py index e28d9c6d8..ecd91d2b8 100644 --- a/src/macaron/slsa_analyzer/specs/package_registry_spec.py +++ b/src/macaron/slsa_analyzer/specs/package_registry_spec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. @@ -7,7 +7,6 @@ from dataclasses import dataclass, field from macaron.slsa_analyzer.asset import AssetLocator -from macaron.slsa_analyzer.build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry import PackageRegistry from macaron.slsa_analyzer.provenance.provenance import DownloadedProvenanceData @@ -17,7 +16,7 @@ class PackageRegistryInfo: """This class contains data for one package registry that is matched against a repository.""" #: The build tool matched against the repository. - build_tool: BaseBuildTool + build_tool_name: str #: The package registry matched against the repository. This is dependent on the build tool detected. package_registry: PackageRegistry #: The provenances matched against the current repo. From ae950efea01af3d08958d003889684c1a5330e9e Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Thu, 13 Feb 2025 15:24:22 +1000 Subject: [PATCH 08/17] chore: update tests Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../pypi_heuristics/metadata/wheel_absence.py | 2 +- .../malware_analyzer/pypi/test_closer_release_join_date.py | 5 ++++- tests/malware_analyzer/pypi/test_wheel_absence.py | 7 ++++--- .../checks/test_detect_malicious_metadata_check.py | 5 ++--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py index 2a8217353..3a3033e22 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py @@ -61,7 +61,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes logger.debug(error_msg) raise HeuristicAnalyzerValueError(error_msg) - version = pypi_package_json.component.version + version = pypi_package_json.component_version if version is None: # check latest release version version = pypi_package_json.get_latest_version() diff --git a/tests/malware_analyzer/pypi/test_closer_release_join_date.py b/tests/malware_analyzer/pypi/test_closer_release_join_date.py index 4ed1a9b24..309574a21 100644 --- a/tests/malware_analyzer/pypi/test_closer_release_join_date.py +++ b/tests/malware_analyzer/pypi/test_closer_release_join_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for closer release join date heuristic.""" @@ -17,6 +17,7 @@ def test_analyze_pass(pypi_package_json: MagicMock) -> None: pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["maintainer1", "maintainer2"] pypi_package_json.pypi_registry.get_maintainer_join_date.side_effect = [datetime(2018, 1, 1), datetime(2019, 1, 1)] pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00" + pypi_package_json.component_name = "mock1" # Call the method. result, detail_info = analyzer.analyze(pypi_package_json) @@ -35,6 +36,7 @@ def test_analyze_process(pypi_package_json: MagicMock) -> None: pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["maintainer1"] pypi_package_json.pypi_registry.get_maintainer_join_date.side_effect = [datetime(2022, 6, 18)] pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00" + pypi_package_json.component_name = "mock1" # Call the method. result, detail_info = analyzer.analyze(pypi_package_json) @@ -52,6 +54,7 @@ def test_analyze_skip(pypi_package_json: MagicMock) -> None: # Set up mock return values. pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = None pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00" + pypi_package_json.component_name = "mock1" # Call the method. result, detail_info = analyzer.analyze(pypi_package_json) diff --git a/tests/malware_analyzer/pypi/test_wheel_absence.py b/tests/malware_analyzer/pypi/test_wheel_absence.py index a2eebd554..3cfccfbe7 100644 --- a/tests/malware_analyzer/pypi/test_wheel_absence.py +++ b/tests/malware_analyzer/pypi/test_wheel_absence.py @@ -67,10 +67,11 @@ def test_analyze_tar_present(mock_send_head_http_raw: MagicMock, pypi_package_js pypi_package_json.get_releases.return_value = release pypi_package_json.get_latest_version.return_value = version - pypi_package_json.component.version = None + pypi_package_json.component_version = None pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}} pypi_package_json.pypi_registry.inspector_url_scheme = "https" pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io" + mock_send_head_http_raw.return_value = MagicMock() # assume valid URL for testing purposes expected_detail_info = { @@ -126,7 +127,7 @@ def test_analyze_whl_present(mock_send_head_http_raw: MagicMock, pypi_package_js } pypi_package_json.get_releases.return_value = release - pypi_package_json.component.version = version + pypi_package_json.component_version = version pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}} pypi_package_json.pypi_registry.inspector_url_scheme = "https" pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io" @@ -214,7 +215,7 @@ def test_analyze_both_present(mock_send_head_http_raw: MagicMock, pypi_package_j } pypi_package_json.get_releases.return_value = release - pypi_package_json.component.version = version + pypi_package_json.component_version = version pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}} pypi_package_json.pypi_registry.inspector_url_scheme = "https" pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io" diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index c6ecb044d..8f15c636a 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -12,7 +12,6 @@ from pytest_httpserver import HTTPServer from macaron.config.defaults import load_defaults -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.detect_malicious_metadata_check import DetectMaliciousMetadataCheck from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIRegistry @@ -35,7 +34,7 @@ ], ) def test_detect_malicious_metadata( - httpserver: HTTPServer, tmp_path: Path, pip_tool: BaseBuildTool, macaron_path: Path, purl: str, expected: str + httpserver: HTTPServer, tmp_path: Path, macaron_path: Path, purl: str, expected: str ) -> None: """Test that the check handles repositories correctly.""" check = DetectMaliciousMetadataCheck() @@ -43,7 +42,7 @@ def test_detect_malicious_metadata( # Set up the context object with PyPIRegistry instance. ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl) pypi_registry = PyPIRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool, pypi_registry)] + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", pypi_registry)] # Set up responses of PyPI endpoints using the httpserver plugin. with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page: From bc03bbb99e4aa2a3cc3591bf858b91a4be737b76 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Thu, 13 Feb 2025 16:04:15 +1000 Subject: [PATCH 09/17] chore: add purl type to build tool in registry info Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/slsa_analyzer/analyzer.py | 1 + .../checks/detect_malicious_metadata_check.py | 1 + .../checks/infer_artifact_pipeline_check.py | 2 +- .../slsa_analyzer/specs/package_registry_spec.py | 4 +++- .../test_detect_malicious_metadata_check.py | 2 +- .../checks/test_repo_verification_check.py | 16 +++++++++++----- 6 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index e9bece9ca..de881bab7 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -1035,6 +1035,7 @@ def _populate_package_registry_info(self) -> list[PackageRegistryInfo]: package_registries.append( PackageRegistryInfo( build_tool_name=build_tool_name, + build_tool_purl_type=build_tool.purl_type, package_registry=package_registry, ) ) diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 26ae8937e..86c567762 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -259,6 +259,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Currently, only PyPI packages are supported. case PackageRegistryInfo( build_tool_name="pip" | "poetry", + build_tool_purl_type="pypi", package_registry=PyPIRegistry() as pypi_registry, ) as pypi_registry_info: diff --git a/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py b/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py index 96f83cefc..c02fa8380 100644 --- a/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py +++ b/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py @@ -123,7 +123,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Look for the artifact in the corresponding registry and find the publish timestamp. artifact_published_date = None for registry_info in ctx.dynamic_data["package_registries"]: - if registry_info.build_tool.purl_type == ctx.component.type: + if registry_info.build_tool_purl_type == ctx.component.type: try: artifact_published_date = registry_info.package_registry.find_publish_timestamp(ctx.component.purl) break diff --git a/src/macaron/slsa_analyzer/specs/package_registry_spec.py b/src/macaron/slsa_analyzer/specs/package_registry_spec.py index ecd91d2b8..84b2a69e7 100644 --- a/src/macaron/slsa_analyzer/specs/package_registry_spec.py +++ b/src/macaron/slsa_analyzer/specs/package_registry_spec.py @@ -15,8 +15,10 @@ class PackageRegistryInfo: """This class contains data for one package registry that is matched against a repository.""" - #: The build tool matched against the repository. + #: The name of the build tool matched against the repository. build_tool_name: str + #: The purl type of the build tool matched against the repository. + build_tool_purl_type: str #: The package registry matched against the repository. This is dependent on the build tool detected. package_registry: PackageRegistry #: The provenances matched against the current repo. diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index 8f15c636a..c4251ff66 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -42,7 +42,7 @@ def test_detect_malicious_metadata( # Set up the context object with PyPIRegistry instance. ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl) pypi_registry = PyPIRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", pypi_registry)] + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", "pypi", pypi_registry)] # Set up responses of PyPI endpoints using the httpserver plugin. with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page: diff --git a/tests/slsa_analyzer/checks/test_repo_verification_check.py b/tests/slsa_analyzer/checks/test_repo_verification_check.py index f0f3dd923..dcc15af43 100644 --- a/tests/slsa_analyzer/checks/test_repo_verification_check.py +++ b/tests/slsa_analyzer/checks/test_repo_verification_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Module to test the repository verification check.""" @@ -23,7 +23,9 @@ def test_repo_verification_pass(maven_tool: BaseBuildTool, macaron_path: Path) - ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") maven_registry = MavenCentralRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["package_registries"] = [ + PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry) + ] ctx.dynamic_data["repo_verification"] = [ RepositoryVerificationResult( status=RepositoryVerificationStatus.PASSED, @@ -41,7 +43,9 @@ def test_repo_verification_fail(maven_tool: BaseBuildTool, macaron_path: Path) - ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") maven_registry = MavenCentralRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["package_registries"] = [ + PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry) + ] ctx.dynamic_data["repo_verification"] = [ RepositoryVerificationResult( status=RepositoryVerificationStatus.FAILED, @@ -59,7 +63,9 @@ def test_check_unknown_for_unknown_repo_verification(maven_tool: BaseBuildTool, ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") maven_registry = MavenCentralRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["package_registries"] = [ + PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry) + ] ctx.dynamic_data["repo_verification"] = [ RepositoryVerificationResult( status=RepositoryVerificationStatus.UNKNOWN, @@ -77,6 +83,6 @@ def test_check_unknown_for_unsupported_build_tools(pip_tool: BaseBuildTool, maca ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:pypi/test/test") pypi_registry = PyPIRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool, pypi_registry)] + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool.name, pip_tool.purl_type, pypi_registry)] assert check.run_check(ctx).result_type == CheckResultType.UNKNOWN From 0c1c9f09484bad2763f011fbf1a4a4e9539750a6 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Mon, 10 Mar 2025 13:41:34 +1000 Subject: [PATCH 10/17] chore: add repository info for source code heuristic; minor fixes Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../pypi_heuristics/metadata/source_code_repo.py | 2 +- src/macaron/slsa_analyzer/analyzer.py | 6 +++--- .../slsa_analyzer/checks/detect_malicious_metadata_check.py | 2 ++ src/macaron/slsa_analyzer/package_registry/pypi_registry.py | 3 +++ 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py index 8d8c9619d..708301807 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py @@ -41,6 +41,6 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes The result and related information collected during the analysis. """ # If a sourcecode repo exists, then this will have already been validated - if not pypi_package_json.component.repository: + if not pypi_package_json.has_repository: return HeuristicResult.FAIL, {} return HeuristicResult.PASS, {} diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index de881bab7..8ae5ff3b2 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -472,7 +472,7 @@ def run_single( logger.info("With PURL: %s", component.purl) logger.info("=====================================") - analyze_ctx = self.get_analyze_ctx(component) + analyze_ctx = self.create_analyze_ctx(component) analyze_ctx.dynamic_data["expectation"] = self.expectations.get_expectation_for_target( analyze_ctx.component.purl.split("@")[0] ) @@ -917,8 +917,8 @@ def to_analysis_target( "Cannot determine the analysis target: PURL and repository path are missing." ) - def get_analyze_ctx(self, component: Component) -> AnalyzeContext: - """Return the analyze context for a target component. + def create_analyze_ctx(self, component: Component) -> AnalyzeContext: + """Create and return an analysis context for the passed component. Parameters ---------- diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 86c567762..05444bec3 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -222,6 +222,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # First check if this package is a known malware data = {"package": {"purl": ctx.component.purl}} + package_exists = False try: package_exists = bool(DepsDevService.get_package_info(ctx.component.purl)) except APIAccessError as error: @@ -273,6 +274,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: pypi_package_json = PyPIPackageJsonAsset( component_name=ctx.component.name, component_version=ctx.component.version, + has_repository=ctx.component.repository is not None, pypi_registry=pypi_registry, package_json={}, ) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 77d2c8b8a..20f75db08 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -346,6 +346,9 @@ class PyPIPackageJsonAsset: #: The target pypi software component version. component_version: str | None + #: Whether the component of this asset has a related repository. + has_repository: bool + #: The pypi registry. pypi_registry: PyPIRegistry From f3a5419dcb94c1ec3047c9ba8ec3d4701036bd21 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Mon, 10 Mar 2025 13:46:39 +1000 Subject: [PATCH 11/17] chore: fix test Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../malware_analyzer/pypi/test_source_code_repo.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/malware_analyzer/pypi/test_source_code_repo.py b/tests/malware_analyzer/pypi/test_source_code_repo.py index 668c80865..3cc9db15d 100644 --- a/tests/malware_analyzer/pypi/test_source_code_repo.py +++ b/tests/malware_analyzer/pypi/test_source_code_repo.py @@ -14,19 +14,13 @@ @pytest.mark.parametrize( ("repository", "expected_result"), [ - pytest.param(None, HeuristicResult.FAIL, id="test_no_repo"), - pytest.param( - MagicMock(), - HeuristicResult.PASS, - id="test_valid_repo", - ), + pytest.param(False, HeuristicResult.FAIL, id="test_no_repo"), + pytest.param(True, HeuristicResult.PASS, id="test_valid_repo"), ], ) -def test_repo_existence( - pypi_package_json: MagicMock, repository: MagicMock | None, expected_result: HeuristicResult -) -> None: +def test_repo_existence(pypi_package_json: MagicMock, repository: bool, expected_result: HeuristicResult) -> None: """Test if the source code repo exists.""" - pypi_package_json.component.repository = repository + pypi_package_json.has_repository = repository analyzer = SourceCodeRepoAnalyzer() result, _ = analyzer.analyze(pypi_package_json) assert result == expected_result From e73235c0840db938c8bf74cd222f17fc6046c6c6 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Mon, 10 Mar 2025 14:20:13 +1000 Subject: [PATCH 12/17] chore: minor fix Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/repo_finder/repo_finder_pypi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 537e3297d..93af5395e 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -37,7 +37,7 @@ def find_repo( return "", RepoFinderInfo.PYPI_NO_REGISTRY pypi_registry.load_defaults() - pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, pypi_registry, {}) + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}) if not pypi_asset.download(dest=""): return "", RepoFinderInfo.PYPI_HTTP_ERROR @@ -80,6 +80,7 @@ def find_repo( fragment=parsed_url.fragment, ).geturl() logger.debug("Found repository URL from PyPI: %s", fixed_url) + pypi_asset.has_repository = True return fixed_url, RepoFinderInfo.FOUND_FROM_PYPI return "", RepoFinderInfo.PYPI_NO_URLS From 79566935411ba8c04613a72d49ebd1a7ab32dc6a Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Wed, 19 Mar 2025 11:04:34 +1000 Subject: [PATCH 13/17] chore: add integration test for find-source command Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../repo_finder_pypi_find_source/test.yaml | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 tests/integration/cases/repo_finder_pypi_find_source/test.yaml diff --git a/tests/integration/cases/repo_finder_pypi_find_source/test.yaml b/tests/integration/cases/repo_finder_pypi_find_source/test.yaml new file mode 100644 index 000000000..690658908 --- /dev/null +++ b/tests/integration/cases/repo_finder_pypi_find_source/test.yaml @@ -0,0 +1,22 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Finding the source of a PyPI PURL that is not correctly found by deps.dev and must be sought on the package registry directly. + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: find-source + options: + command_args: + - -purl + - pkg:pypi/torch@2.6.0 +- name: Validate the produced report + kind: validate_schema + options: + kind: json_schema + schema: find_source_json_report + result: output/reports/pypi/torch/torch.source.json From 599d261cfc083cdb6b2f56ff09c9fdab4a221a03 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Tue, 18 Mar 2025 16:51:31 +1000 Subject: [PATCH 14/17] feat: add GitHub attestation discovery Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/artifact/local_artifact.py | 57 ++++++++- src/macaron/slsa_analyzer/analyzer.py | 117 ++++++++++++++++-- .../slsa_analyzer/git_service/api_client.py | 21 +++- .../maven_central_registry.py | 72 +++++++++++ .../slsa_analyzer/provenance/loader.py | 8 +- .../cases/github_maven_attestation/policy.dl | 10 ++ .../cases/github_maven_attestation/test.yaml | 22 ++++ .../github_maven_attestation_local/policy.dl | 10 ++ .../github_maven_attestation_local/test.yaml | 28 +++++ 9 files changed, 328 insertions(+), 17 deletions(-) create mode 100644 tests/integration/cases/github_maven_attestation/policy.dl create mode 100644 tests/integration/cases/github_maven_attestation/test.yaml create mode 100644 tests/integration/cases/github_maven_attestation_local/policy.dl create mode 100644 tests/integration/cases/github_maven_attestation_local/test.yaml diff --git a/src/macaron/artifact/local_artifact.py b/src/macaron/artifact/local_artifact.py index ed37c335a..582799824 100644 --- a/src/macaron/artifact/local_artifact.py +++ b/src/macaron/artifact/local_artifact.py @@ -1,16 +1,21 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module declares types and utilities for handling local artifacts.""" import fnmatch import glob +import hashlib +import logging import os from packageurl import PackageURL from macaron.artifact.maven import construct_maven_repository_path from macaron.errors import LocalArtifactFinderError +from macaron.slsa_analyzer.package_registry import MavenCentralRegistry + +logger: logging.Logger = logging.getLogger(__name__) def construct_local_artifact_dirs_glob_pattern_maven_purl(maven_purl: PackageURL) -> list[str] | None: @@ -247,3 +252,53 @@ def get_local_artifact_dirs( ) raise LocalArtifactFinderError(f"Unsupported PURL type {purl_type}") + + +def get_local_artifact_hash(purl: PackageURL, artifact_dirs: list[str], hash_algorithm_name: str) -> str | None: + """Compute the hash of the local artifact. + + Parameters + ---------- + purl: PackageURL + The PURL of the artifact being sought. + artifact_dirs: list[str] + The possible locations of the artifact. + hash_algorithm_name: str + The hash algorithm to use. + + Returns + ------- + str | None + The hash, or None if not found. + """ + if not artifact_dirs: + logger.debug("No artifact directories provided.") + return None + + if not purl.version: + logger.debug("PURL is missing version.") + return None + + artifact_target = None + if purl.type == "maven": + artifact_target = MavenCentralRegistry.get_artifact_file_name(purl) + + if not artifact_target: + logger.debug("PURL type not supported: %s", purl.type) + return None + + for artifact_dir in artifact_dirs: + full_path = os.path.join(artifact_dir, artifact_target) + if not os.path.exists(full_path): + continue + + with open(full_path, "rb") as file: + try: + hash_result = hashlib.file_digest(file, hash_algorithm_name) + except ValueError as error: + logger.debug("Error while hashing file: %s", error) + continue + + return hash_result.hexdigest() + + return None diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index 8ae5ff3b2..a8fb88830 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -4,11 +4,14 @@ """This module handles the cloning and analyzing a Git repo.""" import glob +import hashlib +import json import logging import os import re import sys import tempfile +import urllib.parse from collections.abc import Mapping from datetime import datetime, timezone from pathlib import Path @@ -20,7 +23,10 @@ from sqlalchemy.orm import Session from macaron import __version__ -from macaron.artifact.local_artifact import get_local_artifact_dirs +from macaron.artifact.local_artifact import ( + get_local_artifact_dirs, + get_local_artifact_hash, +) from macaron.config.global_config import global_config from macaron.config.target_config import Configuration from macaron.database.database_manager import DatabaseManager, get_db_manager, get_db_session @@ -41,6 +47,7 @@ ProvenanceError, PURLNotFoundError, ) +from macaron.json_tools import json_extract from macaron.output_reporter.reporter import FileReporter from macaron.output_reporter.results import Record, Report, SCMStatus from macaron.provenance import provenance_verifier @@ -66,12 +73,14 @@ from macaron.slsa_analyzer.checks import * # pylint: disable=wildcard-import,unused-wildcard-import # noqa: F401,F403 from macaron.slsa_analyzer.ci_service import CI_SERVICES from macaron.slsa_analyzer.database_store import store_analyze_context_to_db -from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService +from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService, GitHub from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR -from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES +from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry from macaron.slsa_analyzer.provenance.expectations.expectation_registry import ExpectationRegistry from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV01Payload +from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError +from macaron.slsa_analyzer.provenance.loader import load_provenance_payload from macaron.slsa_analyzer.provenance.slsa import SLSAProvenanceData from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.specs.ci_spec import CIInfo @@ -403,6 +412,17 @@ def run_single( status=SCMStatus.ANALYSIS_FAILED, ) + local_artifact_dirs = None + if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper: + local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type] + try: + local_artifact_dirs = get_local_artifact_dirs( + purl=parsed_purl, + local_artifact_repo_path=local_artifact_repo_path, + ) + except LocalArtifactFinderError as error: + logger.debug(error) + # Prepare the repo. git_obj = None commit_finder_outcome = CommitFinderInfo.NOT_USED @@ -480,6 +500,37 @@ def run_single( git_service = self._determine_git_service(analyze_ctx) self._determine_ci_services(analyze_ctx, git_service) self._determine_build_tools(analyze_ctx, git_service) + + # Try to find an attestation from GitHub, if applicable. + if parsed_purl and not provenance_payload and analysis_target.repo_path and isinstance(git_service, GitHub): + # Try to discover GitHub attestation for the target software component. + url = None + try: + url = urllib.parse.urlparse(analysis_target.repo_path) + except TypeError as error: + logger.debug("Failed to parse repository path as URL: %s", error) + if url and url.hostname == "github.com": + artifact_hash = self.get_artifact_hash(parsed_purl, local_artifact_dirs, hashlib.sha256()) + if artifact_hash: + git_attestation_dict = git_service.api_client.get_attestation( + analyze_ctx.component.repository.full_name, artifact_hash + ) + if git_attestation_dict: + git_attestation_list = json_extract(git_attestation_dict, ["attestations"], list) + if git_attestation_list: + git_attestation = git_attestation_list[0] + + with tempfile.TemporaryDirectory() as temp_dir: + attestation_file = os.path.join(temp_dir, "attestation") + with open(attestation_file, "w", encoding="UTF-8") as file: + json.dump(git_attestation, file) + + try: + payload = load_provenance_payload(attestation_file) + provenance_payload = payload + except LoadIntotoAttestationError as error: + logger.debug("Failed to load provenance payload: %s", error) + if parsed_purl is not None: self._verify_repository_link(parsed_purl, analyze_ctx) self._determine_package_registries(analyze_ctx, all_package_registries) @@ -541,16 +592,8 @@ def run_single( analyze_ctx.dynamic_data["validate_malware"] = validate_malware - if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper: - local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type] - try: - local_artifact_dirs = get_local_artifact_dirs( - purl=parsed_purl, - local_artifact_repo_path=local_artifact_repo_path, - ) - analyze_ctx.dynamic_data["local_artifact_paths"].extend(local_artifact_dirs) - except LocalArtifactFinderError as error: - logger.debug(error) + if local_artifact_dirs: + analyze_ctx.dynamic_data["local_artifact_paths"].extend(local_artifact_dirs) analyze_ctx.check_results = registry.scan(analyze_ctx) @@ -939,6 +982,54 @@ def create_analyze_ctx(self, component: Component) -> AnalyzeContext: return analyze_ctx + def get_artifact_hash( + self, purl: PackageURL, cached_artifacts: list[str] | None, hash_algorithm: Any + ) -> str | None: + """Get the hash of the artifact found from the passed PURL using local or remote files. + + Parameters + ---------- + purl: PackageURL + The PURL of the artifact. + cached_artifacts: list[str] | None + The list of local files that match the PURL. + hash_algorithm: Any + The hash algorithm to use. + + Returns + ------- + str | None + The hash of the artifact, or None if not found. + """ + if cached_artifacts: + # Try to get the hash from a local file. + artifact_hash = get_local_artifact_hash(purl, cached_artifacts, hash_algorithm.name) + + if artifact_hash: + return artifact_hash + + # Download the artifact. + if purl.type == "maven": + maven_registry = next( + ( + package_registry + for package_registry in PACKAGE_REGISTRIES + if isinstance(package_registry, MavenCentralRegistry) + ), + None, + ) + if not maven_registry: + return None + + return maven_registry.get_artifact_hash(purl, hash_algorithm) + + if purl.type == "pypi": + # TODO implement + return None + + logger.debug("Purl type '%s' not yet supported for GitHub attestation discovery.", purl.type) + return None + def _determine_git_service(self, analyze_ctx: AnalyzeContext) -> BaseGitService: """Determine the Git service used by the software component.""" remote_path = analyze_ctx.component.repository.remote_path if analyze_ctx.component.repository else None diff --git a/src/macaron/slsa_analyzer/git_service/api_client.py b/src/macaron/slsa_analyzer/git_service/api_client.py index 8e987e6ca..681a1f4e0 100644 --- a/src/macaron/slsa_analyzer/git_service/api_client.py +++ b/src/macaron/slsa_analyzer/git_service/api_client.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """The module provides API clients for VCS services, such as GitHub.""" @@ -659,6 +659,25 @@ def download_asset(self, url: str, download_path: str) -> bool: return True + def get_attestation(self, full_name: str, artifact_hash: str) -> dict: + """Download and return the attestation associated with the passed artifact hash, if any. + + Parameters + ---------- + full_name : str + The full name of the repo. + artifact_hash: str + The SHA256 hash of an artifact. + + Returns + ------- + dict + The attestation data, or an empty dict if not found. + """ + url = f"{GhAPIClient._REPO_END_POINT}/{full_name}/attestations/sha256:{artifact_hash}" + response_data = send_get_http(url, self.headers) + return response_data or {} + def get_default_gh_client(access_token: str) -> GhAPIClient: """Return a GhAPIClient instance with default values. diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index 131051b66..593c15b88 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -6,10 +6,13 @@ import logging import urllib.parse from datetime import datetime, timezone +from typing import Any import requests from packageurl import PackageURL +from requests import RequestException +from macaron.artifact.maven import construct_maven_repository_path from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry @@ -236,3 +239,72 @@ def find_publish_timestamp(self, purl: str) -> datetime: raise InvalidHTTPResponseError(f"The timestamp returned by {url} is invalid") from error raise InvalidHTTPResponseError(f"Invalid response from Maven central for {url}.") + + @staticmethod + def get_artifact_file_name(purl: PackageURL) -> str | None: + """Return the artifact file name of the passed PURL based on the Maven registry standard. + + Parameters + ---------- + purl: PackageURL + The PURL of the artifact. + + Returns + ------- + str | None + The artifact file name, or None if invalid. + """ + if not purl.version: + return None + + return purl.name + "-" + purl.version + ".jar" + + def get_artifact_hash(self, purl: PackageURL, hash_algorithm: Any) -> str | None: + """Return the hash of the artifact found by the passed purl relevant to the registry's URL. + + Parameters + ---------- + purl: PackageURL + The purl of the artifact. + hash_algorithm: Any + The hash algorithm to use. + + Returns + ------- + str | None + The hash of the artifact, or None if not found. + """ + if not (purl.namespace and purl.version): + return None + + artifact_path = construct_maven_repository_path(purl.namespace, purl.name, purl.version) + file_name = MavenCentralRegistry.get_artifact_file_name(purl) + if not file_name: + return None + + artifact_url = self.registry_url + "/" + artifact_path + "/" + file_name + logger.debug("Search for artifact using URL: %s", artifact_url) + + try: + response = requests.get(artifact_url, stream=True, timeout=40) + response.raise_for_status() + except requests.exceptions.HTTPError as http_err: + logger.debug("HTTP error occurred: %s", http_err) + return None + + if response.status_code != 200: + return None + + # Download file and compute hash as chunks are received. + try: + for chunk in response.iter_content(): + hash_algorithm.update(chunk) + except RequestException as error: + # Something went wrong with the request, abort. + logger.debug("Error while streaming target file: %s", error) + response.close() + return None + + artifact_hash: str = hash_algorithm.hexdigest() + logger.debug("Computed hash of artifact: %s", artifact_hash) + return artifact_hash diff --git a/src/macaron/slsa_analyzer/provenance/loader.py b/src/macaron/slsa_analyzer/provenance/loader.py index 65dfee1bb..d75faa726 100644 --- a/src/macaron/slsa_analyzer/provenance/loader.py +++ b/src/macaron/slsa_analyzer/provenance/loader.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the loaders for SLSA provenances.""" @@ -12,7 +12,7 @@ from urllib.parse import urlparse from macaron.config.defaults import defaults -from macaron.json_tools import JsonType +from macaron.json_tools import JsonType, json_extract from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, validate_intoto_payload from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError, ValidateInTotoPayloadError from macaron.util import send_get_http_raw @@ -83,6 +83,10 @@ def _load_provenance_file_content( # Some provenances, such as Witness may not include the DSSE envelope `dsseEnvelope` # property but contain its value directly. provenance_payload = provenance.get("payload", None) + if not provenance_payload: + # GitHub Attestation. + # TODO Check if old method (above) actually works. + provenance_payload = json_extract(provenance, ["bundle", "dsseEnvelope", "payload"], str) if not provenance_payload: raise LoadIntotoAttestationError( 'Cannot find the "payload" field in the decoded provenance.', diff --git a/tests/integration/cases/github_maven_attestation/policy.dl b/tests/integration/cases/github_maven_attestation/policy.dl new file mode 100644 index 000000000..9df46219b --- /dev/null +++ b/tests/integration/cases/github_maven_attestation/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_provenance_available_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22"). diff --git a/tests/integration/cases/github_maven_attestation/test.yaml b/tests/integration/cases/github_maven_attestation/test.yaml new file mode 100644 index 000000000..9913d930e --- /dev/null +++ b/tests/integration/cases/github_maven_attestation/test.yaml @@ -0,0 +1,22 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Discovering attestation of a Maven artifact on GitHub + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22 + - -rp + - https://github.com/liftwizard/liftwizard +- name: Run macaron verify-policy to verify passed/failed checks + kind: verify + options: + policy: policy.dl diff --git a/tests/integration/cases/github_maven_attestation_local/policy.dl b/tests/integration/cases/github_maven_attestation_local/policy.dl new file mode 100644 index 000000000..ff31abf90 --- /dev/null +++ b/tests/integration/cases/github_maven_attestation_local/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_failed(component_id, "mcn_provenance_available_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22"). diff --git a/tests/integration/cases/github_maven_attestation_local/test.yaml b/tests/integration/cases/github_maven_attestation_local/test.yaml new file mode 100644 index 000000000..d66a089b2 --- /dev/null +++ b/tests/integration/cases/github_maven_attestation_local/test.yaml @@ -0,0 +1,28 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Discovering GitHub attestation of a local Maven artifact but failing because the artifact is wrong + +tags: +- macaron-python-package + +steps: +- name: Download artifact POM instead of the JAR + kind: shell + options: + cmd: curl --create-dirs -o ./output/.m2/repository/io/liftwizard/liftwizard-checkstyle/2.1.22/liftwizard-checkstyle-2.1.22.jar https://repo1.maven.org/maven2/io/liftwizard/liftwizard-checkstyle/2.1.22/liftwizard-checkstyle-2.1.22.pom +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:maven/io.liftwizard/liftwizard-checkstyle@2.1.22 + - -rp + - https://github.com/liftwizard/liftwizard + - --local-maven-repo + - ./output/.m2 +- name: Run macaron verify-policy to verify no provenance was found + kind: verify + options: + policy: policy.dl From 69099a837428286518ef75231ff2e13506d8faa1 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Wed, 19 Mar 2025 20:11:07 +1000 Subject: [PATCH 15/17] chore: add support for PyPI PURLs Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/slsa_analyzer/analyzer.py | 54 ++++++++++-- .../checks/detect_malicious_metadata_check.py | 29 +++---- .../package_registry/pypi_registry.py | 87 ++++++++++++++++++- 3 files changed, 146 insertions(+), 24 deletions(-) diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index a8fb88830..21cbd3ad0 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -76,7 +76,8 @@ from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService, GitHub from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR -from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry +from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry, PyPIRegistry +from macaron.slsa_analyzer.package_registry.pypi_registry import find_or_create_pypi_asset from macaron.slsa_analyzer.provenance.expectations.expectation_registry import ExpectationRegistry from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV01Payload from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError @@ -510,7 +511,9 @@ def run_single( except TypeError as error: logger.debug("Failed to parse repository path as URL: %s", error) if url and url.hostname == "github.com": - artifact_hash = self.get_artifact_hash(parsed_purl, local_artifact_dirs, hashlib.sha256()) + artifact_hash = self.get_artifact_hash( + parsed_purl, local_artifact_dirs, hashlib.sha256(), all_package_registries + ) if artifact_hash: git_attestation_dict = git_service.api_client.get_attestation( analyze_ctx.component.repository.full_name, artifact_hash @@ -983,7 +986,11 @@ def create_analyze_ctx(self, component: Component) -> AnalyzeContext: return analyze_ctx def get_artifact_hash( - self, purl: PackageURL, cached_artifacts: list[str] | None, hash_algorithm: Any + self, + purl: PackageURL, + cached_artifacts: list[str] | None, + hash_algorithm: Any, + all_package_registries: list[PackageRegistryInfo], ) -> str | None: """Get the hash of the artifact found from the passed PURL using local or remote files. @@ -995,6 +1002,8 @@ def get_artifact_hash( The list of local files that match the PURL. hash_algorithm: Any The hash algorithm to use. + all_package_registries: list[PackageRegistryInfo] + The list of package registry information. Returns ------- @@ -1024,8 +1033,43 @@ def get_artifact_hash( return maven_registry.get_artifact_hash(purl, hash_algorithm) if purl.type == "pypi": - # TODO implement - return None + pypi_registry = next( + ( + package_registry + for package_registry in PACKAGE_REGISTRIES + if isinstance(package_registry, PyPIRegistry) + ), + None, + ) + if not pypi_registry: + logger.debug("Missing registry for PyPI") + return None + + registry_info = next( + ( + info + for info in all_package_registries + if info.package_registry == pypi_registry and info.build_tool_name in {"pip", "poetry"} + ), + None, + ) + if not registry_info: + logger.debug("Missing registry information for PyPI") + return None + + pypi_asset = find_or_create_pypi_asset(purl.name, purl.version, registry_info) + if not pypi_asset: + return None + + pypi_asset.has_repository = True + if not pypi_asset.download(""): + return None + + source_url = pypi_asset.get_sourcecode_url("bdist_wheel") + if not source_url: + return None + + return pypi_registry.get_artifact_hash(source_url, hash_algorithm) logger.debug("Purl type '%s' not yet supported for GitHub attestation discovery.", purl.type) return None diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 05444bec3..4e58b7cab 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -32,7 +32,11 @@ from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService -from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry +from macaron.slsa_analyzer.package_registry.pypi_registry import ( + PyPIPackageJsonAsset, + PyPIRegistry, + find_or_create_pypi_asset, +) from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo from macaron.util import send_post_http_raw @@ -261,23 +265,16 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: case PackageRegistryInfo( build_tool_name="pip" | "poetry", build_tool_purl_type="pypi", - package_registry=PyPIRegistry() as pypi_registry, + package_registry=PyPIRegistry(), ) as pypi_registry_info: - - # Retrieve the pre-existing AssetLocator object for the PyPI package JSON object, if it exists. - pypi_package_json = next( - (asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)), - None, + # Retrieve the pre-existing asset, or create a new one. + pypi_package_json = find_or_create_pypi_asset( + ctx.component.name, ctx.component.version, pypi_registry_info ) - if not pypi_package_json: - # Create an AssetLocator object for the PyPI package JSON object. - pypi_package_json = PyPIPackageJsonAsset( - component_name=ctx.component.name, - component_version=ctx.component.version, - has_repository=ctx.component.repository is not None, - pypi_registry=pypi_registry, - package_json={}, - ) + if pypi_package_json is None: + return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + + pypi_package_json.has_repository = ctx.component.repository is not None pypi_registry_info.metadata.append(pypi_package_json) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 20f75db08..35d6e9c41 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -11,6 +11,7 @@ import zipfile from dataclasses import dataclass from datetime import datetime +from typing import Any import requests from bs4 import BeautifulSoup, Tag @@ -21,6 +22,7 @@ from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo from macaron.util import send_get_http_raw logger: logging.Logger = logging.getLogger(__name__) @@ -231,6 +233,45 @@ def fetch_sourcecode(self, src_url: str) -> dict[str, str] | None: logger.debug("Successfully fetch the source code from PyPI") return py_files_content + def get_artifact_hash(self, artifact_url: str, hash_algorithm: Any) -> str | None: + """Return the hash of the artifact found at the passed URL. + + Parameters + ---------- + artifact_url + The URL of the artifact. + hash_algorithm: Any + The hash algorithm to use. + + Returns + ------- + str | None + The hash of the artifact, or None if not found. + """ + try: + response = requests.get(artifact_url, stream=True, timeout=40) + response.raise_for_status() + except requests.exceptions.HTTPError as http_err: + logger.debug("HTTP error occurred: %s", http_err) + return None + + if response.status_code != 200: + logger.debug("Invalid response: %s", response.status_code) + return None + + try: + for chunk in response.iter_content(): + hash_algorithm.update(chunk) + except RequestException as error: + # Something went wrong with the request, abort. + logger.debug("Error while streaming source file: %s", error) + response.close() + return None + + artifact_hash: str = hash_algorithm.hexdigest() + logger.debug("Computed artifact hash: %s", artifact_hash) + return artifact_hash + def get_package_page(self, package_name: str) -> str | None: """Implement custom API to get package main page. @@ -430,15 +471,19 @@ def get_latest_version(self) -> str | None: """ return json_extract(self.package_json, ["info", "version"], str) - def get_sourcecode_url(self) -> str | None: + def get_sourcecode_url(self, package_type: str = "sdist") -> str | None: """Get the url of the source distribution. + Parameters + ---------- + package_type: str + The package type to retrieve the URL of. + Returns ------- str | None The URL of the source distribution. """ - urls: list | None = None if self.component_version: urls = json_extract(self.package_json, ["releases", self.component_version], list) else: @@ -447,7 +492,7 @@ def get_sourcecode_url(self) -> str | None: if not urls: return None for distribution in urls: - if distribution.get("packagetype") != "sdist": + if distribution.get("packagetype") != package_type: continue # We intentionally check if the url is None and use empty string if that's the case. source_url: str = distribution.get("url") or "" @@ -497,3 +542,39 @@ def get_sourcecode(self) -> dict[str, str] | None: source_code: dict[str, str] | None = self.pypi_registry.fetch_sourcecode(url) return source_code return None + + +def find_or_create_pypi_asset( + asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo +) -> PyPIPackageJsonAsset | None: + """Find the asset in the provided package registry information, or create it. + + Parameters + ---------- + asset_name: str + The name of the asset. + asset_version: str | None + The version of the asset. + pypi_registry_info: + The package registry information. + + Returns + ------- + PyPIPackageJsonAsset | None + The asset, or None if not found. + """ + pypi_package_json = next( + (asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)), + None, + ) + if pypi_package_json: + return pypi_package_json + + package_registry = pypi_registry_info.package_registry + if not isinstance(package_registry, PyPIRegistry): + logger.debug("Failed to create PyPIPackageJson asset.") + return None + + asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}) + pypi_registry_info.metadata.append(asset) + return asset From 167b34a5a9e699d0ac636c8d3557c54ad02ebeaf Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Mon, 31 Mar 2025 10:31:46 +1000 Subject: [PATCH 16/17] chore: add support for sha256 hashes in maven Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- .../package_registry/maven_central_registry.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index 593c15b88..2ad0cbf1e 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -282,8 +282,15 @@ def get_artifact_hash(self, purl: PackageURL, hash_algorithm: Any) -> str | None if not file_name: return None + # Maven supports but does not require a sha256 hash of uploaded artifacts. Check that first. artifact_url = self.registry_url + "/" + artifact_path + "/" + file_name - logger.debug("Search for artifact using URL: %s", artifact_url) + sha256_url = artifact_url + ".sha256" + logger.debug("Search for artifact hash using URL: %s", [sha256_url, artifact_url]) + + response = send_get_http_raw(sha256_url, {}) + if response and response.text: + logger.debug("Found hash of artifact: %s", response.text) + return response.text try: response = requests.get(artifact_url, stream=True, timeout=40) From ea49398afcef30c6ce7b3f84969723c44fbe346b Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith <benselwynsmith@googlemail.com> Date: Mon, 31 Mar 2025 11:09:50 +1000 Subject: [PATCH 17/17] chore: add pypi sha256 support Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com> --- src/macaron/slsa_analyzer/analyzer.py | 4 ++++ .../package_registry/pypi_registry.py | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index 21cbd3ad0..a4698866e 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -1065,6 +1065,10 @@ def get_artifact_hash( if not pypi_asset.download(""): return None + artifact_hash = pypi_asset.get_sha256() + if artifact_hash: + return artifact_hash + source_url = pypi_asset.get_sourcecode_url("bdist_wheel") if not source_url: return None diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 35d6e9c41..0852d554c 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -543,6 +543,26 @@ def get_sourcecode(self) -> dict[str, str] | None: return source_code return None + def get_sha256(self) -> str | None: + """Get the sha256 hash of the artifact from its payload. + + Returns + ------- + str | None + The sha256 hash of the artifact, or None if not found. + """ + if not self.package_json and not self.download(""): + return None + + if not self.component_version: + artifact_hash = json_extract(self.package_json, ["urls", 0, "digests", "sha256"], str) + else: + artifact_hash = json_extract( + self.package_json, ["releases", self.component_version, "digests", "sha256"], str + ) + logger.debug("Found sha256 hash: %s", artifact_hash) + return artifact_hash + def find_or_create_pypi_asset( asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo