Skip to content

feat(heuristics): add three analyzers to detect dependency confusion and distinguish from stub packages #1117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/macaron/malware_analyzer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,22 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b
- **Description**: Checks if the package name is suspiciously similar to any package name in a predefined list of popular packages. The similarity check incorporates the Jaro-Winkler distance and considers keyboard layout proximity to identify potential typosquatting.
- **Rule**: Return `HeuristicResult.FAIL` if the similarity ratio between the package name and any popular package name meets or exceeds a defined threshold; otherwise, return `HeuristicResult.PASS`.
- **Dependency**: None.

11. **Minimal Content**
- **Description**: Checks if the package has a small number of files.
- **Rule**: Return `HeuristicResult.FAIL` if the number of files is strictly less than FILES_THRESHOLD; otherwise, return `HeuristicResult.PASS`.
- **Dependency**: None.

12. **Unsecure Description**
- **Description**: Checks if the package description is unsecure, such as not having a descriptive keywords that indicates its a stub package .
- **Rule**: Return `HeuristicResult.FAIL` if no descriptive word is found in the package description or summary ; otherwise, return `HeuristicResult.PASS`.
- **Dependency**: None.

13. **Unknown Organization**
- **Description**: Checks if the package is from a known organization.
- **Rule**: Return `HeuristicResult.FAIL` if no organisation in the trusted organisation file found in the package metadata ; otherwise, return `HeuristicResult.PASS`.
- **Dependency**: None.

### Source Code Analysis with Semgrep
**PyPI Source Code Analyzer**
- **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code.
Expand Down
6 changes: 6 additions & 0 deletions src/macaron/malware_analyzer/pypi_heuristics/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ class Heuristics(str, Enum):
#: Indicates that the package source code contains suspicious code patterns.
SUSPICIOUS_PATTERNS = "suspicious_patterns"

#: Indicates that the package has minimal content.
MINIMAL_CONTENT = "minimal_content"

#: Indicates that the package's description is unsecure, such as not having a descriptive keywords.
UNSECURE_DESCRIPTION = "unsecure_description"


class HeuristicResult(str, Enum):
"""Result type indicating the outcome of a heuristic."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This analyzer checks if a PyPI package has minimal content."""

import logging
import os

from macaron.errors import SourceCodeError
from macaron.json_tools import JsonType
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset

logger: logging.Logger = logging.getLogger(__name__)


class MinimalContentAnalyzer(BaseHeuristicAnalyzer):
"""Check whether the package has minimal content."""

FILES_THRESHOLD = 50

def __init__(self) -> None:
super().__init__(
name="minimal_content_analyzer",
heuristic=Heuristics.MINIMAL_CONTENT,
depends_on=None,
)

def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
"""Analyze the package.

Parameters
----------
pypi_package_json: PyPIPackageJsonAsset
The PyPI package JSON asset object.

Returns
-------
tuple[HeuristicResult, dict[str, JsonType]]:
The result and related information collected during the analysis.
"""
result = pypi_package_json.download_sourcecode()
if not result:
error_msg = "No source code files have been downloaded"
logger.debug(error_msg)
raise SourceCodeError(error_msg)

file_count = sum(len(files) for _, _, files in os.walk(pypi_package_json.package_sourcecode_path))

if file_count >= self.FILES_THRESHOLD:
return HeuristicResult.PASS, {"message": "Package has sufficient content"}

return HeuristicResult.FAIL, {"message": "Not enough files found"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This analyzer checks if a PyPI package has unsecure description."""

import logging
import re

from macaron.errors import HeuristicAnalyzerValueError
from macaron.json_tools import JsonType, json_extract
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset

logger: logging.Logger = logging.getLogger(__name__)


class UnsecureDescriptionAnalyzer(BaseHeuristicAnalyzer):
"""Check whether the package's description is unsecure."""

SECURE_DESCRIPTION_REGEX = re.compile(
r"\b(?:internal|private|stub|placeholder|dependency confusion|security|namespace protection|reserved)\b",
re.IGNORECASE,
)

def __init__(self) -> None:
super().__init__(
name="unsecure_description_analyzer", heuristic=Heuristics.UNSECURE_DESCRIPTION, depends_on=None
)

def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
"""Analyze the package.

Parameters
----------
pypi_package_json: PyPIPackageJsonAsset
The PyPI package JSON asset object.

Returns
-------
tuple[HeuristicResult, dict[str, JsonType]]:
The result and related information collected during the analysis.
"""
package_json = pypi_package_json.package_json
info = package_json.get("info", {})
if not info:
error_msg = "No package info found in metadata"
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg)

description = json_extract(package_json, ["info", "description"], str)
summary = json_extract(package_json, ["info", "summary"], str)
data = f"{description} {summary}"
if self.SECURE_DESCRIPTION_REGEX.search(data):
return HeuristicResult.PASS, {"message": "Package description is secure"}
return HeuristicResult.FAIL, {"message": "Package description is unsecure"}
4 changes: 2 additions & 2 deletions src/macaron/slsa_analyzer/build_tool/gradle.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This module contains the Gradle class which inherits BaseBuildTool.
Expand Down Expand Up @@ -122,7 +122,7 @@ def get_dep_analyzer(self) -> CycloneDxGradle:
raise DependencyAnalyzerError("No default dependency analyzer is found.")
if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_gradle")):
raise DependencyAnalyzerError(
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
)

tool_name, tool_version = tuple(
Expand Down
4 changes: 2 additions & 2 deletions src/macaron/slsa_analyzer/build_tool/maven.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This module contains the Maven class which inherits BaseBuildTool.
Expand Down Expand Up @@ -116,7 +116,7 @@ def get_dep_analyzer(self) -> CycloneDxMaven:
raise DependencyAnalyzerError("No default dependency analyzer is found.")
if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_maven")):
raise DependencyAnalyzerError(
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_maven')} is not valid.",
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_maven')} is not valid.",
)

tool_name, tool_version = tuple(
Expand Down
4 changes: 2 additions & 2 deletions src/macaron/slsa_analyzer/build_tool/pip.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This module contains the Pip class which inherits BaseBuildTool.
Expand Down Expand Up @@ -88,7 +88,7 @@ def get_dep_analyzer(self) -> DependencyAnalyzer:
tool_name = "cyclonedx_py"
if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"):
raise DependencyAnalyzerError(
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
)
return CycloneDxPython(
resources_path=global_config.resources_path,
Expand Down
4 changes: 2 additions & 2 deletions src/macaron/slsa_analyzer/build_tool/poetry.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This module contains the Poetry class which inherits BaseBuildTool.
Expand Down Expand Up @@ -126,7 +126,7 @@ def get_dep_analyzer(self) -> DependencyAnalyzer:
tool_name = "cyclonedx_py"
if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"):
raise DependencyAnalyzerError(
f"Dependency analyzer {defaults.get('dependency.resolver','dep_tool_gradle')} is not valid.",
f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.",
)
return CycloneDxPython(
resources_path=global_config.resources_path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.minimal_content import MinimalContentAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.unsecure_description import UnsecureDescriptionAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer
from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer
from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer
Expand Down Expand Up @@ -358,6 +360,8 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
WheelAbsenceAnalyzer,
AnomalousVersionAnalyzer,
TyposquattingPresenceAnalyzer,
UnsecureDescriptionAnalyzer,
MinimalContentAnalyzer,
]

# name used to query the result of all problog rules, so it can be accessed outside the model.
Expand Down Expand Up @@ -411,6 +415,12 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
{Confidence.HIGH.value}::trigger(malware_high_confidence_4) :-
quickUndetailed, forceSetup, failed({Heuristics.TYPOSQUATTING_PRESENCE.value}).

% Package released with dependency confusion .
{Confidence.HIGH.value}::trigger(malware_high_confidence_5) :-
passed({Heuristics.MINIMAL_CONTENT.value}),
failed({Heuristics.ANOMALOUS_VERSION.value}),
failed({Heuristics.UNSECURE_DESCRIPTION.value}).

% Package released recently with little detail, with multiple releases as a trust marker, but frequent and with
% the same code.
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_1) :-
Expand All @@ -423,7 +433,8 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_2) :-
quickUndetailed,
failed({Heuristics.ONE_RELEASE.value}),
failed({Heuristics.ANOMALOUS_VERSION.value}).
failed({Heuristics.ANOMALOUS_VERSION.value}),
failed({Heuristics.UNSECURE_DESCRIPTION.value}).

% ----- Evaluation -----

Expand All @@ -432,6 +443,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
{problog_result_access} :- trigger(malware_high_confidence_2).
{problog_result_access} :- trigger(malware_high_confidence_3).
{problog_result_access} :- trigger(malware_high_confidence_4).
{problog_result_access} :- trigger(malware_high_confidence_5).
{problog_result_access} :- trigger(malware_medium_confidence_2).
{problog_result_access} :- trigger(malware_medium_confidence_1).
query({problog_result_access}).
Expand Down
107 changes: 107 additions & 0 deletions tests/malware_analyzer/pypi/test_minimal_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Tests for the MinimalContentAnalyzer heuristic."""

from unittest.mock import MagicMock, patch

import pytest

from macaron.errors import SourceCodeError
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
from macaron.malware_analyzer.pypi_heuristics.metadata.minimal_content import MinimalContentAnalyzer


@pytest.fixture(name="analyzer")
def analyzer_() -> MinimalContentAnalyzer:
"""Pytest fixture to create a MinimalContentAnalyzer instance."""
return MinimalContentAnalyzer()


def test_analyze_sufficient_files_pass(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer passes when the package has sufficient files."""
pypi_package_json.download_sourcecode.return_value = True
pypi_package_json.package_sourcecode_path = "/fake/path"
with patch("os.walk") as mock_walk:
mock_walk.return_value = [("root", [], [f"file{i}.py" for i in range(60)])]
result, info = analyzer.analyze(pypi_package_json)

assert result == HeuristicResult.PASS
assert info == {"message": "Package has sufficient content"}
pypi_package_json.download_sourcecode.assert_called_once()


def test_analyze_exactly_threshold_files_pass(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer passes when the package has exactly the threshold number of files."""
pypi_package_json.download_sourcecode.return_value = True
pypi_package_json.package_sourcecode_path = "/fake/path"
with patch("os.walk") as mock_walk:
mock_walk.return_value = [("root", [], [f"file{i}.py" for i in range(50)])]
result, info = analyzer.analyze(pypi_package_json)

assert result == HeuristicResult.PASS
assert info == {"message": "Package has sufficient content"}


def test_analyze_insufficient_files_fail(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer fails when the package has insufficient files."""
pypi_package_json.download_sourcecode.return_value = True
pypi_package_json.package_sourcecode_path = "/fake/path"
with patch("os.walk") as mock_walk:
mock_walk.return_value = [("root", [], ["file1.py"])]
result, info = analyzer.analyze(pypi_package_json)

assert result == HeuristicResult.FAIL
assert info == {"message": "Not enough files found"}


def test_analyze_no_files_fail(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer fails when the package has no files."""
pypi_package_json.download_sourcecode.return_value = True
pypi_package_json.package_sourcecode_path = "/fake/path"
with patch("os.walk") as mock_walk:
mock_walk.return_value = [("root", [], [])]
result, info = analyzer.analyze(pypi_package_json)

assert result == HeuristicResult.FAIL
assert info == {"message": "Not enough files found"}


def test_analyze_download_failed_raises_error(analyzer: MinimalContentAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer raises SourceCodeError when source code download fails."""
pypi_package_json.download_sourcecode.return_value = False

with pytest.raises(SourceCodeError) as exc_info:
analyzer.analyze(pypi_package_json)

assert "No source code files have been downloaded" in str(exc_info.value)
pypi_package_json.download_sourcecode.assert_called_once()


@pytest.mark.parametrize(
("file_count", "expected_result"),
[
(0, HeuristicResult.FAIL),
(1, HeuristicResult.FAIL),
(2, HeuristicResult.FAIL),
(55, HeuristicResult.PASS),
(70, HeuristicResult.PASS),
],
)
def test_analyze_various_file_counts(
analyzer: MinimalContentAnalyzer,
pypi_package_json: MagicMock,
file_count: int,
expected_result: HeuristicResult,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test the analyzer with various file counts."""
pypi_package_json.download_sourcecode.return_value = True
pypi_package_json.package_sourcecode_path = "/fake/path"
files = [f"file{i}.py" for i in range(file_count)]
mock_walk = MagicMock(return_value=[("root", [], files)])
monkeypatch.setattr("os.walk", mock_walk)

result, _ = analyzer.analyze(pypi_package_json)

assert result == expected_result
Loading
Loading