Skip to content

feat(dependency): Improve Python site-packages scanning #4823

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cve_bin_tool/checkers/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ class PythonChecker(Checker):
r"Fatal Python error: unable to decode the command line argument",
r"Internal error in the Python interpreter",
r"CPython",
r"Python package: ",
r"dist-info",
r"egg-info",
]
FILENAME_PATTERNS = [r"python"]
VERSION_PATTERNS = [
Expand All @@ -25,5 +28,7 @@ class PythonChecker(Checker):
r"([23]+\.[0-9]+\.[0-9]+)\r?\nPython %s",
r"([23]+\.[0-9]+\.[0-9]+)\r?\n%\.80s \(%\.80s\) %\.80s",
r"tags/v([23]+\.[0-9]+\.[0-9]+)\r?\n",
r"Python\s+([23]+\.[0-9]+\.[0-9]+)",
r"__version__\s*=\s*['\"]((?:[23]+\.[0-9]+\.[0-9]+))['\"]",
]
VENDOR_PRODUCT = [("python_software_foundation", "python"), ("python", "python")]
54 changes: 54 additions & 0 deletions cve_bin_tool/dependency_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
class DependencyGraph:
def __init__(self):
self.graph = {} # {package_name: [dependency_names]}

def add_package(self, package, dependencies):
# Add or update the package dependencies in the graph
self.graph[package.lower()] = [dep.lower() for dep in dependencies]

def resolve_dependencies(self, root_package):
"""
Returns an ordered list of packages including indirect dependencies.
Uses iterative depth-first search (without recursion) and cycle detection.
"""
visited = set()
ordered = []
root = root_package.lower()
stack = [(root, False)] # (node, processed_flag)
in_stack = {root} # track nodes in the current stack

while stack:
current, processed = stack[-1]

if processed:
stack.pop()
in_stack.remove(current)
if current not in visited:
visited.add(current)
ordered.append(current)
continue

stack[-1] = (current, True) # mark current as processed

# Add unvisited dependencies not already in the stack to avoid cycles
if current in self.graph:
for dep in reversed(self.graph[current]):
if dep not in visited and dep not in in_stack:
stack.append((dep, False))
in_stack.add(dep)
return ordered


# Example usage:
if __name__ == "__main__":
# Build a sample dependency graph for testing
dg = DependencyGraph()
dg.add_package("packageA", ["packageB", "packageC"])
dg.add_package("packageB", ["packageD"])
dg.add_package("packageC", ["packageD", "packageE"])
dg.add_package("packageD", [])
dg.add_package("packageE", [])

# Resolve dependencies for packageA
dependencies = dg.resolve_dependencies("packageA")
print("Resolved order:", dependencies)
80 changes: 72 additions & 8 deletions cve_bin_tool/parsers/python.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: GPL-3.0-or-later

import importlib.metadata as importlib_metadata
import json
import re
import subprocess
from re import MULTILINE, compile, search

from packaging.requirements import Requirement
from packaging.version import parse as parse_version

from cve_bin_tool.parsers import Parser
Expand Down Expand Up @@ -140,24 +142,86 @@ def generate_purl(self, product, vendor="", qualifier={}, subpath=None):

return purl

def _get_installed_packages(self):
"""Get a list of all installed packages using importlib.metadata."""
try:
return {
dist.metadata["Name"]: dist # Keep original case
for dist in importlib_metadata.distributions()
}
except Exception as e:
self.logger.debug(f"Error getting installed packages: {e}")
return {}

def _parse_dependencies(self, dist):
"""Parse package dependencies considering environment markers."""
requires = dist.requires or []
dependencies = []

for req_str in requires:
try:
req = Requirement(req_str)
if req.marker is None or req.marker.evaluate():
dependencies.append(req.name) # Keep original case
except Exception as e:
self.logger.debug(f"Error parsing requirement {req_str}: {e}")

return dependencies

def run_checker(self, filename):
"""
This generator runs only for python packages.
There are no actual checkers.
The ProductInfo is computed without the help of any checkers from PKG-INFO or METADATA.
Enhanced checker that uses importlib.metadata for better dependency resolution.
"""
self.filename = filename
lines = parse_strings(self.filename)
lines = "\n".join(lines.splitlines()[:3])

try:
product = search(compile(r"^Name: (.+)$", MULTILINE), lines).group(1)
version = search(compile(r"^Version: (.+)$", MULTILINE), lines).group(1)
purl = self.generate_purl(product)
vendor = self.get_vendor(purl, product, version)
if vendor is not None:
yield from vendor
# Remove lowercasing to preserve original case
normalized_product = product.replace("-", "_")

# Get all installed packages and their dependencies
installed_packages = self._get_installed_packages()
# Use case-insensitive lookup
product_key = next(
(
name
for name in installed_packages.keys()
if name.lower() == normalized_product.lower()
),
None,
)

if product_key:
dist = installed_packages[product_key]
dependencies = self._parse_dependencies(dist)

# Process the main package
purl = self.generate_purl(product) # Original name for PURL
vendor = self.get_vendor(purl, product, version)
if vendor is not None:
yield from vendor

# Process dependencies with case-preservation
for dep_name in dependencies:
dep_key = next(
(
name
for name in installed_packages.keys()
if name.lower() == dep_name.lower()
),
None,
)
if dep_key:
dep_dist = installed_packages[dep_key]
dep_purl = self.generate_purl(dep_name)
dep_version = dep_dist.version
dep_vendor = self.get_vendor(dep_purl, dep_name, dep_version)
if dep_vendor is not None:
yield from dep_vendor

# There are packages with a METADATA file in them containing different data from what the tool expects
except AttributeError:
self.logger.debug(f"{filename} is an invalid METADATA/PKG-INFO")
self.logger.debug(f"Done scanning file: {filename}")
15 changes: 15 additions & 0 deletions cve_bin_tool/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import fnmatch
import os
import platform
import re
import sys
from enum import Enum
Expand Down Expand Up @@ -608,6 +609,20 @@ def windows_fixup(filename):
return filename.replace(":", "_").replace("\\", "_")


def get_environment_context():
"""
Returns a dictionary with environment details for marker evaluation.
"""
return {
"python_version": f"{sys.version_info.major}.{sys.version_info.minor}",
"sys_platform": sys.platform,
"os_name": os.name,
"platform_machine": platform.machine(),
"platform_system": platform.system(),
"platform_release": platform.release(),
}


def strip_path(path_element: str, scanned_dir: str) -> str:
path = Path(path_element)
return path.drive + path.root + os.path.relpath(path_element, scanned_dir)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ defusedxml
distro
filetype>=1.2.0
gsutil
importlib_metadata>=3.6; python_version < "3.10"
importlib_metadata>=4.0; python_version < "3.10"
importlib_resources; python_version < "3.9"
jinja2>=2.11.3
jsonschema>=3.0.2
Expand Down
69 changes: 69 additions & 0 deletions test/test_dependency_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright (C) 2022 Intel Corporation
# SPDX-License-Identifier: GPL-3.0-or-later

from cve_bin_tool.dependency_graph import DependencyGraph


def test_dependency_order():
"""Test that dependencies are resolved in correct order"""
dg = DependencyGraph()
dg.add_package("A", ["B", "C"])
dg.add_package("B", ["D"])
dg.add_package("C", ["D", "E"])
dg.add_package("D", [])
dg.add_package("E", [])

order = dg.resolve_dependencies("A")

# Verify dependencies come before their dependents
a_index = order.index("a")
b_index = order.index("b")
c_index = order.index("c")
d_index = order.index("d")
e_index = order.index("e")

# D should come before B and C (its dependents)
assert d_index < b_index
assert d_index < c_index

# E should come before C (its dependent)
assert e_index < c_index

# B and C should come before A (their dependent)
assert b_index < a_index
assert c_index < a_index


def test_cycle_handling():
"""Test that circular dependencies don't cause infinite loops"""
dg = DependencyGraph()
dg.add_package("A", ["B"])
dg.add_package("B", ["C"])
dg.add_package("C", ["A"]) # Create cycle A -> B -> C -> A

order = dg.resolve_dependencies("A")

# All packages should be present exactly once
assert len(order) == 3
assert len(set(order)) == 3
for pkg in ["a", "b", "c"]:
assert pkg in order


def test_deep_dependency_chain():
"""Test handling of deep dependency chains"""
dg = DependencyGraph()
# Create a chain A -> B -> C -> D -> E -> F
dg.add_package("A", ["B"])
dg.add_package("B", ["C"])
dg.add_package("C", ["D"])
dg.add_package("D", ["E"])
dg.add_package("E", ["F"])
dg.add_package("F", [])

order = dg.resolve_dependencies("A")

# Verify F comes first (no dependencies)
assert order[0] == "f"
# Verify A comes last (depends on everything)
assert order[-1] == "a"
34 changes: 34 additions & 0 deletions test/test_dependency_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (C) 2022 Intel Corporation
# SPDX-License-Identifier: GPL-3.0-or-later

import pytest
from packaging.requirements import Requirement


# Dummy implementation of parse_dependencies copied from your parser
def parse_dependencies(metadata):
dependencies = []
for req in metadata.get("Requires-Dist", []):
requirement = Requirement(req)
if requirement.marker and not requirement.marker.evaluate():
continue
dependencies.append(requirement.name.lower())
return dependencies


@pytest.fixture
def metadata_with_markers():
return {
"Requires-Dist": [
"packageA; python_version >= '3.0'", # Should be included (assuming current python is >=3.0)
"packageB; python_version < '2.0'", # Should be excluded
"packageC", # Always included
]
}


def test_parse_dependencies(metadata_with_markers):
deps = parse_dependencies(metadata_with_markers)
assert "packagea" in deps
assert "packagec" in deps
assert "packageb" not in deps
65 changes: 65 additions & 0 deletions test/test_python_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import importlib

import pytest

# Assume our parser is imported as follows:
from cve_bin_tool.parsers.python import PythonParser


class DummyLogger:
def debug(self, msg):
pass

def error(self, msg):
pass


class DummyDB:
pass


@pytest.fixture
def parser():
return PythonParser(DummyDB(), DummyLogger())


def test_get_installed_packages(parser, monkeypatch):
# Create a dummy distribution object
class DummyDist:
def __init__(self, name, version, requires=None):
self.metadata = {"Name": name, "Version": version}
self.requires = requires or []

dummy_dists = [
DummyDist("packageA", "1.0"),
DummyDist("packageB", "2.0", requires=["packageC; python_version >= '3.6'"]),
DummyDist("packageC", "3.0"),
]
# Patch distributions on importlib.metadata directly.
monkeypatch.setattr(importlib.metadata, "distributions", lambda: dummy_dists)
packages = parser._get_installed_packages()
assert "packageA" in packages
assert "packageB" in packages
assert "packageC" in packages


def test_parse_dependencies(parser):
# Dummy distribution with requires field
class DummyDist:
def __init__(self, requires):
self.requires = requires

# Requirement without marker
dist1 = DummyDist(requires=["packageD>=1.0"])
deps1 = parser._parse_dependencies(dist1)
assert "packageD" in deps1

# Requirement with marker that evaluates to True (simulate current version)
dist2 = DummyDist(requires=["packageE; python_version >= '3.0'"])
deps2 = parser._parse_dependencies(dist2)
assert "packageE" in deps2

# Requirement with marker that evaluates to False (simulate unmet condition)
dist3 = DummyDist(requires=["packageF; python_version < '2.0'"])
deps3 = parser._parse_dependencies(dist3)
assert "packageF" not in deps3
Loading
Loading