33
33
class PyPISourcecodeAnalyzer (BaseHeuristicAnalyzer ):
34
34
"""This class is used to analyze the source code of python PyPI packages. This analyzer is a work in progress.
35
35
36
- This analyzer works in two phases. In the first phase, it will perform a pattern-based scan of all python files
37
- in the source code, looking for suspicious patterns defined by the YAML file in defaults.ini. By default, this
38
- will include suspicious package imports, suspicious hardcoded constants, and suspicious function calls. If this
39
- scan does not find any suspicious activity, the analysis will stop and the package will be marked as benign
40
- by this analyzer. If the scan does find suspicious activity, the analyzer will move on to the second phase.
41
-
42
- In the second phase, the analyzer will perform dataflow analysis. This will track the flow of suspicious constants
43
- and the results of suspicious function calls to where they are used, to determine if they are used in a malicious
44
- manner. Suspicious activity includes data exfiltration, code execution, remote connections, operating system and
45
- process manipulation, and encoded and obfuscated patterns. The types of activity, and their severity and quantity,
46
- will then determine the probability of the package being malicious.
47
-
48
- Currently, this analyzer only supports the first phase, and will return simply boolean results on the maliciousness
49
- of the package.
36
+ Currently the analyzer performs textual pattern matching and dataflow analysis using the open-source features of
37
+ Semgrep. Semgrep open-source taint tracking can only perform in one locale, but this is a known limitation. Default
38
+ rules are stored in 'macaron/resources/pypi_malware_rules' as semgrep .yaml rule files. A user may add additional
39
+ rules stored in a specified directory passed by them in the 'defaults.ini' configuration file.
50
40
"""
51
41
52
42
def __init__ (self , resources_path : str | None = None ) -> None :
43
+ """
44
+ Initialise the source code analyzer and load default and custom semgrep rulesets.
45
+
46
+ Parameters
47
+ ----------
48
+ resources_path: str | None
49
+ The path to the resources directory which must contain a 'pypi_malware_rules' directory of
50
+ semgrep rules. If None is provided, then this is loaded from the global config resources path.
51
+ Defaults to None
52
+
53
+ Raises
54
+ ------
55
+ ConfigurationError
56
+ If the default rule path is invalid, the heuristic.pypi entry is not present, or if the semgrep
57
+ validation of the custom rule path failed.
58
+ """
53
59
super ().__init__ (
54
60
name = "anomalous_version_analyzer" ,
55
61
heuristic = Heuristics .SUSPICIOUS_PATTERNS ,
@@ -65,6 +71,12 @@ def _load_defaults(self, resources_path: str) -> tuple[str, str | None]:
65
71
66
72
Semgrep validation is run on the custom rules provided by the user.
67
73
74
+ Parameters
75
+ ----------
76
+ resources_path: str
77
+ The path to the resources directory which must contain a 'pypi_malware_rules' directory of
78
+ semgrep rules.
79
+
68
80
Returns
69
81
-------
70
82
tuple[str, str | None]
@@ -140,7 +152,8 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
140
152
if there is no source code available.
141
153
"""
142
154
analysis_result : defaultdict = defaultdict (list )
143
- semgrep_commands : list [str ] = ["semgrep" , "scan" ]
155
+ # only run semgrep open-source features, and disable 'nosemgrep' ignoring so this does not bypass our scan
156
+ semgrep_commands : list [str ] = ["semgrep" , "scan" , "--oss-only" , "--disable-nosem" ]
144
157
result : HeuristicResult = HeuristicResult .PASS
145
158
146
159
source_code_path = pypi_package_json .package_sourcecode_path
0 commit comments