refactor: more efficient file traversing for semgrep rule IDs

art1f1c3R · art1f1c3R · commit f00e63d470e4 · 2025-06-02T15:28:29.000+10:00
Signed-off-by: Carl Flottmann &lt;carl.flottmann@oracle.com&gt;
diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py
@@ -185,35 +185,40 @@ def _extract_rule_ids(self, path: str, target_files: set[str]) -> set[str]:
             If any Semgrep rule file could not be safely loaded, or if their format was not in the expected Semgrep
             format, or if there were any files in 'target_files' not found when searching in 'path'.
         """
-        path_tree = glob.glob(os.path.join(path, "**", "*"), recursive=True)
-        all_file_names = {os.path.basename(file) for file in path_tree if os.path.isfile(file)}
-        if not target_files.issubset(all_file_names):
-            error_msg = f"The following semgrep files were not found in {path}: {target_files - all_file_names}"
+        # We keep a record of any file paths we coulnd't find to provide a more useful error message, rather than raising
+        # an error on the first missing file we see.
+        missing_files: list[str] = []
+        target_file_paths: list[str] = []
+        rule_ids: set[str] = set()
+
+        for target_file in target_files:
+            file_paths = glob.glob(os.path.join(path, "**", target_file), recursive=True)
+            if not file_paths:
+                missing_files.append(target_file)
+            target_file_paths.extend(file_paths)
+
+        if missing_files:
+            error_msg = f"The following semgrep files were not found in {path}: {missing_files}"
             logger.debug(error_msg)
             raise ConfigurationError(error_msg)
 
-        rule_ids = set()
-        for root, _, files in os.walk(path):
-            files_found = set.intersection(target_files, set(files))
-            for filename in files_found:
-                semgrep_ruleset_file = os.path.join(root, filename)
-
-                try:
-                    with open(semgrep_ruleset_file, encoding="utf-8") as file:
-                        semgrep_ruleset: dict[str, list] = yaml.safe_load(file.read())
-                except yaml.YAMLError as yaml_error:
-                    error_msg = f"Unable to open semgrep rule file {semgrep_ruleset_file}: {yaml_error}."
-                    logger.debug(error_msg)
-                    raise ConfigurationError(error_msg) from yaml_error
-
-                # should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries
-                try:
-                    for semgrep_rule in semgrep_ruleset["rules"]:
-                        rule_ids.add(semgrep_rule["id"])
-                except (KeyError, TypeError) as format_error:
-                    error_msg = f"Invalid semgrep rule format for {semgrep_ruleset_file}: {format_error}."
-                    logger.debug(error_msg)
-                    raise ConfigurationError(error_msg) from format_error
+        for file_path in target_file_paths:
+            try:
+                with open(file_path, encoding="utf-8") as file:
+                    semgrep_ruleset: dict[str, list] = yaml.safe_load(file.read())
+            except yaml.YAMLError as yaml_error:
+                error_msg = f"Unable to open semgrep rule file {file_path}: {yaml_error}."
+                logger.debug(error_msg)
+                raise ConfigurationError(error_msg) from yaml_error
+
+            # should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries
+            try:
+                for semgrep_rule in semgrep_ruleset["rules"]:
+                    rule_ids.add(semgrep_rule["id"])
+            except (KeyError, TypeError) as format_error:
+                error_msg = f"Invalid semgrep rule format for {file_path}: {format_error}."
+                logger.debug(error_msg)
+                raise ConfigurationError(error_msg) from format_error
 
         return rule_ids
 
@@ -306,7 +311,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
             # e.g. rule_id = src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute, which comes from
             # the rule ID 'obfuscation_decode-and-execute' inside 'obfuscation.yaml'.
             if rule_id.split(".")[-1] in self.disabled_rule_ids:
-                if rule_id not in self.disabled_rule_ids:
+                if rule_id not in disabled_results:
                     disabled_results[rule_id] = {"message": message, "detections": []}
                 disabled_results[rule_id]["detections"].append({"file": file, "start": start, "end": end})