@@ -185,35 +185,40 @@ def _extract_rule_ids(self, path: str, target_files: set[str]) -> set[str]:
185185 If any Semgrep rule file could not be safely loaded, or if their format was not in the expected Semgrep
186186 format, or if there were any files in 'target_files' not found when searching in 'path'.
187187 """
188- path_tree = glob .glob (os .path .join (path , "**" , "*" ), recursive = True )
189- all_file_names = {os .path .basename (file ) for file in path_tree if os .path .isfile (file )}
190- if not target_files .issubset (all_file_names ):
191- error_msg = f"The following semgrep files were not found in { path } : { target_files - all_file_names } "
188+ # We keep a record of any file paths we coulnd't find to provide a more useful error message, rather than raising
189+ # an error on the first missing file we see.
190+ missing_files : list [str ] = []
191+ target_file_paths : list [str ] = []
192+ rule_ids : set [str ] = set ()
193+
194+ for target_file in target_files :
195+ file_paths = glob .glob (os .path .join (path , "**" , target_file ), recursive = True )
196+ if not file_paths :
197+ missing_files .append (target_file )
198+ target_file_paths .extend (file_paths )
199+
200+ if missing_files :
201+ error_msg = f"The following semgrep files were not found in { path } : { missing_files } "
192202 logger .debug (error_msg )
193203 raise ConfigurationError (error_msg )
194204
195- rule_ids = set ()
196- for root , _ , files in os .walk (path ):
197- files_found = set .intersection (target_files , set (files ))
198- for filename in files_found :
199- semgrep_ruleset_file = os .path .join (root , filename )
200-
201- try :
202- with open (semgrep_ruleset_file , encoding = "utf-8" ) as file :
203- semgrep_ruleset : dict [str , list ] = yaml .safe_load (file .read ())
204- except yaml .YAMLError as yaml_error :
205- error_msg = f"Unable to open semgrep rule file { semgrep_ruleset_file } : { yaml_error } ."
206- logger .debug (error_msg )
207- raise ConfigurationError (error_msg ) from yaml_error
208-
209- # should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries
210- try :
211- for semgrep_rule in semgrep_ruleset ["rules" ]:
212- rule_ids .add (semgrep_rule ["id" ])
213- except (KeyError , TypeError ) as format_error :
214- error_msg = f"Invalid semgrep rule format for { semgrep_ruleset_file } : { format_error } ."
215- logger .debug (error_msg )
216- raise ConfigurationError (error_msg ) from format_error
205+ for file_path in target_file_paths :
206+ try :
207+ with open (file_path , encoding = "utf-8" ) as file :
208+ semgrep_ruleset : dict [str , list ] = yaml .safe_load (file .read ())
209+ except yaml .YAMLError as yaml_error :
210+ error_msg = f"Unable to open semgrep rule file { file_path } : { yaml_error } ."
211+ logger .debug (error_msg )
212+ raise ConfigurationError (error_msg ) from yaml_error
213+
214+ # should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries
215+ try :
216+ for semgrep_rule in semgrep_ruleset ["rules" ]:
217+ rule_ids .add (semgrep_rule ["id" ])
218+ except (KeyError , TypeError ) as format_error :
219+ error_msg = f"Invalid semgrep rule format for { file_path } : { format_error } ."
220+ logger .debug (error_msg )
221+ raise ConfigurationError (error_msg ) from format_error
217222
218223 return rule_ids
219224
@@ -306,7 +311,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
306311 # e.g. rule_id = src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute, which comes from
307312 # the rule ID 'obfuscation_decode-and-execute' inside 'obfuscation.yaml'.
308313 if rule_id .split ("." )[- 1 ] in self .disabled_rule_ids :
309- if rule_id not in self . disabled_rule_ids :
314+ if rule_id not in disabled_results :
310315 disabled_results [rule_id ] = {"message" : message , "detections" : []}
311316 disabled_results [rule_id ]["detections" ].append ({"file" : file , "start" : start , "end" : end })
312317
0 commit comments