qiita-spots · antgonza · Mar 6, 2025 · Jan 30, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py
@@ -58,30 +58,23 @@ def _get_commands(self):
         """
         results = []
 
-        if self.is_amplicon:
-            # skip this step for amplicon runs since raw and processed are the
-            # same file.
-            project_names = []
-        else:
-            # gather the parameters for processing all relevant raw fastq
-            # files.
-            params, project_names = self._scan_fastq_files(True)
-
-            for fwd_file_path, rev_file_path, output_path in params:
-                command = ['fastqc', '--noextract', '-t', str(self.nthreads),
-                           fwd_file_path, rev_file_path, '-o', output_path]
-                results.append(' '.join(command))
-
-        # next, do the same for the trimmed/filtered fastq files.
-        params, additional_project_names = self._scan_fastq_files(False)
-
+        # gather the parameters for processing all relevant raw fastq
+        # files.
+        params, project_names = self._scan_fastq_files(True)
         for fwd_file_path, rev_file_path, output_path in params:
             command = ['fastqc', '--noextract', '-t', str(self.nthreads),
                        fwd_file_path, rev_file_path, '-o', output_path]
             results.append(' '.join(command))
 
-        # remove duplicate project names from the list
-        project_names = list(set(project_names + additional_project_names))
+        if not self.is_amplicon:
+            # next, do the same for the trimmed/filtered fastq files.
+            params, additional_project_names = self._scan_fastq_files(False)
+            for fwd_file_path, rev_file_path, output_path in params:
+                command = ['fastqc', '--noextract', '-t', str(self.nthreads),
+                           fwd_file_path, rev_file_path, '-o', output_path]
+                results.append(' '.join(command))
+            # remove duplicate project names from the list
+            project_names = list(set(project_names + additional_project_names))
 
         return results, project_names
 

diff --git a/sequence_processing_pipeline/GenPrepFileJob.py b/sequence_processing_pipeline/GenPrepFileJob.py
@@ -52,7 +52,7 @@ def __init__(self, run_dir, convert_job_path, qc_job_path, output_path,
 
             if isdir(self.reports_path):
                 copytree(self.reports_path, reports_dir)
-            else:
+            elif not self.is_amplicon:
                 # assume self.reports_path is a file.
                 makedirs(reports_dir)
                 copy(self.reports_path, reports_dir)

diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
@@ -92,6 +92,8 @@ def __init__(self, root_dir, output_path, job_name, executable_paths,
         self.is_test = True if [
             x for x in stack() if 'unittest' in x.filename] else False
 
+        self.audit_folders = None
+
         # For each executable in the list, get its filename and use _which()
         # to see if it can be found. Directly pass an optional list of modules
         # to load before-hand, so that the binary can be found.
@@ -475,6 +477,10 @@ def audit(self, sample_ids):
         for root, dirs, files in walk(self.output_path):
             if 'zero_files' in root:
                 continue
+            if self.audit_folders is not None:
+                # let's check that any of the audit_folders is in root
+                if not [f for f in self.audit_folders if f in root]:
+                    continue
             files_found += [join(root, x) for x in files if
                             x.endswith(self.suffix)]
 

diff --git a/sequence_processing_pipeline/MultiQCJob.py b/sequence_processing_pipeline/MultiQCJob.py
@@ -116,7 +116,7 @@ def _get_failed_indexes(self, job_id):
         # the command used for this job
         completed_indexes = [int(cf.split('_')[-1]) for cf in completed_files]
 
-        all_indexes = list(range(1, len(self.commands) + 1))
+        all_indexes = list(range(1, len(self.array_cmds) + 1))
         failed_indexes = sorted(set(all_indexes) - set(completed_indexes))
 
         # generate log-file here instead of in run() where it can be
@@ -180,12 +180,12 @@ def _get_commands(self):
     def _generate_job_script(self):
         template = self.jinja_env.get_template("multiqc_job.sh")
 
-        array_cmds = self._get_commands()
+        self.array_cmds = self._get_commands()
 
         job_name = f'{self.qiita_job_id}_{self.job_name}'
         details_file_name = f'{self.job_name}.array-details'
         array_details = join(self.output_path, details_file_name)
-        array_params = "1-%d%%%d" % (len(array_cmds), self.pool_size)
+        array_params = "1-%d%%%d" % (len(self.array_cmds), self.pool_size)
         modules_to_load = ' '.join(self.modules_to_load)
 
         with open(self.job_script_path, mode="w", encoding="utf-8") as f:
@@ -202,7 +202,7 @@ def _generate_job_script(self):
 
         # save the .details file as well
         with open(array_details, 'w') as f:
-            f.write('\n'.join(array_cmds) + '\n')
+            f.write('\n'.join(self.array_cmds) + '\n')
 
         return self.job_script_path
 

diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
@@ -80,6 +80,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         self.gres_value = gres_value
         self.pmls_path = pmls_path
         self.additional_fastq_tags = additional_fastq_tags
+        self.audit_folders = ['filtered_sequences']
 
         # for projects that use sequence_processing_pipeline as a dependency,
         # jinja_env must be set to sequence_processing_pipeline's root path,