generate_sequence_counts

antgonza · antgonza · commit 551bfd18edd1 · 2025-09-03T08:11:51.000-06:00
diff --git a/src/qp_klp/Protocol.py b/src/qp_klp/Protocol.py
@@ -432,38 +432,29 @@ def convert_raw_to_fastq(self):
         return failed_samples
 
     def generate_sequence_counts(self):
-        # config = self.pipeline.get_software_configuration('tell-seq')
-
-        # files_to_count_path = join(self.pipeline.output_path,
-        #                            'files_to_count.txt')
-
-        # with open(files_to_count_path, 'w') as f:
-        #     for root, _, files in walk(self.raw_fastq_files_path):
-        #         for _file in files:
-        #             if determine_orientation(_file) in ['R1', 'R2']:
-        #                 print(join(root, _file), file=f)
-
-        # job = SeqCountsJob(self.pipeline.run_dir,
-        #                    self.pipeline.output_path,
-        #                    config['queue'],
-        #                    config['nodes'],
-        #                    config['wallclock_time_in_minutes'],
-        #                    config['normcount_mem_limit'],
-        #                    config['modules_to_load'],
-        #                    self.master_qiita_job_id,
-        #                    config['job_max_array_length'],
-        #                    files_to_count_path,
-        #                    self.pipeline.get_sample_sheet_path(),
-        #                    cores_per_task=config['tellread_cores'])
-
-        # if 'SeqCountsJob' not in self.skip_steps:
-        #     job.run(callback=self.job_callback)
-
-        # if successful, set self.reports_path
+        # for other isntances of generate_sequence_counts in other objects
+        # the sequence counting needs to be done; however, for PacBio we
+        # already have done it and just need to merge the results.
+        gz_files = glob(f'{self.raw_fastq_files_path}/*/*.fastq.gz')
+        data, missing_files = [], []
+
+        for gzf in gz_files:
+            cf = gzf.replace('.fastq.gz', '.counts.txt')
+            sn = basename(cf).replace('.counts.txt', '')
+            if not exists(cf):
+                missing_files.append(sn)
+                continue
+            with open(cf, 'r') as fh:
+                counts = fh.read().strip()
+            data.append({'SampleID': sn, '# Reads': counts})
+
+        if missing_files:
+            raise ValueError(f'Missing count files: {missing_files}')
+
+        df = pd.DataFrame(data)
         self.reports_path = join(self.pipeline.output_path,
                                  'SeqCounts.csv')
-        open(self.reports_path, 'w').write(
-            'SampleID,# Reads\nA1,100')
+        df.to_csv(self.reports_path, index=False)
 
     def integrate_results(self):
         pass
diff --git a/tests/data/configuration_profiles/pacbio_metagenomic.json b/tests/data/configuration_profiles/pacbio_metagenomic.json
@@ -8,23 +8,20 @@
         "nprocs": 16,
         "queue": "qiita",
         "wallclock_time_in_minutes": 216,
-        "modules_to_load": [
-          "bclconvert_3.7.5"
-        ],
-        "executable_path": "bcl-convert",
-        "per_process_memory_limit": "10gb"
+        "modules_to_load": [],
+        "executable_path": "",
+        "per_process_memory_limit": "1gb"
       },
       "nu-qc": {
         "nodes": 1,
         "cpus_per_task": 8,
         "queue": "qiita",
         "wallclock_time_in_minutes": 240,
-        "minimap2_databases": ["/databases/minimap2/db_1.mmi", "/databases/minimap2/db_2.mmi"],
-        "modules_to_load": [
-          "fastp_0.20.1",
-          "samtools_1.12",
-          "minimap2_2.18"
+        "minimap2_databases": [
+          "/databases/minimap2/db_1.mmi",
+          "/databases/minimap2/db_2.mmi"
         ],
+        "modules_to_load": ["fastp_0.20.1", "samtools_1.12", "minimap2_2.18"],
         "fastp_executable_path": "fastp",
         "minimap2_executable_path": "minimap2",
         "samtools_executable_path": "samtools",
@@ -49,9 +46,7 @@
         "queue": "qiita",
         "nthreads": 16,
         "wallclock_time_in_minutes": 60,
-        "modules_to_load": [
-          "fastqc_0.11.5"
-        ],
+        "modules_to_load": ["fastqc_0.11.5"],
         "fastqc_executable_path": "fastqc",
         "multiqc_executable_path": "multiqc",
         "multiqc_config_file_path": "sequence_processing_pipeline/multiqc-bclconvert-config.yaml",