Merge pull request #48 from antgonza/add-artifact-prep-method

charles-cowart · web-flow · commit 690362abaf17 · 2023-01-11T13:38:43.000-08:00
add artifact_and_preparation_files method
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 Qiita Client
 ============
 
-[![Build Status](https://travis-ci.org/qiita-spots/qiita_client.png?branch=master)](https://travis-ci.org/qiita-spots/qiita_client)
+[![Build Status](https://github.com/qiita-spots/qiita_client/actions/workflows/qiita-ci.yml/badge.svg)](https://github.com/qiita-spots/qiita_client/actions/workflows/qiita-ci.yml)
 
 Qiita (canonically pronounced *cheetah*) is an analysis environment for microbiome (and other "comparative -omics") datasets.
 
diff --git a/qiita_client/qiita_client.py b/qiita_client/qiita_client.py
@@ -9,8 +9,11 @@
 import time
 import requests
 import threading
+import pandas as pd
 from json import dumps
 from random import randint
+from itertools import zip_longest
+from os.path import basename
 
 from .exceptions import (QiitaClientError, NotFoundError, BadRequestError,
                          ForbiddenError)
@@ -23,6 +26,7 @@
 MAX_RETRIES = 3
 MIN_TIME_SLEEP = 180
 MAX_TIME_SLEEP = 360
+BLANK_FILE_THRESHOLD = 100
 
 
 class ArtifactInfo(object):
@@ -556,3 +560,110 @@ def complete_job(self, job_id, success, error_msg=None,
                                              artifacts_info=artifacts_info))
         # Create the URL where we have to post the results
         self.post("/qiita_db/jobs/%s/complete/" % job_id, data=json_payload)
+
+    def artifact_and_preparation_files(self, artifact_id,
+                                       ignore_small_files=True):
+        """Gets the artifact and preparation files from a given artifact_id
+
+        Parameters
+        ----------
+        artifact_id : int
+            The artifact id
+        ignore_small_files : bool
+            Whether to ignore small files or retrieve all of them (only applies
+            to per_sample_FASTQ artifacts)
+
+        Returns
+        -------
+        dict
+            files available in the artifact
+        pandas.DataFrame
+            the prep information file for that artifact
+
+        Raises
+        ------
+        RuntimeError
+            - If the artifact belongs to an analysis
+
+        """
+        artifact_info = self.get("/qiita_db/artifacts/%s/" % artifact_id)
+
+        if artifact_info['analysis'] is not None:
+            raise RuntimeError(
+                f'Artifact {artifact_id} is an analysis artifact, this method '
+                'is meant to work with artifacts linked to a preparation.')
+
+        prep_info = self.get('/qiita_db/prep_template/%s/'
+                             % artifact_info['prep_information'][0])
+        prep_info = pd.read_csv(prep_info['prep-file'], sep='\t', dtype=str)
+        if artifact_info['type'] == 'per_sample_FASTQ':
+            files, prep_info = self._process_files_per_sample_fastq(
+                artifact_info['files'], prep_info, ignore_small_files)
+        else:
+            files = {k: [vv['filepath'] for vv in v]
+                     for k, v in artifact_info['files'].items()}
+
+        return files, prep_info
+
+    def _process_files_per_sample_fastq(self, files, prep_info,
+                                        ignore_small_files):
+        "helper function to process per_sample_FASTQ artifacts and their preps"
+
+        fwds = sorted(files['raw_forward_seqs'], key=lambda x: x['filepath'])
+        revs = []
+        if 'raw_reverse_seqs' in files:
+            revs = sorted(
+                files['raw_reverse_seqs'], key=lambda x: x['filepath'])
+            if len(fwds) != len(revs):
+                raise ValueError(f'The fwd ({len(fwds)}) and rev ({len(revs)})'
+                                 ' files should be of the same length')
+
+        run_prefixes = prep_info['run_prefix'].to_dict()
+
+        # make parirings
+        sample_names = dict()
+        used_prefixes = []
+        for i, (fwd, rev) in enumerate(zip_longest(fwds, revs)):
+            fwd_fn = basename(fwd['filepath'])
+            file_smaller_than_min = fwd['size'] < BLANK_FILE_THRESHOLD
+
+            # iterate over run prefixes and make sure only one matches
+            run_prefix = None
+            sample_name = None
+            for sn, rp in run_prefixes.items():
+                if fwd_fn.startswith(rp) and run_prefix is None:
+                    run_prefix = rp
+                    sample_name = sn
+                elif fwd_fn.startswith(rp) and run_prefix is not None:
+                    raise ValueError(
+                        f'Multiple run prefixes match this fwd read: {fwd_fn}')
+
+            if run_prefix is None:
+                raise ValueError(
+                    f'No run prefix matching this fwd read: {fwd_fn}')
+            if run_prefix in used_prefixes:
+                raise ValueError(
+                    f'Run prefix matches multiple fwd reads: {run_prefix}')
+            used_prefixes.append(run_prefix)
+
+            if rev is not None:
+                # if we have reverse reads, make sure the matching pair also
+                # matches the run prefix:
+                rev_fn = basename(rev['filepath'])
+                if not file_smaller_than_min:
+                    file_smaller_than_min = rev['size'] < BLANK_FILE_THRESHOLD
+                if not rev_fn.startswith(run_prefix):
+                    raise ValueError(
+                        'Reverse read does not match run prefix. run_prefix: '
+                        f'{run_prefix}; files: {fwd_fn} / {rev_fn}')
+
+            used_prefixes.append(run_prefix)
+
+            if ignore_small_files and file_smaller_than_min:
+                continue
+
+            sample_names[sample_name] = (fwd, rev)
+
+        prep_info = prep_info.filter(items=sample_names.keys(), axis=0)
+
+        return sample_names, prep_info
diff --git a/qiita_client/tests/test_qiita_client.py b/qiita_client/tests/test_qiita_client.py
@@ -11,6 +11,7 @@
 from os.path import basename, exists
 from tempfile import mkstemp
 from json import dumps
+import pandas as pd
 
 from qiita_client.qiita_client import (QiitaClient, _format_payload,
                                        ArtifactInfo)
@@ -133,11 +134,17 @@ def test_get(self):
         # Files contain the full path, which it is hard to test, so get only
         # the basename of the files
         obs_files = obs.pop('files')
-        for k in obs_files:
-            obs_files[k] = [basename(v) for v in obs_files[k]]
+        obs_files = {
+            k: [{'filepath': basename(vv['filepath']),
+                 'size': vv['size']} for vv in v]
+            for k, v in obs_files.items()}
         exp_files = {
-            'raw_barcodes': ['1_s_G1_L001_sequences_barcodes.fastq.gz'],
-            'raw_forward_seqs': ['1_s_G1_L001_sequences.fastq.gz']}
+            'raw_barcodes': [
+                {'filepath': '1_s_G1_L001_sequences_barcodes.fastq.gz',
+                 'size': 58}],
+            'raw_forward_seqs': [
+                {'filepath': '1_s_G1_L001_sequences.fastq.gz',
+                 'size': 58}]}
 
         self.assertEqual(obs, exp)
         self.assertEqual(obs_files, exp_files)
@@ -204,6 +211,7 @@ def test_get_job_info(self):
         job_id = "3c9991ab-6c14-4368-a48c-841e8837a79c"
         obs = self.tester.get_job_info(job_id)
         exp = {'command': 'Pick closed-reference OTUs',
+               'msg': '',
                'status': 'success',
                'parameters': {'input_data': 2,
                               'reference': 1,
@@ -250,6 +258,86 @@ def test_complete_job(self):
         obs = self.tester.complete_job(job_id, True, artifacts_info=ainfo)
         self.assertIsNone(obs)
 
+    def test_artifact_and_preparation_files(self):
+
+        # check success
+        fobs, prep_info = self.tester.artifact_and_preparation_files(1)
+        # just leaving filenames as the folders are dynamic and a pain to test
+        fobs = {k: [basename(vv) for vv in v] for k, v in fobs.items()}
+        fexp = {'raw_forward_seqs': ['1_s_G1_L001_sequences.fastq.gz'],
+                'raw_barcodes': ['1_s_G1_L001_sequences_barcodes.fastq.gz']}
+        self.assertEqual(fobs, fexp)
+        self.assertEqual(prep_info.shape, (27, 22))
+
+        # check failure
+        with self.assertRaisesRegex(RuntimeError, 'Artifact 8 is an analysis '
+                                    'artifact, this method is meant to work '
+                                    'with artifacts linked to a preparation.'):
+            self.tester.artifact_and_preparation_files(8)
+
+        # test _process_files_per_sample_fastq
+        # both fwd/rev
+        files = {
+            'raw_forward_seqs': [
+                {'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
+                {'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
+                {'filepath': '/X/file_2_R1.fastq.gz', 'size': 101}],
+            'raw_reverse_seqs': [
+                {'filepath': '/X/file_2_R2.fastq.gz', 'size': 101},
+                {'filepath': '/X/file_1_R2.fastq.gz', 'size': 101},
+                {'filepath': '/X/file_3_R2.fastq.gz', 'size': 101}]}
+        prep_info = pd.DataFrame.from_dict({
+            'run_prefix': {"sample.1": 'file_1',
+                           "sample.2": 'file_2',
+                           "sample.3": 'file_3'}}, dtype=str)
+        prep_info.index.name = 'sample_name'
+        fobs, piobs = self.tester._process_files_per_sample_fastq(
+            files, prep_info, False)
+        fexp = {
+            'sample.1': ({'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
+                         {'filepath': '/X/file_1_R2.fastq.gz', 'size': 101}),
+            'sample.2': ({'filepath': '/X/file_2_R1.fastq.gz', 'size': 101},
+                         {'filepath': '/X/file_2_R2.fastq.gz', 'size': 101}),
+            'sample.3': ({'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
+                         {'filepath': '/X/file_3_R2.fastq.gz', 'size': 101})}
+        self.assertEqual(fobs, fexp)
+        self.assertEqual(piobs.shape, (3, 1))
+
+        fobs, piobs = self.tester._process_files_per_sample_fastq(
+            files, prep_info, True)
+        del fexp['sample.1']
+        self.assertEqual(fobs, fexp)
+        self.assertEqual(piobs.shape, (2, 1))
+
+        # just fwd
+        files = {
+            'raw_forward_seqs': [
+                {'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
+                {'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
+                {'filepath': '/X/file_2_R1.fastq.gz', 'size': 101}]}
+        prep_info = pd.DataFrame.from_dict({
+            'run_prefix': {"sample.1": 'file_1',
+                           "sample.2": 'file_2',
+                           "sample.3": 'file_3'}}, dtype=str)
+        prep_info.index.name = 'sample_name'
+        fobs, piobs = self.tester._process_files_per_sample_fastq(
+            files, prep_info, False)
+        fexp = {
+            'sample.1': ({'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
+                         None),
+            'sample.2': ({'filepath': '/X/file_2_R1.fastq.gz', 'size': 101},
+                         None),
+            'sample.3': ({'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
+                         None)}
+        self.assertEqual(fobs, fexp)
+        self.assertEqual(piobs.shape, (3, 1))
+
+        fobs, piobs = self.tester._process_files_per_sample_fastq(
+            files, prep_info, True)
+        del fexp['sample.1']
+        self.assertEqual(fobs, fexp)
+        self.assertEqual(piobs.shape, (2, 1))
+
 
 if __name__ == '__main__':
     main()