Skip to content

Commit 690362a

Browse files
Merge pull request #48 from antgonza/add-artifact-prep-method
add artifact_and_preparation_files method
2 parents 50adab5 + b558499 commit 690362a

File tree

3 files changed

+204
-5
lines changed

3 files changed

+204
-5
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Qiita Client
22
============
33

4-
[![Build Status](https://travis-ci.org/qiita-spots/qiita_client.png?branch=master)](https://travis-ci.org/qiita-spots/qiita_client)
4+
[![Build Status](https://github.com/qiita-spots/qiita_client/actions/workflows/qiita-ci.yml/badge.svg)](https://github.com/qiita-spots/qiita_client/actions/workflows/qiita-ci.yml)
55

66
Qiita (canonically pronounced *cheetah*) is an analysis environment for microbiome (and other "comparative -omics") datasets.
77

qiita_client/qiita_client.py

+111
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@
99
import time
1010
import requests
1111
import threading
12+
import pandas as pd
1213
from json import dumps
1314
from random import randint
15+
from itertools import zip_longest
16+
from os.path import basename
1417

1518
from .exceptions import (QiitaClientError, NotFoundError, BadRequestError,
1619
ForbiddenError)
@@ -23,6 +26,7 @@
2326
MAX_RETRIES = 3
2427
MIN_TIME_SLEEP = 180
2528
MAX_TIME_SLEEP = 360
29+
BLANK_FILE_THRESHOLD = 100
2630

2731

2832
class ArtifactInfo(object):
@@ -556,3 +560,110 @@ def complete_job(self, job_id, success, error_msg=None,
556560
artifacts_info=artifacts_info))
557561
# Create the URL where we have to post the results
558562
self.post("/qiita_db/jobs/%s/complete/" % job_id, data=json_payload)
563+
564+
def artifact_and_preparation_files(self, artifact_id,
565+
ignore_small_files=True):
566+
"""Gets the artifact and preparation files from a given artifact_id
567+
568+
Parameters
569+
----------
570+
artifact_id : int
571+
The artifact id
572+
ignore_small_files : bool
573+
Whether to ignore small files or retrieve all of them (only applies
574+
to per_sample_FASTQ artifacts)
575+
576+
Returns
577+
-------
578+
dict
579+
files available in the artifact
580+
pandas.DataFrame
581+
the prep information file for that artifact
582+
583+
Raises
584+
------
585+
RuntimeError
586+
- If the artifact belongs to an analysis
587+
588+
"""
589+
artifact_info = self.get("/qiita_db/artifacts/%s/" % artifact_id)
590+
591+
if artifact_info['analysis'] is not None:
592+
raise RuntimeError(
593+
f'Artifact {artifact_id} is an analysis artifact, this method '
594+
'is meant to work with artifacts linked to a preparation.')
595+
596+
prep_info = self.get('/qiita_db/prep_template/%s/'
597+
% artifact_info['prep_information'][0])
598+
prep_info = pd.read_csv(prep_info['prep-file'], sep='\t', dtype=str)
599+
if artifact_info['type'] == 'per_sample_FASTQ':
600+
files, prep_info = self._process_files_per_sample_fastq(
601+
artifact_info['files'], prep_info, ignore_small_files)
602+
else:
603+
files = {k: [vv['filepath'] for vv in v]
604+
for k, v in artifact_info['files'].items()}
605+
606+
return files, prep_info
607+
608+
def _process_files_per_sample_fastq(self, files, prep_info,
609+
ignore_small_files):
610+
"helper function to process per_sample_FASTQ artifacts and their preps"
611+
612+
fwds = sorted(files['raw_forward_seqs'], key=lambda x: x['filepath'])
613+
revs = []
614+
if 'raw_reverse_seqs' in files:
615+
revs = sorted(
616+
files['raw_reverse_seqs'], key=lambda x: x['filepath'])
617+
if len(fwds) != len(revs):
618+
raise ValueError(f'The fwd ({len(fwds)}) and rev ({len(revs)})'
619+
' files should be of the same length')
620+
621+
run_prefixes = prep_info['run_prefix'].to_dict()
622+
623+
# make parirings
624+
sample_names = dict()
625+
used_prefixes = []
626+
for i, (fwd, rev) in enumerate(zip_longest(fwds, revs)):
627+
fwd_fn = basename(fwd['filepath'])
628+
file_smaller_than_min = fwd['size'] < BLANK_FILE_THRESHOLD
629+
630+
# iterate over run prefixes and make sure only one matches
631+
run_prefix = None
632+
sample_name = None
633+
for sn, rp in run_prefixes.items():
634+
if fwd_fn.startswith(rp) and run_prefix is None:
635+
run_prefix = rp
636+
sample_name = sn
637+
elif fwd_fn.startswith(rp) and run_prefix is not None:
638+
raise ValueError(
639+
f'Multiple run prefixes match this fwd read: {fwd_fn}')
640+
641+
if run_prefix is None:
642+
raise ValueError(
643+
f'No run prefix matching this fwd read: {fwd_fn}')
644+
if run_prefix in used_prefixes:
645+
raise ValueError(
646+
f'Run prefix matches multiple fwd reads: {run_prefix}')
647+
used_prefixes.append(run_prefix)
648+
649+
if rev is not None:
650+
# if we have reverse reads, make sure the matching pair also
651+
# matches the run prefix:
652+
rev_fn = basename(rev['filepath'])
653+
if not file_smaller_than_min:
654+
file_smaller_than_min = rev['size'] < BLANK_FILE_THRESHOLD
655+
if not rev_fn.startswith(run_prefix):
656+
raise ValueError(
657+
'Reverse read does not match run prefix. run_prefix: '
658+
f'{run_prefix}; files: {fwd_fn} / {rev_fn}')
659+
660+
used_prefixes.append(run_prefix)
661+
662+
if ignore_small_files and file_smaller_than_min:
663+
continue
664+
665+
sample_names[sample_name] = (fwd, rev)
666+
667+
prep_info = prep_info.filter(items=sample_names.keys(), axis=0)
668+
669+
return sample_names, prep_info

qiita_client/tests/test_qiita_client.py

+92-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from os.path import basename, exists
1212
from tempfile import mkstemp
1313
from json import dumps
14+
import pandas as pd
1415

1516
from qiita_client.qiita_client import (QiitaClient, _format_payload,
1617
ArtifactInfo)
@@ -133,11 +134,17 @@ def test_get(self):
133134
# Files contain the full path, which it is hard to test, so get only
134135
# the basename of the files
135136
obs_files = obs.pop('files')
136-
for k in obs_files:
137-
obs_files[k] = [basename(v) for v in obs_files[k]]
137+
obs_files = {
138+
k: [{'filepath': basename(vv['filepath']),
139+
'size': vv['size']} for vv in v]
140+
for k, v in obs_files.items()}
138141
exp_files = {
139-
'raw_barcodes': ['1_s_G1_L001_sequences_barcodes.fastq.gz'],
140-
'raw_forward_seqs': ['1_s_G1_L001_sequences.fastq.gz']}
142+
'raw_barcodes': [
143+
{'filepath': '1_s_G1_L001_sequences_barcodes.fastq.gz',
144+
'size': 58}],
145+
'raw_forward_seqs': [
146+
{'filepath': '1_s_G1_L001_sequences.fastq.gz',
147+
'size': 58}]}
141148

142149
self.assertEqual(obs, exp)
143150
self.assertEqual(obs_files, exp_files)
@@ -204,6 +211,7 @@ def test_get_job_info(self):
204211
job_id = "3c9991ab-6c14-4368-a48c-841e8837a79c"
205212
obs = self.tester.get_job_info(job_id)
206213
exp = {'command': 'Pick closed-reference OTUs',
214+
'msg': '',
207215
'status': 'success',
208216
'parameters': {'input_data': 2,
209217
'reference': 1,
@@ -250,6 +258,86 @@ def test_complete_job(self):
250258
obs = self.tester.complete_job(job_id, True, artifacts_info=ainfo)
251259
self.assertIsNone(obs)
252260

261+
def test_artifact_and_preparation_files(self):
262+
263+
# check success
264+
fobs, prep_info = self.tester.artifact_and_preparation_files(1)
265+
# just leaving filenames as the folders are dynamic and a pain to test
266+
fobs = {k: [basename(vv) for vv in v] for k, v in fobs.items()}
267+
fexp = {'raw_forward_seqs': ['1_s_G1_L001_sequences.fastq.gz'],
268+
'raw_barcodes': ['1_s_G1_L001_sequences_barcodes.fastq.gz']}
269+
self.assertEqual(fobs, fexp)
270+
self.assertEqual(prep_info.shape, (27, 22))
271+
272+
# check failure
273+
with self.assertRaisesRegex(RuntimeError, 'Artifact 8 is an analysis '
274+
'artifact, this method is meant to work '
275+
'with artifacts linked to a preparation.'):
276+
self.tester.artifact_and_preparation_files(8)
277+
278+
# test _process_files_per_sample_fastq
279+
# both fwd/rev
280+
files = {
281+
'raw_forward_seqs': [
282+
{'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
283+
{'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
284+
{'filepath': '/X/file_2_R1.fastq.gz', 'size': 101}],
285+
'raw_reverse_seqs': [
286+
{'filepath': '/X/file_2_R2.fastq.gz', 'size': 101},
287+
{'filepath': '/X/file_1_R2.fastq.gz', 'size': 101},
288+
{'filepath': '/X/file_3_R2.fastq.gz', 'size': 101}]}
289+
prep_info = pd.DataFrame.from_dict({
290+
'run_prefix': {"sample.1": 'file_1',
291+
"sample.2": 'file_2',
292+
"sample.3": 'file_3'}}, dtype=str)
293+
prep_info.index.name = 'sample_name'
294+
fobs, piobs = self.tester._process_files_per_sample_fastq(
295+
files, prep_info, False)
296+
fexp = {
297+
'sample.1': ({'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
298+
{'filepath': '/X/file_1_R2.fastq.gz', 'size': 101}),
299+
'sample.2': ({'filepath': '/X/file_2_R1.fastq.gz', 'size': 101},
300+
{'filepath': '/X/file_2_R2.fastq.gz', 'size': 101}),
301+
'sample.3': ({'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
302+
{'filepath': '/X/file_3_R2.fastq.gz', 'size': 101})}
303+
self.assertEqual(fobs, fexp)
304+
self.assertEqual(piobs.shape, (3, 1))
305+
306+
fobs, piobs = self.tester._process_files_per_sample_fastq(
307+
files, prep_info, True)
308+
del fexp['sample.1']
309+
self.assertEqual(fobs, fexp)
310+
self.assertEqual(piobs.shape, (2, 1))
311+
312+
# just fwd
313+
files = {
314+
'raw_forward_seqs': [
315+
{'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
316+
{'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
317+
{'filepath': '/X/file_2_R1.fastq.gz', 'size': 101}]}
318+
prep_info = pd.DataFrame.from_dict({
319+
'run_prefix': {"sample.1": 'file_1',
320+
"sample.2": 'file_2',
321+
"sample.3": 'file_3'}}, dtype=str)
322+
prep_info.index.name = 'sample_name'
323+
fobs, piobs = self.tester._process_files_per_sample_fastq(
324+
files, prep_info, False)
325+
fexp = {
326+
'sample.1': ({'filepath': '/X/file_1_R1.fastq.gz', 'size': 99},
327+
None),
328+
'sample.2': ({'filepath': '/X/file_2_R1.fastq.gz', 'size': 101},
329+
None),
330+
'sample.3': ({'filepath': '/X/file_3_R1.fastq.gz', 'size': 101},
331+
None)}
332+
self.assertEqual(fobs, fexp)
333+
self.assertEqual(piobs.shape, (3, 1))
334+
335+
fobs, piobs = self.tester._process_files_per_sample_fastq(
336+
files, prep_info, True)
337+
del fexp['sample.1']
338+
self.assertEqual(fobs, fexp)
339+
self.assertEqual(piobs.shape, (2, 1))
340+
253341

254342
if __name__ == '__main__':
255343
main()

0 commit comments

Comments
 (0)