From a9d0979dc811e67baab11c5621e6a70623f3cba5 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 4 Jun 2021 10:10:30 -0600 Subject: [PATCH 1/4] add script --- scripts/qiita-load-qebil-downloads | 211 +++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100755 scripts/qiita-load-qebil-downloads diff --git a/scripts/qiita-load-qebil-downloads b/scripts/qiita-load-qebil-downloads new file mode 100755 index 000000000..d42e8615c --- /dev/null +++ b/scripts/qiita-load-qebil-downloads @@ -0,0 +1,211 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- + +from time import sleep +from glob import glob +from os.path import isdir, basename, join +from shutil import copyfile + +from qiita_db.study import Study +from qiita_db.artifact import Artifact +from qiita_db.commands import ( + load_study_from_cmd, load_sample_template_from_cmd, + load_prep_template_from_cmd) +from qiita_db.util import get_data_types, get_mountpoint + + +SLEEP_TIME = 10 +EBIDIR = '/panfs/panfs1.ucsd.edu/panscratch/qiita/qebil/vertebrates/' +data_types = set([x.replace(' ', '_') for x in get_data_types()]) + +for folder in glob(f'{EBIDIR}/*'): + warnings = [] + extra_notes = dict() + if not isdir(folder): + print(f'Ignoring: {folder}') + continue + # note necessry but nice for debugging + print(f'\n\n\n+++> Processing {folder}, you have {SLEEP_TIME} ' + 'seconds to ctrl-c') + sleep(10) + + files = glob(f'{folder}/*') + files_used = [] + qebil_status_fp = [f for f in files if f.endswith('qebil_status')][0] + with open(qebil_status_fp, 'r') as fp: + qebil_status = fp.readlines()[0] + if 'complete' not in qebil_status: + print(f'Skipping {qebil_status_fp}, not ready: {qebil_status}') + continue + files_used.append(qebil_status_fp) + + title_fp = [f for f in files if f.endswith('_study_title.txt')][0] + files_used.append(title_fp) + config_fp = [f for f in files if f.endswith('_study_config.txt')][0] + files_used.append(config_fp) + sample_fp = [f for f in files if f.endswith('_sample_info.tsv')][0] + files_used.append(sample_fp) + + with open(title_fp, 'r') as fp: + title = fp.readlines()[0] + + if Study.exists(title): + print(f'======> {folder}: {title} already loaded') + continue + + with open(config_fp, 'r') as fp: + study = load_study_from_cmd('qiita.help@gmail.com', title, fp) + + study.autoloaded = True + study.ebi_study_accession = study.info['study_alias'].split(';')[0] + st = study.sample_template + sample_info = load_sample_template_from_cmd(sample_fp, study.id) + st.ebi_sample_accessions = st.get_category('secondary_sample_accession') + st.biosample_accessions = st.get_category('sample_accession') + + preps = dict() + for f in files: + if '_prep_info_' not in f: + if (f.endswith('.log') or f.endswith('.EBI_metadata.tsv') or + f.endswith('.QIIME_mapping_file.tsv')): + files_used.append(f) + continue + + if 'MISSING' in f or 'TOOMANYREADS' in f: + warnings.append(f'Skipping: {f}') + if 'MISSING' in f and 'MISSING' not in extra_notes: + extra_notes['MISSING'] = ( + 'One or more of the fastq files for your study were ' + 'unavailable for download from EBI/ENA or the downloaded ' + 'files were found to contain corrupt data and were ' + 'excluded from our automatic association and processing. ' + 'A list of the affected samples and their corresponding ' + 'EBI/ENA ftp links can be found in the .MISSING ' + 'preparation information files in the Uploads section of ' + 'this page. If you would like to attempt to manually ' + 'download and/or correct the fastq files, please visit ' + 'the linked EBI/ENA project page in the Study details and ' + 'follow our instructions for manually associating and processing the ' + 'files.') + elif 'TOOMANYREADS' not in extra_notes: + extra_notes['TOOMANYREADS'] = ( + 'One or more of the fastq files for your study were found ' + 'to contain more read files than indicated by the single ' + 'or paired-end read technology that EBI/ENA indicated was ' + 'used for processing the sample. This is most likely the ' + 'case for studies where index reads have been included in ' + 'a separate file as part of the upload, however our ' + 'automated system is unable to readily distinguish this. ' + 'A list of the affected samples and their corresponding ' + 'EBI/ENA ftp links can be found in the .TOOMANYREADS ' + 'preparation information files in the Uploads section of ' + 'this page. If you would like to attempt to have these ' + 'samples processed, please visit the linked EBI/ENA ' + 'project page in the Study details and either a) follow ' + 'our instructions for manually associating and processing the ' + 'files. or b) email Qiita Help to indicate that the ' + 'study should be processed with the assumption that the ' + 'first file associated with a samples is an index read ' + 'file.') + continue + added = False + for dt in data_types: + if f'{dt}' in f: + if dt not in preps: + preps[dt] = [] + preps[dt].append(f) + added = True + files_used.append(f) + break + if not added: + warnings.append(f'Not supported: {f}') + + if not preps: + warnings.append('No valid preparations found') + + for dt, ptfps in preps.items(): + dt = dt.replace('_', ' ') + print(f'==> Processing {dt}') + for ptfp in ptfps: + print(f' {ptfp}') + files_used.append(ptfp) + pt = load_prep_template_from_cmd(ptfp, study.id, dt) + pt.ebi_experiment_accessions = pt.get_category( + 'experiment_accession') + pt.ebi_run_accessions = pt.get_category('run_accession') + + library_layout = set(pt.get_category('library_layout').values()) + + run_prefixes = pt.get_category('run_prefix').values() + + if len(run_prefixes) != len(set(run_prefixes)): + warnings.append( + f'Run prefixes are not unique; prep-id: {pt.id}') + continue + + filepaths = [] + for rp in run_prefixes: + matches = sorted([f for f in files if rp in f]) + if library_layout == {'PAIRED'}: + if len(matches) != 2: + warnings.append(f"{pt.id}: {rp} doesn't match PAIRED " + "library layout") + continue + filepaths.append((matches[0], 1)) + filepaths.append((matches[1], 2)) + elif library_layout == {'SINGLE'}: + if len(matches) != 1: + warnings.append(f"{pt.id}: {rp} doesn't match SINGLE " + "library layout") + continue + filepaths.append((matches[0], 1)) + else: + warnings.append('Unknown library layout: ' + f'{library_layout}; prep-id: {pt.id}') + files_used.extend([x for x, _ in filepaths]) + + lfp = len(filepaths) + lrp = len(run_prefixes) + if library_layout == {'PAIRED'} and lfp != lrp*2: + warnings.append('Not a valid number of files/run_prefixes ' + f'({lfp}/{lrp}) for "PAIRED"; prep-id: ' + f'{pt.id}') + continue + elif library_layout == {'SINGLE'} and lfp != lrp: + warnings.append('Not a valid number of files/run_prefixes ' + f'({lfp}/{lrp}) for "SINGLE"; prep-id: ' + f'{pt.id}') + continue + + artifact = Artifact.create(filepaths, 'per_sample_FASTQ', + prep_template=pt, move_files=False) + notes = '' + if warnings: + notes = 'Warnings:
    %s
\n' % ''.join( + [f'
  • {x}
  • ' for x in warnings]) + missing_files = [x for x in set(files) - set(files_used)] + if missing_files: + uploads_fp = join(get_mountpoint("uploads")[0][1], str(study.id)) + notes = f'{notes}Extra files:' + if extra_notes: + notes = f'{notes}Extra Notes:' % ''.join( + [f'
  • {x}
  • ' for x in extra_notes.values()]) + + if notes: + study.notes = notes From 5e92eedd36768e0bf44496df0863a7eee4f107f6 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 4 Jun 2021 12:00:11 -0600 Subject: [PATCH 2/4] addressing @wasade comments --- scripts/qiita-load-qebil-downloads | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/qiita-load-qebil-downloads b/scripts/qiita-load-qebil-downloads index d42e8615c..3c9a67bd9 100755 --- a/scripts/qiita-load-qebil-downloads +++ b/scripts/qiita-load-qebil-downloads @@ -30,9 +30,10 @@ for folder in glob(f'{EBIDIR}/*'): if not isdir(folder): print(f'Ignoring: {folder}') continue - # note necessry but nice for debugging print(f'\n\n\n+++> Processing {folder}, you have {SLEEP_TIME} ' 'seconds to ctrl-c') + # Note: this sleep is not necessary but nice for debugging so we have time + # to ctrl-c sleep(10) files = glob(f'{folder}/*') @@ -68,6 +69,8 @@ for folder in glob(f'{EBIDIR}/*'): sample_info = load_sample_template_from_cmd(sample_fp, study.id) st.ebi_sample_accessions = st.get_category('secondary_sample_accession') st.biosample_accessions = st.get_category('sample_accession') + # ToDo: in the future we should check that these accessions do not + # exist in the system - we need to decide what to do with these. preps = dict() for f in files: @@ -157,7 +160,8 @@ for folder in glob(f'{EBIDIR}/*'): filepaths = [] for rp in run_prefixes: - matches = sorted([f for f in files if rp in f]) + matches = sorted([f for f in files + if basename(f).startswith(rp)]) if library_layout == {'PAIRED'}: if len(matches) != 2: warnings.append(f"{pt.id}: {rp} doesn't match PAIRED " From 03b26041c3974a90e51a7974a917af0ba558c09c Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Thu, 17 Jun 2021 14:55:33 -0600 Subject: [PATCH 3/4] minor changes --- qiita_pet/handlers/base_handlers.py | 6 +++++- qiita_ware/private_plugin.py | 3 --- scripts/qiita-load-qebil-downloads | 26 +++++++++++++++++++------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/qiita_pet/handlers/base_handlers.py b/qiita_pet/handlers/base_handlers.py index 2890ad157..4639ab15d 100644 --- a/qiita_pet/handlers/base_handlers.py +++ b/qiita_pet/handlers/base_handlers.py @@ -56,7 +56,11 @@ def write_error(self, status_code, **kwargs): request_info = ''.join(["%s: %s\n" % (k, req_dict[k]) for k in req_dict.keys() if k != 'files']) - error = str(exc_info[1]).split(':', 1)[1] + error = str(exc_info[1]).split(':', 1) + if len(error) > 1: + error = error[1] + else: + error = error[0] # render error page self.render('error.html', status_code=status_code, is_admin=is_admin, diff --git a/qiita_ware/private_plugin.py b/qiita_ware/private_plugin.py index 0b74625e6..c3e9aeb6a 100644 --- a/qiita_ware/private_plugin.py +++ b/qiita_ware/private_plugin.py @@ -402,9 +402,6 @@ def list_remote_files(job): job._set_error(traceback.format_exception(*exc_info())) else: job._set_status('success') - finally: - # making sure to always delete the key so Qiita never keeps it - remove(private_key) def download_remote_files(job): diff --git a/scripts/qiita-load-qebil-downloads b/scripts/qiita-load-qebil-downloads index 3c9a67bd9..d43fb3623 100755 --- a/scripts/qiita-load-qebil-downloads +++ b/scripts/qiita-load-qebil-downloads @@ -12,19 +12,24 @@ from glob import glob from os.path import isdir, basename, join from shutil import copyfile +from qiita_db.user import User from qiita_db.study import Study from qiita_db.artifact import Artifact from qiita_db.commands import ( load_study_from_cmd, load_sample_template_from_cmd, load_prep_template_from_cmd) -from qiita_db.util import get_data_types, get_mountpoint +from qiita_db.util import get_data_types, get_mountpoint, create_nested_path SLEEP_TIME = 10 EBIDIR = '/panfs/panfs1.ucsd.edu/panscratch/qiita/qebil/vertebrates/' data_types = set([x.replace(' ', '_') for x in get_data_types()]) -for folder in glob(f'{EBIDIR}/*'): +folders = glob(f'{EBIDIR}/*') +shared_with_emails = ['sjsong@eng.ucsd.edu'] +shared_with = [User(x) for x in shared_with_emails] + +for folder in folders: warnings = [] extra_notes = dict() if not isdir(folder): @@ -65,10 +70,11 @@ for folder in glob(f'{EBIDIR}/*'): study.autoloaded = True study.ebi_study_accession = study.info['study_alias'].split(';')[0] - st = study.sample_template sample_info = load_sample_template_from_cmd(sample_fp, study.id) - st.ebi_sample_accessions = st.get_category('secondary_sample_accession') - st.biosample_accessions = st.get_category('sample_accession') + sample_info.ebi_sample_accessions = sample_info.get_category( + 'secondary_sample_accession') + sample_info.biosample_accessions = sample_info.get_category( + 'sample_accession') # ToDo: in the future we should check that these accessions do not # exist in the system - we need to decide what to do with these. @@ -202,10 +208,13 @@ for folder in glob(f'{EBIDIR}/*'): missing_files = [x for x in set(files) - set(files_used)] if missing_files: uploads_fp = join(get_mountpoint("uploads")[0][1], str(study.id)) + create_nested_path(uploads_fp) notes = f'{notes}Extra files:
      ' + for mf in missing_files: - copyfile(mf, uploads_fp) - notes = f'{notes}
    • %s
    • ' % basename(mf) + bn = basename(mf) + copyfile(mf, join(uploads_fp, bn)) + notes = f'{notes}
    • {bn}
    • ' notes = f'{notes}
    ' if extra_notes: notes = f'{notes}Extra Notes:
      %s
    ' % ''.join( @@ -213,3 +222,6 @@ for folder in glob(f'{EBIDIR}/*'): if notes: study.notes = notes + + for x in shared_with: + study.share(x) From 87fecb4d41b3274abc12f988ace067173b310c3e Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 13 Aug 2021 06:47:19 -0600 Subject: [PATCH 4/4] making a function --- scripts/qiita-load-qebil-downloads | 55 +++++++++++++++++++----------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/scripts/qiita-load-qebil-downloads b/scripts/qiita-load-qebil-downloads index d43fb3623..d5a13d1ab 100755 --- a/scripts/qiita-load-qebil-downloads +++ b/scripts/qiita-load-qebil-downloads @@ -12,7 +12,6 @@ from glob import glob from os.path import isdir, basename, join from shutil import copyfile -from qiita_db.user import User from qiita_db.study import Study from qiita_db.artifact import Artifact from qiita_db.commands import ( @@ -21,25 +20,11 @@ from qiita_db.commands import ( from qiita_db.util import get_data_types, get_mountpoint, create_nested_path -SLEEP_TIME = 10 -EBIDIR = '/panfs/panfs1.ucsd.edu/panscratch/qiita/qebil/vertebrates/' -data_types = set([x.replace(' ', '_') for x in get_data_types()]) +def load_qebil_study(folder, shared_with): + data_types = set([x.replace(' ', '_') for x in get_data_types()]) -folders = glob(f'{EBIDIR}/*') -shared_with_emails = ['sjsong@eng.ucsd.edu'] -shared_with = [User(x) for x in shared_with_emails] - -for folder in folders: warnings = [] extra_notes = dict() - if not isdir(folder): - print(f'Ignoring: {folder}') - continue - print(f'\n\n\n+++> Processing {folder}, you have {SLEEP_TIME} ' - 'seconds to ctrl-c') - # Note: this sleep is not necessary but nice for debugging so we have time - # to ctrl-c - sleep(10) files = glob(f'{folder}/*') files_used = [] @@ -48,7 +33,7 @@ for folder in folders: qebil_status = fp.readlines()[0] if 'complete' not in qebil_status: print(f'Skipping {qebil_status_fp}, not ready: {qebil_status}') - continue + return files_used.append(qebil_status_fp) title_fp = [f for f in files if f.endswith('_study_title.txt')][0] @@ -63,11 +48,18 @@ for folder in folders: if Study.exists(title): print(f'======> {folder}: {title} already loaded') - continue + return with open(config_fp, 'r') as fp: study = load_study_from_cmd('qiita.help@gmail.com', title, fp) + print('===================') + print('===================') + print('===================') + print(f'study {study.id} created') + print('===================') + print('===================') + study.autoloaded = True study.ebi_study_accession = study.info['study_alias'].split(';')[0] sample_info = load_sample_template_from_cmd(sample_fp, study.id) @@ -201,6 +193,11 @@ for folder in folders: artifact = Artifact.create(filepaths, 'per_sample_FASTQ', prep_template=pt, move_files=False) + print(" ") + print(" ") + print(f" artifact {artifact.id} was created for {pt.id}") + print(" ") + print(" ") notes = '' if warnings: notes = 'Warnings:
      %s
    \n' % ''.join( @@ -225,3 +222,23 @@ for folder in folders: for x in shared_with: study.share(x) + + +# data is a list [str, [list of Users]] +data = [ + # ["folder filepath", [list of Users to add as shared_with]] +] + +for folder, shared_with in data: + SLEEP_TIME = 10 + + if not isdir(folder): + print(f'Ignoring: {folder}') + continue + print(f'\n\n\n+++> Processing {folder}, you have {SLEEP_TIME} ' + 'seconds to ctrl-c') + # Note: this sleep is not necessary but nice for debugging so we have time + # to ctrl-c + sleep(10) + + load_qebil_study(folder, shared_with)