From ad95a9d3c613a23982d120287afd99a8d9006edd Mon Sep 17 00:00:00 2001 From: yy1533 Date: Mon, 7 May 2018 10:58:30 -0400 Subject: [PATCH 1/8] :dog: add in-line comment --- celseq2/celseq2.py | 25 ++++-- celseq2/workflow/celseq2_beta.snakemake | 101 +++++++++++++++--------- 2 files changed, 81 insertions(+), 45 deletions(-) diff --git a/celseq2/celseq2.py b/celseq2/celseq2.py index 44331b5..0af8aae 100644 --- a/celseq2/celseq2.py +++ b/celseq2/celseq2.py @@ -29,6 +29,15 @@ * -n * --notemp (--nt) +Name of rules to request outputs: + * all (default) + * TAG_FASTQ + * ANNOTATION + * ALIGNMENT + * COUNT_MATRIX + * QC_COUNT_MATRIX + * CELSEQ2_TO_ST (only available for ST data) + Refs: - https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html - https://bitbucket.org/snakemake/snakemake/src/e11a57fe1f62f3f56c815d95d82871811dae81b3/snakemake/__init__.py?at=master&fileviewer=file-view-default#__init__.py-580:1127 @@ -36,34 +45,38 @@ def get_argument_parser(): - desc = ('CEL-Seq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.') + desc = ('celseq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.') parser = argparse.ArgumentParser(description=desc, add_help=True) parser.add_argument( "target", nargs="*", default=None, - help="Targets to build. May be rules or files.") + help="Targets to build. May be rules or files.", + choices=[None, 'all', 'TAG_FASTQ', 'ANNOTATION', 'ALIGNMENT', + 'COUNT_MATRIX', 'QC_COUNT_MATRIX', 'CELSEQ2_TO_ST']) parser.add_argument( "--config-file", metavar="FILE", required=True, - help=("Specify details of CEL-Seq2 and gneome information.")) + help=("Configurations of the details of CEL-Seq2 " + " and running environment.")) parser.add_argument( "--experiment-table", metavar="FILE", required=True, - help=("Space/Tab separated file specifying experiment design.")) + help=("Space/Tab separated file specifying the R1/R2 reads " + "and the experiment design.")) parser.add_argument( "--output-dir", metavar="DIRECTORY", required=True, - help=("All results are saved with here as root directory.")) + help=("All results are saved here as root directory.")) parser.add_argument( "--reverse-stranded", "--rs", action="store_true", default=False, - help="Read has to be mapped to the opposite strand as the feature") + help="Reads have to be mapped to the opposite strand of the feature.") parser.add_argument( "--celseq2-to-st", "--st", diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake index 79dec53..9e4c589 100644 --- a/celseq2/workflow/celseq2_beta.snakemake +++ b/celseq2/workflow/celseq2_beta.snakemake @@ -1,5 +1,3 @@ -###################################################################### -## Dependencies ## from celseq2.helper import join_path, base_name, dir_name, print_logger from celseq2.helper import rmfolder, mkfolder, is_nonempty_file from celseq2.helper import cook_sample_sheet, popen_communicate @@ -18,16 +16,18 @@ import tempfile import shutil import os - -## Inforamtion ## +''' +Part-1 Reading Config +''' +# Inforamtion # '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2' DIR_PROJ = config.get('output_dir', None) -## Sample Sheet ## +# Sample Sheet SAMPLE_TABLE_FPATH = config.get('experiment_table', None) SAMPLE_TABLE = cook_sample_sheet(SAMPLE_TABLE_FPATH) # '' -## CEL-seq2 Tech Setting ## +# CEL-seq2 Tech Setting # '/ifs/data/yanailab/refs/barcodes/barcodes_cel-seq_umis96.tab' BC_INDEX_FPATH = config.get('BC_INDEX_FPATH', None) BC_IDs_DEFAULT = config.get('BC_IDs_DEFAULT', None) # '1-96' @@ -37,7 +37,7 @@ BC_START_POSITION = config.get('BC_START_POSITION', None) UMI_LENGTH = config.get('UMI_LENGTH', None) # 6 BC_LENGTH = config.get('BC_LENGTH', None) # 6 -## Tools ## +# Tools # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/genome/Danio_rerio.GRCz10.dna.toplevel' BOWTIE2_INDEX_PREFIX = config.get('BOWTIE2_INDEX_PREFIX', None) BOWTIE2 = config.get('BOWTIE2', None) # '/local/apps/bowtie2/2.3.1/bowtie2' @@ -45,7 +45,7 @@ STAR_INDEX_DIR = config.get('STAR_INDEX_DIR', None) STAR = config.get('STAR', None) ALIGNER_EXTRA_PARAMETERS = config.get('ALIGNER_EXTRA_PARAMETERS', '') -## Annotations ## +# Annotations # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/gtf/Danio_rerio.GRCz10.87.gtf.gz' GFF = config.get('GFF', None) FEATURE_ID = config.get('FEATURE_ID', 'gene_name') @@ -53,35 +53,29 @@ FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon') GENE_BIOTYPE = config.get('GENE_BIOTYPE', None) if not GENE_BIOTYPE: GENE_BIOTYPE = (); -## Demultiplexing ## +# Demultiplexing FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10 CUT_LENGTH = config.get('CUT_LENGTH', None) # 35 SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False -## Alignment ## +# Alignment ALIGNER = config.get('ALIGNER', None) # 'bowtie2', 'star' assert (ALIGNER), 'Error: Specify aligner.' assert (ALIGNER in ['bowtie2', 'star']), 'Error: Unknown aligner.' -## UMI Count ## +# UMI Count ALN_QUAL_MIN = config.get('ALN_QUAL_MIN', None) # 0 STRANDED = config.get('stranded', 'yes') - -## Running Parameters ## -# bc_ids_used=config.get('bc_ids_used', None) # '1,2,3,4-5' +# Running Parameters num_threads = config.get('num_threads', 16) # 5 verbose = config.get('verbose', True) # True RUN_CELSEQ2_TO_ST = config.get('run_celseq2_to_st', False) KEEP_INTERMEDIATE = config.get('keep_intermediate', False) -####################### -## Pipeline reserved ## -####################### +# Pipeline reserved variables item_names = list(SAMPLE_TABLE.index) -# expand('{r1_fpath}', r1_fpath=SAMPLE_TABLE['R1'].values) sample_list = SAMPLE_TABLE['SAMPLE_NAME'].values -# expand('{r2_fpath}', r2_fpath=SAMPLE_TABLE['R2'].values) R1 = SAMPLE_TABLE['R1'].values R2 = SAMPLE_TABLE['R2'].values bc_used = SAMPLE_TABLE['CELL_BARCODES_INDEX'].values @@ -120,14 +114,15 @@ SUBDIRS = [SUBDIR_INPUT, # "_low_map_qual", '_multimapped', "_uniquemapped", # "_no_feature", "_ambiguous", # "_total"] -## Dev ## - -##################### -## Snakemake rules ## -##################### +''' +Part-2: Snakemake rules +''' workdir: DIR_PROJ +''' +Default task named "all" to request all outputs. +''' if RUN_CELSEQ2_TO_ST: rule all: input: @@ -166,6 +161,9 @@ else: rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET); +''' +Subtask named "COUNT_MATRIX" to request the UMIs matrices outputs +''' rule COUNT_MATRIX: input: csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'), @@ -185,6 +183,9 @@ rule COUNT_MATRIX: print_logger('UMI-count matrix is saved at {}'.format(input.csv)) +''' +Subtask named "QC_COUNT_MATRIX" to request the QCs plots of UMIs matrices +''' rule QC_COUNT_MATRIX: input: html = expand(join_path(DIR_PROJ, SUBDIR_QC_EXPR, @@ -192,8 +193,9 @@ rule QC_COUNT_MATRIX: expid=list(set(sample_list))), message: 'Finished QC UMI-count matrix.' - -# join_path(DIR_PROJ, SUBDIR_QC_EXPR, '{expid}', 'QC_ST.html') +''' +Subtask named "CELSEQ2_TO_ST" to request spatial-transcriptomics data format. +''' if RUN_CELSEQ2_TO_ST: rule CELSEQ2_TO_ST: input: @@ -254,7 +256,10 @@ if RUN_CELSEQ2_TO_ST: # touch('_done_report') -# Combo-demultiplexing +# Pipeline Step 1a: combo_demultiplexing +# Inputs: R1 (barcode) and R2 (mRNA) +# Output: A list of FASTQs extracted from R2 per cells and they are tagged with +# cell barcode and UMI sequence gained from R1. rule combo_demultiplexing: input: SAMPLE_TABLE_FPATH, output: @@ -305,13 +310,18 @@ rule combo_demultiplexing: p.join() shell('touch _done_combodemultiplex') + +''' +Subtask named "TAG_FASTQ" to request intermediate files: mRNA.fastq tagged +with cell id and UMI seq gained from barcode.fastq +''' rule TAG_FASTQ: input: expand(join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'), itemID=item_names), message: "Finished Tagging FASTQs." - +# Pipeline Step 1b: Merge tagged FASTQs rule tag_fastq: input: fq = dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ, @@ -332,7 +342,7 @@ rule tag_fastq: print_logger('Tagged FQ: {}'.format(itemid_tag_fq)) -## Alignment ## +# Pipeline Step 2a: Alignment rule ALIGNMENT: input: expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER+'.bigsam'), @@ -401,7 +411,7 @@ if ALIGNER == 'star': shell('mv {starsam} {output.sam} ') shell('mv {starlog} {output.log} ') - +# Pipeline Step 2b: Combo-demultiplex the SAM file rule combo_demultiplexing_sam: input: sam = expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, @@ -436,7 +446,11 @@ rule combo_demultiplexing_sam: shell('touch _done_combodemultiplex_sam') -## HT-seq Count UMI ## +# Pipeline Step 3a: cook ht-seq recognized feature object before counting UMIs +# Input: GTF/GFF file +# Output: a pickle file saving a tuple (features, exported_genes) where: +# - features: HTSeq.GenomicArrayOfSets(). *All* exons locations ~ genes. +# - exported_genes: a sorted list. Gene id / gene name (default all genes). rule COOK_ANNOTATION: input: SAMPLE_TABLE_FPATH, @@ -463,7 +477,7 @@ rule COOK_ANNOTATION: if params.gene_type: print_logger('Types of reported gene: {}.'.format( ', '.join(params.gene_type))) - gene_df = get_genes(GFF, valid_biotypes = set(params.gene_type)) + gene_df = get_genes(GFF, valid_biotypes=set(params.gene_type)) exported_genes = sorted(gene_df['name'].values) print_logger('Number of reported genes: {}.'.format( len(exported_genes))) @@ -472,11 +486,17 @@ rule COOK_ANNOTATION: shell('touch {output.anno_csv}') with open(output.anno_pkl, 'wb') as fh: - pickle.dump( (features, exported_genes), fh) + pickle.dump((features, exported_genes), fh) shell('touch {output.flag} ') - +# Pipeline Step 3b: Count UMIs +# Inputs: +# - annotation object +# - SAM per cell +# Outputs: two pickle files per cell +# - umicnt: dict(str: set(str)), i.e., dict(gene ~ set(UMI_sequence)) +# - umiset: Counter(str: int), i.e., Counter(gene ~ "number of UMIs") rule count_umi: input: gff = rules.COOK_ANNOTATION.output.anno_pkl, @@ -499,7 +519,9 @@ rule count_umi: pickle.dump(umi_set, open(output.umiset, 'wb')) -# - regular umi-count using *_umicnt.pkl -> umi_count_matrix replicates/lanes per plate +# Pipeline Step 4a: Merge UMIs of cells to UMI matrix of item +# Input: a list of umicnt pickle files of cells per item +# Output: UMI-count matric file (csv & hdf) per item rule summarize_umi_matrix_per_item: input: gff = rules.COOK_ANNOTATION.output.anno_pkl, @@ -524,6 +546,7 @@ rule summarize_umi_matrix_per_item: item_id = base_name(dir_name(f)) # item-1 item_expr_matrix[item_id][bc_name] = pickle.load(open(f, 'rb')) + # export to csv/hdf for item_id, expr_dict in item_expr_matrix.items(): exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] # E1 @@ -537,12 +560,11 @@ rule summarize_umi_matrix_per_item: expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, item_id, 'expr.h5'), 'table') - -# - merge umi-count using *_umiset.pkl -> correct umi count per experiment/plate +# Pipeline Step 4b: Merge UMIs of cells to UMI matrix per experiment +# Input: a list of umiset pickle files of cells per experiment +# Output: UMI-count matric file (csv & hdf) per experiment rule summarize_umi_matrix_per_experiment: input: - # gff = join_path(DIR_PROJ, SUBDIR_ANNO, - # base_name(GFF) + '.pickle'), gff = rules.COOK_ANNOTATION.output.anno_pkl, umiset = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_SET, '{itemID}', '{bcID}.pkl')), @@ -578,6 +600,7 @@ rule summarize_umi_matrix_per_experiment: y2 = umiset_stream.get(x, set()) exp_expr_matrix[exp_id][bc_name][x] = y1 | y2 + # export to csv/hdf for exp_id, expr_dict in exp_expr_matrix.items(): for bc, cnt in expr_dict.items(): cnt = _flatten_umi_set(cnt) From f11f7835e0746fb4deb205f3f0e0f1fcdc48c0d8 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Tue, 8 May 2018 16:26:21 -0400 Subject: [PATCH 2/8] :beers: plotly graph on demultiplexing fastqs --- celseq2/demultiplex.py | 59 ++++++++++++++- celseq2/workflow/celseq2_beta.snakemake | 95 +++++++++++++++++-------- 2 files changed, 123 insertions(+), 31 deletions(-) diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py index dbd7538..2e09e1c 100644 --- a/celseq2/demultiplex.py +++ b/celseq2/demultiplex.py @@ -5,7 +5,11 @@ import argparse from celseq2.helper import filehandle_fastq_gz, print_logger -from celseq2.helper import join_path, mkfolder +from celseq2.helper import join_path, mkfolder, base_name + +import plotly.graph_objs as go +from plotly.offline import plot +import pandas as pd def str2int(s): @@ -212,6 +216,59 @@ def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath): stats['total'] / stats['total'] * 100)) +def plotly_demultiplexing_stats(fpaths=[], saveto='', fnames=[]): + ''' + Save a plotly box graph with a list of demultiplexing stats files + + Parameters + ---------- + fpaths : list + A list of file paths + saveto : str + File path to save the html file as the plotly box graph + fnames : list + A list of strings to label each ``fpaths`` + + Returns + ------- + bool + True if saving successfully, False otherwise + ''' + + if not fnames: + fnames = [base_name(f) for f in fpaths] + if len(fnames) != len(fpaths): + fnames = [base_name(f) for f in fpaths] + + num_reads_data = [] + for i in range(len(fpaths)): + f = fpaths[i] + fname = fnames[i] + + stats = pd.read_csv(f, index_col=0) + cell_stats = stats.iloc[:-5, :] + # tail 5 lines are fixed as the overall stats + overall_stats = stats.iloc[-5:, :] + num_reads_data.append( + go.Box( + y=cell_stats['Reads(#)'], + name='{} (#Saved={}/#Total={})'.format( + fname, + overall_stats.loc['saved', 'Reads(#)'], + overall_stats.loc['total', 'Reads(#)']))) + + layout = go.Layout( + xaxis=dict(showticklabels=False), + title='Number of reads saved per BC per item') + fig = go.Figure(data=num_reads_data, layout=layout) + try: + plot(fig, filename=saveto, auto_open=False) + return(True) + except Exception as e: + print(e, flush=True) + return(False) + + def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument('read1_fpath', type=str) diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake index 9e4c589..f6fcade 100644 --- a/celseq2/workflow/celseq2_beta.snakemake +++ b/celseq2/workflow/celseq2_beta.snakemake @@ -3,7 +3,8 @@ from celseq2.helper import rmfolder, mkfolder, is_nonempty_file from celseq2.helper import cook_sample_sheet, popen_communicate from celseq2.prepare_annotation_model import cook_anno_model from celseq2.count_umi import count_umi, _flatten_umi_set -from celseq2.parse_log import parse_bowtie2_report, parse_star_report, merge_reports +# from celseq2.parse_log import parse_bowtie2_report, parse_star_report, merge_reports +from celseq2.demultiplex import plotly_demultiplexing_stats import pandas as pd import glob import pickle @@ -51,12 +52,13 @@ GFF = config.get('GFF', None) FEATURE_ID = config.get('FEATURE_ID', 'gene_name') FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon') GENE_BIOTYPE = config.get('GENE_BIOTYPE', None) -if not GENE_BIOTYPE: GENE_BIOTYPE = (); +if not GENE_BIOTYPE: + GENE_BIOTYPE = () # Demultiplexing FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10 CUT_LENGTH = config.get('CUT_LENGTH', None) # 35 -SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False +SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False # Alignment ALIGNER = config.get('ALIGNER', None) # 'bowtie2', 'star' assert (ALIGNER), 'Error: Specify aligner.' @@ -136,11 +138,13 @@ if RUN_CELSEQ2_TO_ST: shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB)) if not KEEP_INTERMEDIATE: print_logger('Cleaning... ') - rmfolder(SUBDIR_FASTQ); + rmfolder(SUBDIR_FASTQ) # rmfolder(SUBDIR_ANNO); - rmfolder(SUBDIR_ALIGN); rmfolder(SUBDIR_ALIGN_ITEM); - rmfolder(SUBDIR_LOG); - rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET); + rmfolder(SUBDIR_ALIGN) + rmfolder(SUBDIR_ALIGN_ITEM) + rmfolder(SUBDIR_LOG) + rmfolder(SUBDIR_UMI_CNT) + rmfolder(SUBDIR_UMI_SET) else: rule all: @@ -154,11 +158,13 @@ else: shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB)) if not KEEP_INTERMEDIATE: print_logger('Cleaning... ') - rmfolder(SUBDIR_FASTQ); + rmfolder(SUBDIR_FASTQ) # rmfolder(SUBDIR_ANNO); - rmfolder(SUBDIR_ALIGN); rmfolder(SUBDIR_ALIGN_ITEM); - rmfolder(SUBDIR_LOG); - rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET); + rmfolder(SUBDIR_ALIGN) + rmfolder(SUBDIR_ALIGN_ITEM) + rmfolder(SUBDIR_LOG) + rmfolder(SUBDIR_UMI_CNT) + rmfolder(SUBDIR_UMI_SET) ''' @@ -236,7 +242,8 @@ if RUN_CELSEQ2_TO_ST: input: tsv = join_path(DIR_PROJ, SUBDIR_ST, '{expid}', 'ST.tsv'), output: - html = join_path(DIR_PROJ, SUBDIR_QC_EXPR, '{expid}', 'QC_ST.html'), + html = join_path(DIR_PROJ, SUBDIR_QC_EXPR, + '{expid}', 'QC_ST.html'), params: expid = '{expid}', run: @@ -258,8 +265,9 @@ if RUN_CELSEQ2_TO_ST: # Pipeline Step 1a: combo_demultiplexing # Inputs: R1 (barcode) and R2 (mRNA) -# Output: A list of FASTQs extracted from R2 per cells and they are tagged with -# cell barcode and UMI sequence gained from R1. +# Output: +# - A list of FASTQs extracted from R2 per cells and they are tagged with cell +# barcode and UMI sequence gained from R1. rule combo_demultiplexing: input: SAMPLE_TABLE_FPATH, output: @@ -345,17 +353,18 @@ rule tag_fastq: # Pipeline Step 2a: Alignment rule ALIGNMENT: input: - expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER+'.bigsam'), + expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER + '.bigsam'), itemID=item_names), message: "Finished Alignment." if ALIGNER == 'bowtie2': rule align_bowtie2: input: - fq = join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'), + fq = join_path(DIR_PROJ, SUBDIR_FASTQ, + '{itemID}', 'TAGGED.bigfastq'), output: sam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, - '{itemID}', ALIGNER+'.bigsam'), + '{itemID}', ALIGNER + '.bigsam'), params: threads = num_threads, aligner_extra_parameters = ALIGNER_EXTRA_PARAMETERS, @@ -376,10 +385,11 @@ if ALIGNER == 'bowtie2': if ALIGNER == 'star': rule align_star: input: - fq = join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'), + fq = join_path(DIR_PROJ, SUBDIR_FASTQ, + '{itemID}', 'TAGGED.bigfastq'), output: sam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, - '{itemID}', ALIGNER+'.bigsam'), + '{itemID}', ALIGNER + '.bigsam'), log = join_path(DIR_PROJ, SUBDIR_LOG, '{itemID}', 'Align-STAR.log'), # starsam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', '.star', @@ -415,10 +425,11 @@ if ALIGNER == 'star': rule combo_demultiplexing_sam: input: sam = expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, - '{itemID}', ALIGNER+'.bigsam'), - itemID = item_names), + '{itemID}', ALIGNER + '.bigsam'), + itemID=item_names), output: - sam = dynamic(join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemID}', '{bcID}.sam')), + sam = dynamic(join_path(DIR_PROJ, SUBDIR_ALIGN, + '{itemID}', '{bcID}.sam')), params: jobs = len(item_names), claim_used_bc = True, @@ -449,8 +460,8 @@ rule combo_demultiplexing_sam: # Pipeline Step 3a: cook ht-seq recognized feature object before counting UMIs # Input: GTF/GFF file # Output: a pickle file saving a tuple (features, exported_genes) where: -# - features: HTSeq.GenomicArrayOfSets(). *All* exons locations ~ genes. -# - exported_genes: a sorted list. Gene id / gene name (default all genes). +# - features: HTSeq.GenomicArrayOfSets(). *All* exons locations ~ set(genes). +# - exported_genes: a sorted list. Gene id/name. (Default all genes are exported). rule COOK_ANNOTATION: input: SAMPLE_TABLE_FPATH, @@ -495,17 +506,17 @@ rule COOK_ANNOTATION: # - annotation object # - SAM per cell # Outputs: two pickle files per cell -# - umicnt: dict(str: set(str)), i.e., dict(gene ~ set(UMI_sequence)) -# - umiset: Counter(str: int), i.e., Counter(gene ~ "number of UMIs") +# - umicnt: dict(str: set(str)) i.e., dict(gene ~ set(UMI_sequence)) +# - umiset: Counter(str: int) i.e., Counter(gene ~ number of UMIs) rule count_umi: input: gff = rules.COOK_ANNOTATION.output.anno_pkl, sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemID}', '{bcID}.sam'), output: umicnt = join_path(DIR_PROJ, SUBDIR_UMI_CNT, - '{itemID}', '{bcID}.pkl'), + '{itemID}', '{bcID}.pkl'), umiset = join_path(DIR_PROJ, SUBDIR_UMI_SET, - '{itemID}', '{bcID}.pkl'), + '{itemID}', '{bcID}.pkl'), message: 'Counting {input.sam}' run: features_f, _ = pickle.load(open(input.gff, 'rb')) @@ -538,7 +549,7 @@ rule summarize_umi_matrix_per_item: run: _, export_genes = pickle.load(open(input.gff, 'rb')) - # item -> dict(cell_bc -> Counter(umi_vector)) + # { item -> dict(cell_bc -> Counter(umi_vector)) } item_expr_matrix = defaultdict(dict) for f in input.umicnt: @@ -579,7 +590,7 @@ rule summarize_umi_matrix_per_experiment: sample_list = SAMPLE_TABLE['SAMPLE_NAME'].values - # experiment_id -> dict(cell_bc -> dict(gname -> set(umi))) + # { experiment_id -> dict(cell_bc -> dict(gname -> set(umi))) } exp_expr_matrix = {} for exp_id in list(set(sample_list)): @@ -655,6 +666,30 @@ rule qc_umi_matrix_per_experiment: # savetocsv=report_item) +rule REPORT: + input: + demultiplexing_fastq = join_path(DIR_PROJ, SUBDIR_REPORT, + 'demultiplexing_fastq.html') + +# Inputs: project/report/item-*/demultiplexing.csv +# Outputs: +# - Stats file (csv) per item. +# - Plotly box graph for all the items. +rule report_combo_demultiplexing: + output: + html = join_path(DIR_PROJ, SUBDIR_REPORT, 'demultiplexing_fastq.html') + run: + # Prepare the stats files + stats_fpaths = glob.glob(join_path(DIR_PROJ, SUBDIR_REPORT, + 'item-*', 'demultiplexing.csv')) + stats_fpaths_labels = [base_name(base_name(f)) for f in stats_fpaths] + work = plotly_demultiplexing_stats( + fpaths=stats_fpaths, + saveto=output.html, + fnames=stats_fpaths_labels) + if not work: + touch('{output.html}') + rule cleanall: message: "Remove all files under {DIR_PROJ}" run: From cfebaf2fbeab496870cee05829a1946e136fc498 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Tue, 8 May 2018 16:43:52 -0400 Subject: [PATCH 3/8] :pill: plotly graph on demultiplexing fastqs --- celseq2/celseq2.py | 10 +++++++--- celseq2/workflow/celseq2_beta.snakemake | 5 +++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/celseq2/celseq2.py b/celseq2/celseq2.py index 0af8aae..4b106d5 100644 --- a/celseq2/celseq2.py +++ b/celseq2/celseq2.py @@ -43,6 +43,10 @@ - https://bitbucket.org/snakemake/snakemake/src/e11a57fe1f62f3f56c815d95d82871811dae81b3/snakemake/__init__.py?at=master&fileviewer=file-view-default#__init__.py-580:1127 ''' +task_choices = ['all', 'TAG_FASTQ', 'ANNOTATION', 'ALIGNMENT', + 'COUNT_MATRIX', 'QC_COUNT_MATRIX', 'CELSEQ2_TO_ST', + 'REPORT'] + def get_argument_parser(): desc = ('celseq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.') @@ -52,9 +56,9 @@ def get_argument_parser(): "target", nargs="*", default=None, - help="Targets to build. May be rules or files.", - choices=[None, 'all', 'TAG_FASTQ', 'ANNOTATION', 'ALIGNMENT', - 'COUNT_MATRIX', 'QC_COUNT_MATRIX', 'CELSEQ2_TO_ST']) + help=('Targets to build. ' + 'May be rules or files. ' + 'Task choices: {}').format(', '.join(task_choices))) parser.add_argument( "--config-file", metavar="FILE", diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake index f6fcade..a82c194 100644 --- a/celseq2/workflow/celseq2_beta.snakemake +++ b/celseq2/workflow/celseq2_beta.snakemake @@ -679,10 +679,11 @@ rule report_combo_demultiplexing: output: html = join_path(DIR_PROJ, SUBDIR_REPORT, 'demultiplexing_fastq.html') run: - # Prepare the stats files + # stats files [/path/item-1/d.csv, /path/item-2/d.csv] stats_fpaths = glob.glob(join_path(DIR_PROJ, SUBDIR_REPORT, 'item-*', 'demultiplexing.csv')) - stats_fpaths_labels = [base_name(base_name(f)) for f in stats_fpaths] + # labels [item-1, item-2] + stats_fpaths_labels = [base_name(dir_name(f)) for f in stats_fpaths] work = plotly_demultiplexing_stats( fpaths=stats_fpaths, saveto=output.html, From f7f3388fbdc0dabfab935e50653a35ad89d03c85 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Tue, 8 May 2018 16:53:31 -0400 Subject: [PATCH 4/8] :muscle: plotly graph on demultiplexing fastqs --- celseq2/demultiplex.py | 1 + 1 file changed, 1 insertion(+) diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py index 2e09e1c..356ec99 100644 --- a/celseq2/demultiplex.py +++ b/celseq2/demultiplex.py @@ -258,6 +258,7 @@ def plotly_demultiplexing_stats(fpaths=[], saveto='', fnames=[]): overall_stats.loc['total', 'Reads(#)']))) layout = go.Layout( + legend=dict(x=-.1, y=-.2), xaxis=dict(showticklabels=False), title='Number of reads saved per BC per item') fig = go.Figure(data=num_reads_data, layout=layout) From 0493caa721925d92fe5a3db2ac3424f8ea0e8bb2 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Wed, 9 May 2018 11:52:33 -0400 Subject: [PATCH 5/8] :pill: forgot to claim the value is median --- celseq2/qc.py | 4 +++- celseq2/version.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/celseq2/qc.py b/celseq2/qc.py index 5475ff2..08355c9 100644 --- a/celseq2/qc.py +++ b/celseq2/qc.py @@ -218,7 +218,8 @@ def plotly_qc_st(fpath, saveto, sep='\t', name=''): mask_by=ST_qc.total_num_UMIs, hover_text=ST_qc.total_num_UMIs.astype('str'), colorscale='Viridis', - mask_title='#Total UMIs {})'.format(ST_qc.total_num_UMIs.median())) + mask_title=('#Total UMIs ' + '(median={})').format(ST_qc.total_num_UMIs.median())) # 3/3 plotly_ST_mt = plotly_scatter( x=ST_qc.Row, y=ST_qc.Col, @@ -239,6 +240,7 @@ def plotly_qc_st(fpath, saveto, sep='\t', name=''): fig['layout'].update(height=600, width=1900, title=name) fig.layout.showlegend = False + # Manually change the locations of other two color bars to proper places fig.data[0].marker.colorbar.x = 0.28 fig.data[1].marker.colorbar.x = 0.64 diff --git a/celseq2/version.py b/celseq2/version.py index 45869b6..ed7d50e 100644 --- a/celseq2/version.py +++ b/celseq2/version.py @@ -1 +1 @@ -__version__ = '0.5.2' +__version__ = '0.5.3' From 5f9bc60610b313d953279e0f2b3cc7ca23411d42 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Wed, 9 May 2018 17:04:54 -0400 Subject: [PATCH 6/8] :beers: plotly graph on the alignment stats --- celseq2/count_umi.py | 62 +++++++++++++++++++++ celseq2/demultiplex.py | 2 +- celseq2/workflow/celseq2_beta.snakemake | 73 +++++++++++++++++++++++-- 3 files changed, 130 insertions(+), 7 deletions(-) diff --git a/celseq2/count_umi.py b/celseq2/count_umi.py index db71581..3d77739 100644 --- a/celseq2/count_umi.py +++ b/celseq2/count_umi.py @@ -8,6 +8,10 @@ import pickle import argparse from collections import defaultdict, Counter +import plotly.graph_objs as go +from plotly.offline import plot +import pandas as pd +from celseq2.helper import base_name def invert_strand(iv): @@ -102,6 +106,64 @@ def _flatten_umi_set(umi_set): # pass +def plotly_alignment_stats(fpaths=[], saveto='', fnames=[]): + ''' + Save a plotly box graph with a list of alignment stats files + + Parameters + ---------- + fpaths : list + A list of file paths + saveto : str + File path to save the html file as the plotly box graph + fnames : list + A list of strings to label each ``fpaths`` + + Returns + ------- + bool + True if saving successfully, False otherwise + ''' + if not fnames: + fnames = [base_name(f) for f in fpaths] + if len(fnames) != len(fpaths): + fnames = [base_name(f) for f in fpaths] + trace_data = [] + # aln_diagnose_item = ["_unmapped", + # "_low_map_qual", '_multimapped', "_uniquemapped", + # "_no_feature", "_ambiguous", + # "_total"] + for i in range(len(fpaths)): + f = fpaths[i] + fname = fnames[i] + + stats = pd.read_csv(f, index_col=0) + + mapped = stats.loc['_multimapped', :] + stats.loc['_uniquemapped', :] + rate_mapped = mapped / stats.loc['_total', :] + + overall_mapped = mapped.sum() + overall_total = stats.loc['_total', :].sum() + + stats.fillna(value=0, inplace=True) + trace_data.append( + go.Box( + y=rate_mapped, + name='{} (#Mapped={}/#Total={})'.format( + fname, overall_mapped, overall_total))) + + layout = go.Layout( + xaxis=dict(showticklabels=False), + title='Mapped/Total alignments per BC per item') + fig = go.Figure(data=trace_data, layout=layout) + try: + plot(fig, filename=saveto, auto_open=False) + return(True) + except Exception as e: + print(e, flush=True) + return(False) + + def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument('--sam_fpath', type=str, metavar='FILENAME', diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py index 356ec99..c7be5e3 100644 --- a/celseq2/demultiplex.py +++ b/celseq2/demultiplex.py @@ -258,7 +258,7 @@ def plotly_demultiplexing_stats(fpaths=[], saveto='', fnames=[]): overall_stats.loc['total', 'Reads(#)']))) layout = go.Layout( - legend=dict(x=-.1, y=-.2), + # legend=dict(x=-.1, y=-.2), xaxis=dict(showticklabels=False), title='Number of reads saved per BC per item') fig = go.Figure(data=num_reads_data, layout=layout) diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake index a82c194..c93ad1f 100644 --- a/celseq2/workflow/celseq2_beta.snakemake +++ b/celseq2/workflow/celseq2_beta.snakemake @@ -96,6 +96,7 @@ SUBDIR_ALIGN = 'small_sam' SUBDIR_ALIGN_ITEM = 'item_sam' SUBDIR_UMI_CNT = 'small_umi_count' SUBDIR_UMI_SET = 'small_umi_set' +SUBDIR_ALN_STATS = 'small_aln_stats' SUBDIR_EXPR = 'expr' SUBDIR_ST = 'ST' SUBDIR_LOG = 'small_log' @@ -106,16 +107,12 @@ SUBDIR_QC_EXPR = 'qc_expr' SUBDIRS = [SUBDIR_INPUT, SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_ALIGN_ITEM, - SUBDIR_UMI_CNT, SUBDIR_UMI_SET, + SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_ALN_STATS, SUBDIR_EXPR, SUBDIR_REPORT, SUBDIR_QC_EXPR, SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_ANNO ] -# aln_diagnose_item = ["_unmapped", -# "_low_map_qual", '_multimapped', "_uniquemapped", -# "_no_feature", "_ambiguous", -# "_total"] ''' Part-2: Snakemake rules @@ -145,6 +142,7 @@ if RUN_CELSEQ2_TO_ST: rmfolder(SUBDIR_LOG) rmfolder(SUBDIR_UMI_CNT) rmfolder(SUBDIR_UMI_SET) + rmfolder(SUBDIR_ALN_STATS) else: rule all: @@ -165,6 +163,7 @@ else: rmfolder(SUBDIR_LOG) rmfolder(SUBDIR_UMI_CNT) rmfolder(SUBDIR_UMI_SET) + rmfolder(SUBDIR_ALN_STATS) ''' @@ -517,6 +516,8 @@ rule count_umi: '{itemID}', '{bcID}.pkl'), umiset = join_path(DIR_PROJ, SUBDIR_UMI_SET, '{itemID}', '{bcID}.pkl'), + alncnt = join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, + '{itemID}', '{bcID}.pkl'), message: 'Counting {input.sam}' run: features_f, _ = pickle.load(open(input.gff, 'rb')) @@ -528,6 +529,7 @@ rule count_umi: dumpto=None) pickle.dump(umi_cnt, open(output.umicnt, 'wb')) pickle.dump(umi_set, open(output.umiset, 'wb')) + pickle.dump(aln_cnt, open(output.alncnt, 'wb')) # Pipeline Step 4a: Merge UMIs of cells to UMI matrix of item @@ -637,6 +639,46 @@ rule qc_umi_matrix_per_experiment: cmd += '--sep {}'.format(',') shell(cmd) + +rule summarize_aln_stats_per_item: + input: + alncnt = dynamic(join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, + '{itemID}', '{bcID}.pkl')), + output: + aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT, + '{itemID}', + 'alignment-' + ALIGNER + '.csv'), + itemID=item_names), + run: + aln_diagnose_item = ["_unmapped", + "_low_map_qual", '_multimapped', "_uniquemapped", + "_no_feature", "_ambiguous", + "_total"] + + # { item -> dict(cell_bc -> Counter(stats)) } + item_stats = defaultdict(dict) + + for f in input.alncnt: + bc_name = base_name(f) # BC-1-xxx + item_id = base_name(dir_name(f)) # item-1 + item_stats[item_id][bc_name] = pickle.load(open(f, 'rb')) + + # export to csv + for item_id, aln_dict in item_stats.items(): + exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] # E1 + + for bc, cnt in aln_dict.items(): + aln_dict[bc] = pd.Series([cnt[x] for x in aln_diagnose_item], + index=aln_diagnose_item) + + aln_stats_df = pd.DataFrame( + aln_dict, index=aln_diagnose_item).fillna(0) + + aln_stats_df.to_csv(join_path(DIR_PROJ, SUBDIR_REPORT, + item_id, + 'alignment-' + ALIGNER + '.csv')) + + # rule report_alignment_log: # input: # df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER, @@ -669,7 +711,9 @@ rule qc_umi_matrix_per_experiment: rule REPORT: input: demultiplexing_fastq = join_path(DIR_PROJ, SUBDIR_REPORT, - 'demultiplexing_fastq.html') + 'demultiplexing_fastq.html'), + alignment_stats = join_path(DIR_PROJ, SUBDIR_REPORT, + 'alignment-{}.html'.format(ALIGNER)), # Inputs: project/report/item-*/demultiplexing.csv # Outputs: @@ -691,6 +735,23 @@ rule report_combo_demultiplexing: if not work: touch('{output.html}') +rule report_alignment_stats: + input: + aln_item = rules.summarize_aln_stats_per_item.output.aln_item, + output: + html = join_path(DIR_PROJ, SUBDIR_REPORT, + 'alignment-{}.html'.format(ALIGNER)), + run: + from celseq2.count_umi import plotly_alignment_stats + stats_fpaths_labels = [base_name(dir_name(f)) for f in input.aln_item] + work = plotly_alignment_stats( + fpaths=input.aln_item, + saveto=output.html, + fnames=stats_fpaths_labels) + if not work: + touch('{output.html}') + + rule cleanall: message: "Remove all files under {DIR_PROJ}" run: From 805d6d053767360f7db2733eaae75d5c0b9caf05 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Thu, 10 May 2018 15:14:48 -0400 Subject: [PATCH 7/8] :beer: cell name has bc id attached; :beer: merge plotly for alignment.html to the pipeline --- celseq2/workflow/celseq2_beta.snakemake | 63 ++++++++++++++++++++----- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake index c93ad1f..7690b12 100644 --- a/celseq2/workflow/celseq2_beta.snakemake +++ b/celseq2/workflow/celseq2_beta.snakemake @@ -127,6 +127,7 @@ if RUN_CELSEQ2_TO_ST: input: '_done_UMI', '_done_ST', + '_done_report', output: touch('_DONE'), run: @@ -148,6 +149,7 @@ else: rule all: input: '_done_UMI', + '_done_report', output: touch('_DONE'), run: @@ -560,14 +562,26 @@ rule summarize_umi_matrix_per_item: item_expr_matrix[item_id][bc_name] = pickle.load(open(f, 'rb')) # export to csv/hdf + dict_bc_id = pd.read_csv(BC_INDEX_FPATH, + sep='\t', index_col=BC_SEQ_COLUMN) + all_bc_seq = dict_bc_id.index.values + dict_bc_id = {seq: seq_id + 1 for seq_id, seq in enumerate(all_bc_seq)} for item_id, expr_dict in item_expr_matrix.items(): exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] # E1 for bc, cnt in expr_dict.items(): expr_dict[bc] = pd.Series([cnt[x] for x in export_genes], index=export_genes) - - expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0) + cnames_ordered = sorted( + expr_dict.keys(), + key=lambda xx: dict_bc_id.get(xx, float('Inf'))) + expr_df = pd.DataFrame( + expr_dict, + index=export_genes, + columns=cnames_ordered) + expr_df.fillna(0, inplace=True) + expr_df.columns = ['BC-{}-{}'.format( + dict_bc_id.get(xx, 0), xx) for xx in cnames_ordered] expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, item_id, 'expr.csv')) expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR, @@ -599,7 +613,7 @@ rule summarize_umi_matrix_per_experiment: exp_expr_matrix[exp_id] = defaultdict(dict) for f in input.umiset: - bc_name = base_name(f) # BC-1-xxx + bc_name = base_name(f) # xxx item_id = base_name(dir_name(f)) # item-1 exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] @@ -614,12 +628,26 @@ rule summarize_umi_matrix_per_experiment: exp_expr_matrix[exp_id][bc_name][x] = y1 | y2 # export to csv/hdf + dict_bc_id = pd.read_csv(BC_INDEX_FPATH, + sep='\t', index_col=BC_SEQ_COLUMN) + all_bc_seq = dict_bc_id.index.values + dict_bc_id = {seq: seq_id + 1 for seq_id, seq in enumerate(all_bc_seq)} for exp_id, expr_dict in exp_expr_matrix.items(): for bc, cnt in expr_dict.items(): cnt = _flatten_umi_set(cnt) expr_dict[bc] = pd.Series([cnt[x] for x in export_genes], index=export_genes) - expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0) + cnames_ordered = sorted( + expr_dict.keys(), + key=lambda xx: dict_bc_id.get(xx, float('Inf'))) + expr_df = pd.DataFrame( + expr_dict, + index=export_genes, + columns=cnames_ordered) + expr_df.fillna(0, inplace=True) + expr_df.columns = ['BC-{}-{}'.format( + dict_bc_id.get(xx, 0), xx) for xx in cnames_ordered] + expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, 'expr.csv')) expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR, @@ -642,23 +670,23 @@ rule qc_umi_matrix_per_experiment: rule summarize_aln_stats_per_item: input: - alncnt = dynamic(join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, - '{itemID}', '{bcID}.pkl')), + # alncnt = dynamic(join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, + # '{itemID}', '{bcID}.pkl')), + '_done_UMI', output: - aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT, - '{itemID}', + aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemName}', 'alignment-' + ALIGNER + '.csv'), - itemID=item_names), + itemName=item_names), run: aln_diagnose_item = ["_unmapped", "_low_map_qual", '_multimapped', "_uniquemapped", "_no_feature", "_ambiguous", "_total"] - # { item -> dict(cell_bc -> Counter(stats)) } item_stats = defaultdict(dict) - - for f in input.alncnt: + alncnt_files = glob.glob(join_path( + DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, 'item-*', '*.pkl')) + for f in alncnt_files: # input.alncnt: bc_name = base_name(f) # BC-1-xxx item_id = base_name(dir_name(f)) # item-1 item_stats[item_id][bc_name] = pickle.load(open(f, 'rb')) @@ -714,12 +742,18 @@ rule REPORT: 'demultiplexing_fastq.html'), alignment_stats = join_path(DIR_PROJ, SUBDIR_REPORT, 'alignment-{}.html'.format(ALIGNER)), + output: + flag = '_done_report' + run: + shell('touch {output.flag}') # Inputs: project/report/item-*/demultiplexing.csv # Outputs: # - Stats file (csv) per item. # - Plotly box graph for all the items. rule report_combo_demultiplexing: + input: + '_done_UMI', output: html = join_path(DIR_PROJ, SUBDIR_REPORT, 'demultiplexing_fastq.html') run: @@ -737,7 +771,10 @@ rule report_combo_demultiplexing: rule report_alignment_stats: input: - aln_item = rules.summarize_aln_stats_per_item.output.aln_item, + aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT, + '{itemName}', + 'alignment-' + ALIGNER + '.csv'), + itemName=item_names), output: html = join_path(DIR_PROJ, SUBDIR_REPORT, 'alignment-{}.html'.format(ALIGNER)), From 457245e1d660f4cb2e9a7aee9f29ede77481e8d2 Mon Sep 17 00:00:00 2001 From: yy1533 Date: Thu, 10 May 2018 15:49:43 -0400 Subject: [PATCH 8/8] :dog: add in-line comments --- celseq2/workflow/celseq2_beta.snakemake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake index 7690b12..fffb5c6 100644 --- a/celseq2/workflow/celseq2_beta.snakemake +++ b/celseq2/workflow/celseq2_beta.snakemake @@ -534,7 +534,7 @@ rule count_umi: pickle.dump(aln_cnt, open(output.alncnt, 'wb')) -# Pipeline Step 4a: Merge UMIs of cells to UMI matrix of item +# Pipeline Step 4a (deprecated) : Merge UMIs of cells to UMI matrix of item # Input: a list of umicnt pickle files of cells per item # Output: UMI-count matric file (csv & hdf) per item rule summarize_umi_matrix_per_item: @@ -706,7 +706,7 @@ rule summarize_aln_stats_per_item: item_id, 'alignment-' + ALIGNER + '.csv')) - +# (deprecated) # rule report_alignment_log: # input: # df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER,