diff --git a/celseq2/celseq2.py b/celseq2/celseq2.py index 44331b5..4b106d5 100644 --- a/celseq2/celseq2.py +++ b/celseq2/celseq2.py @@ -29,41 +29,58 @@ * -n * --notemp (--nt) +Name of rules to request outputs: + * all (default) + * TAG_FASTQ + * ANNOTATION + * ALIGNMENT + * COUNT_MATRIX + * QC_COUNT_MATRIX + * CELSEQ2_TO_ST (only available for ST data) + Refs: - https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html - https://bitbucket.org/snakemake/snakemake/src/e11a57fe1f62f3f56c815d95d82871811dae81b3/snakemake/__init__.py?at=master&fileviewer=file-view-default#__init__.py-580:1127 ''' +task_choices = ['all', 'TAG_FASTQ', 'ANNOTATION', 'ALIGNMENT', + 'COUNT_MATRIX', 'QC_COUNT_MATRIX', 'CELSEQ2_TO_ST', + 'REPORT'] + def get_argument_parser(): - desc = ('CEL-Seq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.') + desc = ('celseq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.') parser = argparse.ArgumentParser(description=desc, add_help=True) parser.add_argument( "target", nargs="*", default=None, - help="Targets to build. May be rules or files.") + help=('Targets to build. ' + 'May be rules or files. ' + 'Task choices: {}').format(', '.join(task_choices))) parser.add_argument( "--config-file", metavar="FILE", required=True, - help=("Specify details of CEL-Seq2 and gneome information.")) + help=("Configurations of the details of CEL-Seq2 " + " and running environment.")) parser.add_argument( "--experiment-table", metavar="FILE", required=True, - help=("Space/Tab separated file specifying experiment design.")) + help=("Space/Tab separated file specifying the R1/R2 reads " + "and the experiment design.")) parser.add_argument( "--output-dir", metavar="DIRECTORY", required=True, - help=("All results are saved with here as root directory.")) + help=("All results are saved here as root directory.")) parser.add_argument( "--reverse-stranded", "--rs", action="store_true", default=False, - help="Read has to be mapped to the opposite strand as the feature") + help="Reads have to be mapped to the opposite strand of the feature.") parser.add_argument( "--celseq2-to-st", "--st", diff --git a/celseq2/count_umi.py b/celseq2/count_umi.py index db71581..3d77739 100644 --- a/celseq2/count_umi.py +++ b/celseq2/count_umi.py @@ -8,6 +8,10 @@ import pickle import argparse from collections import defaultdict, Counter +import plotly.graph_objs as go +from plotly.offline import plot +import pandas as pd +from celseq2.helper import base_name def invert_strand(iv): @@ -102,6 +106,64 @@ def _flatten_umi_set(umi_set): # pass +def plotly_alignment_stats(fpaths=[], saveto='', fnames=[]): + ''' + Save a plotly box graph with a list of alignment stats files + + Parameters + ---------- + fpaths : list + A list of file paths + saveto : str + File path to save the html file as the plotly box graph + fnames : list + A list of strings to label each ``fpaths`` + + Returns + ------- + bool + True if saving successfully, False otherwise + ''' + if not fnames: + fnames = [base_name(f) for f in fpaths] + if len(fnames) != len(fpaths): + fnames = [base_name(f) for f in fpaths] + trace_data = [] + # aln_diagnose_item = ["_unmapped", + # "_low_map_qual", '_multimapped', "_uniquemapped", + # "_no_feature", "_ambiguous", + # "_total"] + for i in range(len(fpaths)): + f = fpaths[i] + fname = fnames[i] + + stats = pd.read_csv(f, index_col=0) + + mapped = stats.loc['_multimapped', :] + stats.loc['_uniquemapped', :] + rate_mapped = mapped / stats.loc['_total', :] + + overall_mapped = mapped.sum() + overall_total = stats.loc['_total', :].sum() + + stats.fillna(value=0, inplace=True) + trace_data.append( + go.Box( + y=rate_mapped, + name='{} (#Mapped={}/#Total={})'.format( + fname, overall_mapped, overall_total))) + + layout = go.Layout( + xaxis=dict(showticklabels=False), + title='Mapped/Total alignments per BC per item') + fig = go.Figure(data=trace_data, layout=layout) + try: + plot(fig, filename=saveto, auto_open=False) + return(True) + except Exception as e: + print(e, flush=True) + return(False) + + def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument('--sam_fpath', type=str, metavar='FILENAME', diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py index dbd7538..c7be5e3 100644 --- a/celseq2/demultiplex.py +++ b/celseq2/demultiplex.py @@ -5,7 +5,11 @@ import argparse from celseq2.helper import filehandle_fastq_gz, print_logger -from celseq2.helper import join_path, mkfolder +from celseq2.helper import join_path, mkfolder, base_name + +import plotly.graph_objs as go +from plotly.offline import plot +import pandas as pd def str2int(s): @@ -212,6 +216,60 @@ def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath): stats['total'] / stats['total'] * 100)) +def plotly_demultiplexing_stats(fpaths=[], saveto='', fnames=[]): + ''' + Save a plotly box graph with a list of demultiplexing stats files + + Parameters + ---------- + fpaths : list + A list of file paths + saveto : str + File path to save the html file as the plotly box graph + fnames : list + A list of strings to label each ``fpaths`` + + Returns + ------- + bool + True if saving successfully, False otherwise + ''' + + if not fnames: + fnames = [base_name(f) for f in fpaths] + if len(fnames) != len(fpaths): + fnames = [base_name(f) for f in fpaths] + + num_reads_data = [] + for i in range(len(fpaths)): + f = fpaths[i] + fname = fnames[i] + + stats = pd.read_csv(f, index_col=0) + cell_stats = stats.iloc[:-5, :] + # tail 5 lines are fixed as the overall stats + overall_stats = stats.iloc[-5:, :] + num_reads_data.append( + go.Box( + y=cell_stats['Reads(#)'], + name='{} (#Saved={}/#Total={})'.format( + fname, + overall_stats.loc['saved', 'Reads(#)'], + overall_stats.loc['total', 'Reads(#)']))) + + layout = go.Layout( + # legend=dict(x=-.1, y=-.2), + xaxis=dict(showticklabels=False), + title='Number of reads saved per BC per item') + fig = go.Figure(data=num_reads_data, layout=layout) + try: + plot(fig, filename=saveto, auto_open=False) + return(True) + except Exception as e: + print(e, flush=True) + return(False) + + def main(): parser = argparse.ArgumentParser(add_help=True) parser.add_argument('read1_fpath', type=str) diff --git a/celseq2/qc.py b/celseq2/qc.py index 5475ff2..08355c9 100644 --- a/celseq2/qc.py +++ b/celseq2/qc.py @@ -218,7 +218,8 @@ def plotly_qc_st(fpath, saveto, sep='\t', name=''): mask_by=ST_qc.total_num_UMIs, hover_text=ST_qc.total_num_UMIs.astype('str'), colorscale='Viridis', - mask_title='#Total UMIs {})'.format(ST_qc.total_num_UMIs.median())) + mask_title=('#Total UMIs ' + '(median={})').format(ST_qc.total_num_UMIs.median())) # 3/3 plotly_ST_mt = plotly_scatter( x=ST_qc.Row, y=ST_qc.Col, @@ -239,6 +240,7 @@ def plotly_qc_st(fpath, saveto, sep='\t', name=''): fig['layout'].update(height=600, width=1900, title=name) fig.layout.showlegend = False + # Manually change the locations of other two color bars to proper places fig.data[0].marker.colorbar.x = 0.28 fig.data[1].marker.colorbar.x = 0.64 diff --git a/celseq2/version.py b/celseq2/version.py index 45869b6..ed7d50e 100644 --- a/celseq2/version.py +++ b/celseq2/version.py @@ -1 +1 @@ -__version__ = '0.5.2' +__version__ = '0.5.3' diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake index 79dec53..fffb5c6 100644 --- a/celseq2/workflow/celseq2_beta.snakemake +++ b/celseq2/workflow/celseq2_beta.snakemake @@ -1,11 +1,10 @@ -###################################################################### -## Dependencies ## from celseq2.helper import join_path, base_name, dir_name, print_logger from celseq2.helper import rmfolder, mkfolder, is_nonempty_file from celseq2.helper import cook_sample_sheet, popen_communicate from celseq2.prepare_annotation_model import cook_anno_model from celseq2.count_umi import count_umi, _flatten_umi_set -from celseq2.parse_log import parse_bowtie2_report, parse_star_report, merge_reports +# from celseq2.parse_log import parse_bowtie2_report, parse_star_report, merge_reports +from celseq2.demultiplex import plotly_demultiplexing_stats import pandas as pd import glob import pickle @@ -18,16 +17,18 @@ import tempfile import shutil import os - -## Inforamtion ## +''' +Part-1 Reading Config +''' +# Inforamtion # '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2' DIR_PROJ = config.get('output_dir', None) -## Sample Sheet ## +# Sample Sheet SAMPLE_TABLE_FPATH = config.get('experiment_table', None) SAMPLE_TABLE = cook_sample_sheet(SAMPLE_TABLE_FPATH) # '' -## CEL-seq2 Tech Setting ## +# CEL-seq2 Tech Setting # '/ifs/data/yanailab/refs/barcodes/barcodes_cel-seq_umis96.tab' BC_INDEX_FPATH = config.get('BC_INDEX_FPATH', None) BC_IDs_DEFAULT = config.get('BC_IDs_DEFAULT', None) # '1-96' @@ -37,7 +38,7 @@ BC_START_POSITION = config.get('BC_START_POSITION', None) UMI_LENGTH = config.get('UMI_LENGTH', None) # 6 BC_LENGTH = config.get('BC_LENGTH', None) # 6 -## Tools ## +# Tools # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/genome/Danio_rerio.GRCz10.dna.toplevel' BOWTIE2_INDEX_PREFIX = config.get('BOWTIE2_INDEX_PREFIX', None) BOWTIE2 = config.get('BOWTIE2', None) # '/local/apps/bowtie2/2.3.1/bowtie2' @@ -45,43 +46,38 @@ STAR_INDEX_DIR = config.get('STAR_INDEX_DIR', None) STAR = config.get('STAR', None) ALIGNER_EXTRA_PARAMETERS = config.get('ALIGNER_EXTRA_PARAMETERS', '') -## Annotations ## +# Annotations # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/gtf/Danio_rerio.GRCz10.87.gtf.gz' GFF = config.get('GFF', None) FEATURE_ID = config.get('FEATURE_ID', 'gene_name') FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon') GENE_BIOTYPE = config.get('GENE_BIOTYPE', None) -if not GENE_BIOTYPE: GENE_BIOTYPE = (); +if not GENE_BIOTYPE: + GENE_BIOTYPE = () -## Demultiplexing ## +# Demultiplexing FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None) # 10 CUT_LENGTH = config.get('CUT_LENGTH', None) # 35 -SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False -## Alignment ## +SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False +# Alignment ALIGNER = config.get('ALIGNER', None) # 'bowtie2', 'star' assert (ALIGNER), 'Error: Specify aligner.' assert (ALIGNER in ['bowtie2', 'star']), 'Error: Unknown aligner.' -## UMI Count ## +# UMI Count ALN_QUAL_MIN = config.get('ALN_QUAL_MIN', None) # 0 STRANDED = config.get('stranded', 'yes') - -## Running Parameters ## -# bc_ids_used=config.get('bc_ids_used', None) # '1,2,3,4-5' +# Running Parameters num_threads = config.get('num_threads', 16) # 5 verbose = config.get('verbose', True) # True RUN_CELSEQ2_TO_ST = config.get('run_celseq2_to_st', False) KEEP_INTERMEDIATE = config.get('keep_intermediate', False) -####################### -## Pipeline reserved ## -####################### +# Pipeline reserved variables item_names = list(SAMPLE_TABLE.index) -# expand('{r1_fpath}', r1_fpath=SAMPLE_TABLE['R1'].values) sample_list = SAMPLE_TABLE['SAMPLE_NAME'].values -# expand('{r2_fpath}', r2_fpath=SAMPLE_TABLE['R2'].values) R1 = SAMPLE_TABLE['R1'].values R2 = SAMPLE_TABLE['R2'].values bc_used = SAMPLE_TABLE['CELL_BARCODES_INDEX'].values @@ -100,6 +96,7 @@ SUBDIR_ALIGN = 'small_sam' SUBDIR_ALIGN_ITEM = 'item_sam' SUBDIR_UMI_CNT = 'small_umi_count' SUBDIR_UMI_SET = 'small_umi_set' +SUBDIR_ALN_STATS = 'small_aln_stats' SUBDIR_EXPR = 'expr' SUBDIR_ST = 'ST' SUBDIR_LOG = 'small_log' @@ -110,29 +107,27 @@ SUBDIR_QC_EXPR = 'qc_expr' SUBDIRS = [SUBDIR_INPUT, SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_ALIGN_ITEM, - SUBDIR_UMI_CNT, SUBDIR_UMI_SET, + SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_ALN_STATS, SUBDIR_EXPR, SUBDIR_REPORT, SUBDIR_QC_EXPR, SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_ANNO ] -# aln_diagnose_item = ["_unmapped", -# "_low_map_qual", '_multimapped', "_uniquemapped", -# "_no_feature", "_ambiguous", -# "_total"] -## Dev ## - -##################### -## Snakemake rules ## -##################### +''' +Part-2: Snakemake rules +''' workdir: DIR_PROJ +''' +Default task named "all" to request all outputs. +''' if RUN_CELSEQ2_TO_ST: rule all: input: '_done_UMI', '_done_ST', + '_done_report', output: touch('_DONE'), run: @@ -141,16 +136,20 @@ if RUN_CELSEQ2_TO_ST: shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB)) if not KEEP_INTERMEDIATE: print_logger('Cleaning... ') - rmfolder(SUBDIR_FASTQ); + rmfolder(SUBDIR_FASTQ) # rmfolder(SUBDIR_ANNO); - rmfolder(SUBDIR_ALIGN); rmfolder(SUBDIR_ALIGN_ITEM); - rmfolder(SUBDIR_LOG); - rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET); + rmfolder(SUBDIR_ALIGN) + rmfolder(SUBDIR_ALIGN_ITEM) + rmfolder(SUBDIR_LOG) + rmfolder(SUBDIR_UMI_CNT) + rmfolder(SUBDIR_UMI_SET) + rmfolder(SUBDIR_ALN_STATS) else: rule all: input: '_done_UMI', + '_done_report', output: touch('_DONE'), run: @@ -159,13 +158,19 @@ else: shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB)) if not KEEP_INTERMEDIATE: print_logger('Cleaning... ') - rmfolder(SUBDIR_FASTQ); + rmfolder(SUBDIR_FASTQ) # rmfolder(SUBDIR_ANNO); - rmfolder(SUBDIR_ALIGN); rmfolder(SUBDIR_ALIGN_ITEM); - rmfolder(SUBDIR_LOG); - rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET); + rmfolder(SUBDIR_ALIGN) + rmfolder(SUBDIR_ALIGN_ITEM) + rmfolder(SUBDIR_LOG) + rmfolder(SUBDIR_UMI_CNT) + rmfolder(SUBDIR_UMI_SET) + rmfolder(SUBDIR_ALN_STATS) +''' +Subtask named "COUNT_MATRIX" to request the UMIs matrices outputs +''' rule COUNT_MATRIX: input: csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'), @@ -185,6 +190,9 @@ rule COUNT_MATRIX: print_logger('UMI-count matrix is saved at {}'.format(input.csv)) +''' +Subtask named "QC_COUNT_MATRIX" to request the QCs plots of UMIs matrices +''' rule QC_COUNT_MATRIX: input: html = expand(join_path(DIR_PROJ, SUBDIR_QC_EXPR, @@ -192,8 +200,9 @@ rule QC_COUNT_MATRIX: expid=list(set(sample_list))), message: 'Finished QC UMI-count matrix.' - -# join_path(DIR_PROJ, SUBDIR_QC_EXPR, '{expid}', 'QC_ST.html') +''' +Subtask named "CELSEQ2_TO_ST" to request spatial-transcriptomics data format. +''' if RUN_CELSEQ2_TO_ST: rule CELSEQ2_TO_ST: input: @@ -234,7 +243,8 @@ if RUN_CELSEQ2_TO_ST: input: tsv = join_path(DIR_PROJ, SUBDIR_ST, '{expid}', 'ST.tsv'), output: - html = join_path(DIR_PROJ, SUBDIR_QC_EXPR, '{expid}', 'QC_ST.html'), + html = join_path(DIR_PROJ, SUBDIR_QC_EXPR, + '{expid}', 'QC_ST.html'), params: expid = '{expid}', run: @@ -254,7 +264,11 @@ if RUN_CELSEQ2_TO_ST: # touch('_done_report') -# Combo-demultiplexing +# Pipeline Step 1a: combo_demultiplexing +# Inputs: R1 (barcode) and R2 (mRNA) +# Output: +# - A list of FASTQs extracted from R2 per cells and they are tagged with cell +# barcode and UMI sequence gained from R1. rule combo_demultiplexing: input: SAMPLE_TABLE_FPATH, output: @@ -305,13 +319,18 @@ rule combo_demultiplexing: p.join() shell('touch _done_combodemultiplex') + +''' +Subtask named "TAG_FASTQ" to request intermediate files: mRNA.fastq tagged +with cell id and UMI seq gained from barcode.fastq +''' rule TAG_FASTQ: input: expand(join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'), itemID=item_names), message: "Finished Tagging FASTQs." - +# Pipeline Step 1b: Merge tagged FASTQs rule tag_fastq: input: fq = dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ, @@ -332,20 +351,21 @@ rule tag_fastq: print_logger('Tagged FQ: {}'.format(itemid_tag_fq)) -## Alignment ## +# Pipeline Step 2a: Alignment rule ALIGNMENT: input: - expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER+'.bigsam'), + expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER + '.bigsam'), itemID=item_names), message: "Finished Alignment." if ALIGNER == 'bowtie2': rule align_bowtie2: input: - fq = join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'), + fq = join_path(DIR_PROJ, SUBDIR_FASTQ, + '{itemID}', 'TAGGED.bigfastq'), output: sam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, - '{itemID}', ALIGNER+'.bigsam'), + '{itemID}', ALIGNER + '.bigsam'), params: threads = num_threads, aligner_extra_parameters = ALIGNER_EXTRA_PARAMETERS, @@ -366,10 +386,11 @@ if ALIGNER == 'bowtie2': if ALIGNER == 'star': rule align_star: input: - fq = join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'), + fq = join_path(DIR_PROJ, SUBDIR_FASTQ, + '{itemID}', 'TAGGED.bigfastq'), output: sam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, - '{itemID}', ALIGNER+'.bigsam'), + '{itemID}', ALIGNER + '.bigsam'), log = join_path(DIR_PROJ, SUBDIR_LOG, '{itemID}', 'Align-STAR.log'), # starsam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', '.star', @@ -401,14 +422,15 @@ if ALIGNER == 'star': shell('mv {starsam} {output.sam} ') shell('mv {starlog} {output.log} ') - +# Pipeline Step 2b: Combo-demultiplex the SAM file rule combo_demultiplexing_sam: input: sam = expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, - '{itemID}', ALIGNER+'.bigsam'), - itemID = item_names), + '{itemID}', ALIGNER + '.bigsam'), + itemID=item_names), output: - sam = dynamic(join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemID}', '{bcID}.sam')), + sam = dynamic(join_path(DIR_PROJ, SUBDIR_ALIGN, + '{itemID}', '{bcID}.sam')), params: jobs = len(item_names), claim_used_bc = True, @@ -436,7 +458,11 @@ rule combo_demultiplexing_sam: shell('touch _done_combodemultiplex_sam') -## HT-seq Count UMI ## +# Pipeline Step 3a: cook ht-seq recognized feature object before counting UMIs +# Input: GTF/GFF file +# Output: a pickle file saving a tuple (features, exported_genes) where: +# - features: HTSeq.GenomicArrayOfSets(). *All* exons locations ~ set(genes). +# - exported_genes: a sorted list. Gene id/name. (Default all genes are exported). rule COOK_ANNOTATION: input: SAMPLE_TABLE_FPATH, @@ -463,7 +489,7 @@ rule COOK_ANNOTATION: if params.gene_type: print_logger('Types of reported gene: {}.'.format( ', '.join(params.gene_type))) - gene_df = get_genes(GFF, valid_biotypes = set(params.gene_type)) + gene_df = get_genes(GFF, valid_biotypes=set(params.gene_type)) exported_genes = sorted(gene_df['name'].values) print_logger('Number of reported genes: {}.'.format( len(exported_genes))) @@ -472,20 +498,28 @@ rule COOK_ANNOTATION: shell('touch {output.anno_csv}') with open(output.anno_pkl, 'wb') as fh: - pickle.dump( (features, exported_genes), fh) + pickle.dump((features, exported_genes), fh) shell('touch {output.flag} ') - +# Pipeline Step 3b: Count UMIs +# Inputs: +# - annotation object +# - SAM per cell +# Outputs: two pickle files per cell +# - umicnt: dict(str: set(str)) i.e., dict(gene ~ set(UMI_sequence)) +# - umiset: Counter(str: int) i.e., Counter(gene ~ number of UMIs) rule count_umi: input: gff = rules.COOK_ANNOTATION.output.anno_pkl, sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemID}', '{bcID}.sam'), output: umicnt = join_path(DIR_PROJ, SUBDIR_UMI_CNT, - '{itemID}', '{bcID}.pkl'), + '{itemID}', '{bcID}.pkl'), umiset = join_path(DIR_PROJ, SUBDIR_UMI_SET, - '{itemID}', '{bcID}.pkl'), + '{itemID}', '{bcID}.pkl'), + alncnt = join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, + '{itemID}', '{bcID}.pkl'), message: 'Counting {input.sam}' run: features_f, _ = pickle.load(open(input.gff, 'rb')) @@ -497,9 +531,12 @@ rule count_umi: dumpto=None) pickle.dump(umi_cnt, open(output.umicnt, 'wb')) pickle.dump(umi_set, open(output.umiset, 'wb')) + pickle.dump(aln_cnt, open(output.alncnt, 'wb')) -# - regular umi-count using *_umicnt.pkl -> umi_count_matrix replicates/lanes per plate +# Pipeline Step 4a (deprecated) : Merge UMIs of cells to UMI matrix of item +# Input: a list of umicnt pickle files of cells per item +# Output: UMI-count matric file (csv & hdf) per item rule summarize_umi_matrix_per_item: input: gff = rules.COOK_ANNOTATION.output.anno_pkl, @@ -516,7 +553,7 @@ rule summarize_umi_matrix_per_item: run: _, export_genes = pickle.load(open(input.gff, 'rb')) - # item -> dict(cell_bc -> Counter(umi_vector)) + # { item -> dict(cell_bc -> Counter(umi_vector)) } item_expr_matrix = defaultdict(dict) for f in input.umicnt: @@ -524,25 +561,37 @@ rule summarize_umi_matrix_per_item: item_id = base_name(dir_name(f)) # item-1 item_expr_matrix[item_id][bc_name] = pickle.load(open(f, 'rb')) + # export to csv/hdf + dict_bc_id = pd.read_csv(BC_INDEX_FPATH, + sep='\t', index_col=BC_SEQ_COLUMN) + all_bc_seq = dict_bc_id.index.values + dict_bc_id = {seq: seq_id + 1 for seq_id, seq in enumerate(all_bc_seq)} for item_id, expr_dict in item_expr_matrix.items(): exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] # E1 for bc, cnt in expr_dict.items(): expr_dict[bc] = pd.Series([cnt[x] for x in export_genes], index=export_genes) - - expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0) + cnames_ordered = sorted( + expr_dict.keys(), + key=lambda xx: dict_bc_id.get(xx, float('Inf'))) + expr_df = pd.DataFrame( + expr_dict, + index=export_genes, + columns=cnames_ordered) + expr_df.fillna(0, inplace=True) + expr_df.columns = ['BC-{}-{}'.format( + dict_bc_id.get(xx, 0), xx) for xx in cnames_ordered] expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, item_id, 'expr.csv')) expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, item_id, 'expr.h5'), 'table') - -# - merge umi-count using *_umiset.pkl -> correct umi count per experiment/plate +# Pipeline Step 4b: Merge UMIs of cells to UMI matrix per experiment +# Input: a list of umiset pickle files of cells per experiment +# Output: UMI-count matric file (csv & hdf) per experiment rule summarize_umi_matrix_per_experiment: input: - # gff = join_path(DIR_PROJ, SUBDIR_ANNO, - # base_name(GFF) + '.pickle'), gff = rules.COOK_ANNOTATION.output.anno_pkl, umiset = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_SET, '{itemID}', '{bcID}.pkl')), @@ -557,14 +606,14 @@ rule summarize_umi_matrix_per_experiment: sample_list = SAMPLE_TABLE['SAMPLE_NAME'].values - # experiment_id -> dict(cell_bc -> dict(gname -> set(umi))) + # { experiment_id -> dict(cell_bc -> dict(gname -> set(umi))) } exp_expr_matrix = {} for exp_id in list(set(sample_list)): exp_expr_matrix[exp_id] = defaultdict(dict) for f in input.umiset: - bc_name = base_name(f) # BC-1-xxx + bc_name = base_name(f) # xxx item_id = base_name(dir_name(f)) # item-1 exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] @@ -578,12 +627,27 @@ rule summarize_umi_matrix_per_experiment: y2 = umiset_stream.get(x, set()) exp_expr_matrix[exp_id][bc_name][x] = y1 | y2 + # export to csv/hdf + dict_bc_id = pd.read_csv(BC_INDEX_FPATH, + sep='\t', index_col=BC_SEQ_COLUMN) + all_bc_seq = dict_bc_id.index.values + dict_bc_id = {seq: seq_id + 1 for seq_id, seq in enumerate(all_bc_seq)} for exp_id, expr_dict in exp_expr_matrix.items(): for bc, cnt in expr_dict.items(): cnt = _flatten_umi_set(cnt) expr_dict[bc] = pd.Series([cnt[x] for x in export_genes], index=export_genes) - expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0) + cnames_ordered = sorted( + expr_dict.keys(), + key=lambda xx: dict_bc_id.get(xx, float('Inf'))) + expr_df = pd.DataFrame( + expr_dict, + index=export_genes, + columns=cnames_ordered) + expr_df.fillna(0, inplace=True) + expr_df.columns = ['BC-{}-{}'.format( + dict_bc_id.get(xx, 0), xx) for xx in cnames_ordered] + expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR, exp_id, 'expr.csv')) expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR, @@ -603,6 +667,46 @@ rule qc_umi_matrix_per_experiment: cmd += '--sep {}'.format(',') shell(cmd) + +rule summarize_aln_stats_per_item: + input: + # alncnt = dynamic(join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, + # '{itemID}', '{bcID}.pkl')), + '_done_UMI', + output: + aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemName}', + 'alignment-' + ALIGNER + '.csv'), + itemName=item_names), + run: + aln_diagnose_item = ["_unmapped", + "_low_map_qual", '_multimapped', "_uniquemapped", + "_no_feature", "_ambiguous", + "_total"] + # { item -> dict(cell_bc -> Counter(stats)) } + item_stats = defaultdict(dict) + alncnt_files = glob.glob(join_path( + DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, 'item-*', '*.pkl')) + for f in alncnt_files: # input.alncnt: + bc_name = base_name(f) # BC-1-xxx + item_id = base_name(dir_name(f)) # item-1 + item_stats[item_id][bc_name] = pickle.load(open(f, 'rb')) + + # export to csv + for item_id, aln_dict in item_stats.items(): + exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME'] # E1 + + for bc, cnt in aln_dict.items(): + aln_dict[bc] = pd.Series([cnt[x] for x in aln_diagnose_item], + index=aln_diagnose_item) + + aln_stats_df = pd.DataFrame( + aln_dict, index=aln_diagnose_item).fillna(0) + + aln_stats_df.to_csv(join_path(DIR_PROJ, SUBDIR_REPORT, + item_id, + 'alignment-' + ALIGNER + '.csv')) + +# (deprecated) # rule report_alignment_log: # input: # df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER, @@ -632,6 +736,59 @@ rule qc_umi_matrix_per_experiment: # savetocsv=report_item) +rule REPORT: + input: + demultiplexing_fastq = join_path(DIR_PROJ, SUBDIR_REPORT, + 'demultiplexing_fastq.html'), + alignment_stats = join_path(DIR_PROJ, SUBDIR_REPORT, + 'alignment-{}.html'.format(ALIGNER)), + output: + flag = '_done_report' + run: + shell('touch {output.flag}') + +# Inputs: project/report/item-*/demultiplexing.csv +# Outputs: +# - Stats file (csv) per item. +# - Plotly box graph for all the items. +rule report_combo_demultiplexing: + input: + '_done_UMI', + output: + html = join_path(DIR_PROJ, SUBDIR_REPORT, 'demultiplexing_fastq.html') + run: + # stats files [/path/item-1/d.csv, /path/item-2/d.csv] + stats_fpaths = glob.glob(join_path(DIR_PROJ, SUBDIR_REPORT, + 'item-*', 'demultiplexing.csv')) + # labels [item-1, item-2] + stats_fpaths_labels = [base_name(dir_name(f)) for f in stats_fpaths] + work = plotly_demultiplexing_stats( + fpaths=stats_fpaths, + saveto=output.html, + fnames=stats_fpaths_labels) + if not work: + touch('{output.html}') + +rule report_alignment_stats: + input: + aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT, + '{itemName}', + 'alignment-' + ALIGNER + '.csv'), + itemName=item_names), + output: + html = join_path(DIR_PROJ, SUBDIR_REPORT, + 'alignment-{}.html'.format(ALIGNER)), + run: + from celseq2.count_umi import plotly_alignment_stats + stats_fpaths_labels = [base_name(dir_name(f)) for f in input.aln_item] + work = plotly_alignment_stats( + fpaths=input.aln_item, + saveto=output.html, + fnames=stats_fpaths_labels) + if not work: + touch('{output.html}') + + rule cleanall: message: "Remove all files under {DIR_PROJ}" run: