From ad95a9d3c613a23982d120287afd99a8d9006edd Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Mon, 7 May 2018 10:58:30 -0400
Subject: [PATCH 1/8] :dog: add in-line comment

---
 celseq2/celseq2.py                      |  25 ++++--
 celseq2/workflow/celseq2_beta.snakemake | 101 +++++++++++++++---------
 2 files changed, 81 insertions(+), 45 deletions(-)

diff --git a/celseq2/celseq2.py b/celseq2/celseq2.py
index 44331b5..0af8aae 100644
--- a/celseq2/celseq2.py
+++ b/celseq2/celseq2.py
@@ -29,6 +29,15 @@
     * -n
     * --notemp (--nt)
 
+Name of rules to request outputs:
+    * all (default)
+    * TAG_FASTQ
+    * ANNOTATION
+    * ALIGNMENT
+    * COUNT_MATRIX
+    * QC_COUNT_MATRIX
+    * CELSEQ2_TO_ST (only available for ST data)
+
 Refs:
     - https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html
     - https://bitbucket.org/snakemake/snakemake/src/e11a57fe1f62f3f56c815d95d82871811dae81b3/snakemake/__init__.py?at=master&fileviewer=file-view-default#__init__.py-580:1127
@@ -36,34 +45,38 @@
 
 
 def get_argument_parser():
-    desc = ('CEL-Seq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.')
+    desc = ('celseq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.')
     parser = argparse.ArgumentParser(description=desc, add_help=True)
 
     parser.add_argument(
         "target",
         nargs="*",
         default=None,
-        help="Targets to build. May be rules or files.")
+        help="Targets to build. May be rules or files.",
+        choices=[None, 'all', 'TAG_FASTQ', 'ANNOTATION', 'ALIGNMENT',
+                 'COUNT_MATRIX', 'QC_COUNT_MATRIX', 'CELSEQ2_TO_ST'])
     parser.add_argument(
         "--config-file",
         metavar="FILE",
         required=True,
-        help=("Specify details of CEL-Seq2 and gneome information."))
+        help=("Configurations of the details of CEL-Seq2 "
+              " and running environment."))
     parser.add_argument(
         "--experiment-table",
         metavar="FILE",
         required=True,
-        help=("Space/Tab separated file specifying experiment design."))
+        help=("Space/Tab separated file specifying the R1/R2 reads "
+              "and the experiment design."))
     parser.add_argument(
         "--output-dir",
         metavar="DIRECTORY",
         required=True,
-        help=("All results are saved with here as root directory."))
+        help=("All results are saved here as root directory."))
 
     parser.add_argument(
         "--reverse-stranded", "--rs",
         action="store_true", default=False,
-        help="Read has to be mapped to the opposite strand as the feature")
+        help="Reads have to be mapped to the opposite strand of the feature.")
 
     parser.add_argument(
         "--celseq2-to-st", "--st",
diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake
index 79dec53..9e4c589 100644
--- a/celseq2/workflow/celseq2_beta.snakemake
+++ b/celseq2/workflow/celseq2_beta.snakemake
@@ -1,5 +1,3 @@
-######################################################################
-## Dependencies ##
 from celseq2.helper import join_path, base_name, dir_name, print_logger
 from celseq2.helper import rmfolder, mkfolder, is_nonempty_file
 from celseq2.helper import cook_sample_sheet, popen_communicate
@@ -18,16 +16,18 @@ import tempfile
 import shutil
 import os
 
-
-## Inforamtion ##
+'''
+Part-1 Reading Config
+'''
+# Inforamtion
 # '/ifs/home/yy1533/Lab/cel-seq-pipe/demo/celseq2'
 DIR_PROJ = config.get('output_dir', None)
 
-## Sample Sheet ##
+# Sample Sheet
 SAMPLE_TABLE_FPATH = config.get('experiment_table', None)
 SAMPLE_TABLE = cook_sample_sheet(SAMPLE_TABLE_FPATH)  # ''
 
-## CEL-seq2 Tech Setting ##
+# CEL-seq2 Tech Setting
 # '/ifs/data/yanailab/refs/barcodes/barcodes_cel-seq_umis96.tab'
 BC_INDEX_FPATH = config.get('BC_INDEX_FPATH', None)
 BC_IDs_DEFAULT = config.get('BC_IDs_DEFAULT', None)  # '1-96'
@@ -37,7 +37,7 @@ BC_START_POSITION = config.get('BC_START_POSITION', None)
 UMI_LENGTH = config.get('UMI_LENGTH', None)  # 6
 BC_LENGTH = config.get('BC_LENGTH', None)  # 6
 
-## Tools ##
+# Tools
 # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/genome/Danio_rerio.GRCz10.dna.toplevel'
 BOWTIE2_INDEX_PREFIX = config.get('BOWTIE2_INDEX_PREFIX', None)
 BOWTIE2 = config.get('BOWTIE2', None)  # '/local/apps/bowtie2/2.3.1/bowtie2'
@@ -45,7 +45,7 @@ STAR_INDEX_DIR = config.get('STAR_INDEX_DIR', None)
 STAR = config.get('STAR', None)
 ALIGNER_EXTRA_PARAMETERS = config.get('ALIGNER_EXTRA_PARAMETERS', '')
 
-## Annotations ##
+# Annotations
 # '/ifs/data/yanailab/refs/danio_rerio/danRer10_87/gtf/Danio_rerio.GRCz10.87.gtf.gz'
 GFF = config.get('GFF', None)
 FEATURE_ID = config.get('FEATURE_ID', 'gene_name')
@@ -53,35 +53,29 @@ FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon')
 GENE_BIOTYPE = config.get('GENE_BIOTYPE', None)
 if not GENE_BIOTYPE: GENE_BIOTYPE = ();
 
-## Demultiplexing ##
+# Demultiplexing
 FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None)  # 10
 CUT_LENGTH = config.get('CUT_LENGTH', None)  # 35
 SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False
-## Alignment ##
+# Alignment
 ALIGNER = config.get('ALIGNER', None)  # 'bowtie2', 'star'
 assert (ALIGNER), 'Error: Specify aligner.'
 assert (ALIGNER in ['bowtie2', 'star']), 'Error: Unknown aligner.'
 
-## UMI Count ##
+# UMI Count
 ALN_QUAL_MIN = config.get('ALN_QUAL_MIN', None)  # 0
 STRANDED = config.get('stranded', 'yes')
 
-
-## Running Parameters ##
-# bc_ids_used=config.get('bc_ids_used', None)  # '1,2,3,4-5'
+# Running Parameters
 num_threads = config.get('num_threads', 16)  # 5
 verbose = config.get('verbose', True)  # True
 
 RUN_CELSEQ2_TO_ST = config.get('run_celseq2_to_st', False)
 KEEP_INTERMEDIATE = config.get('keep_intermediate', False)
 
-#######################
-## Pipeline reserved ##
-#######################
+# Pipeline reserved variables
 item_names = list(SAMPLE_TABLE.index)
-# expand('{r1_fpath}', r1_fpath=SAMPLE_TABLE['R1'].values)
 sample_list = SAMPLE_TABLE['SAMPLE_NAME'].values
-# expand('{r2_fpath}', r2_fpath=SAMPLE_TABLE['R2'].values)
 R1 = SAMPLE_TABLE['R1'].values
 R2 = SAMPLE_TABLE['R2'].values
 bc_used = SAMPLE_TABLE['CELL_BARCODES_INDEX'].values
@@ -120,14 +114,15 @@ SUBDIRS = [SUBDIR_INPUT,
 #                      "_low_map_qual", '_multimapped', "_uniquemapped",
 #                      "_no_feature", "_ambiguous",
 #                      "_total"]
-## Dev ##
-
-#####################
-## Snakemake rules ##
-#####################
 
+'''
+Part-2: Snakemake rules
+'''
 workdir: DIR_PROJ
 
+'''
+Default task named "all" to request all outputs.
+'''
 if RUN_CELSEQ2_TO_ST:
     rule all:
         input:
@@ -166,6 +161,9 @@ else:
                 rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET);
 
 
+'''
+Subtask named "COUNT_MATRIX" to request the UMIs matrices outputs
+'''
 rule COUNT_MATRIX:
     input:
         csv = expand(join_path(DIR_PROJ, SUBDIR_EXPR, '{expid}', 'expr.csv'),
@@ -185,6 +183,9 @@ rule COUNT_MATRIX:
 
         print_logger('UMI-count matrix is saved at {}'.format(input.csv))
 
+'''
+Subtask named "QC_COUNT_MATRIX" to request the QCs plots of UMIs matrices
+'''
 rule QC_COUNT_MATRIX:
     input:
         html = expand(join_path(DIR_PROJ, SUBDIR_QC_EXPR,
@@ -192,8 +193,9 @@ rule QC_COUNT_MATRIX:
                       expid=list(set(sample_list))),
     message: 'Finished QC UMI-count matrix.'
 
-
-# join_path(DIR_PROJ, SUBDIR_QC_EXPR, '{expid}', 'QC_ST.html')
+'''
+Subtask named "CELSEQ2_TO_ST" to request spatial-transcriptomics data format.
+'''
 if RUN_CELSEQ2_TO_ST:
     rule CELSEQ2_TO_ST:
         input:
@@ -254,7 +256,10 @@ if RUN_CELSEQ2_TO_ST:
 #         touch('_done_report')
 
 
-# Combo-demultiplexing
+# Pipeline Step 1a: combo_demultiplexing
+# Inputs: R1 (barcode) and R2 (mRNA)
+# Output: A list of FASTQs extracted from R2 per cells and they are tagged with
+# cell barcode and UMI sequence gained from R1.
 rule combo_demultiplexing:
     input: SAMPLE_TABLE_FPATH,
     output:
@@ -305,13 +310,18 @@ rule combo_demultiplexing:
         p.join()
         shell('touch _done_combodemultiplex')
 
+
+'''
+Subtask named "TAG_FASTQ" to request intermediate files: mRNA.fastq tagged
+with cell id and UMI seq gained from barcode.fastq
+'''
 rule TAG_FASTQ:
     input:
         expand(join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'),
                itemID=item_names),
     message: "Finished Tagging FASTQs."
 
-
+# Pipeline Step 1b: Merge tagged FASTQs
 rule tag_fastq:
     input:
         fq = dynamic(join_path(DIR_PROJ, SUBDIR_FASTQ,
@@ -332,7 +342,7 @@ rule tag_fastq:
             print_logger('Tagged FQ: {}'.format(itemid_tag_fq))
 
 
-## Alignment ##
+# Pipeline Step 2a: Alignment
 rule ALIGNMENT:
     input:
         expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER+'.bigsam'),
@@ -401,7 +411,7 @@ if ALIGNER == 'star':
             shell('mv {starsam} {output.sam} ')
             shell('mv {starlog} {output.log} ')
 
-
+# Pipeline Step 2b: Combo-demultiplex the SAM file
 rule combo_demultiplexing_sam:
     input:
         sam = expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM,
@@ -436,7 +446,11 @@ rule combo_demultiplexing_sam:
         shell('touch _done_combodemultiplex_sam')
 
 
-## HT-seq Count UMI ##
+# Pipeline Step 3a: cook ht-seq recognized feature object before counting UMIs
+# Input: GTF/GFF file
+# Output: a pickle file saving a tuple (features, exported_genes) where:
+#     - features: HTSeq.GenomicArrayOfSets(). *All* exons locations ~ genes.
+#     - exported_genes: a sorted list. Gene id / gene name (default all genes).
 rule COOK_ANNOTATION:
     input:
         SAMPLE_TABLE_FPATH,
@@ -463,7 +477,7 @@ rule COOK_ANNOTATION:
         if params.gene_type:
             print_logger('Types of reported gene: {}.'.format(
                 ', '.join(params.gene_type)))
-            gene_df = get_genes(GFF, valid_biotypes = set(params.gene_type))
+            gene_df = get_genes(GFF, valid_biotypes=set(params.gene_type))
             exported_genes = sorted(gene_df['name'].values)
             print_logger('Number of reported genes: {}.'.format(
                 len(exported_genes)))
@@ -472,11 +486,17 @@ rule COOK_ANNOTATION:
             shell('touch {output.anno_csv}')
 
         with open(output.anno_pkl, 'wb') as fh:
-            pickle.dump( (features, exported_genes), fh)
+            pickle.dump((features, exported_genes), fh)
 
         shell('touch {output.flag} ')
 
-
+# Pipeline Step 3b: Count UMIs
+# Inputs:
+#   - annotation object
+#   - SAM per cell
+# Outputs: two pickle files per cell
+#   - umicnt: dict(str: set(str)), i.e., dict(gene ~ set(UMI_sequence))
+#   - umiset: Counter(str: int), i.e., Counter(gene ~ "number of UMIs")
 rule count_umi:
     input:
         gff = rules.COOK_ANNOTATION.output.anno_pkl,
@@ -499,7 +519,9 @@ rule count_umi:
         pickle.dump(umi_set, open(output.umiset, 'wb'))
 
 
-# - regular umi-count using *_umicnt.pkl -> umi_count_matrix replicates/lanes per plate
+# Pipeline Step 4a: Merge UMIs of cells to UMI matrix of item
+# Input: a list of umicnt pickle files of cells per item
+# Output: UMI-count matric file (csv & hdf) per item
 rule summarize_umi_matrix_per_item:
     input:
         gff = rules.COOK_ANNOTATION.output.anno_pkl,
@@ -524,6 +546,7 @@ rule summarize_umi_matrix_per_item:
             item_id = base_name(dir_name(f))  # item-1
             item_expr_matrix[item_id][bc_name] = pickle.load(open(f, 'rb'))
 
+        # export to csv/hdf
         for item_id, expr_dict in item_expr_matrix.items():
             exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME']  # E1
 
@@ -537,12 +560,11 @@ rule summarize_umi_matrix_per_item:
             expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR,
                                      exp_id, item_id, 'expr.h5'), 'table')
 
-
-# - merge umi-count using *_umiset.pkl -> correct umi count per experiment/plate
+# Pipeline Step 4b: Merge UMIs of cells to UMI matrix per experiment
+# Input: a list of umiset pickle files of cells per experiment
+# Output: UMI-count matric file (csv & hdf) per experiment
 rule summarize_umi_matrix_per_experiment:
     input:
-        # gff = join_path(DIR_PROJ, SUBDIR_ANNO,
-        #                 base_name(GFF) + '.pickle'),
         gff = rules.COOK_ANNOTATION.output.anno_pkl,
         umiset = dynamic(join_path(DIR_PROJ, SUBDIR_UMI_SET,
                                    '{itemID}', '{bcID}.pkl')),
@@ -578,6 +600,7 @@ rule summarize_umi_matrix_per_experiment:
                 y2 = umiset_stream.get(x, set())
                 exp_expr_matrix[exp_id][bc_name][x] = y1 | y2
 
+        # export to csv/hdf
         for exp_id, expr_dict in exp_expr_matrix.items():
             for bc, cnt in expr_dict.items():
                 cnt = _flatten_umi_set(cnt)

From f11f7835e0746fb4deb205f3f0e0f1fcdc48c0d8 Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Tue, 8 May 2018 16:26:21 -0400
Subject: [PATCH 2/8] :beers: plotly graph on demultiplexing fastqs

---
 celseq2/demultiplex.py                  | 59 ++++++++++++++-
 celseq2/workflow/celseq2_beta.snakemake | 95 +++++++++++++++++--------
 2 files changed, 123 insertions(+), 31 deletions(-)

diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py
index dbd7538..2e09e1c 100644
--- a/celseq2/demultiplex.py
+++ b/celseq2/demultiplex.py
@@ -5,7 +5,11 @@
 import argparse
 
 from celseq2.helper import filehandle_fastq_gz, print_logger
-from celseq2.helper import join_path, mkfolder
+from celseq2.helper import join_path, mkfolder, base_name
+
+import plotly.graph_objs as go
+from plotly.offline import plot
+import pandas as pd
 
 
 def str2int(s):
@@ -212,6 +216,59 @@ def write_demultiplexing(stats, dict_bc_id2seq, stats_fpath):
                                     stats['total'] / stats['total'] * 100))
 
 
+def plotly_demultiplexing_stats(fpaths=[], saveto='', fnames=[]):
+    '''
+    Save a plotly box graph with a list of demultiplexing stats files
+
+    Parameters
+    ----------
+    fpaths : list
+        A list of file paths
+    saveto : str
+        File path to save the html file as the plotly box graph
+    fnames : list
+        A list of strings to label each ``fpaths``
+
+    Returns
+    -------
+    bool
+        True if saving successfully, False otherwise
+    '''
+
+    if not fnames:
+        fnames = [base_name(f) for f in fpaths]
+    if len(fnames) != len(fpaths):
+        fnames = [base_name(f) for f in fpaths]
+
+    num_reads_data = []
+    for i in range(len(fpaths)):
+        f = fpaths[i]
+        fname = fnames[i]
+
+        stats = pd.read_csv(f, index_col=0)
+        cell_stats = stats.iloc[:-5, :]
+        # tail 5 lines are fixed as the overall stats
+        overall_stats = stats.iloc[-5:, :]
+        num_reads_data.append(
+            go.Box(
+                y=cell_stats['Reads(#)'],
+                name='{} (#Saved={}/#Total={})'.format(
+                    fname,
+                    overall_stats.loc['saved', 'Reads(#)'],
+                    overall_stats.loc['total', 'Reads(#)'])))
+
+    layout = go.Layout(
+        xaxis=dict(showticklabels=False),
+        title='Number of reads saved per BC per item')
+    fig = go.Figure(data=num_reads_data, layout=layout)
+    try:
+        plot(fig, filename=saveto, auto_open=False)
+        return(True)
+    except Exception as e:
+        print(e, flush=True)
+        return(False)
+
+
 def main():
     parser = argparse.ArgumentParser(add_help=True)
     parser.add_argument('read1_fpath', type=str)
diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake
index 9e4c589..f6fcade 100644
--- a/celseq2/workflow/celseq2_beta.snakemake
+++ b/celseq2/workflow/celseq2_beta.snakemake
@@ -3,7 +3,8 @@ from celseq2.helper import rmfolder, mkfolder, is_nonempty_file
 from celseq2.helper import cook_sample_sheet, popen_communicate
 from celseq2.prepare_annotation_model import cook_anno_model
 from celseq2.count_umi import count_umi, _flatten_umi_set
-from celseq2.parse_log import parse_bowtie2_report, parse_star_report, merge_reports
+# from celseq2.parse_log import parse_bowtie2_report, parse_star_report, merge_reports
+from celseq2.demultiplex import plotly_demultiplexing_stats
 import pandas as pd
 import glob
 import pickle
@@ -51,12 +52,13 @@ GFF = config.get('GFF', None)
 FEATURE_ID = config.get('FEATURE_ID', 'gene_name')
 FEATURE_CONTENT = config.get('FEATURE_CONTENT', 'exon')
 GENE_BIOTYPE = config.get('GENE_BIOTYPE', None)
-if not GENE_BIOTYPE: GENE_BIOTYPE = ();
+if not GENE_BIOTYPE:
+    GENE_BIOTYPE = ()
 
 # Demultiplexing
 FASTQ_QUAL_MIN_OF_BC = config.get('FASTQ_QUAL_MIN_OF_BC', None)  # 10
 CUT_LENGTH = config.get('CUT_LENGTH', None)  # 35
-SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False) # False
+SAVE_UNKNOWN_BC_FASTQ = config.get('SAVE_UNKNOWN_BC_FASTQ', False)  # False
 # Alignment
 ALIGNER = config.get('ALIGNER', None)  # 'bowtie2', 'star'
 assert (ALIGNER), 'Error: Specify aligner.'
@@ -136,11 +138,13 @@ if RUN_CELSEQ2_TO_ST:
                 shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
             if not KEEP_INTERMEDIATE:
                 print_logger('Cleaning... ')
-                rmfolder(SUBDIR_FASTQ);
+                rmfolder(SUBDIR_FASTQ)
                 # rmfolder(SUBDIR_ANNO);
-                rmfolder(SUBDIR_ALIGN); rmfolder(SUBDIR_ALIGN_ITEM);
-                rmfolder(SUBDIR_LOG);
-                rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET);
+                rmfolder(SUBDIR_ALIGN)
+                rmfolder(SUBDIR_ALIGN_ITEM)
+                rmfolder(SUBDIR_LOG)
+                rmfolder(SUBDIR_UMI_CNT)
+                rmfolder(SUBDIR_UMI_SET)
 
 else:
     rule all:
@@ -154,11 +158,13 @@ else:
                 shell('mv -f celseq2_job*.sh* {}'.format(SUBDIR_QSUB))
             if not KEEP_INTERMEDIATE:
                 print_logger('Cleaning... ')
-                rmfolder(SUBDIR_FASTQ);
+                rmfolder(SUBDIR_FASTQ)
                 # rmfolder(SUBDIR_ANNO);
-                rmfolder(SUBDIR_ALIGN); rmfolder(SUBDIR_ALIGN_ITEM);
-                rmfolder(SUBDIR_LOG);
-                rmfolder(SUBDIR_UMI_CNT); rmfolder(SUBDIR_UMI_SET);
+                rmfolder(SUBDIR_ALIGN)
+                rmfolder(SUBDIR_ALIGN_ITEM)
+                rmfolder(SUBDIR_LOG)
+                rmfolder(SUBDIR_UMI_CNT)
+                rmfolder(SUBDIR_UMI_SET)
 
 
 '''
@@ -236,7 +242,8 @@ if RUN_CELSEQ2_TO_ST:
         input:
             tsv = join_path(DIR_PROJ, SUBDIR_ST, '{expid}', 'ST.tsv'),
         output:
-            html = join_path(DIR_PROJ, SUBDIR_QC_EXPR, '{expid}', 'QC_ST.html'),
+            html = join_path(DIR_PROJ, SUBDIR_QC_EXPR,
+                             '{expid}', 'QC_ST.html'),
         params:
             expid = '{expid}',
         run:
@@ -258,8 +265,9 @@ if RUN_CELSEQ2_TO_ST:
 
 # Pipeline Step 1a: combo_demultiplexing
 # Inputs: R1 (barcode) and R2 (mRNA)
-# Output: A list of FASTQs extracted from R2 per cells and they are tagged with
-# cell barcode and UMI sequence gained from R1.
+# Output:
+#   - A list of FASTQs extracted from R2 per cells and they are tagged with cell
+#     barcode and UMI sequence gained from R1.
 rule combo_demultiplexing:
     input: SAMPLE_TABLE_FPATH,
     output:
@@ -345,17 +353,18 @@ rule tag_fastq:
 # Pipeline Step 2a: Alignment
 rule ALIGNMENT:
     input:
-        expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER+'.bigsam'),
+        expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', ALIGNER + '.bigsam'),
                itemID=item_names),
     message: "Finished Alignment."
 
 if ALIGNER == 'bowtie2':
     rule align_bowtie2:
         input:
-            fq = join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'),
+            fq = join_path(DIR_PROJ, SUBDIR_FASTQ,
+                           '{itemID}', 'TAGGED.bigfastq'),
         output:
             sam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM,
-                            '{itemID}', ALIGNER+'.bigsam'),
+                            '{itemID}', ALIGNER + '.bigsam'),
         params:
             threads = num_threads,
             aligner_extra_parameters = ALIGNER_EXTRA_PARAMETERS,
@@ -376,10 +385,11 @@ if ALIGNER == 'bowtie2':
 if ALIGNER == 'star':
     rule align_star:
         input:
-            fq = join_path(DIR_PROJ, SUBDIR_FASTQ, '{itemID}', 'TAGGED.bigfastq'),
+            fq = join_path(DIR_PROJ, SUBDIR_FASTQ,
+                           '{itemID}', 'TAGGED.bigfastq'),
         output:
             sam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM,
-                            '{itemID}', ALIGNER+'.bigsam'),
+                            '{itemID}', ALIGNER + '.bigsam'),
             log = join_path(DIR_PROJ, SUBDIR_LOG,
                             '{itemID}', 'Align-STAR.log'),
             # starsam = join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM, '{itemID}', '.star',
@@ -415,10 +425,11 @@ if ALIGNER == 'star':
 rule combo_demultiplexing_sam:
     input:
         sam = expand(join_path(DIR_PROJ, SUBDIR_ALIGN_ITEM,
-                               '{itemID}', ALIGNER+'.bigsam'),
-                     itemID = item_names),
+                               '{itemID}', ALIGNER + '.bigsam'),
+                     itemID=item_names),
     output:
-        sam = dynamic(join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemID}', '{bcID}.sam')),
+        sam = dynamic(join_path(DIR_PROJ, SUBDIR_ALIGN,
+                                '{itemID}', '{bcID}.sam')),
     params:
         jobs = len(item_names),
         claim_used_bc = True,
@@ -449,8 +460,8 @@ rule combo_demultiplexing_sam:
 # Pipeline Step 3a: cook ht-seq recognized feature object before counting UMIs
 # Input: GTF/GFF file
 # Output: a pickle file saving a tuple (features, exported_genes) where:
-#     - features: HTSeq.GenomicArrayOfSets(). *All* exons locations ~ genes.
-#     - exported_genes: a sorted list. Gene id / gene name (default all genes).
+#     - features: HTSeq.GenomicArrayOfSets(). *All* exons locations ~ set(genes).
+#     - exported_genes: a sorted list. Gene id/name. (Default all genes are exported).
 rule COOK_ANNOTATION:
     input:
         SAMPLE_TABLE_FPATH,
@@ -495,17 +506,17 @@ rule COOK_ANNOTATION:
 #   - annotation object
 #   - SAM per cell
 # Outputs: two pickle files per cell
-#   - umicnt: dict(str: set(str)), i.e., dict(gene ~ set(UMI_sequence))
-#   - umiset: Counter(str: int), i.e., Counter(gene ~ "number of UMIs")
+#   - umicnt: dict(str: set(str)) i.e., dict(gene ~ set(UMI_sequence))
+#   - umiset: Counter(str: int) i.e., Counter(gene ~ number of UMIs)
 rule count_umi:
     input:
         gff = rules.COOK_ANNOTATION.output.anno_pkl,
         sam = join_path(DIR_PROJ, SUBDIR_ALIGN, '{itemID}', '{bcID}.sam'),
     output:
         umicnt = join_path(DIR_PROJ, SUBDIR_UMI_CNT,
-                                '{itemID}', '{bcID}.pkl'),
+                           '{itemID}', '{bcID}.pkl'),
         umiset = join_path(DIR_PROJ, SUBDIR_UMI_SET,
-                                '{itemID}', '{bcID}.pkl'),
+                           '{itemID}', '{bcID}.pkl'),
     message: 'Counting {input.sam}'
     run:
         features_f, _ = pickle.load(open(input.gff, 'rb'))
@@ -538,7 +549,7 @@ rule summarize_umi_matrix_per_item:
     run:
         _, export_genes = pickle.load(open(input.gff, 'rb'))
 
-        # item -> dict(cell_bc -> Counter(umi_vector))
+        # { item -> dict(cell_bc -> Counter(umi_vector)) }
         item_expr_matrix = defaultdict(dict)
 
         for f in input.umicnt:
@@ -579,7 +590,7 @@ rule summarize_umi_matrix_per_experiment:
 
         sample_list = SAMPLE_TABLE['SAMPLE_NAME'].values
 
-        # experiment_id -> dict(cell_bc -> dict(gname -> set(umi)))
+        # { experiment_id -> dict(cell_bc -> dict(gname -> set(umi))) }
         exp_expr_matrix = {}
 
         for exp_id in list(set(sample_list)):
@@ -655,6 +666,30 @@ rule qc_umi_matrix_per_experiment:
 #                               savetocsv=report_item)
 
 
+rule REPORT:
+    input:
+        demultiplexing_fastq = join_path(DIR_PROJ, SUBDIR_REPORT,
+                                         'demultiplexing_fastq.html')
+
+# Inputs: project/report/item-*/demultiplexing.csv
+# Outputs:
+#   - Stats file (csv) per item.
+#   - Plotly box graph for all the items.
+rule report_combo_demultiplexing:
+    output:
+        html = join_path(DIR_PROJ, SUBDIR_REPORT, 'demultiplexing_fastq.html')
+    run:
+        # Prepare the stats files
+        stats_fpaths = glob.glob(join_path(DIR_PROJ, SUBDIR_REPORT,
+                                           'item-*', 'demultiplexing.csv'))
+        stats_fpaths_labels = [base_name(base_name(f)) for f in stats_fpaths]
+        work = plotly_demultiplexing_stats(
+            fpaths=stats_fpaths,
+            saveto=output.html,
+            fnames=stats_fpaths_labels)
+        if not work:
+            touch('{output.html}')
+
 rule cleanall:
     message: "Remove all files under {DIR_PROJ}"
     run:

From cfebaf2fbeab496870cee05829a1946e136fc498 Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Tue, 8 May 2018 16:43:52 -0400
Subject: [PATCH 3/8] :pill: plotly graph on demultiplexing fastqs

---
 celseq2/celseq2.py                      | 10 +++++++---
 celseq2/workflow/celseq2_beta.snakemake |  5 +++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/celseq2/celseq2.py b/celseq2/celseq2.py
index 0af8aae..4b106d5 100644
--- a/celseq2/celseq2.py
+++ b/celseq2/celseq2.py
@@ -43,6 +43,10 @@
     - https://bitbucket.org/snakemake/snakemake/src/e11a57fe1f62f3f56c815d95d82871811dae81b3/snakemake/__init__.py?at=master&fileviewer=file-view-default#__init__.py-580:1127
 '''
 
+task_choices = ['all', 'TAG_FASTQ', 'ANNOTATION', 'ALIGNMENT',
+                'COUNT_MATRIX', 'QC_COUNT_MATRIX', 'CELSEQ2_TO_ST',
+                'REPORT']
+
 
 def get_argument_parser():
     desc = ('celseq2: A Python Package for Processing CEL-Seq2 RNA-Seq Data.')
@@ -52,9 +56,9 @@ def get_argument_parser():
         "target",
         nargs="*",
         default=None,
-        help="Targets to build. May be rules or files.",
-        choices=[None, 'all', 'TAG_FASTQ', 'ANNOTATION', 'ALIGNMENT',
-                 'COUNT_MATRIX', 'QC_COUNT_MATRIX', 'CELSEQ2_TO_ST'])
+        help=('Targets to build. '
+              'May be rules or files. '
+              'Task choices: {}').format(', '.join(task_choices)))
     parser.add_argument(
         "--config-file",
         metavar="FILE",
diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake
index f6fcade..a82c194 100644
--- a/celseq2/workflow/celseq2_beta.snakemake
+++ b/celseq2/workflow/celseq2_beta.snakemake
@@ -679,10 +679,11 @@ rule report_combo_demultiplexing:
     output:
         html = join_path(DIR_PROJ, SUBDIR_REPORT, 'demultiplexing_fastq.html')
     run:
-        # Prepare the stats files
+        # stats files [/path/item-1/d.csv, /path/item-2/d.csv]
         stats_fpaths = glob.glob(join_path(DIR_PROJ, SUBDIR_REPORT,
                                            'item-*', 'demultiplexing.csv'))
-        stats_fpaths_labels = [base_name(base_name(f)) for f in stats_fpaths]
+        # labels [item-1, item-2]
+        stats_fpaths_labels = [base_name(dir_name(f)) for f in stats_fpaths]
         work = plotly_demultiplexing_stats(
             fpaths=stats_fpaths,
             saveto=output.html,

From f7f3388fbdc0dabfab935e50653a35ad89d03c85 Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Tue, 8 May 2018 16:53:31 -0400
Subject: [PATCH 4/8] :muscle: plotly graph on demultiplexing fastqs

---
 celseq2/demultiplex.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py
index 2e09e1c..356ec99 100644
--- a/celseq2/demultiplex.py
+++ b/celseq2/demultiplex.py
@@ -258,6 +258,7 @@ def plotly_demultiplexing_stats(fpaths=[], saveto='', fnames=[]):
                     overall_stats.loc['total', 'Reads(#)'])))
 
     layout = go.Layout(
+        legend=dict(x=-.1, y=-.2),
         xaxis=dict(showticklabels=False),
         title='Number of reads saved per BC per item')
     fig = go.Figure(data=num_reads_data, layout=layout)

From 0493caa721925d92fe5a3db2ac3424f8ea0e8bb2 Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Wed, 9 May 2018 11:52:33 -0400
Subject: [PATCH 5/8] :pill: forgot to claim the value is median

---
 celseq2/qc.py      | 4 +++-
 celseq2/version.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/celseq2/qc.py b/celseq2/qc.py
index 5475ff2..08355c9 100644
--- a/celseq2/qc.py
+++ b/celseq2/qc.py
@@ -218,7 +218,8 @@ def plotly_qc_st(fpath, saveto, sep='\t', name=''):
         mask_by=ST_qc.total_num_UMIs,
         hover_text=ST_qc.total_num_UMIs.astype('str'),
         colorscale='Viridis',
-        mask_title='#Total UMIs {})'.format(ST_qc.total_num_UMIs.median()))
+        mask_title=('#Total UMIs '
+                    '(median={})').format(ST_qc.total_num_UMIs.median()))
     # 3/3
     plotly_ST_mt = plotly_scatter(
         x=ST_qc.Row, y=ST_qc.Col,
@@ -239,6 +240,7 @@ def plotly_qc_st(fpath, saveto, sep='\t', name=''):
     fig['layout'].update(height=600, width=1900, title=name)
 
     fig.layout.showlegend = False
+    # Manually change the locations of other two color bars to proper places
     fig.data[0].marker.colorbar.x = 0.28
     fig.data[1].marker.colorbar.x = 0.64
 
diff --git a/celseq2/version.py b/celseq2/version.py
index 45869b6..ed7d50e 100644
--- a/celseq2/version.py
+++ b/celseq2/version.py
@@ -1 +1 @@
-__version__ = '0.5.2'
+__version__ = '0.5.3'

From 5f9bc60610b313d953279e0f2b3cc7ca23411d42 Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Wed, 9 May 2018 17:04:54 -0400
Subject: [PATCH 6/8] :beers: plotly graph on the alignment stats

---
 celseq2/count_umi.py                    | 62 +++++++++++++++++++++
 celseq2/demultiplex.py                  |  2 +-
 celseq2/workflow/celseq2_beta.snakemake | 73 +++++++++++++++++++++++--
 3 files changed, 130 insertions(+), 7 deletions(-)

diff --git a/celseq2/count_umi.py b/celseq2/count_umi.py
index db71581..3d77739 100644
--- a/celseq2/count_umi.py
+++ b/celseq2/count_umi.py
@@ -8,6 +8,10 @@
 import pickle
 import argparse
 from collections import defaultdict, Counter
+import plotly.graph_objs as go
+from plotly.offline import plot
+import pandas as pd
+from celseq2.helper import base_name
 
 
 def invert_strand(iv):
@@ -102,6 +106,64 @@ def _flatten_umi_set(umi_set):
 #     pass
 
 
+def plotly_alignment_stats(fpaths=[], saveto='', fnames=[]):
+    '''
+    Save a plotly box graph with a list of alignment stats files
+
+    Parameters
+    ----------
+    fpaths : list
+        A list of file paths
+    saveto : str
+        File path to save the html file as the plotly box graph
+    fnames : list
+        A list of strings to label each ``fpaths``
+
+    Returns
+    -------
+    bool
+        True if saving successfully, False otherwise
+    '''
+    if not fnames:
+        fnames = [base_name(f) for f in fpaths]
+    if len(fnames) != len(fpaths):
+        fnames = [base_name(f) for f in fpaths]
+    trace_data = []
+    # aln_diagnose_item = ["_unmapped",
+    #                      "_low_map_qual", '_multimapped', "_uniquemapped",
+    #                      "_no_feature", "_ambiguous",
+    #                      "_total"]
+    for i in range(len(fpaths)):
+        f = fpaths[i]
+        fname = fnames[i]
+
+        stats = pd.read_csv(f, index_col=0)
+
+        mapped = stats.loc['_multimapped', :] + stats.loc['_uniquemapped', :]
+        rate_mapped = mapped / stats.loc['_total', :]
+
+        overall_mapped = mapped.sum()
+        overall_total = stats.loc['_total', :].sum()
+
+        stats.fillna(value=0, inplace=True)
+        trace_data.append(
+            go.Box(
+                y=rate_mapped,
+                name='{} (#Mapped={}/#Total={})'.format(
+                    fname, overall_mapped, overall_total)))
+
+    layout = go.Layout(
+        xaxis=dict(showticklabels=False),
+        title='Mapped/Total alignments per BC per item')
+    fig = go.Figure(data=trace_data, layout=layout)
+    try:
+        plot(fig, filename=saveto, auto_open=False)
+        return(True)
+    except Exception as e:
+        print(e, flush=True)
+        return(False)
+
+
 def main():
     parser = argparse.ArgumentParser(add_help=True)
     parser.add_argument('--sam_fpath', type=str, metavar='FILENAME',
diff --git a/celseq2/demultiplex.py b/celseq2/demultiplex.py
index 356ec99..c7be5e3 100644
--- a/celseq2/demultiplex.py
+++ b/celseq2/demultiplex.py
@@ -258,7 +258,7 @@ def plotly_demultiplexing_stats(fpaths=[], saveto='', fnames=[]):
                     overall_stats.loc['total', 'Reads(#)'])))
 
     layout = go.Layout(
-        legend=dict(x=-.1, y=-.2),
+        # legend=dict(x=-.1, y=-.2),
         xaxis=dict(showticklabels=False),
         title='Number of reads saved per BC per item')
     fig = go.Figure(data=num_reads_data, layout=layout)
diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake
index a82c194..c93ad1f 100644
--- a/celseq2/workflow/celseq2_beta.snakemake
+++ b/celseq2/workflow/celseq2_beta.snakemake
@@ -96,6 +96,7 @@ SUBDIR_ALIGN = 'small_sam'
 SUBDIR_ALIGN_ITEM = 'item_sam'
 SUBDIR_UMI_CNT = 'small_umi_count'
 SUBDIR_UMI_SET = 'small_umi_set'
+SUBDIR_ALN_STATS = 'small_aln_stats'
 SUBDIR_EXPR = 'expr'
 SUBDIR_ST = 'ST'
 SUBDIR_LOG = 'small_log'
@@ -106,16 +107,12 @@ SUBDIR_QC_EXPR = 'qc_expr'
 
 SUBDIRS = [SUBDIR_INPUT,
            SUBDIR_FASTQ, SUBDIR_ALIGN, SUBDIR_ALIGN_ITEM,
-           SUBDIR_UMI_CNT, SUBDIR_UMI_SET,
+           SUBDIR_UMI_CNT, SUBDIR_UMI_SET, SUBDIR_ALN_STATS,
            SUBDIR_EXPR,
            SUBDIR_REPORT, SUBDIR_QC_EXPR,
            SUBDIR_LOG, SUBDIR_QSUB, SUBDIR_ANNO
            ]
 
-# aln_diagnose_item = ["_unmapped",
-#                      "_low_map_qual", '_multimapped', "_uniquemapped",
-#                      "_no_feature", "_ambiguous",
-#                      "_total"]
 
 '''
 Part-2: Snakemake rules
@@ -145,6 +142,7 @@ if RUN_CELSEQ2_TO_ST:
                 rmfolder(SUBDIR_LOG)
                 rmfolder(SUBDIR_UMI_CNT)
                 rmfolder(SUBDIR_UMI_SET)
+                rmfolder(SUBDIR_ALN_STATS)
 
 else:
     rule all:
@@ -165,6 +163,7 @@ else:
                 rmfolder(SUBDIR_LOG)
                 rmfolder(SUBDIR_UMI_CNT)
                 rmfolder(SUBDIR_UMI_SET)
+                rmfolder(SUBDIR_ALN_STATS)
 
 
 '''
@@ -517,6 +516,8 @@ rule count_umi:
                            '{itemID}', '{bcID}.pkl'),
         umiset = join_path(DIR_PROJ, SUBDIR_UMI_SET,
                            '{itemID}', '{bcID}.pkl'),
+        alncnt = join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER,
+                           '{itemID}', '{bcID}.pkl'),
     message: 'Counting {input.sam}'
     run:
         features_f, _ = pickle.load(open(input.gff, 'rb'))
@@ -528,6 +529,7 @@ rule count_umi:
                                               dumpto=None)
         pickle.dump(umi_cnt, open(output.umicnt, 'wb'))
         pickle.dump(umi_set, open(output.umiset, 'wb'))
+        pickle.dump(aln_cnt, open(output.alncnt, 'wb'))
 
 
 # Pipeline Step 4a: Merge UMIs of cells to UMI matrix of item
@@ -637,6 +639,46 @@ rule qc_umi_matrix_per_experiment:
         cmd += '--sep {}'.format(',')
         shell(cmd)
 
+
+rule summarize_aln_stats_per_item:
+    input:
+        alncnt = dynamic(join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER,
+                                   '{itemID}', '{bcID}.pkl')),
+    output:
+        aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT,
+                                    '{itemID}',
+                                    'alignment-' + ALIGNER + '.csv'),
+                          itemID=item_names),
+    run:
+        aln_diagnose_item = ["_unmapped",
+                             "_low_map_qual", '_multimapped', "_uniquemapped",
+                             "_no_feature", "_ambiguous",
+                             "_total"]
+
+        # { item -> dict(cell_bc -> Counter(stats)) }
+        item_stats = defaultdict(dict)
+
+        for f in input.alncnt:
+            bc_name = base_name(f)  # BC-1-xxx
+            item_id = base_name(dir_name(f))  # item-1
+            item_stats[item_id][bc_name] = pickle.load(open(f, 'rb'))
+
+        # export to csv
+        for item_id, aln_dict in item_stats.items():
+            exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME']  # E1
+
+            for bc, cnt in aln_dict.items():
+                aln_dict[bc] = pd.Series([cnt[x] for x in aln_diagnose_item],
+                                         index=aln_diagnose_item)
+
+            aln_stats_df = pd.DataFrame(
+                aln_dict, index=aln_diagnose_item).fillna(0)
+
+            aln_stats_df.to_csv(join_path(DIR_PROJ, SUBDIR_REPORT,
+                                          item_id,
+                                          'alignment-' + ALIGNER + '.csv'))
+
+
 # rule report_alignment_log:
 #     input:
 #         df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER,
@@ -669,7 +711,9 @@ rule qc_umi_matrix_per_experiment:
 rule REPORT:
     input:
         demultiplexing_fastq = join_path(DIR_PROJ, SUBDIR_REPORT,
-                                         'demultiplexing_fastq.html')
+                                         'demultiplexing_fastq.html'),
+        alignment_stats = join_path(DIR_PROJ, SUBDIR_REPORT,
+                                    'alignment-{}.html'.format(ALIGNER)),
 
 # Inputs: project/report/item-*/demultiplexing.csv
 # Outputs:
@@ -691,6 +735,23 @@ rule report_combo_demultiplexing:
         if not work:
             touch('{output.html}')
 
+rule report_alignment_stats:
+    input:
+        aln_item = rules.summarize_aln_stats_per_item.output.aln_item,
+    output:
+        html = join_path(DIR_PROJ, SUBDIR_REPORT,
+                         'alignment-{}.html'.format(ALIGNER)),
+    run:
+        from celseq2.count_umi import plotly_alignment_stats
+        stats_fpaths_labels = [base_name(dir_name(f)) for f in input.aln_item]
+        work = plotly_alignment_stats(
+            fpaths=input.aln_item,
+            saveto=output.html,
+            fnames=stats_fpaths_labels)
+        if not work:
+            touch('{output.html}')
+
+
 rule cleanall:
     message: "Remove all files under {DIR_PROJ}"
     run:

From 805d6d053767360f7db2733eaae75d5c0b9caf05 Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Thu, 10 May 2018 15:14:48 -0400
Subject: [PATCH 7/8] :beer: cell name has bc id attached; :beer: merge plotly
 for alignment.html to the pipeline

---
 celseq2/workflow/celseq2_beta.snakemake | 63 ++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake
index c93ad1f..7690b12 100644
--- a/celseq2/workflow/celseq2_beta.snakemake
+++ b/celseq2/workflow/celseq2_beta.snakemake
@@ -127,6 +127,7 @@ if RUN_CELSEQ2_TO_ST:
         input:
             '_done_UMI',
             '_done_ST',
+            '_done_report',
         output:
             touch('_DONE'),
         run:
@@ -148,6 +149,7 @@ else:
     rule all:
         input:
             '_done_UMI',
+            '_done_report',
         output:
             touch('_DONE'),
         run:
@@ -560,14 +562,26 @@ rule summarize_umi_matrix_per_item:
             item_expr_matrix[item_id][bc_name] = pickle.load(open(f, 'rb'))
 
         # export to csv/hdf
+        dict_bc_id = pd.read_csv(BC_INDEX_FPATH,
+                                 sep='\t', index_col=BC_SEQ_COLUMN)
+        all_bc_seq = dict_bc_id.index.values
+        dict_bc_id = {seq: seq_id + 1 for seq_id, seq in enumerate(all_bc_seq)}
         for item_id, expr_dict in item_expr_matrix.items():
             exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME']  # E1
 
             for bc, cnt in expr_dict.items():
                 expr_dict[bc] = pd.Series([cnt[x] for x in export_genes],
                                           index=export_genes)
-
-            expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0)
+            cnames_ordered = sorted(
+                expr_dict.keys(),
+                key=lambda xx: dict_bc_id.get(xx, float('Inf')))
+            expr_df = pd.DataFrame(
+                expr_dict,
+                index=export_genes,
+                columns=cnames_ordered)
+            expr_df.fillna(0, inplace=True)
+            expr_df.columns = ['BC-{}-{}'.format(
+                dict_bc_id.get(xx, 0), xx) for xx in cnames_ordered]
             expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR,
                                      exp_id, item_id, 'expr.csv'))
             expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR,
@@ -599,7 +613,7 @@ rule summarize_umi_matrix_per_experiment:
             exp_expr_matrix[exp_id] = defaultdict(dict)
 
         for f in input.umiset:
-            bc_name = base_name(f)  # BC-1-xxx
+            bc_name = base_name(f)  # xxx
             item_id = base_name(dir_name(f))  # item-1
             exp_id = SAMPLE_TABLE.loc[item_id, 'SAMPLE_NAME']
 
@@ -614,12 +628,26 @@ rule summarize_umi_matrix_per_experiment:
                 exp_expr_matrix[exp_id][bc_name][x] = y1 | y2
 
         # export to csv/hdf
+        dict_bc_id = pd.read_csv(BC_INDEX_FPATH,
+                                 sep='\t', index_col=BC_SEQ_COLUMN)
+        all_bc_seq = dict_bc_id.index.values
+        dict_bc_id = {seq: seq_id + 1 for seq_id, seq in enumerate(all_bc_seq)}
         for exp_id, expr_dict in exp_expr_matrix.items():
             for bc, cnt in expr_dict.items():
                 cnt = _flatten_umi_set(cnt)
                 expr_dict[bc] = pd.Series([cnt[x] for x in export_genes],
                                           index=export_genes)
-            expr_df = pd.DataFrame(expr_dict, index=export_genes).fillna(0)
+            cnames_ordered = sorted(
+                expr_dict.keys(),
+                key=lambda xx: dict_bc_id.get(xx, float('Inf')))
+            expr_df = pd.DataFrame(
+                expr_dict,
+                index=export_genes,
+                columns=cnames_ordered)
+            expr_df.fillna(0, inplace=True)
+            expr_df.columns = ['BC-{}-{}'.format(
+                dict_bc_id.get(xx, 0), xx) for xx in cnames_ordered]
+
             expr_df.to_csv(join_path(DIR_PROJ, SUBDIR_EXPR,
                                      exp_id, 'expr.csv'))
             expr_df.to_hdf(join_path(DIR_PROJ, SUBDIR_EXPR,
@@ -642,23 +670,23 @@ rule qc_umi_matrix_per_experiment:
 
 rule summarize_aln_stats_per_item:
     input:
-        alncnt = dynamic(join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER,
-                                   '{itemID}', '{bcID}.pkl')),
+        # alncnt = dynamic(join_path(DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER,
+        #                            '{itemID}', '{bcID}.pkl')),
+        '_done_UMI',
     output:
-        aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT,
-                                    '{itemID}',
+        aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT, '{itemName}',
                                     'alignment-' + ALIGNER + '.csv'),
-                          itemID=item_names),
+                          itemName=item_names),
     run:
         aln_diagnose_item = ["_unmapped",
                              "_low_map_qual", '_multimapped', "_uniquemapped",
                              "_no_feature", "_ambiguous",
                              "_total"]
-
         # { item -> dict(cell_bc -> Counter(stats)) }
         item_stats = defaultdict(dict)
-
-        for f in input.alncnt:
+        alncnt_files = glob.glob(join_path(
+            DIR_PROJ, SUBDIR_ALN_STATS, ALIGNER, 'item-*', '*.pkl'))
+        for f in alncnt_files:  # input.alncnt:
             bc_name = base_name(f)  # BC-1-xxx
             item_id = base_name(dir_name(f))  # item-1
             item_stats[item_id][bc_name] = pickle.load(open(f, 'rb'))
@@ -714,12 +742,18 @@ rule REPORT:
                                          'demultiplexing_fastq.html'),
         alignment_stats = join_path(DIR_PROJ, SUBDIR_REPORT,
                                     'alignment-{}.html'.format(ALIGNER)),
+    output:
+        flag = '_done_report'
+    run:
+        shell('touch {output.flag}')
 
 # Inputs: project/report/item-*/demultiplexing.csv
 # Outputs:
 #   - Stats file (csv) per item.
 #   - Plotly box graph for all the items.
 rule report_combo_demultiplexing:
+    input:
+        '_done_UMI',
     output:
         html = join_path(DIR_PROJ, SUBDIR_REPORT, 'demultiplexing_fastq.html')
     run:
@@ -737,7 +771,10 @@ rule report_combo_demultiplexing:
 
 rule report_alignment_stats:
     input:
-        aln_item = rules.summarize_aln_stats_per_item.output.aln_item,
+        aln_item = expand(join_path(DIR_PROJ, SUBDIR_REPORT,
+                                    '{itemName}',
+                                    'alignment-' + ALIGNER + '.csv'),
+                          itemName=item_names),
     output:
         html = join_path(DIR_PROJ, SUBDIR_REPORT,
                          'alignment-{}.html'.format(ALIGNER)),

From 457245e1d660f4cb2e9a7aee9f29ede77481e8d2 Mon Sep 17 00:00:00 2001
From: yy1533 <yun.yan@nyumc.org>
Date: Thu, 10 May 2018 15:49:43 -0400
Subject: [PATCH 8/8] :dog: add in-line comments

---
 celseq2/workflow/celseq2_beta.snakemake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/celseq2/workflow/celseq2_beta.snakemake b/celseq2/workflow/celseq2_beta.snakemake
index 7690b12..fffb5c6 100644
--- a/celseq2/workflow/celseq2_beta.snakemake
+++ b/celseq2/workflow/celseq2_beta.snakemake
@@ -534,7 +534,7 @@ rule count_umi:
         pickle.dump(aln_cnt, open(output.alncnt, 'wb'))
 
 
-# Pipeline Step 4a: Merge UMIs of cells to UMI matrix of item
+# Pipeline Step 4a (deprecated) : Merge UMIs of cells to UMI matrix of item
 # Input: a list of umicnt pickle files of cells per item
 # Output: UMI-count matric file (csv & hdf) per item
 rule summarize_umi_matrix_per_item:
@@ -706,7 +706,7 @@ rule summarize_aln_stats_per_item:
                                           item_id,
                                           'alignment-' + ALIGNER + '.csv'))
 
-
+# (deprecated)
 # rule report_alignment_log:
 #     input:
 #         df = dynamic(join_path(DIR_PROJ, SUBDIR_LOG, '{itemid}', ALIGNER,