diff --git a/README.md b/README.md
index 5df679f..649da5d 100644
--- a/README.md
+++ b/README.md
@@ -1,78 +1,21 @@
 # RNAseq
 The Gresham Lab RNAseq pipeline
 
-Version 1.0 - 10/18/2017
+Version 2.0 - 10/31/2017
 
 Usage: The intent of this pipeline is to standardize RNA-seq alignments 
-for the Gresham lab and to make the most of the HPC environment.
+for the Gresham lab and to make the most of the HPC environment. v2.0 
+intorduced a python wrapper to handle the generation of the SBATCH bash
+file. 
 
-## To Use:
+Please use the '-h' or '-help' commands to access details about 
+command line arguments and examples command lines.
 
-A. Necessary steps: 
-	
-	1. Define the SBATCH array size (line 2):
-		#SBATCH --array=0-N
-		Purpose: SBATCH needs to launch a number of runs equal 
-		to the number of fastq files that need to be processed. This 
-		command dictates the number of runs to launch. 
-		Modify: Change the variable 'N' to the number of fastq files
-		contained in the DIR directory. Note the zero indexing, if you have 
-		two fastq files, N = 1 and the command is #SBATCH --array=0-1.
-	
-	2. Define run configuration (line 26-28):
-		a. The name of the analysis (line 26):
-		analysisName="N"
-		Purpose: Name defines output folder name
-		b. Is the library Paired End (line 27):
-		isPE=N
-		Purpose: Paired end and single end libraries require distinct
-		operations, this value define the operations applied to your library. 
-		Modify: If paired end (PE) set 'N' to 1, if single end (SE) set 'N' to 0.
-		c. Does the library have UMIs (line 28):
-		isUMI=N
-		Purpose: UMI labelled libraries require an additional step to
-		to demultiplex, this values dictates whether or not that occurs. 
-		Modify: If UMIs were used in library prep, set 'N' to 1, else 
-		set 'N' to 0.
+	python RNAseq_pipeline.py -h    
 
-	3. Define directory containing fastq files to process (line 34):
-		DIR='N'
-		Purpose: Directory conatins fastq files to process.
-		Modify: Set directory 'N' to the location of your files.
-		NOTE: Current sbatch capacities have an upper limit of runs per 
-		individual user. If you encouter an 'excessive runs' error you may 
-		need to split your files into two seperate directories.
+----------------------------------------------------------------------
+Please report all issues and bugs to the GreshamLab github page:
+https://github.com/GreshamLab/RNAseq/issues
 
-	4. Define compression state (line 56 - 57):
-		#FILES=($(ls $DIR/*fastq.gz | perl -pe 's/^.+n0\d_(.+)\.fastq\.gz/$1/g' | sort | uniq))
-		FILES=($(ls $DIR/*fastq | perl -pe 's/^.+n0\d_(.+)\.fastq/$1/g' | sort | uniq))
-		Purpose: Command collects the filenames of files with a defined 
-		file extension, either fastq.gz or fastq.
-		Modify: If the files have the gz extension (ie. *.fastq.gz),
-		uncomment line 56 and comment line 57. If the files do not have the 
-		gz extension (ie. *.fastq), comment line 56 and uncomment line 57.
-	
-B. Optional Modifications:
-	
-	1. Reference Genome (line 35):
-		hisat_ref='N'
-		Purpose: Defines genome to align against.
-		Modify: Set 'N' to point to a valid genome. The default genome
-		has spike in sequences (BAC, CEL, ERCC) already present.
-	
-	2. Genome Annotation (line 37):
-		gff='N'
-		Purpose: Defines annotation file for alignment.
-		Modify: Set 'N' to point to a valid 'gff' file. NOTE: Older 
-		annotation formats (eg. 'gtf', 'gff2') are not supported. 
 
-
-##
-#	Please submit issues to:
-#		https://github.com/GreshamLab/RNAseq/issues
-#	--------------------------------------------------- 
-##	
-
-## Features for future versions:
-	1. General interface python wrapper.
-		
+MIT License Copyright (c) 2017 David Gresham
diff --git a/RNAseq_pipeline.py b/RNAseq_pipeline.py
new file mode 100644
index 0000000..a61b145
--- /dev/null
+++ b/RNAseq_pipeline.py
@@ -0,0 +1,431 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 26 16:36:19 2017
+
+This script acts as a wrapper to generate a bash file with certain parameters 
+based on user input. By defualt SBATCH is automatically launched using the 
+generated command file.
+
+@author: Pieter Spealman. email pspealman@nyu.edu
+"""
+
+### import section. sys handles user IO, subprocess makes system calls.
+import sys
+import subprocess
+
+### defaults section. Sets default parameters for wrapper.
+input_dir = ''
+output_dir = ''
+isPE = 0
+isUMI = 0
+file_number = 1
+file_ext='fastq'
+
+genome_name='/scratch/work/cgsb/genomes/In_house/Fungi/Saccharomyces_cerevisiae/Gresham/GCF_000146045.2_R64_GreshamLabSpikes/GCF_000146045.2_R64_GreshamLabSpikes_genomic_hisat_index/genome'
+gff_name='/scratch/work/cgsb/genomes/In_house/Fungi/Saccharomyces_cerevisiae/Gresham/GCF_000146045.2_R64_GreshamLabSpikes/GCF_000146045.2_R64_GreshamLabSpikes_genomic.gff'
+polyG = False
+blank_lines = False
+no_launch = False
+
+#function handles help commands
+def help_dialog():
+    print('\n')
+    print('#=============================================================#')
+    print('\tThis script takes input to generate a standard shell script')
+    print('that aligns RNAseq data and performs QC. The default is assumed')
+    print('to be single end (unpaired), fastq files, without UMIs.\n')
+    print('#=============================================================#')
+    print('Usage:')
+    print('python RNAseq_pipeline.py [-pe] [-umi] [-gzip] -n file_number -i input_directory -o output_directory\n')
+    print('\t\tCommands:')
+    print('-n : Number of files (.fastq or fastq.gzip), present in directory.')
+    print('\t this number is used to launch arrays. Begin counting at one.')
+    print('-i : Input directory which holds the files to be processed')
+    print('-o : Output directory will hold the aligned files and logs.\n')
+    print('\t\tOptional Commands:')
+    print('-pe : Paired end read library. Default is unpaired reads.')
+    print('-umi : UMI adapter library. UMItools will be run on processed reads.')
+    print('\t\tTurned off by defualt.')
+    print('-gzip : FASTQ files have been compressed with gzip. By defualt files')
+    print('\t\tare asumed to be uncompressed. Other compression formats not supported\n')
+    print('\t\tExtended optional commands.')
+    print('[-polyG]')
+    print("-polyG : Nextera's two color chemistry reads the absence of signal as 'G'")
+    print("\t\tas such strings of 'GGG GGG' are artificially inflated. This option removes")
+    print("\t\tthose cases which have 'GGG GGG' UMI labels. Usage implies -umi.")
+    print('[-blank_lines]')
+    print('-blank_lines : Older seqeuncing files conatin an erroneous new line')
+    print('\t\tcharacter, while this bug has been fixed, this option allows for')
+    print('\t\tbackwards compatibility.')
+    print('[-gff gff_name]')
+    print("-gff gff_name: 'gff_name' specifies the custom gff reference file to use.")
+    print('[-genome genome_name]')
+    print("-genome genome_name: genome_name specifies the custom reference genome to use.")
+    print('[-no_launch]')
+    print('-no_launch : This option does not invoke SBATCH after generating the')
+    print('\t\tcommand file.')
+    print('')    
+    print('test se umi:\n\t python RNAseq_pipeline.py -umi -gzip -n 2 -i /scratch/cgsb/gencore/out/Gresham/2015-04-24_C5726ACXX/umi/8 -o test_SE_UMI')
+    print('test pe no umi:\n\t python RNAseq_pipeline.py -pe -gzip -n 2 -i /scratch/cgsb/gresham/LABSHARE/Scripts/Pipelines/testdata_PE -o test_PE_noUMI')
+    sys.exit('')    
+
+### user input section
+# parse sys.argv for help requests
+if ('-h' in sys.argv) or ('-help' in sys.argv) or (len(sys.argv)<1) or ('-man' in sys.argv) or ('?' in sys.argv):
+    help_dialog()
+
+# parse sys.argv for commands and options
+for arg in range(len(sys.argv)):
+    if sys.argv[arg] == '-n': 
+        file_number = int(sys.argv[arg+1])
+    if sys.argv[arg] == '-i':
+        input_dir = sys.argv[arg+1]
+    if sys.argv[arg] == '-o':
+        output_dir = sys.argv[arg+1]
+        
+    if (sys.argv[arg] == '-pe') or (sys.argv[arg] == '--paired_end') :
+        isPE = 1
+    if (sys.argv[arg] == '-umi') or (sys.argv[arg] == '--umi') :
+        isUMI = 1
+    if (sys.argv[arg] == '-gzip'):
+        file_ext='fastq.gz'
+        
+    if sys.argv[arg] == '-gff':
+        gff_name = sys.argv[arg+1]
+    if sys.argv[arg] == '-genome':
+        genome_name = sys.argv[arg+1]
+    if sys.argv[arg] == '-no_launch':
+        no_launch = True
+    if sys.argv[arg] == '-polyG':
+        isUMI = 1
+        polyG = True
+    if sys.argv[arg] == '-blank_lines':
+        blank_lines = True
+
+# eval input for errors
+if file_number > 1:
+    array_start = 0
+    array_stop = file_number-1       
+
+if input_dir =='':
+    print("Input directory error. Please specify a input directory using the '-i' argument")
+    help_dialog()
+    
+if output_dir =='':
+    print("Output directory error. Please specify an output directory using the '-o' argument")
+    help_dialog()
+    
+outfile=open('rnaseq-array-pipeline.sh','w')
+analysisName = output_dir
+
+# function generates module load section with difference based on UMI 
+def module_load():
+    outline=('module purge\n'+
+    'module load cutadapt/intel/1.12\n'+
+    'module load hisat2/intel/2.0.5\n'+
+    'module load samtools/intel/1.3.1\n'+
+    'module load r/intel/3.3.2\n'+
+    'module load qualimap/2.2.1\n'+
+    'module load gffread/intel/0.9.8c\n'+
+    'module load biopython/intel/1.70\n')
+    outfile.write(outline)
+    
+    if isUMI==0:
+        outline =('module load trim_galore/0.4.4\n\n')
+        outfile.write(outline)
+
+    if isUMI==1:
+        outline=('module load fastqc/0.11.5\n'+
+        'module load umi_tools/0.4.2\n\n')
+        outfile.write(outline)
+
+# function generates setup section with difference based on array_size (n) and file_ext (-gzip)
+# This function is always called.
+def write_setup_section():
+    
+    outline='#!/bin/bash\n'
+    outfile.write(outline)
+
+    if array_stop == 0:
+        outline = ('#SBATCH --array=%s ##specify number of samples\n')%(array_start)
+        outfile.write(outline)   
+
+    if array_stop != 0:
+        outline = ('#SBATCH --array=%s-%s ##specify number of samples\n')%(array_start,array_stop)
+        outfile.write(outline)
+    
+    outline = ('#SBATCH --output=%A_%a.o\n'+
+    '#SBATCH --error=%A_%a.e\n'+
+    '#SBATCH --job-name=%a_rnaseq-array-pipeline\n'+
+    '#SBATCH --nodes=1\n'+
+    '#SBATCH --ntasks-per-node=1\n'+
+    '#SBATCH --mem=60G\n'+
+    '#SBATCH --time=04:00:00\n'+
+    '#SBATCH --mail-user=${USER}@nyu.edu\n'+
+    '#SBATCH --mail-type=BEGIN,END,FAIL\n\n')
+    outfile.write(outline)
+    
+    module_load()
+
+    outline = ('#Configuration\n'+
+    'analysisName=%s\n'+
+    'isPE=%s #0 if SE\n'+
+    'isUMI=%s #1 if UMIs used\n\n')%(analysisName,isPE,isUMI)
+    outfile.write(outline)
+    
+    DIR=input_dir
+    hisat_ref=genome_name
+    gff=gff_name
+    
+    outline=('DIR=%s\n'+
+    'hisat_ref=%s\n'+
+    'gff=%s\n\n')%(DIR,hisat_ref,gff)
+    outfile.write(outline)
+    
+    outline=('#Make a local directory and out-of-stream resource copies of this script for records\n'+
+    'mkdir -p ${analysisName}\n'+
+    'mkdir -p ${analysisName}/logs\n'+
+    'cp rnaseq-array-pipeline.sh "${analysisName}/logs/rnaseq-array-pipeline.$(date +%F_%R).sh"\n'+
+    'cd ${analysisName}\n'+
+    '\necho "Generating the gtf file from the gff, using gffread"\n'+
+    'time gffread -E ${gff} -T -o- > temp.gtf\n'+
+    'gtf1="temp.gtf"\n\n')
+    outfile.write(outline)
+    
+    if file_ext=='fastq.gz':
+        outline=("FILES=($(ls $DIR/*fastq.gz | perl -pe 's/^.+n0\d_(.+)\.fastq\.gz/$1/g' | sort | uniq))\n")
+    
+    if file_ext=='fastq':
+        outline=("FILES=($(ls $DIR/*fastq | perl -pe 's/^.+n0\d_(.+)\.fastq/$1/g' | sort | uniq))\n\n")
+    
+    outfile.write(outline)
+    
+    outline=('## $ID will hold the sample_id\n'+
+    'ID=${FILES[$SLURM_ARRAY_TASK_ID]}\n\n')
+    outfile.write(outline)
+    
+    outline=('echo "This is the FILE Im running"\n'+
+    'echo ${FILES}\n'+
+    'echo "This is its ID"\n'+
+    'echo ${ID}\n'+
+    'echo "This is the ARRAY number Ive been assigned"\n'+
+    'echo ${SLURM_ARRAY_TASK_ID}\n'+
+    'echo "Im making this directory and jumping into it: ${ID}"\n'+
+    'mkdir $ID\n'+
+    'cd $ID\n'+
+    '\n#Update gtf variable as it only exists once and is in analysis directory\n'+
+    'gtf="../"${gtf1}\n'+
+    'echo "I think my GTF is at ${gtf}"\n\n')    
+    outfile.write(outline)
+
+# function generates umi_tools section, only called with -umi option
+def umi_tools_section():
+    
+    outline = ('\necho "You want me to dedup? Okay, deduping with umi_tools, then sorting and deleting the unsorted bam."\n'+
+    'time umi_tools dedup --umi-separator=: --method=directional --output-stats=${ID}.stats -I ${ID}.bam -S ${ID}.dedup.bam -L ${ID}.dedup.log\n'+
+    'time samtools sort -o ${ID}.sorted.dedup.bam ${ID}.dedup.bam\n'+
+    'rm ${ID}.dedup.bam\n\n')
+    outfile.write(outline)
+    
+    outline = ('##Determine counts\n'+                                                                                                                                      
+    'echo "There were "$( samtools view ${ID}.bam | wc -l )" counts before umi-deduplication"\n'+
+    'echo "There are "$( samtools view ${ID}.sorted.dedup.bam | wc -l )" counts after umi-deduplication"\n\n')
+    outfile.write(outline)
+    
+    outline=('\necho "Deleting a lot of intermediate files, indexing."\n'+
+    'rm ${ID}.bam\n'+
+    'rm ${ID}.bam.bai\n'+
+    'mv ${ID}.sorted.dedup.bam ${ID}.bam\n'+
+    'time samtools index ${ID}.bam\n\n')
+    outfile.write(outline)
+
+# function generates body, only called with -pe and no -umi
+# pe invokes trim_galore as no umi adapters are needed, 
+# trim_galore autoidentifies Nextera adapters for trimming. 
+def paired_end_no_umi():
+    
+    outline=('## full paths to paired mate files\n'+
+    'INPUT_1=$(ls $DIR/*n01_$ID.%s)\n'+
+    'INPUT_2=$(ls $DIR/*n02_$ID.%s)\n\n')%(file_ext,file_ext)
+    outfile.write(outline)
+
+    if blank_lines:
+            outline = ("awk 'NF' $INPUT_1 > INPUT_1.fastq\n"+
+            "INPUT_1=$(ls INPUT_1.fastq)\n"+
+            "awk 'NF' $INPUT_2 > INPUT_2.fastq\n"+
+            "INPUT_2=$(ls INPUT_2.fastq)\n\n")
+            outfile.write(outline)
+    
+    outline=('\necho "Now Im trimming the reads for adapters, quality?"\n'+
+    'time trim_galore --paired --fastqc $INPUT_1 $INPUT_2\n'+
+    'time hisat2 -p ${SLURM_CPUS_ON_NODE} -q -x ${hisat_ref} -1 *n01_${ID}_val_1.fq.gz -2 *n02_${ID}_val_2.fq.gz -S ${ID}.sam\n\n')
+    outfile.write(outline)
+        
+    outline=('\necho "Okay, now I sort and index that bam"\n'+
+    'time samtools sort -o ${ID}.bam ${ID}.sam\n'+
+    'time samtools index ${ID}.bam\n'+
+    '\necho "And delete that file, and fastqz files as well."\n'+
+    'rm ${ID}.sam\n'+
+    'rm *n01_${ID}_val_1.fq.gz\nrm *n02_${ID}_val_2.fq.gz\n\n')       
+    outfile.write(outline)
+
+    outline=('\necho "Now doing qualimaps"\n'+
+    'unset DISPLAY\n'+
+    'time qualimap bamqc -bam ${ID}.bam -gff ${gff} -c -outdir ${ID}_stats -outfile ${ID}\n'+
+    'time qualimap rnaseq -bam ${ID}.bam -gtf ${gtf} -pe -outdir rnaseq_qc_results -outfile ${ID}\n\n')
+    outfile.write(outline)
+
+# function generates body, only called with -pe and -umi
+# because we are using umi we need to identify adapters, for this we call 'scrape_adapter'
+# and pass the results to cutadapt. After alignment we call umi_tools.
+def paired_end_with_umi():
+    
+    outline=('## full paths to paired mate files\n'+
+    'INPUT_1=$(ls $DIR/*n01_$ID.%s)\n'+
+    'INPUT_2=$(ls $DIR/*n02_$ID.%s)\n\n')%(file_ext,file_ext)
+    outfile.write(outline)
+    
+    if blank_lines:
+            outline = ("awk 'NF' $INPUT_1 > INPUT_1.fastq\n"+
+            "INPUT_1=$(ls INPUT_1.fastq)\n"+
+            "awk 'NF' $INPUT_2 > INPUT_2.fastq\n"+
+            "INPUT_2=$(ls INPUT_2.fastq)\n\n")
+            outfile.write(outline)
+            
+    if polyG:
+        outline = ('time python ../../scrape_adapter.py -polyG -i $INPUT_1 -o Read1_filter_polyG.fastq\n'+
+        'time python ../../scrape_adapter.py -polyG -i $INPUT_2 -o Read2_filter_polyG.fastq\n'+
+       'echo "I assume youre using the trumiseq adapters, so trimming"\n'+
+        'time python ../../scrape_adapter.py -u -i Read1_filter_polyG.fastq -o Read1_Adapter.fa\n'+
+        'time python ../../scrape_adapter.py -p -i Read2_filter_polyG.fastq -o Read2_Adapter.fa\n'+
+        'time cutadapt -a file:Read1_Adapter.fa -A file:Read2_Adapter.fa -o ${ID}_1.trim -p ${ID}_2.trim $INPUT_1 $INPUT_2\n'+
+        'time fastqc -o ${ID}_postTrimming -f fastq ${ID}_1.trim ${ID}_2.trim\n'+
+        'time hisat2 -p ${SLURM_CPUS_ON_NODE} -q -x ${hisat_ref} -1 ${ID}_1.trim -2 ${ID}_2.trim -S $ID.sam\n\n')
+        outfile.write(outline)
+            
+    if not polyG:
+        outline = ('echo "I assume youre using the trumiseq adapters, so trimming"\n'+
+        'time python ../../scrape_adapter.py -u -i $INPUT_1 -o Read1_Adapter.fa\n'+
+        'time python ../../scrape_adapter.py -p -i $INPUT_2 -o Read2_Adapter.fa\n'+
+        'time cutadapt -a file:Read1_Adapter.fa -A file:Read2_Adapter.fa -o ${ID}_1.trim -p ${ID}_2.trim $INPUT_1 $INPUT_2\n'+
+        'time fastqc -o ${ID}_postTrimming -f fastq ${ID}_1.trim ${ID}_2.trim\n'+
+        'time hisat2 -p ${SLURM_CPUS_ON_NODE} -q -x ${hisat_ref} -1 ${ID}_1.trim -2 ${ID}_2.trim -S $ID.sam\n\n')
+        outfile.write(outline)
+
+    outline = ('\necho "Okay, now I sort and index that bam"\n'+
+    'time samtools sort -o ${ID}.bam ${ID}.sam\n'+
+    'time samtools index ${ID}.bam)\n'+
+    '\necho "And delete that file, and fastqz files as well."\n'+
+    'rm ${ID}.sam\n'+
+    'rm ${ID}_1.trim\n'+
+    'rm ${ID}_2.trim\n\n')
+    outfile.write(outline)
+    
+    umi_tools_section()
+
+# function generates body, only called with no -pe and no -umi (default)
+# invokes trim_galore as no umi adapters are needed, 
+# trim_galore autoidentifies Nextera adapters for trimming.   
+def single_end_no_umi():
+    
+    outline = ('INPUT_1=$(ls $DIR/*n01_$ID.%s)\n\n')%(file_ext)
+    outfile.write(outline)
+    
+    if blank_lines:
+        outline = ("awk 'NF' $INPUT_1 > INPUT_1.fastq\n"+
+        "INPUT_1=$(ls INPUT_1.fastq)\n\n")
+        outfile.write(outline)
+    
+    outline = ('\necho "Trimgaloring that ${INPUT_1}, then aligning"\n'+
+    'time trim_galore --fastqc $INPUT_1\n'+
+    'time hisat2 -q -x ${hisat_ref} -1 $INPUT_1 $ID.sam\n'+
+    '\necho "sort and index"\n'+
+    'time samtools sort -o ${ID}.bam ${ID}.sam\n'+
+    'time samtools index ${ID}.bam\n'+
+    '#Clean up .sam and .fastq files\n'+
+    'rm ${ID}.sam\n\n')
+    outfile.write(outline)
+
+# function generates body, only called with no -pe and -umi
+# because we are using umi we need to identify adapters, for this we call 'scrape_adapter'
+# and pass the results to cutadapt. After alignment we call umi_tools.
+def single_end_with_umi():
+    
+    outline = ('INPUT_1=$(ls $DIR/*n01_$ID.%s)\n\n')%(file_ext)
+    outfile.write(outline)
+
+    if blank_lines:
+        outline = ("awk 'NF' $INPUT_1 > INPUT_1.fastq\n"+
+        "INPUT_1=$(ls INPUT_1.fastq)\n\n")
+        outfile.write(outline)    
+    
+    if polyG:
+        outline = ('time python ../../scrape_adapter.py -polyG -i $INPUT_1 -o Read1_filter_polyG.fastq\n'+
+        '\necho "Trimming that ${INPUT_1} for adapters, quality? aligning"\n'+
+        '\necho ${ID}_1.trim\n'+
+        'time python ../../scrape_adapter.py -u -i Read1_filter_polyG.fastq -o Read1_Adapter.fa\n'+
+        'time cutadapt -a file:Read1_Adapter.fa -o ${ID}_1.trim $INPUT_1\n'+
+        'time hisat2 -q -x ${hisat_ref} -U ${ID}_1.trim -S $ID.sam\n\n')
+        outfile.write(outline)
+        
+    if not polyG:
+        outline = ('\necho "Trimming that ${INPUT_1} for adapters, quality? aligning"\n'+
+        '\necho ${ID}_1.trim\n'+
+        'time python ../../scrape_adapter.py -u -i $INPUT_1 -o Read1_Adapter.fa\n'+
+        'time cutadapt -a file:Read1_Adapter.fa -o ${ID}_1.trim $INPUT_1\n'+
+        'time hisat2 -q -x ${hisat_ref} -U ${ID}_1.trim -S $ID.sam\n\n')
+        outfile.write(outline)
+
+    outline = ('echo "sort and index"\n'+
+    'time samtools sort -o ${ID}.bam ${ID}.sam\n'+
+    'time samtools index ${ID}.bam\n'+
+    '#Clean up .sam and .fastq files\n\n'+
+    'rm ${ID}.sam\n')
+    outfile.write(outline)
+    
+    umi_tools_section()    
+
+# function generates rsubread section. This is always called.
+def rsubread():
+    outline=('############################\n# Generate table of counts #\n############################\n#Call Rsubread\n'+
+    '\necho "and doing rsubread"\n'+
+    'time Rscript --vanilla ../../feature_counts.R ${gtf1} ${isPE} > ../logs/rsubread.log\n'+
+    'time python ../../feature_filter.py -feature CDS -gff ${gff} -i ../Counts_All_Features.txt -o ../Counts_CDS_Features.txt\n'+
+    '\necho "Completed now moving the STDOUT and STDERR logs from sbatch into the above directory."\n\n')
+    outfile.write(outline)
+
+# function generates clean-up section. This is always called.
+def clean_up():
+    outline=('#################################################\n# Move output and error files to logs directory #\n'+
+    '#################################################\n'+
+    'echo $(pwd)\echon $(ls)\n'+
+    'mv ../../${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.o ../logs\n'+
+    'mv ../../${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}.e ../logs\n'+
+    '\nexit 0;')
+    outfile.write(outline)
+    
+### Function call section. Takes the parameters together to decide output generated.
+write_setup_section()
+
+if isPE==1: 
+    if isUMI==0:
+        paired_end_no_umi()
+    if isUMI==1:
+        paired_end_with_umi()
+
+if isPE==0:
+    if isUMI==0:
+        single_end_no_umi()
+    if isUMI==1:
+        single_end_with_umi()
+    
+rsubread()
+clean_up()
+outfile.close()
+
+if not no_launch:
+    output = subprocess.check_output(['sbatch rnaseq-array-pipeline.sh'],stderr=subprocess.STDOUT,shell=True)
+    print(output)
+
+if no_launch:
+    print('-no_launch option suppressed SBATCH launch. Command file generated.')
diff --git a/feature_counts.R b/feature_counts.R
index 9eb268b..c3f4b64 100644
--- a/feature_counts.R
+++ b/feature_counts.R
@@ -13,7 +13,8 @@ setwd('..')
 bam_files=list.files(pattern = ("\\.bam$"),recursive=TRUE)
 
 #outputfile name: change "exp_counts.txt" to outputfilename of choice
-output_file="Counts.txt"
+#output_file="Counts.txt"
+output_file="Counts_All_Features.txt"
 
 #use featureCounts to count reads to count transcripts per gene
 exp_counts<-featureCounts(files=bam_files,annot.ext=args[1],useMetaFeatures=T,isGTFAnnotationFile=T,GTF.attrType="gene_name",strandSpecific=2,isPairedEnd=paired)
diff --git a/feature_filter.py b/feature_filter.py
new file mode 100644
index 0000000..d59658c
--- /dev/null
+++ b/feature_filter.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Oct 31 12:10:31 2017
+ sub file. Filters mrna
+@author: pieter
+"""
+import sys
+
+for arg in range(len(sys.argv)):
+    if sys.argv[arg] == '-feature':
+        feature_name = sys.argv[arg+1]
+        feature_name = feature_name.strip()
+        
+    if sys.argv[arg] == '-gff':
+        gff_name = sys.argv[arg+1]
+    if sys.argv[arg] == '-i':
+        infile_name = sys.argv[arg+1]
+    if sys.argv[arg] == '-o':
+        outfile_name = sys.argv[arg+1]
+
+    #test line
+    #python feature_filter.py -feature CDS -gff /scratch/work/cgsb/genomes/In_house/Fungi/Saccharomyces_cerevisiae/Gresham/GCF_000146045.2_R64_GreshamLabSpikes/GCF_000146045.2_R64_GreshamLabSpikes_genomic.gff -i test_PE_noUMI/Counts.txt -o CDS_feature_counts.txt
+
+gff_file = open(gff_name)
+feature_list=[]
+all_features=[]
+
+for line in gff_file:
+    if line[0]!='#':
+        line=line.strip()
+        
+        check_feature = line.split('\t')[2]
+        check_feature = check_feature.strip()
+        
+        if check_feature not in all_features:
+            all_features.append(check_feature)
+        
+        if check_feature==feature_name:
+            deets = line.split('\t')[8]
+
+            if 'gene=' in deets:
+                check_feature=deets.split('gene=')[1].split(';')[0]
+            else:
+                check_feature=deets.split('ID=')[1]
+                check_feature=deets.split(';')[0]
+            if len(check_feature) > 0:
+                feature_list.append(check_feature)
+                               
+gff_file.close()
+
+if len(feature_list)<10:
+    print('Only '+str(len(feature_list))+' features found. Try one of these features, instead:\n')
+    print(all_features)
+
+infile = open(infile_name)
+outfile = open(outfile_name,'w')
+
+for line in infile:
+    if (line[0:4] == 'BAC-') or (line[0:4] == 'CEL-') or (line[0:5] == 'ERCC-') or ('.bam' in line):
+        outfile.write(line)
+    else:
+        check_feature = line.split('\t')[0]
+        if check_feature in feature_list:
+            outfile.write(line)
+
+infile.close()
+outfile.close()
diff --git a/scrape_adapter.py b/scrape_adapter.py
index f708a93..4f9d4ac 100644
--- a/scrape_adapter.py
+++ b/scrape_adapter.py
@@ -16,11 +16,17 @@
 #set defaults
 paired_arg = '-u'
 adapter_list=[]
+polyG = False
 
 #collect arguments
 if (sys.argv[1] == '-h') or (sys.argv[1] == '-help'):
     print('scrape_adapter is designed to be used with fastq files.\nThese can be either in uncompressed of gzipped (".gz") formats.')
-    print('Invoke as:\n\tpython scrape_adapter.py -u [-p for paired end] -i fastq_file_name -o output_file_name')
+    print('Invoke as:\n\tpython scrape_adapter.py [-polyG] [-p] -i fastq_file_name -o output_file_name')
+    print('-------------------------------------------------------------------')
+    print('Optional Commands:')
+    print('-p : paired end reads')
+    print('-polyG : Alternative Runmode. Removes all reads with a polyG UMI.')
+    print('\t\tNo adapters are counted when using -polyG run mode.')
     print('')
     exit()
 
@@ -31,6 +37,8 @@
         infile_name = sys.argv[int(arg+1)]
     if sys.argv[arg] == '-o':
         outfile_name = sys.argv[int(arg+1)]
+    if sys.argv[arg] == '-polyG':
+        polyG = True
 
 #test to see if input is gzipped
 #if it is use zcat to make temp uncompressed file
@@ -49,11 +57,50 @@
 if not is_gzipped:
     infile_o = (infile_name)
 
+# search for and remove instances where UMI is 'GGGGGG'
+def remove_polyG(infile_o):
+    infile = open(infile_o)
+    
+    
+    filter_polyG = 0
+    polyG_ct = 0
+    
+    for line in infile:
+
+        if filter_polyG != 0:
+            if filter_polyG == 3:
+                filter_polyG = 0
+                print(line)
+                print(filter_polyG)
+            if (filter_polyG > 0) and (filter_polyG < 3) :
+                filter_polyG+=1
+                print(line)
+                print(filter_polyG)
+                
+        if filter_polyG == 0:
+            if (line[0]=='@'):
+                s_line = line.strip()
+                adapter = s_line.rsplit(':',1)[1]
+                if ('GGGGGG' in adapter):
+                    filter_polyG = 1
+                    polyG_ct+=1
+                    print(line)
+                    print(filter_polyG)
+                    
+                if ('GGGGGG' not in adapter):
+                    outfile.write(line)
+                    
+            if (line[0]!='@') and (filter_polyG == 0):
+                    outfile.write(line)
+                
+
+    infile.close()
+    print('PolyG count: '+str(polyG_ct))
+
 # parse file keeping only highest frequency adapter
 def scrape_file(infile_o):
     infile = open(infile_o)
     
-    
     adapter_dict = {}
     high_score=0
     winner=''
@@ -106,19 +153,24 @@ def write_outadapter(adapter_list,runmode):
             outfile.write(outline)
 
 #actual command statements
-if paired_arg=='-u':
-    adapter_list = scrape_file(infile_o)
+if not polyG:
+    if paired_arg=='-u':
+        adapter_list = scrape_file(infile_o)
+        
+        if adapter_list:
+            outfile = open(outfile_name,'w')
+            write_outadapter(adapter_list,'fwd')
     
-    if adapter_list:
-        outfile = open(outfile_name,'w')
-        write_outadapter(adapter_list,'fwd')
+    if paired_arg=='-p':
+        adapter_list = scrape_file(infile_o)
+            
+        if adapter_list:
+            outfile = open(outfile_name,'w')
+            write_outadapter(adapter_list,'rev')
 
-if paired_arg=='-p':
-    adapter_list = scrape_file(infile_o)
-        
-    if adapter_list:
-        outfile = open(outfile_name,'w')
-        write_outadapter(adapter_list,'rev')
+if polyG:
+   outfile = open(outfile_name,'w')
+   remove_polyG(infile_o)
 
 #clean up
 outfile.close()